hwarang 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Cargo.toml ADDED
@@ -0,0 +1,6 @@
1
+ [workspace]
2
+ members = ["ext/hwarang"]
3
+ resolver = "2"
4
+
5
+ [profile.release]
6
+ strip = true
data/README.md ADDED
@@ -0,0 +1,115 @@
1
+ # hwarang
2
+
3
+ HWP/HWPX 문서에서 텍스트를 빠르게 추출하는 Ruby gem입니다.
4
+
5
+ Rust로 작성된 [hwarang](https://crates.io/crates/hwarang) 크레이트의 Ruby 바인딩입니다.
6
+
7
+ ## 지원 포맷
8
+
9
+ - **HWP** (OLE 바이너리) - 한/글 5.x 이상
10
+ - **HWPX** (ZIP/XML) - 한/글 최신 XML 기반 포맷
11
+ - **HWPML** (순수 XML)
12
+
13
+ ## 주요 기능
14
+
15
+ - 매직 바이트 기반 포맷 자동 감지
16
+ - 압축/비압축 스트림 모두 지원
17
+ - 배포문서 복호화 (AES/ECB)
18
+ - 표를 마크다운 테이블로 변환
19
+ - 머리글/꼬리글, 각주/미주, 글상자, 숨은설명 추출
20
+ - rayon 기반 병렬 배치 처리
21
+
22
+ ## 설치
23
+
24
+ ```ruby
25
+ gem install hwarang
26
+ ```
27
+
28
+ 또는 Gemfile에 추가:
29
+
30
+ ```ruby
31
+ gem "hwarang"
32
+ ```
33
+
34
+ 주요 플랫폼(x86_64-linux, aarch64-linux, arm64-darwin)에는 프리컴파일된 네이티브 gem이 제공됩니다. 그 외 플랫폼에서는 소스 gem이 설치되며 Rust 툴체인이 필요합니다.
35
+
36
+ ## 사용법
37
+
38
+ ### 텍스트 추출
39
+
40
+ ```ruby
41
+ require "hwarang"
42
+
43
+ text = Hwarang.extract_text("document.hwp")
44
+ puts text
45
+ ```
46
+
47
+ ### OLE 스트림 목록
48
+
49
+ ```ruby
50
+ streams = Hwarang.list_streams("document.hwp")
51
+ # => ["/FileHeader", "/BodyText/Section0", ...]
52
+ ```
53
+
54
+ ### 배치 처리
55
+
56
+ 여러 파일을 병렬로 처리합니다:
57
+
58
+ ```ruby
59
+ paths = Dir.glob("documents/**/*.hwp")
60
+ results = Hwarang.extract_batch(paths)
61
+
62
+ results.each do |path, result|
63
+ if result.key?("text")
64
+ puts "#{path}: #{result["text"].length} chars"
65
+ else
66
+ puts "#{path}: ERROR - #{result["error"]}"
67
+ end
68
+ end
69
+ ```
70
+
71
+ ## 에러 처리
72
+
73
+ 모든 예외는 `Hwarang::Error`를 상속합니다:
74
+
75
+ ```ruby
76
+ begin
77
+ Hwarang.extract_text("file.hwp")
78
+ rescue Hwarang::PasswordProtectedError
79
+ puts "암호가 걸린 문서입니다"
80
+ rescue Hwarang::FileError => e
81
+ puts "파일 오류: #{e.message}"
82
+ rescue Hwarang::Error => e
83
+ puts "처리 오류: #{e.message}"
84
+ end
85
+ ```
86
+
87
+ | 예외 클래스 | 설명 |
88
+ |-------------|------|
89
+ | `Hwarang::Error` | 기본 예외 클래스 |
90
+ | `Hwarang::FileError` | 파일 I/O 오류 |
91
+ | `Hwarang::InvalidSignatureError` | HWP 파일 시그니처 불일치 |
92
+ | `Hwarang::UnsupportedVersionError` | 지원하지 않는 HWP 버전 |
93
+ | `Hwarang::PasswordProtectedError` | 암호 보호된 문서 |
94
+ | `Hwarang::StreamNotFoundError` | OLE 스트림 없음 |
95
+ | `Hwarang::InvalidRecordHeaderError` | 레코드 헤더 파싱 실패 |
96
+ | `Hwarang::DecompressFailedError` | 스트림 압축 해제 실패 |
97
+ | `Hwarang::DecryptFailedError` | 복호화 실패 |
98
+ | `Hwarang::ParseError` | 일반 파싱 오류 |
99
+ | `Hwarang::UnsupportedFormatError` | 지원하지 않는 파일 형식 |
100
+ | `Hwarang::HwpxError` | HWPX 처리 오류 |
101
+
102
+ ## 벤치마크
103
+
104
+ | 항목 | 결과 |
105
+ |------|------|
106
+ | 파일 수 | 49,353개 (HWP/HWPX) |
107
+ | 총 용량 | 1.0 GB |
108
+ | 소요 시간 | 43.27초 |
109
+ | 처리 속도 | 1,140 files/s |
110
+ | 성공률 | 99.94% (49,321/49,353) |
111
+ | 환경 | Apple M1, 16GB RAM, 8코어, Ruby 4.0 |
112
+
113
+ ## License
114
+
115
+ MIT
@@ -0,0 +1,15 @@
1
+ [package]
2
+ name = "hwarang-ruby"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ license = "MIT"
6
+ publish = false
7
+
8
+ [lib]
9
+ name = "hwarang"
10
+ crate-type = ["cdylib"]
11
+
12
+ [dependencies]
13
+ magnus = "0.8"
14
+ hwarang_core = { package = "hwarang", version = "0.1.0", default-features = false }
15
+ rayon = "1"
@@ -0,0 +1,4 @@
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
3
+
4
+ create_rust_makefile("hwarang/hwarang")
@@ -0,0 +1,147 @@
1
+ use std::path::Path;
2
+
3
+ use magnus::class::Class;
4
+ use magnus::value::{InnerRef, Lazy, ReprValue};
5
+ use magnus::{function, prelude::*, Error, ExceptionClass, RHash, Ruby};
6
+ use rayon::prelude::*;
7
+
8
+ macro_rules! define_lazy_error {
9
+ ($static_name:ident, $ruby_name:expr, $parent:expr) => {
10
+ static $static_name: Lazy<ExceptionClass> = Lazy::new(|ruby| {
11
+ let parent = $parent.get_inner_ref_with(&ruby);
12
+ let module = ruby.define_module("Hwarang").unwrap();
13
+ let cls = module
14
+ .define_class($ruby_name, parent.as_r_class())
15
+ .unwrap();
16
+ ExceptionClass::from_value(cls.as_value()).unwrap()
17
+ });
18
+ };
19
+ }
20
+
21
+ static HWARANG_ERROR: Lazy<ExceptionClass> = Lazy::new(|ruby| {
22
+ let module = ruby.define_module("Hwarang").unwrap();
23
+ let cls = module
24
+ .define_class("Error", ruby.exception_standard_error().as_r_class())
25
+ .unwrap();
26
+ ExceptionClass::from_value(cls.as_value()).unwrap()
27
+ });
28
+
29
+ define_lazy_error!(FILE_ERROR, "FileError", HWARANG_ERROR);
30
+ define_lazy_error!(
31
+ INVALID_SIGNATURE_ERROR,
32
+ "InvalidSignatureError",
33
+ HWARANG_ERROR
34
+ );
35
+ define_lazy_error!(
36
+ UNSUPPORTED_VERSION_ERROR,
37
+ "UnsupportedVersionError",
38
+ HWARANG_ERROR
39
+ );
40
+ define_lazy_error!(
41
+ PASSWORD_PROTECTED_ERROR,
42
+ "PasswordProtectedError",
43
+ HWARANG_ERROR
44
+ );
45
+ define_lazy_error!(
46
+ STREAM_NOT_FOUND_ERROR,
47
+ "StreamNotFoundError",
48
+ HWARANG_ERROR
49
+ );
50
+ define_lazy_error!(
51
+ INVALID_RECORD_HEADER_ERROR,
52
+ "InvalidRecordHeaderError",
53
+ HWARANG_ERROR
54
+ );
55
+ define_lazy_error!(
56
+ DECOMPRESS_FAILED_ERROR,
57
+ "DecompressFailedError",
58
+ HWARANG_ERROR
59
+ );
60
+ define_lazy_error!(DECRYPT_FAILED_ERROR, "DecryptFailedError", HWARANG_ERROR);
61
+ define_lazy_error!(PARSE_ERROR, "ParseError", HWARANG_ERROR);
62
+ define_lazy_error!(
63
+ UNSUPPORTED_FORMAT_ERROR,
64
+ "UnsupportedFormatError",
65
+ HWARANG_ERROR
66
+ );
67
+ define_lazy_error!(HWPX_ERROR, "HwpxError", HWARANG_ERROR);
68
+
69
+ fn hwp_error_to_magnus(ruby: &Ruby, err: hwarang_core::error::HwpError) -> Error {
70
+ use hwarang_core::error::HwpError;
71
+ let msg = err.to_string();
72
+ let cls = match &err {
73
+ HwpError::Io(_) => *FILE_ERROR.get_inner_ref_with(ruby),
74
+ HwpError::InvalidSignature => *INVALID_SIGNATURE_ERROR.get_inner_ref_with(ruby),
75
+ HwpError::UnsupportedVersion(..) => *UNSUPPORTED_VERSION_ERROR.get_inner_ref_with(ruby),
76
+ HwpError::PasswordProtected => *PASSWORD_PROTECTED_ERROR.get_inner_ref_with(ruby),
77
+ HwpError::StreamNotFound(_) => *STREAM_NOT_FOUND_ERROR.get_inner_ref_with(ruby),
78
+ HwpError::InvalidRecordHeader => *INVALID_RECORD_HEADER_ERROR.get_inner_ref_with(ruby),
79
+ HwpError::DecompressFailed(_) => *DECOMPRESS_FAILED_ERROR.get_inner_ref_with(ruby),
80
+ HwpError::DecryptFailed(_) => *DECRYPT_FAILED_ERROR.get_inner_ref_with(ruby),
81
+ HwpError::Parse(_) => *PARSE_ERROR.get_inner_ref_with(ruby),
82
+ HwpError::UnsupportedFormat => *UNSUPPORTED_FORMAT_ERROR.get_inner_ref_with(ruby),
83
+ HwpError::Hwpx(_) => *HWPX_ERROR.get_inner_ref_with(ruby),
84
+ };
85
+ Error::new(cls, msg)
86
+ }
87
+
88
+ fn extract_text(ruby: &Ruby, path: String) -> Result<String, Error> {
89
+ hwarang_core::extract_text_from_file(Path::new(&path)).map_err(|e| hwp_error_to_magnus(ruby, e))
90
+ }
91
+
92
+ fn list_streams(ruby: &Ruby, path: String) -> Result<Vec<String>, Error> {
93
+ hwarang_core::list_streams(Path::new(&path)).map_err(|e| hwp_error_to_magnus(ruby, e))
94
+ }
95
+
96
+ fn extract_batch(ruby: &Ruby, paths: Vec<String>) -> Result<RHash, Error> {
97
+ let results: Vec<(String, Result<String, String>)> = paths
98
+ .par_iter()
99
+ .map(|p| {
100
+ let result = hwarang_core::extract_text_from_file(Path::new(p));
101
+ match result {
102
+ Ok(text) => (p.clone(), Ok(text)),
103
+ Err(e) => (p.clone(), Err(e.to_string())),
104
+ }
105
+ })
106
+ .collect();
107
+
108
+ let hash = ruby.hash_new();
109
+ for (path, result) in results {
110
+ let inner = ruby.hash_new();
111
+ match result {
112
+ Ok(text) => {
113
+ inner.aset(ruby.str_new("text"), ruby.str_new(&text))?;
114
+ }
115
+ Err(msg) => {
116
+ inner.aset(ruby.str_new("error"), ruby.str_new(&msg))?;
117
+ }
118
+ }
119
+ hash.aset(ruby.str_new(&path), inner)?;
120
+ }
121
+ Ok(hash)
122
+ }
123
+
124
+ #[magnus::init(name = "hwarang")]
125
+ fn init(ruby: &Ruby) -> Result<(), Error> {
126
+ let module = ruby.define_module("Hwarang")?;
127
+
128
+ // Force-initialize all error classes
129
+ Lazy::force(&HWARANG_ERROR, ruby);
130
+ Lazy::force(&FILE_ERROR, ruby);
131
+ Lazy::force(&INVALID_SIGNATURE_ERROR, ruby);
132
+ Lazy::force(&UNSUPPORTED_VERSION_ERROR, ruby);
133
+ Lazy::force(&PASSWORD_PROTECTED_ERROR, ruby);
134
+ Lazy::force(&STREAM_NOT_FOUND_ERROR, ruby);
135
+ Lazy::force(&INVALID_RECORD_HEADER_ERROR, ruby);
136
+ Lazy::force(&DECOMPRESS_FAILED_ERROR, ruby);
137
+ Lazy::force(&DECRYPT_FAILED_ERROR, ruby);
138
+ Lazy::force(&PARSE_ERROR, ruby);
139
+ Lazy::force(&UNSUPPORTED_FORMAT_ERROR, ruby);
140
+ Lazy::force(&HWPX_ERROR, ruby);
141
+
142
+ module.define_module_function("extract_text", function!(extract_text, 1))?;
143
+ module.define_module_function("list_streams", function!(list_streams, 1))?;
144
+ module.define_module_function("extract_batch", function!(extract_batch, 1))?;
145
+
146
+ Ok(())
147
+ }
@@ -0,0 +1,3 @@
1
+ module Hwarang
2
+ VERSION = "0.1.0"
3
+ end
data/lib/hwarang.rb ADDED
@@ -0,0 +1,7 @@
1
+ require_relative "hwarang/version"
2
+
3
+ begin
4
+ require "hwarang/#{RUBY_VERSION.to_f}/hwarang"
5
+ rescue LoadError
6
+ require "hwarang/hwarang"
7
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hwarang
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Lee Wonsup
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-02-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ description: Ruby bindings for the hwarang Rust library. Extracts text from HWP and
28
+ HWPX documents.
29
+ email: onesup.lee@gmail.com
30
+ executables: []
31
+ extensions:
32
+ - ext/hwarang/extconf.rb
33
+ extra_rdoc_files: []
34
+ files:
35
+ - Cargo.lock
36
+ - Cargo.toml
37
+ - README.md
38
+ - ext/hwarang/Cargo.toml
39
+ - ext/hwarang/extconf.rb
40
+ - ext/hwarang/src/lib.rs
41
+ - lib/hwarang.rb
42
+ - lib/hwarang/version.rb
43
+ homepage: https://github.com/teammilestone/hwarang-ruby
44
+ licenses:
45
+ - MIT
46
+ metadata: {}
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '3.1'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubygems_version: 3.5.22
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: Fast HWP/HWPX document text extractor
66
+ test_files: []