hwarang 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1038 -0
- data/Cargo.toml +6 -0
- data/README.md +115 -0
- data/ext/hwarang/Cargo.toml +15 -0
- data/ext/hwarang/extconf.rb +4 -0
- data/ext/hwarang/src/lib.rs +147 -0
- data/lib/hwarang/version.rb +3 -0
- data/lib/hwarang.rb +7 -0
- metadata +66 -0
data/Cargo.toml
ADDED
data/README.md
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# hwarang
|
|
2
|
+
|
|
3
|
+
HWP/HWPX 문서에서 텍스트를 빠르게 추출하는 Ruby gem입니다.
|
|
4
|
+
|
|
5
|
+
Rust로 작성된 [hwarang](https://crates.io/crates/hwarang) 크레이트의 Ruby 바인딩입니다.
|
|
6
|
+
|
|
7
|
+
## 지원 포맷
|
|
8
|
+
|
|
9
|
+
- **HWP** (OLE 바이너리) - 한/글 5.x 이상
|
|
10
|
+
- **HWPX** (ZIP/XML) - 한/글 최신 XML 기반 포맷
|
|
11
|
+
- **HWPML** (순수 XML)
|
|
12
|
+
|
|
13
|
+
## 주요 기능
|
|
14
|
+
|
|
15
|
+
- 매직 바이트 기반 포맷 자동 감지
|
|
16
|
+
- 압축/비압축 스트림 모두 지원
|
|
17
|
+
- 배포문서 복호화 (AES/ECB)
|
|
18
|
+
- 표를 마크다운 테이블로 변환
|
|
19
|
+
- 머리글/꼬리글, 각주/미주, 글상자, 숨은설명 추출
|
|
20
|
+
- rayon 기반 병렬 배치 처리
|
|
21
|
+
|
|
22
|
+
## 설치
|
|
23
|
+
|
|
24
|
+
```ruby
|
|
25
|
+
gem install hwarang
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
또는 Gemfile에 추가:
|
|
29
|
+
|
|
30
|
+
```ruby
|
|
31
|
+
gem "hwarang"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
주요 플랫폼(x86_64-linux, aarch64-linux, arm64-darwin)에는 프리컴파일된 네이티브 gem이 제공됩니다. 그 외 플랫폼에서는 소스 gem이 설치되며 Rust 툴체인이 필요합니다.
|
|
35
|
+
|
|
36
|
+
## 사용법
|
|
37
|
+
|
|
38
|
+
### 텍스트 추출
|
|
39
|
+
|
|
40
|
+
```ruby
|
|
41
|
+
require "hwarang"
|
|
42
|
+
|
|
43
|
+
text = Hwarang.extract_text("document.hwp")
|
|
44
|
+
puts text
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### OLE 스트림 목록
|
|
48
|
+
|
|
49
|
+
```ruby
|
|
50
|
+
streams = Hwarang.list_streams("document.hwp")
|
|
51
|
+
# => ["/FileHeader", "/BodyText/Section0", ...]
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### 배치 처리
|
|
55
|
+
|
|
56
|
+
여러 파일을 병렬로 처리합니다:
|
|
57
|
+
|
|
58
|
+
```ruby
|
|
59
|
+
paths = Dir.glob("documents/**/*.hwp")
|
|
60
|
+
results = Hwarang.extract_batch(paths)
|
|
61
|
+
|
|
62
|
+
results.each do |path, result|
|
|
63
|
+
if result.key?("text")
|
|
64
|
+
puts "#{path}: #{result["text"].length} chars"
|
|
65
|
+
else
|
|
66
|
+
puts "#{path}: ERROR - #{result["error"]}"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## 에러 처리
|
|
72
|
+
|
|
73
|
+
모든 예외는 `Hwarang::Error`를 상속합니다:
|
|
74
|
+
|
|
75
|
+
```ruby
|
|
76
|
+
begin
|
|
77
|
+
Hwarang.extract_text("file.hwp")
|
|
78
|
+
rescue Hwarang::PasswordProtectedError
|
|
79
|
+
puts "암호가 걸린 문서입니다"
|
|
80
|
+
rescue Hwarang::FileError => e
|
|
81
|
+
puts "파일 오류: #{e.message}"
|
|
82
|
+
rescue Hwarang::Error => e
|
|
83
|
+
puts "처리 오류: #{e.message}"
|
|
84
|
+
end
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
| 예외 클래스 | 설명 |
|
|
88
|
+
|-------------|------|
|
|
89
|
+
| `Hwarang::Error` | 기본 예외 클래스 |
|
|
90
|
+
| `Hwarang::FileError` | 파일 I/O 오류 |
|
|
91
|
+
| `Hwarang::InvalidSignatureError` | HWP 파일 시그니처 불일치 |
|
|
92
|
+
| `Hwarang::UnsupportedVersionError` | 지원하지 않는 HWP 버전 |
|
|
93
|
+
| `Hwarang::PasswordProtectedError` | 암호 보호된 문서 |
|
|
94
|
+
| `Hwarang::StreamNotFoundError` | OLE 스트림 없음 |
|
|
95
|
+
| `Hwarang::InvalidRecordHeaderError` | 레코드 헤더 파싱 실패 |
|
|
96
|
+
| `Hwarang::DecompressFailedError` | 스트림 압축 해제 실패 |
|
|
97
|
+
| `Hwarang::DecryptFailedError` | 복호화 실패 |
|
|
98
|
+
| `Hwarang::ParseError` | 일반 파싱 오류 |
|
|
99
|
+
| `Hwarang::UnsupportedFormatError` | 지원하지 않는 파일 형식 |
|
|
100
|
+
| `Hwarang::HwpxError` | HWPX 처리 오류 |
|
|
101
|
+
|
|
102
|
+
## 벤치마크
|
|
103
|
+
|
|
104
|
+
| 항목 | 결과 |
|
|
105
|
+
|------|------|
|
|
106
|
+
| 파일 수 | 49,353개 (HWP/HWPX) |
|
|
107
|
+
| 총 용량 | 1.0 GB |
|
|
108
|
+
| 소요 시간 | 43.27초 |
|
|
109
|
+
| 처리 속도 | 1,140 files/s |
|
|
110
|
+
| 성공률 | 99.94% (49,321/49,353) |
|
|
111
|
+
| 환경 | Apple M1, 16GB RAM, 8코어, Ruby 4.0 |
|
|
112
|
+
|
|
113
|
+
## License
|
|
114
|
+
|
|
115
|
+
MIT
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "hwarang-ruby"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
edition = "2021"
|
|
5
|
+
license = "MIT"
|
|
6
|
+
publish = false
|
|
7
|
+
|
|
8
|
+
[lib]
|
|
9
|
+
name = "hwarang"
|
|
10
|
+
crate-type = ["cdylib"]
|
|
11
|
+
|
|
12
|
+
[dependencies]
|
|
13
|
+
magnus = "0.8"
|
|
14
|
+
hwarang_core = { package = "hwarang", version = "0.1.0", default-features = false }
|
|
15
|
+
rayon = "1"
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
use std::path::Path;
|
|
2
|
+
|
|
3
|
+
use magnus::class::Class;
|
|
4
|
+
use magnus::value::{InnerRef, Lazy, ReprValue};
|
|
5
|
+
use magnus::{function, prelude::*, Error, ExceptionClass, RHash, Ruby};
|
|
6
|
+
use rayon::prelude::*;
|
|
7
|
+
|
|
8
|
+
macro_rules! define_lazy_error {
|
|
9
|
+
($static_name:ident, $ruby_name:expr, $parent:expr) => {
|
|
10
|
+
static $static_name: Lazy<ExceptionClass> = Lazy::new(|ruby| {
|
|
11
|
+
let parent = $parent.get_inner_ref_with(&ruby);
|
|
12
|
+
let module = ruby.define_module("Hwarang").unwrap();
|
|
13
|
+
let cls = module
|
|
14
|
+
.define_class($ruby_name, parent.as_r_class())
|
|
15
|
+
.unwrap();
|
|
16
|
+
ExceptionClass::from_value(cls.as_value()).unwrap()
|
|
17
|
+
});
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
static HWARANG_ERROR: Lazy<ExceptionClass> = Lazy::new(|ruby| {
|
|
22
|
+
let module = ruby.define_module("Hwarang").unwrap();
|
|
23
|
+
let cls = module
|
|
24
|
+
.define_class("Error", ruby.exception_standard_error().as_r_class())
|
|
25
|
+
.unwrap();
|
|
26
|
+
ExceptionClass::from_value(cls.as_value()).unwrap()
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
define_lazy_error!(FILE_ERROR, "FileError", HWARANG_ERROR);
|
|
30
|
+
define_lazy_error!(
|
|
31
|
+
INVALID_SIGNATURE_ERROR,
|
|
32
|
+
"InvalidSignatureError",
|
|
33
|
+
HWARANG_ERROR
|
|
34
|
+
);
|
|
35
|
+
define_lazy_error!(
|
|
36
|
+
UNSUPPORTED_VERSION_ERROR,
|
|
37
|
+
"UnsupportedVersionError",
|
|
38
|
+
HWARANG_ERROR
|
|
39
|
+
);
|
|
40
|
+
define_lazy_error!(
|
|
41
|
+
PASSWORD_PROTECTED_ERROR,
|
|
42
|
+
"PasswordProtectedError",
|
|
43
|
+
HWARANG_ERROR
|
|
44
|
+
);
|
|
45
|
+
define_lazy_error!(
|
|
46
|
+
STREAM_NOT_FOUND_ERROR,
|
|
47
|
+
"StreamNotFoundError",
|
|
48
|
+
HWARANG_ERROR
|
|
49
|
+
);
|
|
50
|
+
define_lazy_error!(
|
|
51
|
+
INVALID_RECORD_HEADER_ERROR,
|
|
52
|
+
"InvalidRecordHeaderError",
|
|
53
|
+
HWARANG_ERROR
|
|
54
|
+
);
|
|
55
|
+
define_lazy_error!(
|
|
56
|
+
DECOMPRESS_FAILED_ERROR,
|
|
57
|
+
"DecompressFailedError",
|
|
58
|
+
HWARANG_ERROR
|
|
59
|
+
);
|
|
60
|
+
define_lazy_error!(DECRYPT_FAILED_ERROR, "DecryptFailedError", HWARANG_ERROR);
|
|
61
|
+
define_lazy_error!(PARSE_ERROR, "ParseError", HWARANG_ERROR);
|
|
62
|
+
define_lazy_error!(
|
|
63
|
+
UNSUPPORTED_FORMAT_ERROR,
|
|
64
|
+
"UnsupportedFormatError",
|
|
65
|
+
HWARANG_ERROR
|
|
66
|
+
);
|
|
67
|
+
define_lazy_error!(HWPX_ERROR, "HwpxError", HWARANG_ERROR);
|
|
68
|
+
|
|
69
|
+
fn hwp_error_to_magnus(ruby: &Ruby, err: hwarang_core::error::HwpError) -> Error {
|
|
70
|
+
use hwarang_core::error::HwpError;
|
|
71
|
+
let msg = err.to_string();
|
|
72
|
+
let cls = match &err {
|
|
73
|
+
HwpError::Io(_) => *FILE_ERROR.get_inner_ref_with(ruby),
|
|
74
|
+
HwpError::InvalidSignature => *INVALID_SIGNATURE_ERROR.get_inner_ref_with(ruby),
|
|
75
|
+
HwpError::UnsupportedVersion(..) => *UNSUPPORTED_VERSION_ERROR.get_inner_ref_with(ruby),
|
|
76
|
+
HwpError::PasswordProtected => *PASSWORD_PROTECTED_ERROR.get_inner_ref_with(ruby),
|
|
77
|
+
HwpError::StreamNotFound(_) => *STREAM_NOT_FOUND_ERROR.get_inner_ref_with(ruby),
|
|
78
|
+
HwpError::InvalidRecordHeader => *INVALID_RECORD_HEADER_ERROR.get_inner_ref_with(ruby),
|
|
79
|
+
HwpError::DecompressFailed(_) => *DECOMPRESS_FAILED_ERROR.get_inner_ref_with(ruby),
|
|
80
|
+
HwpError::DecryptFailed(_) => *DECRYPT_FAILED_ERROR.get_inner_ref_with(ruby),
|
|
81
|
+
HwpError::Parse(_) => *PARSE_ERROR.get_inner_ref_with(ruby),
|
|
82
|
+
HwpError::UnsupportedFormat => *UNSUPPORTED_FORMAT_ERROR.get_inner_ref_with(ruby),
|
|
83
|
+
HwpError::Hwpx(_) => *HWPX_ERROR.get_inner_ref_with(ruby),
|
|
84
|
+
};
|
|
85
|
+
Error::new(cls, msg)
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
fn extract_text(ruby: &Ruby, path: String) -> Result<String, Error> {
|
|
89
|
+
hwarang_core::extract_text_from_file(Path::new(&path)).map_err(|e| hwp_error_to_magnus(ruby, e))
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
fn list_streams(ruby: &Ruby, path: String) -> Result<Vec<String>, Error> {
|
|
93
|
+
hwarang_core::list_streams(Path::new(&path)).map_err(|e| hwp_error_to_magnus(ruby, e))
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
fn extract_batch(ruby: &Ruby, paths: Vec<String>) -> Result<RHash, Error> {
|
|
97
|
+
let results: Vec<(String, Result<String, String>)> = paths
|
|
98
|
+
.par_iter()
|
|
99
|
+
.map(|p| {
|
|
100
|
+
let result = hwarang_core::extract_text_from_file(Path::new(p));
|
|
101
|
+
match result {
|
|
102
|
+
Ok(text) => (p.clone(), Ok(text)),
|
|
103
|
+
Err(e) => (p.clone(), Err(e.to_string())),
|
|
104
|
+
}
|
|
105
|
+
})
|
|
106
|
+
.collect();
|
|
107
|
+
|
|
108
|
+
let hash = ruby.hash_new();
|
|
109
|
+
for (path, result) in results {
|
|
110
|
+
let inner = ruby.hash_new();
|
|
111
|
+
match result {
|
|
112
|
+
Ok(text) => {
|
|
113
|
+
inner.aset(ruby.str_new("text"), ruby.str_new(&text))?;
|
|
114
|
+
}
|
|
115
|
+
Err(msg) => {
|
|
116
|
+
inner.aset(ruby.str_new("error"), ruby.str_new(&msg))?;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
hash.aset(ruby.str_new(&path), inner)?;
|
|
120
|
+
}
|
|
121
|
+
Ok(hash)
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
#[magnus::init(name = "hwarang")]
|
|
125
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
126
|
+
let module = ruby.define_module("Hwarang")?;
|
|
127
|
+
|
|
128
|
+
// Force-initialize all error classes
|
|
129
|
+
Lazy::force(&HWARANG_ERROR, ruby);
|
|
130
|
+
Lazy::force(&FILE_ERROR, ruby);
|
|
131
|
+
Lazy::force(&INVALID_SIGNATURE_ERROR, ruby);
|
|
132
|
+
Lazy::force(&UNSUPPORTED_VERSION_ERROR, ruby);
|
|
133
|
+
Lazy::force(&PASSWORD_PROTECTED_ERROR, ruby);
|
|
134
|
+
Lazy::force(&STREAM_NOT_FOUND_ERROR, ruby);
|
|
135
|
+
Lazy::force(&INVALID_RECORD_HEADER_ERROR, ruby);
|
|
136
|
+
Lazy::force(&DECOMPRESS_FAILED_ERROR, ruby);
|
|
137
|
+
Lazy::force(&DECRYPT_FAILED_ERROR, ruby);
|
|
138
|
+
Lazy::force(&PARSE_ERROR, ruby);
|
|
139
|
+
Lazy::force(&UNSUPPORTED_FORMAT_ERROR, ruby);
|
|
140
|
+
Lazy::force(&HWPX_ERROR, ruby);
|
|
141
|
+
|
|
142
|
+
module.define_module_function("extract_text", function!(extract_text, 1))?;
|
|
143
|
+
module.define_module_function("list_streams", function!(list_streams, 1))?;
|
|
144
|
+
module.define_module_function("extract_batch", function!(extract_batch, 1))?;
|
|
145
|
+
|
|
146
|
+
Ok(())
|
|
147
|
+
}
|
data/lib/hwarang.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: hwarang
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Lee Wonsup
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-02-24 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: rb_sys
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ">="
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ">="
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0'
|
|
27
|
+
description: Ruby bindings for the hwarang Rust library. Extracts text from HWP and
|
|
28
|
+
HWPX documents.
|
|
29
|
+
email: onesup.lee@gmail.com
|
|
30
|
+
executables: []
|
|
31
|
+
extensions:
|
|
32
|
+
- ext/hwarang/extconf.rb
|
|
33
|
+
extra_rdoc_files: []
|
|
34
|
+
files:
|
|
35
|
+
- Cargo.lock
|
|
36
|
+
- Cargo.toml
|
|
37
|
+
- README.md
|
|
38
|
+
- ext/hwarang/Cargo.toml
|
|
39
|
+
- ext/hwarang/extconf.rb
|
|
40
|
+
- ext/hwarang/src/lib.rs
|
|
41
|
+
- lib/hwarang.rb
|
|
42
|
+
- lib/hwarang/version.rb
|
|
43
|
+
homepage: https://github.com/teammilestone/hwarang-ruby
|
|
44
|
+
licenses:
|
|
45
|
+
- MIT
|
|
46
|
+
metadata: {}
|
|
47
|
+
post_install_message:
|
|
48
|
+
rdoc_options: []
|
|
49
|
+
require_paths:
|
|
50
|
+
- lib
|
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
52
|
+
requirements:
|
|
53
|
+
- - ">="
|
|
54
|
+
- !ruby/object:Gem::Version
|
|
55
|
+
version: '3.1'
|
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '0'
|
|
61
|
+
requirements: []
|
|
62
|
+
rubygems_version: 3.5.22
|
|
63
|
+
signing_key:
|
|
64
|
+
specification_version: 4
|
|
65
|
+
summary: Fast HWP/HWPX document text extractor
|
|
66
|
+
test_files: []
|