hwayo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +74 -0
- data/Rakefile +4 -0
- data/lib/hwayo/extractor.rb +123 -0
- data/lib/hwayo/java/HWPTextExtractorCLI.class +0 -0
- data/lib/hwayo/java/hwplib-1.1.10.jar +0 -0
- data/lib/hwayo/simple_extractor.rb +65 -0
- data/lib/hwayo/version.rb +5 -0
- data/lib/hwayo.rb +88 -0
- data/sig/hwayo.rbs +4 -0
- metadata +54 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6b649626b8aa64f7e09efcca03cbc6aaf09564fa7fedd669603f58df2cd2255e
|
4
|
+
data.tar.gz: d3390414f88e33b5c25d0219d9cc41a9b12674b3d52f180a225bd9eb4768364e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 83f850bf7491f9c6d0caf3b0ede1720d82b6c9c7e8644b0b2d4735622b343e07036f980eaefd115bda21a374ac8d93533efb1773c38e042226ee2b7b8a307d4b
|
7
|
+
data.tar.gz: 20916130d516ddd212721212ae518851b92efba543868bf49b0006c76d4ebf531b9f23493be6f262d98b74d0c06ef54d6d81b3bcf7054f9e9465329428b0fe9b
|
data/README.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Hwayo
|
2
|
+
|
3
|
+
Hwayo는 한글(HWP) 파일에서 텍스트를 추출하는 Ruby gem입니다. Java 기반의 hwplib 라이브러리를 Ruby에서 사용할 수 있도록 래핑하였습니다.
|
4
|
+
|
5
|
+
## 설치
|
6
|
+
|
7
|
+
Gemfile에 다음을 추가하세요:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'hwayo'
|
11
|
+
```
|
12
|
+
|
13
|
+
그리고 실행:
|
14
|
+
|
15
|
+
$ bundle install
|
16
|
+
|
17
|
+
또는 직접 설치:
|
18
|
+
|
19
|
+
$ gem install hwayo
|
20
|
+
|
21
|
+
## 사용법
|
22
|
+
|
23
|
+
### 기본 사용법
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
require 'hwayo'
|
27
|
+
|
28
|
+
# HWP 파일에서 텍스트 추출
|
29
|
+
result = Hwayo.extract_text('document.hwp')
|
30
|
+
|
31
|
+
if result[:success]
|
32
|
+
puts result[:text]
|
33
|
+
else
|
34
|
+
puts "Error: #{result[:error]}"
|
35
|
+
end
|
36
|
+
```
|
37
|
+
|
38
|
+
### 파일로 저장
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
# 추출한 텍스트를 파일로 저장
|
42
|
+
result = Hwayo.extract_text('document.hwp', 'output.txt')
|
43
|
+
|
44
|
+
if result[:success]
|
45
|
+
puts "Text saved to: #{result[:output_path]}"
|
46
|
+
else
|
47
|
+
puts "Error: #{result[:error]}"
|
48
|
+
end
|
49
|
+
```
|
50
|
+
|
51
|
+
## 요구사항
|
52
|
+
|
53
|
+
- Ruby 2.7.0 이상
|
54
|
+
- Java 8 이상 (JAVA_HOME 환경변수 설정 필요)
|
55
|
+
- MRI Ruby의 경우 RJB gem 필요 (자동 설치됨)
|
56
|
+
- JRuby의 경우 추가 설정 없이 사용 가능
|
57
|
+
|
58
|
+
## 개발
|
59
|
+
|
60
|
+
이 gem을 개발하려면:
|
61
|
+
|
62
|
+
1. 저장소를 클론합니다
|
63
|
+
2. `bin/setup`을 실행하여 의존성을 설치합니다
|
64
|
+
3. `rake spec`을 실행하여 테스트를 실행합니다
|
65
|
+
4. `bin/console`을 실행하여 대화형 프롬프트를 시작합니다
|
66
|
+
|
67
|
+
## 기여
|
68
|
+
|
69
|
+
버그 리포트와 풀 리퀘스트는 GitHub에서 환영합니다.
|
70
|
+
|
71
|
+
## 라이선스
|
72
|
+
|
73
|
+
이 gem은 [MIT License](https://opensource.org/licenses/MIT)로 제공됩니다.
|
74
|
+
hwplib은 [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0)으로 제공됩니다.
|
data/Rakefile
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'tempfile'
|
4
|
+
|
5
|
+
module Hwayo
|
6
|
+
class Extractor
|
7
|
+
def initialize
|
8
|
+
@java_loaded = false
|
9
|
+
load_java_dependencies
|
10
|
+
end
|
11
|
+
|
12
|
+
def extract_text(hwp_file_path, output_path = nil)
|
13
|
+
raise Error, "HWP file not found: #{hwp_file_path}" unless File.exist?(hwp_file_path)
|
14
|
+
|
15
|
+
ensure_java_loaded
|
16
|
+
|
17
|
+
begin
|
18
|
+
# HWP 파일 읽기
|
19
|
+
hwp_file = @hwp_reader.fromFile(hwp_file_path)
|
20
|
+
|
21
|
+
# 텍스트 추출 옵션 설정
|
22
|
+
option = @text_extract_option.new
|
23
|
+
option.setMethod(@text_extract_method.InsertControlTextBetweenParagraphText)
|
24
|
+
option.setWithControlChar(false)
|
25
|
+
option.setAppendEndingLF(true)
|
26
|
+
|
27
|
+
# 텍스트 추출
|
28
|
+
extracted_text = @text_extractor.extract(hwp_file, option)
|
29
|
+
|
30
|
+
# 결과 저장 또는 반환
|
31
|
+
if output_path
|
32
|
+
File.write(output_path, extracted_text, encoding: 'UTF-8')
|
33
|
+
{ success: true, text: extracted_text, output_path: output_path }
|
34
|
+
else
|
35
|
+
{ success: true, text: extracted_text }
|
36
|
+
end
|
37
|
+
|
38
|
+
rescue => e
|
39
|
+
{ success: false, error: "Failed to extract text: #{e.message}" }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def load_java_dependencies
|
46
|
+
if RUBY_PLATFORM == 'java'
|
47
|
+
# JRuby environment
|
48
|
+
require 'java'
|
49
|
+
load_jruby_dependencies
|
50
|
+
else
|
51
|
+
# MRI Ruby with RJB
|
52
|
+
begin
|
53
|
+
require 'rjb'
|
54
|
+
load_rjb_dependencies
|
55
|
+
rescue LoadError
|
56
|
+
raise Error, "RJB gem is required for Java integration. Please install it: gem install rjb"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def load_jruby_dependencies
|
62
|
+
jar_path = File.expand_path('../java/hwplib-1.1.10.jar', __dir__)
|
63
|
+
raise Error, "hwplib JAR not found at: #{jar_path}" unless File.exist?(jar_path)
|
64
|
+
|
65
|
+
require jar_path
|
66
|
+
|
67
|
+
@hwp_reader = Java::KrDogfootHwplibReader::HWPReader
|
68
|
+
@text_extractor = Java::KrDogfootHwplibToolTextextractor::TextExtractor
|
69
|
+
@text_extract_option = Java::KrDogfootHwplibToolTextextractor::TextExtractOption
|
70
|
+
@text_extract_method = Java::KrDogfootHwplibToolTextextractor::TextExtractMethod
|
71
|
+
|
72
|
+
@java_loaded = true
|
73
|
+
end
|
74
|
+
|
75
|
+
def load_rjb_dependencies
|
76
|
+
return if @java_loaded
|
77
|
+
|
78
|
+
jar_path = File.expand_path('../java/hwplib-1.1.10.jar', __dir__)
|
79
|
+
raise Error, "hwplib JAR not found at: #{jar_path}" unless File.exist?(jar_path)
|
80
|
+
|
81
|
+
# Set JAVA_HOME if not set
|
82
|
+
ENV['JAVA_HOME'] ||= detect_java_home
|
83
|
+
|
84
|
+
# Load RJB with the JAR file
|
85
|
+
Rjb::load(jar_path, ['-Xmx512m'])
|
86
|
+
|
87
|
+
# Import Java classes
|
88
|
+
@hwp_reader = Rjb::import('kr.dogfoot.hwplib.reader.HWPReader')
|
89
|
+
@text_extractor = Rjb::import('kr.dogfoot.hwplib.tool.textextractor.TextExtractor')
|
90
|
+
@text_extract_option = Rjb::import('kr.dogfoot.hwplib.tool.textextractor.TextExtractOption')
|
91
|
+
@text_extract_method = Rjb::import('kr.dogfoot.hwplib.tool.textextractor.TextExtractMethod')
|
92
|
+
|
93
|
+
@java_loaded = true
|
94
|
+
end
|
95
|
+
|
96
|
+
def ensure_java_loaded
|
97
|
+
load_rjb_dependencies unless @java_loaded
|
98
|
+
end
|
99
|
+
|
100
|
+
def detect_java_home
|
101
|
+
# Try common Java installation paths
|
102
|
+
possible_java_homes = [
|
103
|
+
'/usr/libexec/java_home -V 2>&1 | grep -E "1\.(8|11)" | head -1 | awk \'{print $NF}\'',
|
104
|
+
'/usr/lib/jvm/java-11-openjdk-amd64',
|
105
|
+
'/usr/lib/jvm/java-8-openjdk-amd64',
|
106
|
+
'/Library/Java/JavaVirtualMachines/jdk-11.jdk/Contents/Home',
|
107
|
+
'/Library/Java/JavaVirtualMachines/jdk1.8.0_*.jdk/Contents/Home'
|
108
|
+
]
|
109
|
+
|
110
|
+
if system('which java > /dev/null 2>&1')
|
111
|
+
java_home = `echo $(/usr/libexec/java_home 2>/dev/null)`.strip
|
112
|
+
return java_home unless java_home.empty?
|
113
|
+
end
|
114
|
+
|
115
|
+
possible_java_homes.each do |path|
|
116
|
+
expanded_path = File.expand_path(path)
|
117
|
+
return expanded_path if File.directory?(expanded_path)
|
118
|
+
end
|
119
|
+
|
120
|
+
raise Error, "Java not found. Please install Java 8 or later and set JAVA_HOME environment variable."
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
Binary file
|
Binary file
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'open3'
|
4
|
+
require 'tempfile'
|
5
|
+
|
6
|
+
module Hwayo
|
7
|
+
class SimpleExtractor
|
8
|
+
def extract_text(hwp_file_path, output_path = nil)
|
9
|
+
raise Error, "HWP file not found: #{hwp_file_path}" unless File.exist?(hwp_file_path)
|
10
|
+
|
11
|
+
# Java 실행 가능 여부 확인
|
12
|
+
unless system('java -version > /dev/null 2>&1')
|
13
|
+
return { success: false, error: "Java is not installed. Please install Java 8 or later." }
|
14
|
+
end
|
15
|
+
|
16
|
+
# JAR 파일 경로
|
17
|
+
jar_path = File.expand_path('../java/hwplib-1.1.10.jar', __dir__)
|
18
|
+
|
19
|
+
# JAR 파일이 없으면 대체 경로 시도
|
20
|
+
unless File.exist?(jar_path)
|
21
|
+
# 개발 환경에서 직접 실행하는 경우
|
22
|
+
alt_jar_path = File.expand_path('../../../target/hwplib-1.1.10.jar', __dir__)
|
23
|
+
if File.exist?(alt_jar_path)
|
24
|
+
jar_path = alt_jar_path
|
25
|
+
else
|
26
|
+
return { success: false, error: "hwplib JAR not found" }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# 임시 출력 파일
|
31
|
+
temp_output = Tempfile.new(['hwp_output', '.txt'])
|
32
|
+
|
33
|
+
begin
|
34
|
+
# Java 클래스 실행
|
35
|
+
cmd = [
|
36
|
+
'java',
|
37
|
+
'-cp', jar_path,
|
38
|
+
'kr.dogfoot.hwplib.sample.Extracting_Text',
|
39
|
+
hwp_file_path,
|
40
|
+
temp_output.path
|
41
|
+
]
|
42
|
+
|
43
|
+
stdout, stderr, status = Open3.capture3(*cmd)
|
44
|
+
|
45
|
+
if status.success?
|
46
|
+
extracted_text = File.read(temp_output.path, encoding: 'UTF-8')
|
47
|
+
|
48
|
+
# 결과 저장
|
49
|
+
if output_path
|
50
|
+
File.write(output_path, extracted_text, encoding: 'UTF-8')
|
51
|
+
{ success: true, text: extracted_text, output_path: output_path }
|
52
|
+
else
|
53
|
+
{ success: true, text: extracted_text }
|
54
|
+
end
|
55
|
+
else
|
56
|
+
{ success: false, error: "Java execution failed: #{stderr}" }
|
57
|
+
end
|
58
|
+
|
59
|
+
ensure
|
60
|
+
temp_output.close
|
61
|
+
temp_output.unlink
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
data/lib/hwayo.rb
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "hwayo/version"
|
4
|
+
require 'open3'
|
5
|
+
require 'tempfile'
|
6
|
+
|
7
|
+
module Hwayo
|
8
|
+
class Error < StandardError; end
|
9
|
+
|
10
|
+
# Extract text from HWP file
|
11
|
+
def self.extract_text(hwp_file_path, output_path = nil)
|
12
|
+
raise Error, "HWP file not found: #{hwp_file_path}" unless File.exist?(hwp_file_path)
|
13
|
+
|
14
|
+
# Java 실행 가능 여부 확인
|
15
|
+
unless system('java -version > /dev/null 2>&1')
|
16
|
+
return { success: false, error: "Java is not installed. Please install Java 8 or later." }
|
17
|
+
end
|
18
|
+
|
19
|
+
# JAR 파일 경로 찾기
|
20
|
+
jar_path = find_jar_path
|
21
|
+
unless jar_path
|
22
|
+
return { success: false, error: "hwplib JAR not found. Please ensure hwplib-1.1.10.jar is in the gem's lib/hwayo/java directory or in the current directory." }
|
23
|
+
end
|
24
|
+
|
25
|
+
# CLI 클래스 경로 찾기
|
26
|
+
cli_class_path = find_cli_class(jar_path)
|
27
|
+
unless cli_class_path
|
28
|
+
return { success: false, error: "HWPTextExtractorCLI.class not found" }
|
29
|
+
end
|
30
|
+
|
31
|
+
# 임시 출력 파일
|
32
|
+
temp_output = output_path || Tempfile.new(['hwp_output', '.txt']).path
|
33
|
+
|
34
|
+
begin
|
35
|
+
# Java 명령 실행
|
36
|
+
cmd = [
|
37
|
+
'java',
|
38
|
+
'-cp', "#{jar_path}:#{cli_class_path}",
|
39
|
+
'HWPTextExtractorCLI',
|
40
|
+
hwp_file_path,
|
41
|
+
temp_output
|
42
|
+
]
|
43
|
+
|
44
|
+
stdout, stderr, status = Open3.capture3(*cmd)
|
45
|
+
|
46
|
+
if status.success? && stdout.strip == "SUCCESS"
|
47
|
+
extracted_text = File.read(temp_output, encoding: 'UTF-8')
|
48
|
+
|
49
|
+
result = { success: true, text: extracted_text }
|
50
|
+
result[:output_path] = output_path if output_path
|
51
|
+
result
|
52
|
+
else
|
53
|
+
{ success: false, error: "Extraction failed: #{stderr}" }
|
54
|
+
end
|
55
|
+
|
56
|
+
ensure
|
57
|
+
# 임시 파일 정리
|
58
|
+
File.unlink(temp_output) if !output_path && File.exist?(temp_output)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def self.find_jar_path
|
65
|
+
possible_paths = [
|
66
|
+
# gem 내부
|
67
|
+
File.expand_path('../hwayo/java/hwplib-1.1.10.jar', __FILE__),
|
68
|
+
# 현재 디렉토리
|
69
|
+
'hwplib-1.1.10.jar',
|
70
|
+
'target/hwplib-1.1.10.jar',
|
71
|
+
# 환경 변수
|
72
|
+
ENV['HWPLIB_JAR_PATH']
|
73
|
+
].compact
|
74
|
+
|
75
|
+
possible_paths.find { |path| path && File.exist?(path) }
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.find_cli_class(jar_dir)
|
79
|
+
jar_directory = File.dirname(jar_dir)
|
80
|
+
possible_paths = [
|
81
|
+
jar_directory,
|
82
|
+
'.',
|
83
|
+
File.expand_path('../hwayo/java', __FILE__)
|
84
|
+
]
|
85
|
+
|
86
|
+
possible_paths.find { |path| File.exist?(File.join(path, 'HWPTextExtractorCLI.class')) }
|
87
|
+
end
|
88
|
+
end
|
data/sig/hwayo.rbs
ADDED
metadata
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hwayo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- 이원섭wonsup Lee/Alfonso
|
8
|
+
bindir: exe
|
9
|
+
cert_chain: []
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
|
+
dependencies: []
|
12
|
+
description: Hwayo is a Ruby gem that wraps the hwplib Java library to extract text
|
13
|
+
from Korean HWP (Hangul Word Processor) files. It provides a simple interface to
|
14
|
+
extract text content from HWP documents using the Java hwplib library.
|
15
|
+
email:
|
16
|
+
- onesup.lee@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- README.md
|
22
|
+
- Rakefile
|
23
|
+
- lib/hwayo.rb
|
24
|
+
- lib/hwayo/extractor.rb
|
25
|
+
- lib/hwayo/java/HWPTextExtractorCLI.class
|
26
|
+
- lib/hwayo/java/hwplib-1.1.10.jar
|
27
|
+
- lib/hwayo/simple_extractor.rb
|
28
|
+
- lib/hwayo/version.rb
|
29
|
+
- sig/hwayo.rbs
|
30
|
+
homepage: https://github.com/onesup/hwayo
|
31
|
+
licenses:
|
32
|
+
- MIT
|
33
|
+
metadata:
|
34
|
+
homepage_uri: https://github.com/onesup/hwayo
|
35
|
+
source_code_uri: https://github.com/onesup/hwayo
|
36
|
+
changelog_uri: https://github.com/onesup/hwayo/blob/main/CHANGELOG.md
|
37
|
+
rdoc_options: []
|
38
|
+
require_paths:
|
39
|
+
- lib
|
40
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 2.7.0
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0'
|
50
|
+
requirements: []
|
51
|
+
rubygems_version: 3.6.9
|
52
|
+
specification_version: 4
|
53
|
+
summary: Ruby wrapper for hwplib - Extract text from HWP files
|
54
|
+
test_files: []
|