hwayo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 6b649626b8aa64f7e09efcca03cbc6aaf09564fa7fedd669603f58df2cd2255e
4
+ data.tar.gz: d3390414f88e33b5c25d0219d9cc41a9b12674b3d52f180a225bd9eb4768364e
5
+ SHA512:
6
+ metadata.gz: 83f850bf7491f9c6d0caf3b0ede1720d82b6c9c7e8644b0b2d4735622b343e07036f980eaefd115bda21a374ac8d93533efb1773c38e042226ee2b7b8a307d4b
7
+ data.tar.gz: 20916130d516ddd212721212ae518851b92efba543868bf49b0006c76d4ebf531b9f23493be6f262d98b74d0c06ef54d6d81b3bcf7054f9e9465329428b0fe9b
data/README.md ADDED
@@ -0,0 +1,74 @@
1
+ # Hwayo
2
+
3
+ Hwayo는 한글(HWP) 파일에서 텍스트를 추출하는 Ruby gem입니다. Java 기반의 hwplib 라이브러리를 Ruby에서 사용할 수 있도록 래핑하였습니다.
4
+
5
+ ## 설치
6
+
7
+ Gemfile에 다음을 추가하세요:
8
+
9
+ ```ruby
10
+ gem 'hwayo'
11
+ ```
12
+
13
+ 그리고 실행:
14
+
15
+ $ bundle install
16
+
17
+ 또는 직접 설치:
18
+
19
+ $ gem install hwayo
20
+
21
+ ## 사용법
22
+
23
+ ### 기본 사용법
24
+
25
+ ```ruby
26
+ require 'hwayo'
27
+
28
+ # HWP 파일에서 텍스트 추출
29
+ result = Hwayo.extract_text('document.hwp')
30
+
31
+ if result[:success]
32
+ puts result[:text]
33
+ else
34
+ puts "Error: #{result[:error]}"
35
+ end
36
+ ```
37
+
38
+ ### 파일로 저장
39
+
40
+ ```ruby
41
+ # 추출한 텍스트를 파일로 저장
42
+ result = Hwayo.extract_text('document.hwp', 'output.txt')
43
+
44
+ if result[:success]
45
+ puts "Text saved to: #{result[:output_path]}"
46
+ else
47
+ puts "Error: #{result[:error]}"
48
+ end
49
+ ```
50
+
51
+ ## 요구사항
52
+
53
+ - Ruby 2.7.0 이상
54
+ - Java 8 이상 (JAVA_HOME 환경변수 설정 필요)
55
+ - MRI Ruby의 경우 RJB gem 필요 (자동 설치됨)
56
+ - JRuby의 경우 추가 설정 없이 사용 가능
57
+
58
+ ## 개발
59
+
60
+ 이 gem을 개발하려면:
61
+
62
+ 1. 저장소를 클론합니다
63
+ 2. `bin/setup`을 실행하여 의존성을 설치합니다
64
+ 3. `rake spec`을 실행하여 테스트를 실행합니다
65
+ 4. `bin/console`을 실행하여 대화형 프롬프트를 시작합니다
66
+
67
+ ## 기여
68
+
69
+ 버그 리포트와 풀 리퀘스트는 GitHub에서 환영합니다.
70
+
71
+ ## 라이선스
72
+
73
+ 이 gem은 [MIT License](https://opensource.org/licenses/MIT)로 제공됩니다.
74
+ hwplib은 [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0)으로 제공됩니다.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ task default: %i[]
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'tempfile'
4
+
5
+ module Hwayo
6
+ class Extractor
7
+ def initialize
8
+ @java_loaded = false
9
+ load_java_dependencies
10
+ end
11
+
12
+ def extract_text(hwp_file_path, output_path = nil)
13
+ raise Error, "HWP file not found: #{hwp_file_path}" unless File.exist?(hwp_file_path)
14
+
15
+ ensure_java_loaded
16
+
17
+ begin
18
+ # HWP 파일 읽기
19
+ hwp_file = @hwp_reader.fromFile(hwp_file_path)
20
+
21
+ # 텍스트 추출 옵션 설정
22
+ option = @text_extract_option.new
23
+ option.setMethod(@text_extract_method.InsertControlTextBetweenParagraphText)
24
+ option.setWithControlChar(false)
25
+ option.setAppendEndingLF(true)
26
+
27
+ # 텍스트 추출
28
+ extracted_text = @text_extractor.extract(hwp_file, option)
29
+
30
+ # 결과 저장 또는 반환
31
+ if output_path
32
+ File.write(output_path, extracted_text, encoding: 'UTF-8')
33
+ { success: true, text: extracted_text, output_path: output_path }
34
+ else
35
+ { success: true, text: extracted_text }
36
+ end
37
+
38
+ rescue => e
39
+ { success: false, error: "Failed to extract text: #{e.message}" }
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ def load_java_dependencies
46
+ if RUBY_PLATFORM == 'java'
47
+ # JRuby environment
48
+ require 'java'
49
+ load_jruby_dependencies
50
+ else
51
+ # MRI Ruby with RJB
52
+ begin
53
+ require 'rjb'
54
+ load_rjb_dependencies
55
+ rescue LoadError
56
+ raise Error, "RJB gem is required for Java integration. Please install it: gem install rjb"
57
+ end
58
+ end
59
+ end
60
+
61
+ def load_jruby_dependencies
62
+ jar_path = File.expand_path('../java/hwplib-1.1.10.jar', __dir__)
63
+ raise Error, "hwplib JAR not found at: #{jar_path}" unless File.exist?(jar_path)
64
+
65
+ require jar_path
66
+
67
+ @hwp_reader = Java::KrDogfootHwplibReader::HWPReader
68
+ @text_extractor = Java::KrDogfootHwplibToolTextextractor::TextExtractor
69
+ @text_extract_option = Java::KrDogfootHwplibToolTextextractor::TextExtractOption
70
+ @text_extract_method = Java::KrDogfootHwplibToolTextextractor::TextExtractMethod
71
+
72
+ @java_loaded = true
73
+ end
74
+
75
+ def load_rjb_dependencies
76
+ return if @java_loaded
77
+
78
+ jar_path = File.expand_path('../java/hwplib-1.1.10.jar', __dir__)
79
+ raise Error, "hwplib JAR not found at: #{jar_path}" unless File.exist?(jar_path)
80
+
81
+ # Set JAVA_HOME if not set
82
+ ENV['JAVA_HOME'] ||= detect_java_home
83
+
84
+ # Load RJB with the JAR file
85
+ Rjb::load(jar_path, ['-Xmx512m'])
86
+
87
+ # Import Java classes
88
+ @hwp_reader = Rjb::import('kr.dogfoot.hwplib.reader.HWPReader')
89
+ @text_extractor = Rjb::import('kr.dogfoot.hwplib.tool.textextractor.TextExtractor')
90
+ @text_extract_option = Rjb::import('kr.dogfoot.hwplib.tool.textextractor.TextExtractOption')
91
+ @text_extract_method = Rjb::import('kr.dogfoot.hwplib.tool.textextractor.TextExtractMethod')
92
+
93
+ @java_loaded = true
94
+ end
95
+
96
+ def ensure_java_loaded
97
+ load_rjb_dependencies unless @java_loaded
98
+ end
99
+
100
+ def detect_java_home
101
+ # Try common Java installation paths
102
+ possible_java_homes = [
103
+ '/usr/libexec/java_home -V 2>&1 | grep -E "1\.(8|11)" | head -1 | awk \'{print $NF}\'',
104
+ '/usr/lib/jvm/java-11-openjdk-amd64',
105
+ '/usr/lib/jvm/java-8-openjdk-amd64',
106
+ '/Library/Java/JavaVirtualMachines/jdk-11.jdk/Contents/Home',
107
+ '/Library/Java/JavaVirtualMachines/jdk1.8.0_*.jdk/Contents/Home'
108
+ ]
109
+
110
+ if system('which java > /dev/null 2>&1')
111
+ java_home = `echo $(/usr/libexec/java_home 2>/dev/null)`.strip
112
+ return java_home unless java_home.empty?
113
+ end
114
+
115
+ possible_java_homes.each do |path|
116
+ expanded_path = File.expand_path(path)
117
+ return expanded_path if File.directory?(expanded_path)
118
+ end
119
+
120
+ raise Error, "Java not found. Please install Java 8 or later and set JAVA_HOME environment variable."
121
+ end
122
+ end
123
+ end
Binary file
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'tempfile'
5
+
6
+ module Hwayo
7
+ class SimpleExtractor
8
+ def extract_text(hwp_file_path, output_path = nil)
9
+ raise Error, "HWP file not found: #{hwp_file_path}" unless File.exist?(hwp_file_path)
10
+
11
+ # Java 실행 가능 여부 확인
12
+ unless system('java -version > /dev/null 2>&1')
13
+ return { success: false, error: "Java is not installed. Please install Java 8 or later." }
14
+ end
15
+
16
+ # JAR 파일 경로
17
+ jar_path = File.expand_path('../java/hwplib-1.1.10.jar', __dir__)
18
+
19
+ # JAR 파일이 없으면 대체 경로 시도
20
+ unless File.exist?(jar_path)
21
+ # 개발 환경에서 직접 실행하는 경우
22
+ alt_jar_path = File.expand_path('../../../target/hwplib-1.1.10.jar', __dir__)
23
+ if File.exist?(alt_jar_path)
24
+ jar_path = alt_jar_path
25
+ else
26
+ return { success: false, error: "hwplib JAR not found" }
27
+ end
28
+ end
29
+
30
+ # 임시 출력 파일
31
+ temp_output = Tempfile.new(['hwp_output', '.txt'])
32
+
33
+ begin
34
+ # Java 클래스 실행
35
+ cmd = [
36
+ 'java',
37
+ '-cp', jar_path,
38
+ 'kr.dogfoot.hwplib.sample.Extracting_Text',
39
+ hwp_file_path,
40
+ temp_output.path
41
+ ]
42
+
43
+ stdout, stderr, status = Open3.capture3(*cmd)
44
+
45
+ if status.success?
46
+ extracted_text = File.read(temp_output.path, encoding: 'UTF-8')
47
+
48
+ # 결과 저장
49
+ if output_path
50
+ File.write(output_path, extracted_text, encoding: 'UTF-8')
51
+ { success: true, text: extracted_text, output_path: output_path }
52
+ else
53
+ { success: true, text: extracted_text }
54
+ end
55
+ else
56
+ { success: false, error: "Java execution failed: #{stderr}" }
57
+ end
58
+
59
+ ensure
60
+ temp_output.close
61
+ temp_output.unlink
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Hwayo
4
+ VERSION = "0.1.0"
5
+ end
data/lib/hwayo.rb ADDED
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "hwayo/version"
4
+ require 'open3'
5
+ require 'tempfile'
6
+
7
+ module Hwayo
8
+ class Error < StandardError; end
9
+
10
+ # Extract text from HWP file
11
+ def self.extract_text(hwp_file_path, output_path = nil)
12
+ raise Error, "HWP file not found: #{hwp_file_path}" unless File.exist?(hwp_file_path)
13
+
14
+ # Java 실행 가능 여부 확인
15
+ unless system('java -version > /dev/null 2>&1')
16
+ return { success: false, error: "Java is not installed. Please install Java 8 or later." }
17
+ end
18
+
19
+ # JAR 파일 경로 찾기
20
+ jar_path = find_jar_path
21
+ unless jar_path
22
+ return { success: false, error: "hwplib JAR not found. Please ensure hwplib-1.1.10.jar is in the gem's lib/hwayo/java directory or in the current directory." }
23
+ end
24
+
25
+ # CLI 클래스 경로 찾기
26
+ cli_class_path = find_cli_class(jar_path)
27
+ unless cli_class_path
28
+ return { success: false, error: "HWPTextExtractorCLI.class not found" }
29
+ end
30
+
31
+ # 임시 출력 파일
32
+ temp_output = output_path || Tempfile.new(['hwp_output', '.txt']).path
33
+
34
+ begin
35
+ # Java 명령 실행
36
+ cmd = [
37
+ 'java',
38
+ '-cp', "#{jar_path}:#{cli_class_path}",
39
+ 'HWPTextExtractorCLI',
40
+ hwp_file_path,
41
+ temp_output
42
+ ]
43
+
44
+ stdout, stderr, status = Open3.capture3(*cmd)
45
+
46
+ if status.success? && stdout.strip == "SUCCESS"
47
+ extracted_text = File.read(temp_output, encoding: 'UTF-8')
48
+
49
+ result = { success: true, text: extracted_text }
50
+ result[:output_path] = output_path if output_path
51
+ result
52
+ else
53
+ { success: false, error: "Extraction failed: #{stderr}" }
54
+ end
55
+
56
+ ensure
57
+ # 임시 파일 정리
58
+ File.unlink(temp_output) if !output_path && File.exist?(temp_output)
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ def self.find_jar_path
65
+ possible_paths = [
66
+ # gem 내부
67
+ File.expand_path('../hwayo/java/hwplib-1.1.10.jar', __FILE__),
68
+ # 현재 디렉토리
69
+ 'hwplib-1.1.10.jar',
70
+ 'target/hwplib-1.1.10.jar',
71
+ # 환경 변수
72
+ ENV['HWPLIB_JAR_PATH']
73
+ ].compact
74
+
75
+ possible_paths.find { |path| path && File.exist?(path) }
76
+ end
77
+
78
+ def self.find_cli_class(jar_dir)
79
+ jar_directory = File.dirname(jar_dir)
80
+ possible_paths = [
81
+ jar_directory,
82
+ '.',
83
+ File.expand_path('../hwayo/java', __FILE__)
84
+ ]
85
+
86
+ possible_paths.find { |path| File.exist?(File.join(path, 'HWPTextExtractorCLI.class')) }
87
+ end
88
+ end
data/sig/hwayo.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Hwayo
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hwayo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - 이원섭wonsup Lee/Alfonso
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: Hwayo is a Ruby gem that wraps the hwplib Java library to extract text
13
+ from Korean HWP (Hangul Word Processor) files. It provides a simple interface to
14
+ extract text content from HWP documents using the Java hwplib library.
15
+ email:
16
+ - onesup.lee@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - README.md
22
+ - Rakefile
23
+ - lib/hwayo.rb
24
+ - lib/hwayo/extractor.rb
25
+ - lib/hwayo/java/HWPTextExtractorCLI.class
26
+ - lib/hwayo/java/hwplib-1.1.10.jar
27
+ - lib/hwayo/simple_extractor.rb
28
+ - lib/hwayo/version.rb
29
+ - sig/hwayo.rbs
30
+ homepage: https://github.com/onesup/hwayo
31
+ licenses:
32
+ - MIT
33
+ metadata:
34
+ homepage_uri: https://github.com/onesup/hwayo
35
+ source_code_uri: https://github.com/onesup/hwayo
36
+ changelog_uri: https://github.com/onesup/hwayo/blob/main/CHANGELOG.md
37
+ rdoc_options: []
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: 2.7.0
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: '0'
50
+ requirements: []
51
+ rubygems_version: 3.6.9
52
+ specification_version: 4
53
+ summary: Ruby wrapper for hwplib - Extract text from HWP files
54
+ test_files: []