henkei 2.9.2.1 → 2.9.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +1 -1
- data/README.md +17 -3
- data/henkei.gemspec +3 -3
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +52 -10
- data/spec/henkei_spec.rb +20 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e944ff75afd6a3dc86edd752aaf8b9eddf1567dea93b1c1710cdf25be710b662
|
4
|
+
data.tar.gz: d8a4a86fb7c46fddaa17c9ad438ad8f48902e14d1e38021412590df3197306a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8aa16da88e03ff8589c3605775cce4096a9a35d95528441171af5dcb4754279e4159ebdcae4d08d2789a33ac562d82386ff565daf02f3f800458e628af64f77e
|
7
|
+
data.tar.gz: 9cda283d4cfee5c766e6212eaa25834b0761190115c13243f8e6ad9b82e59d5ccaf9e44cb9a82fb9453e53ffc69ed92350ca521a3bf97cea065fffefafe28b70
|
data/.github/workflows/test.yml
CHANGED
data/README.md
CHANGED
@@ -28,7 +28,7 @@ been split up. To keep the gem size down Henkei will only include the client app
|
|
28
28
|
call to Henkei, a new Java process will be started, run your command, then terminate.
|
29
29
|
|
30
30
|
Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
|
31
|
-
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
31
|
+
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
32
32
|
|
33
33
|
## Usage
|
34
34
|
|
@@ -111,12 +111,26 @@ henkei.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.
|
|
111
111
|
henkei.mimetype.extensions #=> ['docx']
|
112
112
|
```
|
113
113
|
|
114
|
+
### Output text in a specific character encoding
|
115
|
+
|
116
|
+
You can specify the output character encoding by passing in the optional `encoding` argument when calling to the
|
117
|
+
`text` or `html` instance methods, as well as the `read` class method.
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
henkei = Henkei.new 'sample.pages'
|
121
|
+
utf_8_text = henkei.text(encoding: 'UTF-8')
|
122
|
+
utf_16_html = henkei.html(encoding: 'UTF-16')
|
123
|
+
|
124
|
+
data = File.read 'sample.pages'
|
125
|
+
utf_32_text = Henkei.read :text, data, encoding: 'UTF-32'
|
126
|
+
```
|
127
|
+
|
114
128
|
## Installation and Dependencies
|
115
129
|
|
116
130
|
### Java Runtime
|
117
131
|
|
118
132
|
Henkei packages the Apache Tika application jar and requires a working JRE for it to work.
|
119
|
-
Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
|
133
|
+
Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
|
120
134
|
|
121
135
|
### Gem
|
122
136
|
|
@@ -131,7 +145,7 @@ And then execute:
|
|
131
145
|
Or install it yourself as:
|
132
146
|
|
133
147
|
$ gem install henkei
|
134
|
-
|
148
|
+
|
135
149
|
### Heroku
|
136
150
|
|
137
151
|
Add the JVM Buildpack to your Heroku project:
|
data/henkei.gemspec
CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec|
|
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
16
|
spec.homepage = 'https://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
|
-
spec.required_ruby_version = ['>= 3.0.0', '< 3.
|
18
|
+
spec.required_ruby_version = ['>= 3.0.0', '< 3.5.0']
|
19
19
|
|
20
20
|
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
21
21
|
# delete this section to allow pushing this gem to any host.
|
@@ -28,6 +28,6 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
29
29
|
spec.require_paths = ['lib']
|
30
30
|
|
31
|
-
spec.
|
32
|
-
spec.
|
31
|
+
spec.add_dependency 'json', '>= 1.8', '< 3'
|
32
|
+
spec.add_dependency 'mini_mime', '>= 0.1.1', '< 2'
|
33
33
|
end
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -47,8 +47,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
47
47
|
# text = Henkei.read :text, data
|
48
48
|
# metadata = Henkei.read :metadata, data
|
49
49
|
#
|
50
|
-
def self.read(type, data, include_ocr: false)
|
51
|
-
result = client_read(type, data, include_ocr: include_ocr)
|
50
|
+
def self.read(type, data, include_ocr: false, encoding: nil)
|
51
|
+
result = client_read(type, data, include_ocr: include_ocr, encoding: encoding)
|
52
52
|
|
53
53
|
case type
|
54
54
|
when :text, :html then result
|
@@ -96,10 +96,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
96
96
|
#
|
97
97
|
# henkei.text(include_ocr: true)
|
98
98
|
#
|
99
|
-
|
99
|
+
# Set the output character encoding (e.g. 'UTF-8')
|
100
|
+
#
|
101
|
+
# henkei.text(encoding: 'UTF-8')
|
102
|
+
#
|
103
|
+
def text(include_ocr: false, encoding: nil)
|
100
104
|
return @text if defined? @text
|
101
105
|
|
102
|
-
@text = Henkei.read :text, data, include_ocr: include_ocr
|
106
|
+
@text = Henkei.read :text, data, include_ocr: include_ocr, encoding: encoding
|
103
107
|
end
|
104
108
|
|
105
109
|
# Returns the text content of the Henkei document in HTML.
|
@@ -111,10 +115,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
111
115
|
#
|
112
116
|
# henkei.html(include_ocr: true)
|
113
117
|
#
|
114
|
-
|
118
|
+
# Set the output character encoding (e.g. 'UTF-8')
|
119
|
+
#
|
120
|
+
# henkei.text(encoding: 'UTF-8')
|
121
|
+
#
|
122
|
+
def html(include_ocr: false, encoding: nil)
|
115
123
|
return @html if defined? @html
|
116
124
|
|
117
|
-
@html = Henkei.read :html, data, include_ocr: include_ocr
|
125
|
+
@html = Henkei.read :html, data, include_ocr: include_ocr, encoding: encoding
|
118
126
|
end
|
119
127
|
|
120
128
|
# Returns the metadata hash of the Henkei document.
|
@@ -211,20 +219,37 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
211
219
|
|
212
220
|
# Internal helper for calling to Tika library directly
|
213
221
|
#
|
214
|
-
def self.client_read(type, data, include_ocr: false)
|
215
|
-
|
222
|
+
def self.client_read(type, data, include_ocr: false, encoding: nil)
|
223
|
+
unless encoding.nil? || Encoding.name_list.include?(encoding)
|
224
|
+
raise ArgumentError, "unsupported encoding - #{encoding}"
|
225
|
+
end
|
226
|
+
|
227
|
+
Open3.popen2(*tika_command(type, include_ocr: include_ocr, encoding: encoding)) do |stdin, stdout|
|
228
|
+
stdin.binmode
|
229
|
+
stdout.binmode
|
230
|
+
stdout.set_encoding encoding unless encoding.nil?
|
231
|
+
|
232
|
+
out_reader = Thread.new { stdout.read }
|
233
|
+
|
234
|
+
write_data_to_stdin(data, stdin)
|
235
|
+
|
236
|
+
stdin.close
|
237
|
+
|
238
|
+
out_reader.value
|
239
|
+
end
|
216
240
|
end
|
217
241
|
private_class_method :client_read
|
218
242
|
|
219
243
|
# Internal helper for building the Java command to call Tika
|
220
244
|
#
|
221
|
-
def self.tika_command(type, include_ocr: false)
|
245
|
+
def self.tika_command(type, include_ocr: false, encoding: nil)
|
222
246
|
[
|
223
247
|
java_path,
|
224
248
|
'-Djava.awt.headless=true',
|
225
249
|
'-jar',
|
226
250
|
Henkei::JAR_PATH,
|
227
|
-
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
|
251
|
+
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}",
|
252
|
+
*("--encoding=#{encoding}" unless encoding.nil?)
|
228
253
|
] + switch_for_type(type)
|
229
254
|
end
|
230
255
|
private_class_method :tika_command
|
@@ -240,4 +265,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
240
265
|
}[type]
|
241
266
|
end
|
242
267
|
private_class_method :switch_for_type
|
268
|
+
|
269
|
+
# Internal helper for writing the input data to stdin when calling Tika
|
270
|
+
#
|
271
|
+
def self.write_data_to_stdin(data, stdin)
|
272
|
+
return unless data
|
273
|
+
|
274
|
+
begin
|
275
|
+
if data.respond_to? :readpartial
|
276
|
+
IO.copy_stream(data, stdin)
|
277
|
+
else
|
278
|
+
stdin.write data
|
279
|
+
end
|
280
|
+
rescue Errno::EPIPE
|
281
|
+
# Catch broken pipe.
|
282
|
+
end
|
283
|
+
end
|
284
|
+
private_class_method :write_data_to_stdin
|
243
285
|
end
|
data/spec/henkei_spec.rb
CHANGED
@@ -72,6 +72,26 @@ describe Henkei do
|
|
72
72
|
end
|
73
73
|
end
|
74
74
|
end
|
75
|
+
|
76
|
+
context 'when a valid `encoding` value is provided' do
|
77
|
+
let(:encoding) { 'UTF-32' }
|
78
|
+
|
79
|
+
it 'returns the parsed text in the specified encoding' do
|
80
|
+
text = described_class.read :text, data, encoding: encoding
|
81
|
+
|
82
|
+
expect(text.encoding.name).to eq encoding
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context 'when an invalid `encoding` value is provided' do
|
87
|
+
let(:encoding) { 'Beef' }
|
88
|
+
|
89
|
+
it 'raises an error' do
|
90
|
+
expect do
|
91
|
+
described_class.read :text, data, encoding: encoding
|
92
|
+
end.to raise_error(ArgumentError, "unsupported encoding - #{encoding}")
|
93
|
+
end
|
94
|
+
end
|
75
95
|
end
|
76
96
|
|
77
97
|
describe '.new' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.9.2.
|
4
|
+
version: 2.9.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2025-01-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -102,7 +102,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
102
102
|
version: 3.0.0
|
103
103
|
- - "<"
|
104
104
|
- !ruby/object:Gem::Version
|
105
|
-
version: 3.
|
105
|
+
version: 3.5.0
|
106
106
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - ">="
|