henkei 2.9.2.1 → 2.9.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -3
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +52 -10
- data/spec/henkei_spec.rb +20 -0
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ea707f5995f2ff0c9036cb992b991e91ccf9f62b9298222cd81d24876678f96
|
4
|
+
data.tar.gz: 4dcd7c3394b3971a58ea11478147320388212bb67622602c795743a7b9d545f6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 03a2c7e3be4f4065385351ddb1694ae9e5c53eae1ddfc8826a3b137868ded1c3a30efb95f515e97cf779fed09ef321565e4ad39c7c232cf315df3ba19c443625
|
7
|
+
data.tar.gz: cbccc2cad4ca82935aee2ffb2c60404264da5dc876f4a3cf05768fb70859dfebfbb436d2d62284e0d5b16d8b246920b0f38e7349d633cabb42facd8bd287ccca
|
data/README.md
CHANGED
@@ -28,7 +28,7 @@ been split up. To keep the gem size down Henkei will only include the client app
|
|
28
28
|
call to Henkei, a new Java process will be started, run your command, then terminate.
|
29
29
|
|
30
30
|
Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
|
31
|
-
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
31
|
+
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
32
32
|
|
33
33
|
## Usage
|
34
34
|
|
@@ -111,12 +111,26 @@ henkei.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.
|
|
111
111
|
henkei.mimetype.extensions #=> ['docx']
|
112
112
|
```
|
113
113
|
|
114
|
+
### Output text in a specific character encoding
|
115
|
+
|
116
|
+
You can specify the output character encoding by passing in the optional `encoding` argument when calling to the
|
117
|
+
`text` or `html` instance methods, as well as the `read` class method.
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
henkei = Henkei.new 'sample.pages'
|
121
|
+
utf_8_text = henkei.text(encoding: 'UTF-8')
|
122
|
+
utf_16_html = henkei.html(encoding: 'UTF-16')
|
123
|
+
|
124
|
+
data = File.read 'sample.pages'
|
125
|
+
utf_32_text = Henkei.read :text, data, encoding: 'UTF-32'
|
126
|
+
```
|
127
|
+
|
114
128
|
## Installation and Dependencies
|
115
129
|
|
116
130
|
### Java Runtime
|
117
131
|
|
118
132
|
Henkei packages the Apache Tika application jar and requires a working JRE for it to work.
|
119
|
-
Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
|
133
|
+
Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
|
120
134
|
|
121
135
|
### Gem
|
122
136
|
|
@@ -131,7 +145,7 @@ And then execute:
|
|
131
145
|
Or install it yourself as:
|
132
146
|
|
133
147
|
$ gem install henkei
|
134
|
-
|
148
|
+
|
135
149
|
### Heroku
|
136
150
|
|
137
151
|
Add the JVM Buildpack to your Heroku project:
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -47,8 +47,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
47
47
|
# text = Henkei.read :text, data
|
48
48
|
# metadata = Henkei.read :metadata, data
|
49
49
|
#
|
50
|
-
def self.read(type, data, include_ocr: false)
|
51
|
-
result = client_read(type, data, include_ocr: include_ocr)
|
50
|
+
def self.read(type, data, include_ocr: false, encoding: nil)
|
51
|
+
result = client_read(type, data, include_ocr: include_ocr, encoding: encoding)
|
52
52
|
|
53
53
|
case type
|
54
54
|
when :text, :html then result
|
@@ -96,10 +96,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
96
96
|
#
|
97
97
|
# henkei.text(include_ocr: true)
|
98
98
|
#
|
99
|
-
|
99
|
+
# Set the output character encoding (e.g. 'UTF-8')
|
100
|
+
#
|
101
|
+
# henkei.text(encoding: 'UTF-8')
|
102
|
+
#
|
103
|
+
def text(include_ocr: false, encoding: nil)
|
100
104
|
return @text if defined? @text
|
101
105
|
|
102
|
-
@text = Henkei.read :text, data, include_ocr: include_ocr
|
106
|
+
@text = Henkei.read :text, data, include_ocr: include_ocr, encoding: encoding
|
103
107
|
end
|
104
108
|
|
105
109
|
# Returns the text content of the Henkei document in HTML.
|
@@ -111,10 +115,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
111
115
|
#
|
112
116
|
# henkei.html(include_ocr: true)
|
113
117
|
#
|
114
|
-
|
118
|
+
# Set the output character encoding (e.g. 'UTF-8')
|
119
|
+
#
|
120
|
+
# henkei.text(encoding: 'UTF-8')
|
121
|
+
#
|
122
|
+
def html(include_ocr: false, encoding: nil)
|
115
123
|
return @html if defined? @html
|
116
124
|
|
117
|
-
@html = Henkei.read :html, data, include_ocr: include_ocr
|
125
|
+
@html = Henkei.read :html, data, include_ocr: include_ocr, encoding: encoding
|
118
126
|
end
|
119
127
|
|
120
128
|
# Returns the metadata hash of the Henkei document.
|
@@ -211,20 +219,37 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
211
219
|
|
212
220
|
# Internal helper for calling to Tika library directly
|
213
221
|
#
|
214
|
-
def self.client_read(type, data, include_ocr: false)
|
215
|
-
|
222
|
+
def self.client_read(type, data, include_ocr: false, encoding: nil)
|
223
|
+
unless encoding.nil? || Encoding.name_list.include?(encoding)
|
224
|
+
raise ArgumentError, "unsupported encoding - #{encoding}"
|
225
|
+
end
|
226
|
+
|
227
|
+
Open3.popen2(*tika_command(type, include_ocr: include_ocr, encoding: encoding)) do |stdin, stdout|
|
228
|
+
stdin.binmode
|
229
|
+
stdout.binmode
|
230
|
+
stdout.set_encoding encoding unless encoding.nil?
|
231
|
+
|
232
|
+
out_reader = Thread.new { stdout.read }
|
233
|
+
|
234
|
+
write_data_to_stdin(data, stdin)
|
235
|
+
|
236
|
+
stdin.close
|
237
|
+
|
238
|
+
out_reader.value
|
239
|
+
end
|
216
240
|
end
|
217
241
|
private_class_method :client_read
|
218
242
|
|
219
243
|
# Internal helper for building the Java command to call Tika
|
220
244
|
#
|
221
|
-
def self.tika_command(type, include_ocr: false)
|
245
|
+
def self.tika_command(type, include_ocr: false, encoding: nil)
|
222
246
|
[
|
223
247
|
java_path,
|
224
248
|
'-Djava.awt.headless=true',
|
225
249
|
'-jar',
|
226
250
|
Henkei::JAR_PATH,
|
227
|
-
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
|
251
|
+
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}",
|
252
|
+
*("--encoding=#{encoding}" unless encoding.nil?)
|
228
253
|
] + switch_for_type(type)
|
229
254
|
end
|
230
255
|
private_class_method :tika_command
|
@@ -240,4 +265,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
240
265
|
}[type]
|
241
266
|
end
|
242
267
|
private_class_method :switch_for_type
|
268
|
+
|
269
|
+
# Internal helper for writing the input data to stdin when calling Tika
|
270
|
+
#
|
271
|
+
def self.write_data_to_stdin(data, stdin)
|
272
|
+
return unless data
|
273
|
+
|
274
|
+
begin
|
275
|
+
if data.respond_to? :readpartial
|
276
|
+
IO.copy_stream(data, stdin)
|
277
|
+
else
|
278
|
+
stdin.write data
|
279
|
+
end
|
280
|
+
rescue Errno::EPIPE
|
281
|
+
# Catch broken pipe.
|
282
|
+
end
|
283
|
+
end
|
284
|
+
private_class_method :write_data_to_stdin
|
243
285
|
end
|
data/spec/henkei_spec.rb
CHANGED
@@ -72,6 +72,26 @@ describe Henkei do
|
|
72
72
|
end
|
73
73
|
end
|
74
74
|
end
|
75
|
+
|
76
|
+
context 'when a valid `encoding` value is provided' do
|
77
|
+
let(:encoding) { 'UTF-32' }
|
78
|
+
|
79
|
+
it 'returns the parsed text in the specified encoding' do
|
80
|
+
text = described_class.read :text, data, encoding: encoding
|
81
|
+
|
82
|
+
expect(text.encoding.name).to eq encoding
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context 'when an invalid `encoding` value is provided' do
|
87
|
+
let(:encoding) { 'Beef' }
|
88
|
+
|
89
|
+
it 'raises an error' do
|
90
|
+
expect do
|
91
|
+
described_class.read :text, data, encoding: encoding
|
92
|
+
end.to raise_error(ArgumentError, "unsupported encoding - #{encoding}")
|
93
|
+
end
|
94
|
+
end
|
75
95
|
end
|
76
96
|
|
77
97
|
describe '.new' do
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.9.2.
|
4
|
+
version: 2.9.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
8
8
|
- Andrew Bromwich
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2024-
|
12
|
+
date: 2024-06-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -91,7 +91,7 @@ licenses:
|
|
91
91
|
metadata:
|
92
92
|
allowed_push_host: https://rubygems.org
|
93
93
|
rubygems_mfa_required: 'true'
|
94
|
-
post_install_message:
|
94
|
+
post_install_message:
|
95
95
|
rdoc_options: []
|
96
96
|
require_paths:
|
97
97
|
- lib
|
@@ -109,8 +109,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
requirements: []
|
112
|
-
rubygems_version: 3.
|
113
|
-
signing_key:
|
112
|
+
rubygems_version: 3.0.0
|
113
|
+
signing_key:
|
114
114
|
specification_version: 4
|
115
115
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
116
116
|
.rtf, .pdf) using Apache Tika toolkit
|