henkei 2.9.2.1 → 2.9.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +17 -3
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +52 -10
- data/spec/henkei_spec.rb +20 -0
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ea707f5995f2ff0c9036cb992b991e91ccf9f62b9298222cd81d24876678f96
|
4
|
+
data.tar.gz: 4dcd7c3394b3971a58ea11478147320388212bb67622602c795743a7b9d545f6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 03a2c7e3be4f4065385351ddb1694ae9e5c53eae1ddfc8826a3b137868ded1c3a30efb95f515e97cf779fed09ef321565e4ad39c7c232cf315df3ba19c443625
|
7
|
+
data.tar.gz: cbccc2cad4ca82935aee2ffb2c60404264da5dc876f4a3cf05768fb70859dfebfbb436d2d62284e0d5b16d8b246920b0f38e7349d633cabb42facd8bd287ccca
|
data/README.md
CHANGED
@@ -28,7 +28,7 @@ been split up. To keep the gem size down Henkei will only include the client app
|
|
28
28
|
call to Henkei, a new Java process will be started, run your command, then terminate.
|
29
29
|
|
30
30
|
Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
|
31
|
-
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
31
|
+
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
32
32
|
|
33
33
|
## Usage
|
34
34
|
|
@@ -111,12 +111,26 @@ henkei.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.
|
|
111
111
|
henkei.mimetype.extensions #=> ['docx']
|
112
112
|
```
|
113
113
|
|
114
|
+
### Output text in a specific character encoding
|
115
|
+
|
116
|
+
You can specify the output character encoding by passing in the optional `encoding` argument when calling to the
|
117
|
+
`text` or `html` instance methods, as well as the `read` class method.
|
118
|
+
|
119
|
+
```ruby
|
120
|
+
henkei = Henkei.new 'sample.pages'
|
121
|
+
utf_8_text = henkei.text(encoding: 'UTF-8')
|
122
|
+
utf_16_html = henkei.html(encoding: 'UTF-16')
|
123
|
+
|
124
|
+
data = File.read 'sample.pages'
|
125
|
+
utf_32_text = Henkei.read :text, data, encoding: 'UTF-32'
|
126
|
+
```
|
127
|
+
|
114
128
|
## Installation and Dependencies
|
115
129
|
|
116
130
|
### Java Runtime
|
117
131
|
|
118
132
|
Henkei packages the Apache Tika application jar and requires a working JRE for it to work.
|
119
|
-
Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
|
133
|
+
Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
|
120
134
|
|
121
135
|
### Gem
|
122
136
|
|
@@ -131,7 +145,7 @@ And then execute:
|
|
131
145
|
Or install it yourself as:
|
132
146
|
|
133
147
|
$ gem install henkei
|
134
|
-
|
148
|
+
|
135
149
|
### Heroku
|
136
150
|
|
137
151
|
Add the JVM Buildpack to your Heroku project:
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -47,8 +47,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
47
47
|
# text = Henkei.read :text, data
|
48
48
|
# metadata = Henkei.read :metadata, data
|
49
49
|
#
|
50
|
-
def self.read(type, data, include_ocr: false)
|
51
|
-
result = client_read(type, data, include_ocr: include_ocr)
|
50
|
+
def self.read(type, data, include_ocr: false, encoding: nil)
|
51
|
+
result = client_read(type, data, include_ocr: include_ocr, encoding: encoding)
|
52
52
|
|
53
53
|
case type
|
54
54
|
when :text, :html then result
|
@@ -96,10 +96,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
96
96
|
#
|
97
97
|
# henkei.text(include_ocr: true)
|
98
98
|
#
|
99
|
-
|
99
|
+
# Set the output character encoding (e.g. 'UTF-8')
|
100
|
+
#
|
101
|
+
# henkei.text(encoding: 'UTF-8')
|
102
|
+
#
|
103
|
+
def text(include_ocr: false, encoding: nil)
|
100
104
|
return @text if defined? @text
|
101
105
|
|
102
|
-
@text = Henkei.read :text, data, include_ocr: include_ocr
|
106
|
+
@text = Henkei.read :text, data, include_ocr: include_ocr, encoding: encoding
|
103
107
|
end
|
104
108
|
|
105
109
|
# Returns the text content of the Henkei document in HTML.
|
@@ -111,10 +115,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
111
115
|
#
|
112
116
|
# henkei.html(include_ocr: true)
|
113
117
|
#
|
114
|
-
|
118
|
+
# Set the output character encoding (e.g. 'UTF-8')
|
119
|
+
#
|
120
|
+
# henkei.text(encoding: 'UTF-8')
|
121
|
+
#
|
122
|
+
def html(include_ocr: false, encoding: nil)
|
115
123
|
return @html if defined? @html
|
116
124
|
|
117
|
-
@html = Henkei.read :html, data, include_ocr: include_ocr
|
125
|
+
@html = Henkei.read :html, data, include_ocr: include_ocr, encoding: encoding
|
118
126
|
end
|
119
127
|
|
120
128
|
# Returns the metadata hash of the Henkei document.
|
@@ -211,20 +219,37 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
211
219
|
|
212
220
|
# Internal helper for calling to Tika library directly
|
213
221
|
#
|
214
|
-
def self.client_read(type, data, include_ocr: false)
|
215
|
-
|
222
|
+
def self.client_read(type, data, include_ocr: false, encoding: nil)
|
223
|
+
unless encoding.nil? || Encoding.name_list.include?(encoding)
|
224
|
+
raise ArgumentError, "unsupported encoding - #{encoding}"
|
225
|
+
end
|
226
|
+
|
227
|
+
Open3.popen2(*tika_command(type, include_ocr: include_ocr, encoding: encoding)) do |stdin, stdout|
|
228
|
+
stdin.binmode
|
229
|
+
stdout.binmode
|
230
|
+
stdout.set_encoding encoding unless encoding.nil?
|
231
|
+
|
232
|
+
out_reader = Thread.new { stdout.read }
|
233
|
+
|
234
|
+
write_data_to_stdin(data, stdin)
|
235
|
+
|
236
|
+
stdin.close
|
237
|
+
|
238
|
+
out_reader.value
|
239
|
+
end
|
216
240
|
end
|
217
241
|
private_class_method :client_read
|
218
242
|
|
219
243
|
# Internal helper for building the Java command to call Tika
|
220
244
|
#
|
221
|
-
def self.tika_command(type, include_ocr: false)
|
245
|
+
def self.tika_command(type, include_ocr: false, encoding: nil)
|
222
246
|
[
|
223
247
|
java_path,
|
224
248
|
'-Djava.awt.headless=true',
|
225
249
|
'-jar',
|
226
250
|
Henkei::JAR_PATH,
|
227
|
-
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
|
251
|
+
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}",
|
252
|
+
*("--encoding=#{encoding}" unless encoding.nil?)
|
228
253
|
] + switch_for_type(type)
|
229
254
|
end
|
230
255
|
private_class_method :tika_command
|
@@ -240,4 +265,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
240
265
|
}[type]
|
241
266
|
end
|
242
267
|
private_class_method :switch_for_type
|
268
|
+
|
269
|
+
# Internal helper for writing the input data to stdin when calling Tika
|
270
|
+
#
|
271
|
+
def self.write_data_to_stdin(data, stdin)
|
272
|
+
return unless data
|
273
|
+
|
274
|
+
begin
|
275
|
+
if data.respond_to? :readpartial
|
276
|
+
IO.copy_stream(data, stdin)
|
277
|
+
else
|
278
|
+
stdin.write data
|
279
|
+
end
|
280
|
+
rescue Errno::EPIPE
|
281
|
+
# Catch broken pipe.
|
282
|
+
end
|
283
|
+
end
|
284
|
+
private_class_method :write_data_to_stdin
|
243
285
|
end
|
data/spec/henkei_spec.rb
CHANGED
@@ -72,6 +72,26 @@ describe Henkei do
|
|
72
72
|
end
|
73
73
|
end
|
74
74
|
end
|
75
|
+
|
76
|
+
context 'when a valid `encoding` value is provided' do
|
77
|
+
let(:encoding) { 'UTF-32' }
|
78
|
+
|
79
|
+
it 'returns the parsed text in the specified encoding' do
|
80
|
+
text = described_class.read :text, data, encoding: encoding
|
81
|
+
|
82
|
+
expect(text.encoding.name).to eq encoding
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context 'when an invalid `encoding` value is provided' do
|
87
|
+
let(:encoding) { 'Beef' }
|
88
|
+
|
89
|
+
it 'raises an error' do
|
90
|
+
expect do
|
91
|
+
described_class.read :text, data, encoding: encoding
|
92
|
+
end.to raise_error(ArgumentError, "unsupported encoding - #{encoding}")
|
93
|
+
end
|
94
|
+
end
|
75
95
|
end
|
76
96
|
|
77
97
|
describe '.new' do
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.9.2.
|
4
|
+
version: 2.9.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
8
8
|
- Andrew Bromwich
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2024-
|
12
|
+
date: 2024-06-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -91,7 +91,7 @@ licenses:
|
|
91
91
|
metadata:
|
92
92
|
allowed_push_host: https://rubygems.org
|
93
93
|
rubygems_mfa_required: 'true'
|
94
|
-
post_install_message:
|
94
|
+
post_install_message:
|
95
95
|
rdoc_options: []
|
96
96
|
require_paths:
|
97
97
|
- lib
|
@@ -109,8 +109,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
requirements: []
|
112
|
-
rubygems_version: 3.
|
113
|
-
signing_key:
|
112
|
+
rubygems_version: 3.0.0
|
113
|
+
signing_key:
|
114
114
|
specification_version: 4
|
115
115
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
116
116
|
.rtf, .pdf) using Apache Tika toolkit
|