henkei 1.24.1 → 2.2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/README.md +23 -0
- data/henkei.gemspec +1 -0
- data/jar/{tika-app-1.24.1.jar → tika-app-2.2.0.jar} +0 -0
- data/jar/tika-config-without-ocr.xml +9 -0
- data/jar/tika-config.xml +1 -0
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +28 -84
- data/spec/henkei_spec.rb +47 -36
- metadata +22 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64f2ec330c97bf77b16e2f7e14e08f90c405ae42c7aeddce733c0d889eeb4782
|
4
|
+
data.tar.gz: c3b3b91c569c7093bf22a5c751153426f1fdaa62da61f8c40f8b8cabc6ce072c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9f2263c057cb9e958039930aa3c0244c31b5bcc5e636515a757e3b1c5d43a6b52c932a76a8c132af73707453df1e2652e6f02d9926276a75bceb4deb69ffa59
|
7
|
+
data.tar.gz: f8ea0d87ad2bc75213483824a30edba37507110fc721434f0a40477500dc4674d8ce8d1c2a47b06624249b48228bbe9c9a9f9c86bb19e5cb2dcc18a8dec23dbe
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -21,6 +21,15 @@ Here are some of the formats supported:
|
|
21
21
|
For the complete list of supported formats, please visit the Apache Tika
|
22
22
|
[Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
|
23
23
|
|
24
|
+
## Upgrading from v1.x to v2.x
|
25
|
+
|
26
|
+
Apache Tika v2.x brings with it some changes. One key change is that the Tika client and server applications have
|
27
|
+
been split up. To keep the gem size down Henkei will only include the client app. That is to say, each time you
|
28
|
+
call to Henkei, a new Java process will be started, run your command, then terminate.
|
29
|
+
|
30
|
+
Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
|
31
|
+
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
32
|
+
|
24
33
|
## Usage
|
25
34
|
|
26
35
|
Text, metadata and MIME type information can be extracted by calling `Henkei.read` directly:
|
@@ -69,6 +78,20 @@ post '/:name/:filename' do
|
|
69
78
|
end
|
70
79
|
```
|
71
80
|
|
81
|
+
### Reading text from inside images (OCR)
|
82
|
+
|
83
|
+
You can enable OCR by specifying the optional `include_ocr: true` when calling to the `text` or `html` instance methods,
|
84
|
+
as well as the `read` class method. Note that Tika does indicate this will greatly increase processing time.
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
henkei = Henkei.new 'sample.pages'
|
88
|
+
text_with_ocr = henkei.text(include_ocr: true)
|
89
|
+
html_with_ocr = henkei.html(include_ocr: true)
|
90
|
+
|
91
|
+
data = File.read 'sample.pages'
|
92
|
+
text_with_ocr = Henkei.read :text, data, include_ocr: true
|
93
|
+
```
|
94
|
+
|
72
95
|
### Reading metadata
|
73
96
|
|
74
97
|
Metadata is returned as a hash.
|
data/henkei.gemspec
CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
|
27
27
|
|
28
28
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
29
|
+
spec.add_development_dependency 'nokogiri', '~> 1.12'
|
29
30
|
spec.add_development_dependency 'rails', '~> 5.0'
|
30
31
|
spec.add_development_dependency 'rake', '~> 12.3'
|
31
32
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
Binary file
|
@@ -0,0 +1,9 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<properties>
|
3
|
+
<service-loader initializableProblemHandler="ignore"/>
|
4
|
+
<parsers>
|
5
|
+
<parser class="org.apache.tika.parser.DefaultParser">
|
6
|
+
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
|
7
|
+
</parser>
|
8
|
+
</parsers>
|
9
|
+
</properties>
|
data/jar/tika-config.xml
CHANGED
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -25,12 +25,9 @@ require 'open3'
|
|
25
25
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
26
26
|
class Henkei # rubocop:disable Metrics/ClassLength
|
27
27
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
28
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-
|
28
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.2.0.jar')
|
29
29
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
30
|
-
|
31
|
-
|
32
|
-
@@server_port = nil
|
33
|
-
@@server_pid = nil
|
30
|
+
CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
|
34
31
|
|
35
32
|
def self.mimetype(content_type)
|
36
33
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
@@ -50,8 +47,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
50
47
|
# text = Henkei.read :text, data
|
51
48
|
# metadata = Henkei.read :metadata, data
|
52
49
|
#
|
53
|
-
def self.read(type, data)
|
54
|
-
result =
|
50
|
+
def self.read(type, data, include_ocr: false)
|
51
|
+
result = client_read(type, data, include_ocr: include_ocr)
|
55
52
|
|
56
53
|
case type
|
57
54
|
when :text then result
|
@@ -96,10 +93,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
96
93
|
# henkei = Henkei.new 'sample.pages'
|
97
94
|
# henkei.text
|
98
95
|
#
|
99
|
-
|
96
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
97
|
+
#
|
98
|
+
# henkei.text(include_ocr: true)
|
99
|
+
#
|
100
|
+
def text(include_ocr: false)
|
100
101
|
return @text if defined? @text
|
101
102
|
|
102
|
-
@text = Henkei.read :text, data
|
103
|
+
@text = Henkei.read :text, data, include_ocr: include_ocr
|
103
104
|
end
|
104
105
|
|
105
106
|
# Returns the text content of the Henkei document in HTML.
|
@@ -107,10 +108,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
107
108
|
# henkei = Henkei.new 'sample.pages'
|
108
109
|
# henkei.html
|
109
110
|
#
|
110
|
-
|
111
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
112
|
+
#
|
113
|
+
# henkei.html(include_ocr: true)
|
114
|
+
#
|
115
|
+
def html(include_ocr: false)
|
111
116
|
return @html if defined? @html
|
112
117
|
|
113
|
-
@html = Henkei.read :html, data
|
118
|
+
@html = Henkei.read :html, data, include_ocr: include_ocr
|
114
119
|
end
|
115
120
|
|
116
121
|
# Returns the metadata hash of the Henkei document.
|
@@ -144,9 +149,9 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
144
149
|
#
|
145
150
|
def creation_date
|
146
151
|
return @creation_date if defined? @creation_date
|
147
|
-
return unless metadata['
|
152
|
+
return unless metadata['dcterms:created']
|
148
153
|
|
149
|
-
@creation_date = Time.parse(metadata['
|
154
|
+
@creation_date = Time.parse(metadata['dcterms:created'])
|
150
155
|
end
|
151
156
|
|
152
157
|
# Returns +true+ if the Henkei document was specified using a file path.
|
@@ -196,44 +201,6 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
196
201
|
@data
|
197
202
|
end
|
198
203
|
|
199
|
-
# Returns pid of Tika server, started as a new spawned process.
|
200
|
-
#
|
201
|
-
# type :html, :text or :metadata
|
202
|
-
# custom_port e.g. 9293
|
203
|
-
#
|
204
|
-
# Henkei.server(:text, 9294)
|
205
|
-
#
|
206
|
-
def self.server(type, custom_port = nil)
|
207
|
-
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
208
|
-
|
209
|
-
@@server_pid = Process.spawn(*tika_command(type, server: true))
|
210
|
-
sleep(2) # Give the server 2 seconds to spin up.
|
211
|
-
@@server_pid
|
212
|
-
end
|
213
|
-
|
214
|
-
# Kills server started by Henkei.server
|
215
|
-
#
|
216
|
-
# Always run this when you're done, or else Tika might run until you kill it manually
|
217
|
-
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
218
|
-
# putting this method in the ensure block.
|
219
|
-
#
|
220
|
-
# Henkei.server(:text)
|
221
|
-
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
222
|
-
# begin
|
223
|
-
# my_texts = reports.map{ |report_path| Henkei.new(report_path).text }
|
224
|
-
# rescue
|
225
|
-
# ensure
|
226
|
-
# Henkei.kill_server!
|
227
|
-
# end
|
228
|
-
#
|
229
|
-
def self.kill_server!
|
230
|
-
return unless @@server_pid
|
231
|
-
|
232
|
-
Process.kill('INT', @@server_pid)
|
233
|
-
@@server_pid = nil
|
234
|
-
@@server_port = nil
|
235
|
-
end
|
236
|
-
|
237
204
|
### Private class methods
|
238
205
|
|
239
206
|
# Provide the path to the Java binary
|
@@ -245,44 +212,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
245
212
|
|
246
213
|
# Internal helper for calling to Tika library directly
|
247
214
|
#
|
248
|
-
def self.client_read(type, data)
|
249
|
-
Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
|
215
|
+
def self.client_read(type, data, include_ocr: false)
|
216
|
+
Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
|
250
217
|
end
|
251
218
|
private_class_method :client_read
|
252
219
|
|
253
|
-
# Internal helper for calling to running Tika server
|
254
|
-
#
|
255
|
-
def self.server_read(data)
|
256
|
-
s = TCPSocket.new('localhost', @@server_port)
|
257
|
-
file = StringIO.new(data, 'r')
|
258
|
-
|
259
|
-
loop do
|
260
|
-
chunk = file.read(65_536)
|
261
|
-
break unless chunk
|
262
|
-
|
263
|
-
s.write(chunk)
|
264
|
-
end
|
265
|
-
|
266
|
-
# tell Tika that we're done sending data
|
267
|
-
s.shutdown(Socket::SHUT_WR)
|
268
|
-
|
269
|
-
resp = String.new ''
|
270
|
-
loop do
|
271
|
-
chunk = s.recv(65_536)
|
272
|
-
break if chunk.empty? || !chunk
|
273
|
-
|
274
|
-
resp << chunk
|
275
|
-
end
|
276
|
-
resp
|
277
|
-
end
|
278
|
-
private_class_method :server_read
|
279
|
-
|
280
220
|
# Internal helper for building the Java command to call Tika
|
281
221
|
#
|
282
|
-
def self.tika_command(type,
|
283
|
-
|
284
|
-
|
285
|
-
|
222
|
+
def self.tika_command(type, include_ocr: false)
|
223
|
+
[
|
224
|
+
java_path,
|
225
|
+
'-Djava.awt.headless=true',
|
226
|
+
'-jar',
|
227
|
+
Henkei::JAR_PATH,
|
228
|
+
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
|
229
|
+
] + switch_for_type(type)
|
286
230
|
end
|
287
231
|
private_class_method :tika_command
|
288
232
|
|
data/spec/henkei_spec.rb
CHANGED
@@ -2,10 +2,15 @@
|
|
2
2
|
|
3
3
|
require 'helper'
|
4
4
|
require 'henkei'
|
5
|
+
require 'nokogiri'
|
5
6
|
|
6
7
|
# Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
|
7
8
|
require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
|
8
9
|
|
10
|
+
def travis_ci?
|
11
|
+
ENV['CI'] == 'true' && ENV['TRAVIS'] == 'true'
|
12
|
+
end
|
13
|
+
|
9
14
|
describe Henkei do
|
10
15
|
let(:data) { File.read 'spec/samples/sample.docx' }
|
11
16
|
|
@@ -52,6 +57,23 @@ describe Henkei do
|
|
52
57
|
|
53
58
|
expect(text).to eq ''
|
54
59
|
end
|
60
|
+
|
61
|
+
unless travis_ci?
|
62
|
+
context 'when `include_ocr` is enabled' do
|
63
|
+
it 'returns parsed plain text in the image' do
|
64
|
+
text = Henkei.read :text, data, include_ocr: true
|
65
|
+
|
66
|
+
expect(text).to include <<~TEXT
|
67
|
+
West Side
|
68
|
+
|
69
|
+
Sea Island
|
70
|
+
PP
|
71
|
+
|
72
|
+
Richmond
|
73
|
+
TEXT
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
55
77
|
end
|
56
78
|
end
|
57
79
|
|
@@ -115,6 +137,7 @@ describe Henkei do
|
|
115
137
|
|
116
138
|
describe '.creation_date' do
|
117
139
|
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
140
|
+
|
118
141
|
it 'should return Time' do
|
119
142
|
expect(henkei.creation_date).to be_a Time
|
120
143
|
end
|
@@ -158,6 +181,30 @@ describe Henkei do
|
|
158
181
|
it '#mimetype returns `image/png`' do
|
159
182
|
expect(henkei.mimetype.content_type).to eq 'image/png'
|
160
183
|
end
|
184
|
+
|
185
|
+
unless travis_ci?
|
186
|
+
context 'when `include_ocr` is enabled' do
|
187
|
+
it '#text returns plain text of parsed text in the image' do
|
188
|
+
expect(henkei.text(include_ocr: true)).to include <<~TEXT
|
189
|
+
West Side
|
190
|
+
|
191
|
+
Sea Island
|
192
|
+
PP
|
193
|
+
|
194
|
+
Richmond
|
195
|
+
TEXT
|
196
|
+
end
|
197
|
+
|
198
|
+
it '#html returns HTML of parsed text in the image' do
|
199
|
+
expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
|
200
|
+
|
201
|
+
html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
|
202
|
+
['Anmore', 'Coquitlam', 'West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
|
203
|
+
expect(html_body.text).to include location
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
161
208
|
end
|
162
209
|
end
|
163
210
|
|
@@ -198,40 +245,4 @@ describe Henkei do
|
|
198
245
|
expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
|
199
246
|
end
|
200
247
|
end
|
201
|
-
|
202
|
-
context 'working as server mode' do
|
203
|
-
specify '#starts and kills server' do
|
204
|
-
begin
|
205
|
-
Henkei.server(:text)
|
206
|
-
expect(Henkei.class_variable_get(:@@server_pid)).not_to be_nil
|
207
|
-
expect(Henkei.class_variable_get(:@@server_port)).not_to be_nil
|
208
|
-
|
209
|
-
s = TCPSocket.new('localhost', Henkei.class_variable_get(:@@server_port))
|
210
|
-
expect(s).to be_a TCPSocket
|
211
|
-
s.close
|
212
|
-
ensure
|
213
|
-
port = Henkei.class_variable_get(:@@server_port)
|
214
|
-
Henkei.kill_server!
|
215
|
-
sleep 2
|
216
|
-
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
specify '#runs samples through server mode' do
|
221
|
-
begin
|
222
|
-
Henkei.server(:text)
|
223
|
-
expect(Henkei.new('spec/samples/sample.pages').text).to(
|
224
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
225
|
-
)
|
226
|
-
expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
|
227
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
228
|
-
)
|
229
|
-
expect(Henkei.new('spec/samples/sample.docx').text).to(
|
230
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
231
|
-
)
|
232
|
-
ensure
|
233
|
-
Henkei.kill_server!
|
234
|
-
end
|
235
|
-
end
|
236
|
-
end
|
237
248
|
end
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
8
8
|
- Andrew Bromwich
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-12-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -65,6 +65,20 @@ dependencies:
|
|
65
65
|
- - "~>"
|
66
66
|
- !ruby/object:Gem::Version
|
67
67
|
version: '2.0'
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: nokogiri
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '1.12'
|
75
|
+
type: :development
|
76
|
+
prerelease: false
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '1.12'
|
68
82
|
- !ruby/object:Gem::Dependency
|
69
83
|
name: rails
|
70
84
|
requirement: !ruby/object:Gem::Requirement
|
@@ -155,7 +169,8 @@ files:
|
|
155
169
|
- Rakefile
|
156
170
|
- bin/console
|
157
171
|
- henkei.gemspec
|
158
|
-
- jar/tika-app-
|
172
|
+
- jar/tika-app-2.2.0.jar
|
173
|
+
- jar/tika-config-without-ocr.xml
|
159
174
|
- jar/tika-config.xml
|
160
175
|
- lib/henkei.rb
|
161
176
|
- lib/henkei/configuration.rb
|
@@ -172,7 +187,7 @@ homepage: http://github.com/abrom/henkei
|
|
172
187
|
licenses:
|
173
188
|
- MIT
|
174
189
|
metadata: {}
|
175
|
-
post_install_message:
|
190
|
+
post_install_message:
|
176
191
|
rdoc_options: []
|
177
192
|
require_paths:
|
178
193
|
- lib
|
@@ -190,8 +205,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
190
205
|
- !ruby/object:Gem::Version
|
191
206
|
version: '0'
|
192
207
|
requirements: []
|
193
|
-
rubygems_version: 3.0.
|
194
|
-
signing_key:
|
208
|
+
rubygems_version: 3.0.9
|
209
|
+
signing_key:
|
195
210
|
specification_version: 4
|
196
211
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
197
212
|
.rtf, .pdf) using Apache Tika toolkit
|