henkei 1.27.1 → 2.2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -0
- data/henkei.gemspec +1 -0
- data/jar/{tika-app-1.27.jar → tika-app-2.2.0.jar} +0 -0
- data/jar/tika-config-without-ocr.xml +9 -0
- data/jar/tika-config.xml +1 -0
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +28 -84
- data/spec/henkei_spec.rb +47 -36
- metadata +22 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64f2ec330c97bf77b16e2f7e14e08f90c405ae42c7aeddce733c0d889eeb4782
|
4
|
+
data.tar.gz: c3b3b91c569c7093bf22a5c751153426f1fdaa62da61f8c40f8b8cabc6ce072c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9f2263c057cb9e958039930aa3c0244c31b5bcc5e636515a757e3b1c5d43a6b52c932a76a8c132af73707453df1e2652e6f02d9926276a75bceb4deb69ffa59
|
7
|
+
data.tar.gz: f8ea0d87ad2bc75213483824a30edba37507110fc721434f0a40477500dc4674d8ce8d1c2a47b06624249b48228bbe9c9a9f9c86bb19e5cb2dcc18a8dec23dbe
|
data/README.md
CHANGED
@@ -21,6 +21,15 @@ Here are some of the formats supported:
|
|
21
21
|
For the complete list of supported formats, please visit the Apache Tika
|
22
22
|
[Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
|
23
23
|
|
24
|
+
## Upgrading from v1.x to v2.x
|
25
|
+
|
26
|
+
Apache Tika v2.x brings with it some changes. One key change is that the Tika client and server applications have
|
27
|
+
been split up. To keep the gem size down Henkei will only include the client app. That is to say, each time you
|
28
|
+
call to Henkei, a new Java process will be started, run your command, then terminate.
|
29
|
+
|
30
|
+
Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
|
31
|
+
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
32
|
+
|
24
33
|
## Usage
|
25
34
|
|
26
35
|
Text, metadata and MIME type information can be extracted by calling `Henkei.read` directly:
|
@@ -69,6 +78,20 @@ post '/:name/:filename' do
|
|
69
78
|
end
|
70
79
|
```
|
71
80
|
|
81
|
+
### Reading text from inside images (OCR)
|
82
|
+
|
83
|
+
You can enable OCR by specifying the optional `include_ocr: true` when calling to the `text` or `html` instance methods,
|
84
|
+
as well as the `read` class method. Note that Tika does indicate this will greatly increase processing time.
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
henkei = Henkei.new 'sample.pages'
|
88
|
+
text_with_ocr = henkei.text(include_ocr: true)
|
89
|
+
html_with_ocr = henkei.html(include_ocr: true)
|
90
|
+
|
91
|
+
data = File.read 'sample.pages'
|
92
|
+
text_with_ocr = Henkei.read :text, data, include_ocr: true
|
93
|
+
```
|
94
|
+
|
72
95
|
### Reading metadata
|
73
96
|
|
74
97
|
Metadata is returned as a hash.
|
data/henkei.gemspec
CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
|
27
27
|
|
28
28
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
29
|
+
spec.add_development_dependency 'nokogiri', '~> 1.12'
|
29
30
|
spec.add_development_dependency 'rails', '~> 5.0'
|
30
31
|
spec.add_development_dependency 'rake', '~> 12.3'
|
31
32
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
Binary file
|
@@ -0,0 +1,9 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<properties>
|
3
|
+
<service-loader initializableProblemHandler="ignore"/>
|
4
|
+
<parsers>
|
5
|
+
<parser class="org.apache.tika.parser.DefaultParser">
|
6
|
+
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
|
7
|
+
</parser>
|
8
|
+
</parsers>
|
9
|
+
</properties>
|
data/jar/tika-config.xml
CHANGED
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -25,12 +25,9 @@ require 'open3'
|
|
25
25
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
26
26
|
class Henkei # rubocop:disable Metrics/ClassLength
|
27
27
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
28
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-
|
28
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.2.0.jar')
|
29
29
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
30
|
-
|
31
|
-
|
32
|
-
@@server_port = nil
|
33
|
-
@@server_pid = nil
|
30
|
+
CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
|
34
31
|
|
35
32
|
def self.mimetype(content_type)
|
36
33
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
@@ -50,8 +47,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
50
47
|
# text = Henkei.read :text, data
|
51
48
|
# metadata = Henkei.read :metadata, data
|
52
49
|
#
|
53
|
-
def self.read(type, data)
|
54
|
-
result =
|
50
|
+
def self.read(type, data, include_ocr: false)
|
51
|
+
result = client_read(type, data, include_ocr: include_ocr)
|
55
52
|
|
56
53
|
case type
|
57
54
|
when :text then result
|
@@ -96,10 +93,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
96
93
|
# henkei = Henkei.new 'sample.pages'
|
97
94
|
# henkei.text
|
98
95
|
#
|
99
|
-
|
96
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
97
|
+
#
|
98
|
+
# henkei.text(include_ocr: true)
|
99
|
+
#
|
100
|
+
def text(include_ocr: false)
|
100
101
|
return @text if defined? @text
|
101
102
|
|
102
|
-
@text = Henkei.read :text, data
|
103
|
+
@text = Henkei.read :text, data, include_ocr: include_ocr
|
103
104
|
end
|
104
105
|
|
105
106
|
# Returns the text content of the Henkei document in HTML.
|
@@ -107,10 +108,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
107
108
|
# henkei = Henkei.new 'sample.pages'
|
108
109
|
# henkei.html
|
109
110
|
#
|
110
|
-
|
111
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
112
|
+
#
|
113
|
+
# henkei.html(include_ocr: true)
|
114
|
+
#
|
115
|
+
def html(include_ocr: false)
|
111
116
|
return @html if defined? @html
|
112
117
|
|
113
|
-
@html = Henkei.read :html, data
|
118
|
+
@html = Henkei.read :html, data, include_ocr: include_ocr
|
114
119
|
end
|
115
120
|
|
116
121
|
# Returns the metadata hash of the Henkei document.
|
@@ -144,9 +149,9 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
144
149
|
#
|
145
150
|
def creation_date
|
146
151
|
return @creation_date if defined? @creation_date
|
147
|
-
return unless metadata['
|
152
|
+
return unless metadata['dcterms:created']
|
148
153
|
|
149
|
-
@creation_date = Time.parse(metadata['
|
154
|
+
@creation_date = Time.parse(metadata['dcterms:created'])
|
150
155
|
end
|
151
156
|
|
152
157
|
# Returns +true+ if the Henkei document was specified using a file path.
|
@@ -196,44 +201,6 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
196
201
|
@data
|
197
202
|
end
|
198
203
|
|
199
|
-
# Returns pid of Tika server, started as a new spawned process.
|
200
|
-
#
|
201
|
-
# type :html, :text or :metadata
|
202
|
-
# custom_port e.g. 9293
|
203
|
-
#
|
204
|
-
# Henkei.server(:text, 9294)
|
205
|
-
#
|
206
|
-
def self.server(type, custom_port = nil)
|
207
|
-
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
208
|
-
|
209
|
-
@@server_pid = Process.spawn(*tika_command(type, server: true))
|
210
|
-
sleep(2) # Give the server 2 seconds to spin up.
|
211
|
-
@@server_pid
|
212
|
-
end
|
213
|
-
|
214
|
-
# Kills server started by Henkei.server
|
215
|
-
#
|
216
|
-
# Always run this when you're done, or else Tika might run until you kill it manually
|
217
|
-
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
218
|
-
# putting this method in the ensure block.
|
219
|
-
#
|
220
|
-
# Henkei.server(:text)
|
221
|
-
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
222
|
-
# begin
|
223
|
-
# my_texts = reports.map{ |report_path| Henkei.new(report_path).text }
|
224
|
-
# rescue
|
225
|
-
# ensure
|
226
|
-
# Henkei.kill_server!
|
227
|
-
# end
|
228
|
-
#
|
229
|
-
def self.kill_server!
|
230
|
-
return unless @@server_pid
|
231
|
-
|
232
|
-
Process.kill('INT', @@server_pid)
|
233
|
-
@@server_pid = nil
|
234
|
-
@@server_port = nil
|
235
|
-
end
|
236
|
-
|
237
204
|
### Private class methods
|
238
205
|
|
239
206
|
# Provide the path to the Java binary
|
@@ -245,44 +212,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
245
212
|
|
246
213
|
# Internal helper for calling to Tika library directly
|
247
214
|
#
|
248
|
-
def self.client_read(type, data)
|
249
|
-
Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
|
215
|
+
def self.client_read(type, data, include_ocr: false)
|
216
|
+
Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
|
250
217
|
end
|
251
218
|
private_class_method :client_read
|
252
219
|
|
253
|
-
# Internal helper for calling to running Tika server
|
254
|
-
#
|
255
|
-
def self.server_read(data)
|
256
|
-
s = TCPSocket.new('localhost', @@server_port)
|
257
|
-
file = StringIO.new(data, 'r')
|
258
|
-
|
259
|
-
loop do
|
260
|
-
chunk = file.read(65_536)
|
261
|
-
break unless chunk
|
262
|
-
|
263
|
-
s.write(chunk)
|
264
|
-
end
|
265
|
-
|
266
|
-
# tell Tika that we're done sending data
|
267
|
-
s.shutdown(Socket::SHUT_WR)
|
268
|
-
|
269
|
-
resp = String.new ''
|
270
|
-
loop do
|
271
|
-
chunk = s.recv(65_536)
|
272
|
-
break if chunk.empty? || !chunk
|
273
|
-
|
274
|
-
resp << chunk
|
275
|
-
end
|
276
|
-
resp
|
277
|
-
end
|
278
|
-
private_class_method :server_read
|
279
|
-
|
280
220
|
# Internal helper for building the Java command to call Tika
|
281
221
|
#
|
282
|
-
def self.tika_command(type,
|
283
|
-
|
284
|
-
|
285
|
-
|
222
|
+
def self.tika_command(type, include_ocr: false)
|
223
|
+
[
|
224
|
+
java_path,
|
225
|
+
'-Djava.awt.headless=true',
|
226
|
+
'-jar',
|
227
|
+
Henkei::JAR_PATH,
|
228
|
+
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
|
229
|
+
] + switch_for_type(type)
|
286
230
|
end
|
287
231
|
private_class_method :tika_command
|
288
232
|
|
data/spec/henkei_spec.rb
CHANGED
@@ -2,10 +2,15 @@
|
|
2
2
|
|
3
3
|
require 'helper'
|
4
4
|
require 'henkei'
|
5
|
+
require 'nokogiri'
|
5
6
|
|
6
7
|
# Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
|
7
8
|
require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
|
8
9
|
|
10
|
+
def travis_ci?
|
11
|
+
ENV['CI'] == 'true' && ENV['TRAVIS'] == 'true'
|
12
|
+
end
|
13
|
+
|
9
14
|
describe Henkei do
|
10
15
|
let(:data) { File.read 'spec/samples/sample.docx' }
|
11
16
|
|
@@ -52,6 +57,23 @@ describe Henkei do
|
|
52
57
|
|
53
58
|
expect(text).to eq ''
|
54
59
|
end
|
60
|
+
|
61
|
+
unless travis_ci?
|
62
|
+
context 'when `include_ocr` is enabled' do
|
63
|
+
it 'returns parsed plain text in the image' do
|
64
|
+
text = Henkei.read :text, data, include_ocr: true
|
65
|
+
|
66
|
+
expect(text).to include <<~TEXT
|
67
|
+
West Side
|
68
|
+
|
69
|
+
Sea Island
|
70
|
+
PP
|
71
|
+
|
72
|
+
Richmond
|
73
|
+
TEXT
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
55
77
|
end
|
56
78
|
end
|
57
79
|
|
@@ -115,6 +137,7 @@ describe Henkei do
|
|
115
137
|
|
116
138
|
describe '.creation_date' do
|
117
139
|
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
140
|
+
|
118
141
|
it 'should return Time' do
|
119
142
|
expect(henkei.creation_date).to be_a Time
|
120
143
|
end
|
@@ -158,6 +181,30 @@ describe Henkei do
|
|
158
181
|
it '#mimetype returns `image/png`' do
|
159
182
|
expect(henkei.mimetype.content_type).to eq 'image/png'
|
160
183
|
end
|
184
|
+
|
185
|
+
unless travis_ci?
|
186
|
+
context 'when `include_ocr` is enabled' do
|
187
|
+
it '#text returns plain text of parsed text in the image' do
|
188
|
+
expect(henkei.text(include_ocr: true)).to include <<~TEXT
|
189
|
+
West Side
|
190
|
+
|
191
|
+
Sea Island
|
192
|
+
PP
|
193
|
+
|
194
|
+
Richmond
|
195
|
+
TEXT
|
196
|
+
end
|
197
|
+
|
198
|
+
it '#html returns HTML of parsed text in the image' do
|
199
|
+
expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
|
200
|
+
|
201
|
+
html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
|
202
|
+
['Anmore', 'Coquitlam', 'West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
|
203
|
+
expect(html_body.text).to include location
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
161
208
|
end
|
162
209
|
end
|
163
210
|
|
@@ -198,40 +245,4 @@ describe Henkei do
|
|
198
245
|
expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
|
199
246
|
end
|
200
247
|
end
|
201
|
-
|
202
|
-
context 'working as server mode' do
|
203
|
-
specify '#starts and kills server' do
|
204
|
-
begin
|
205
|
-
Henkei.server(:text)
|
206
|
-
expect(Henkei.class_variable_get(:@@server_pid)).not_to be_nil
|
207
|
-
expect(Henkei.class_variable_get(:@@server_port)).not_to be_nil
|
208
|
-
|
209
|
-
s = TCPSocket.new('localhost', Henkei.class_variable_get(:@@server_port))
|
210
|
-
expect(s).to be_a TCPSocket
|
211
|
-
s.close
|
212
|
-
ensure
|
213
|
-
port = Henkei.class_variable_get(:@@server_port)
|
214
|
-
Henkei.kill_server!
|
215
|
-
sleep 2
|
216
|
-
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
specify '#runs samples through server mode' do
|
221
|
-
begin
|
222
|
-
Henkei.server(:text)
|
223
|
-
expect(Henkei.new('spec/samples/sample.pages').text).to(
|
224
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
225
|
-
)
|
226
|
-
expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
|
227
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
228
|
-
)
|
229
|
-
expect(Henkei.new('spec/samples/sample.docx').text).to(
|
230
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
231
|
-
)
|
232
|
-
ensure
|
233
|
-
Henkei.kill_server!
|
234
|
-
end
|
235
|
-
end
|
236
|
-
end
|
237
248
|
end
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
8
8
|
- Andrew Bromwich
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-12-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -65,6 +65,20 @@ dependencies:
|
|
65
65
|
- - "~>"
|
66
66
|
- !ruby/object:Gem::Version
|
67
67
|
version: '2.0'
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: nokogiri
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '1.12'
|
75
|
+
type: :development
|
76
|
+
prerelease: false
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '1.12'
|
68
82
|
- !ruby/object:Gem::Dependency
|
69
83
|
name: rails
|
70
84
|
requirement: !ruby/object:Gem::Requirement
|
@@ -155,7 +169,8 @@ files:
|
|
155
169
|
- Rakefile
|
156
170
|
- bin/console
|
157
171
|
- henkei.gemspec
|
158
|
-
- jar/tika-app-
|
172
|
+
- jar/tika-app-2.2.0.jar
|
173
|
+
- jar/tika-config-without-ocr.xml
|
159
174
|
- jar/tika-config.xml
|
160
175
|
- lib/henkei.rb
|
161
176
|
- lib/henkei/configuration.rb
|
@@ -172,7 +187,7 @@ homepage: http://github.com/abrom/henkei
|
|
172
187
|
licenses:
|
173
188
|
- MIT
|
174
189
|
metadata: {}
|
175
|
-
post_install_message:
|
190
|
+
post_install_message:
|
176
191
|
rdoc_options: []
|
177
192
|
require_paths:
|
178
193
|
- lib
|
@@ -190,8 +205,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
190
205
|
- !ruby/object:Gem::Version
|
191
206
|
version: '0'
|
192
207
|
requirements: []
|
193
|
-
rubygems_version: 3.0.
|
194
|
-
signing_key:
|
208
|
+
rubygems_version: 3.0.9
|
209
|
+
signing_key:
|
195
210
|
specification_version: 4
|
196
211
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
197
212
|
.rtf, .pdf) using Apache Tika toolkit
|