henkei 1.28.5.2 → 2.2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +0 -12
- data/.travis.yml +32 -0
- data/Gemfile +0 -9
- data/README.md +24 -1
- data/henkei.gemspec +11 -9
- data/jar/{tika-app-1.28.5.jar → tika-app-2.2.0.jar} +0 -0
- data/jar/tika-config-without-ocr.xml +9 -0
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +33 -97
- data/spec/henkei_spec.rb +71 -57
- metadata +116 -12
- data/.github/workflows/test.yml +0 -37
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64f2ec330c97bf77b16e2f7e14e08f90c405ae42c7aeddce733c0d889eeb4782
|
4
|
+
data.tar.gz: c3b3b91c569c7093bf22a5c751153426f1fdaa62da61f8c40f8b8cabc6ce072c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9f2263c057cb9e958039930aa3c0244c31b5bcc5e636515a757e3b1c5d43a6b52c932a76a8c132af73707453df1e2652e6f02d9926276a75bceb4deb69ffa59
|
7
|
+
data.tar.gz: f8ea0d87ad2bc75213483824a30edba37507110fc721434f0a40477500dc4674d8ce8d1c2a47b06624249b48228bbe9c9a9f9c86bb19e5cb2dcc18a8dec23dbe
|
data/.rubocop.yml
CHANGED
@@ -1,11 +1,5 @@
|
|
1
|
-
require:
|
2
|
-
- rubocop-performance
|
3
|
-
- rubocop-rake
|
4
|
-
- rubocop-rspec
|
5
|
-
|
6
1
|
AllCops:
|
7
2
|
NewCops: enable
|
8
|
-
TargetRubyVersion: 3.0
|
9
3
|
|
10
4
|
Layout/EmptyLinesAroundAttributeAccessor:
|
11
5
|
Enabled: true
|
@@ -35,12 +29,6 @@ Metrics/BlockLength:
|
|
35
29
|
Metrics/MethodLength:
|
36
30
|
Max: 15
|
37
31
|
|
38
|
-
RSpec/ExampleLength:
|
39
|
-
Max: 12
|
40
|
-
|
41
|
-
RSpec/MultipleExpectations:
|
42
|
-
Max: 4
|
43
|
-
|
44
32
|
Style/ClassVars:
|
45
33
|
Enabled: false
|
46
34
|
|
data/.travis.yml
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
env:
|
2
|
+
global:
|
3
|
+
- CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
4
|
+
jobs:
|
5
|
+
- INCLUDE_RAILS=false
|
6
|
+
- INCLUDE_RAILS=true
|
7
|
+
|
8
|
+
language: ruby
|
9
|
+
rvm:
|
10
|
+
- 2.5
|
11
|
+
- 2.6
|
12
|
+
- 2.7
|
13
|
+
- 3.0
|
14
|
+
|
15
|
+
before_install:
|
16
|
+
- gem update bundler
|
17
|
+
|
18
|
+
install:
|
19
|
+
- bundle install --jobs=3 --retry=3
|
20
|
+
- gem install rubocop
|
21
|
+
|
22
|
+
before_script:
|
23
|
+
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
24
|
+
- chmod +x ./cc-test-reporter
|
25
|
+
- ./cc-test-reporter before-build
|
26
|
+
|
27
|
+
script:
|
28
|
+
- bundle exec rubocop
|
29
|
+
- bundle exec rspec
|
30
|
+
|
31
|
+
after_script:
|
32
|
+
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
|
data/Gemfile
CHANGED
@@ -4,12 +4,3 @@ source 'https://rubygems.org'
|
|
4
4
|
|
5
5
|
# Specify your gem's dependencies in henkei.gemspec
|
6
6
|
gemspec
|
7
|
-
|
8
|
-
gem 'bundler', '~> 2.0'
|
9
|
-
gem 'rake', '~> 12.3'
|
10
|
-
gem 'rspec', '~> 3.7'
|
11
|
-
gem 'rubocop', '~> 1.26'
|
12
|
-
gem 'rubocop-performance', '~> 1.13'
|
13
|
-
gem 'rubocop-rake', '~> 0.6'
|
14
|
-
gem 'rubocop-rspec', '~> 2.9'
|
15
|
-
gem 'simplecov', '~> 0.15', '< 0.18'
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[](https://travis-ci.org/abrom/henkei)
|
2
2
|
[](https://codeclimate.com/github/abrom/henkei/maintainability)
|
3
3
|
[](https://codeclimate.com/github/abrom/henkei/test_coverage)
|
4
4
|
[](#)
|
@@ -21,6 +21,15 @@ Here are some of the formats supported:
|
|
21
21
|
For the complete list of supported formats, please visit the Apache Tika
|
22
22
|
[Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
|
23
23
|
|
24
|
+
## Upgrading from v1.x to v2.x
|
25
|
+
|
26
|
+
Apache Tika v2.x brings with it some changes. One key change is that the Tika client and server applications have
|
27
|
+
been split up. To keep the gem size down Henkei will only include the client app. That is to say, each time you
|
28
|
+
call to Henkei, a new Java process will be started, run your command, then terminate.
|
29
|
+
|
30
|
+
Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
|
31
|
+
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
32
|
+
|
24
33
|
## Usage
|
25
34
|
|
26
35
|
Text, metadata and MIME type information can be extracted by calling `Henkei.read` directly:
|
@@ -69,6 +78,20 @@ post '/:name/:filename' do
|
|
69
78
|
end
|
70
79
|
```
|
71
80
|
|
81
|
+
### Reading text from inside images (OCR)
|
82
|
+
|
83
|
+
You can enable OCR by specifying the optional `include_ocr: true` when calling to the `text` or `html` instance methods,
|
84
|
+
as well as the `read` class method. Note that Tika does indicate this will greatly increase processing time.
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
henkei = Henkei.new 'sample.pages'
|
88
|
+
text_with_ocr = henkei.text(include_ocr: true)
|
89
|
+
html_with_ocr = henkei.html(include_ocr: true)
|
90
|
+
|
91
|
+
data = File.read 'sample.pages'
|
92
|
+
text_with_ocr = Henkei.read :text, data, include_ocr: true
|
93
|
+
```
|
94
|
+
|
72
95
|
### Reading metadata
|
73
96
|
|
74
97
|
Metadata is returned as a hash.
|
data/henkei.gemspec
CHANGED
@@ -13,21 +13,23 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
14
14
|
spec.summary = 'Read text and metadata from files and documents ' \
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
|
-
spec.homepage = '
|
16
|
+
spec.homepage = 'http://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
|
-
spec.required_ruby_version = ['>=
|
19
|
-
|
20
|
-
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
21
|
-
# delete this section to allow pushing this gem to any host.
|
22
|
-
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.' unless spec.respond_to?(:metadata)
|
23
|
-
|
24
|
-
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
25
|
-
spec.metadata['rubygems_mfa_required'] = 'true'
|
18
|
+
spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
|
26
19
|
|
27
20
|
spec.files = `git ls-files`.split("\n")
|
28
21
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
22
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
29
23
|
spec.require_paths = ['lib']
|
30
24
|
|
31
25
|
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
32
26
|
spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
|
27
|
+
|
28
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
29
|
+
spec.add_development_dependency 'nokogiri', '~> 1.12'
|
30
|
+
spec.add_development_dependency 'rails', '~> 5.0'
|
31
|
+
spec.add_development_dependency 'rake', '~> 12.3'
|
32
|
+
spec.add_development_dependency 'rspec', '~> 3.7'
|
33
|
+
spec.add_development_dependency 'rubocop', '~> 0.71'
|
34
|
+
spec.add_development_dependency 'simplecov', '~> 0.15'
|
33
35
|
end
|
Binary file
|
@@ -0,0 +1,9 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<properties>
|
3
|
+
<service-loader initializableProblemHandler="ignore"/>
|
4
|
+
<parsers>
|
5
|
+
<parser class="org.apache.tika.parser.DefaultParser">
|
6
|
+
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
|
7
|
+
</parser>
|
8
|
+
</parsers>
|
9
|
+
</properties>
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -25,17 +25,14 @@ require 'open3'
|
|
25
25
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
26
26
|
class Henkei # rubocop:disable Metrics/ClassLength
|
27
27
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
28
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-
|
28
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.2.0.jar')
|
29
29
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
30
|
-
|
31
|
-
|
32
|
-
@@server_port = nil
|
33
|
-
@@server_pid = nil
|
30
|
+
CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
|
34
31
|
|
35
32
|
def self.mimetype(content_type)
|
36
33
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
37
|
-
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.
|
38
|
-
|
34
|
+
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
35
|
+
' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
39
36
|
MIME::Types[content_type].first
|
40
37
|
else
|
41
38
|
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
@@ -50,11 +47,12 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
50
47
|
# text = Henkei.read :text, data
|
51
48
|
# metadata = Henkei.read :metadata, data
|
52
49
|
#
|
53
|
-
def self.read(type, data)
|
54
|
-
result =
|
50
|
+
def self.read(type, data, include_ocr: false)
|
51
|
+
result = client_read(type, data, include_ocr: include_ocr)
|
55
52
|
|
56
53
|
case type
|
57
|
-
when :text
|
54
|
+
when :text then result
|
55
|
+
when :html then result
|
58
56
|
when :metadata then JSON.parse(result)
|
59
57
|
when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
|
60
58
|
end
|
@@ -78,7 +76,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
78
76
|
if input.is_a? String
|
79
77
|
if File.exist? input
|
80
78
|
@path = input
|
81
|
-
elsif input
|
79
|
+
elsif input =~ URI::DEFAULT_PARSER.make_regexp
|
82
80
|
@uri = URI.parse input
|
83
81
|
else
|
84
82
|
raise Errno::ENOENT, "missing file or invalid URI - #{input}"
|
@@ -95,10 +93,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
95
93
|
# henkei = Henkei.new 'sample.pages'
|
96
94
|
# henkei.text
|
97
95
|
#
|
98
|
-
|
96
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
97
|
+
#
|
98
|
+
# henkei.text(include_ocr: true)
|
99
|
+
#
|
100
|
+
def text(include_ocr: false)
|
99
101
|
return @text if defined? @text
|
100
102
|
|
101
|
-
@text = Henkei.read :text, data
|
103
|
+
@text = Henkei.read :text, data, include_ocr: include_ocr
|
102
104
|
end
|
103
105
|
|
104
106
|
# Returns the text content of the Henkei document in HTML.
|
@@ -106,10 +108,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
106
108
|
# henkei = Henkei.new 'sample.pages'
|
107
109
|
# henkei.html
|
108
110
|
#
|
109
|
-
|
111
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
112
|
+
#
|
113
|
+
# henkei.html(include_ocr: true)
|
114
|
+
#
|
115
|
+
def html(include_ocr: false)
|
110
116
|
return @html if defined? @html
|
111
117
|
|
112
|
-
@html = Henkei.read :html, data
|
118
|
+
@html = Henkei.read :html, data, include_ocr: include_ocr
|
113
119
|
end
|
114
120
|
|
115
121
|
# Returns the metadata hash of the Henkei document.
|
@@ -143,9 +149,9 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
143
149
|
#
|
144
150
|
def creation_date
|
145
151
|
return @creation_date if defined? @creation_date
|
146
|
-
return unless metadata['
|
152
|
+
return unless metadata['dcterms:created']
|
147
153
|
|
148
|
-
@creation_date = Time.parse(metadata['
|
154
|
+
@creation_date = Time.parse(metadata['dcterms:created'])
|
149
155
|
end
|
150
156
|
|
151
157
|
# Returns +true+ if the Henkei document was specified using a file path.
|
@@ -195,44 +201,6 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
195
201
|
@data
|
196
202
|
end
|
197
203
|
|
198
|
-
# Returns pid of Tika server, started as a new spawned process.
|
199
|
-
#
|
200
|
-
# type :html, :text or :metadata
|
201
|
-
# custom_port e.g. 9293
|
202
|
-
#
|
203
|
-
# Henkei.server(:text, 9294)
|
204
|
-
#
|
205
|
-
def self.server(type, custom_port = nil)
|
206
|
-
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
207
|
-
|
208
|
-
@@server_pid = Process.spawn(*tika_command(type, server: true))
|
209
|
-
sleep(2) # Give the server 2 seconds to spin up.
|
210
|
-
@@server_pid
|
211
|
-
end
|
212
|
-
|
213
|
-
# Kills server started by Henkei.server
|
214
|
-
#
|
215
|
-
# Always run this when you're done, or else Tika might run until you kill it manually
|
216
|
-
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
217
|
-
# putting this method in the ensure block.
|
218
|
-
#
|
219
|
-
# Henkei.server(:text)
|
220
|
-
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
221
|
-
# begin
|
222
|
-
# my_texts = reports.map{ |report_path| Henkei.new(report_path).text }
|
223
|
-
# rescue
|
224
|
-
# ensure
|
225
|
-
# Henkei.kill_server!
|
226
|
-
# end
|
227
|
-
#
|
228
|
-
def self.kill_server!
|
229
|
-
return unless @@server_pid
|
230
|
-
|
231
|
-
Process.kill('INT', @@server_pid)
|
232
|
-
@@server_pid = nil
|
233
|
-
@@server_port = nil
|
234
|
-
end
|
235
|
-
|
236
204
|
### Private class methods
|
237
205
|
|
238
206
|
# Provide the path to the Java binary
|
@@ -244,44 +212,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
244
212
|
|
245
213
|
# Internal helper for calling to Tika library directly
|
246
214
|
#
|
247
|
-
def self.client_read(type, data)
|
248
|
-
|
215
|
+
def self.client_read(type, data, include_ocr: false)
|
216
|
+
Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
|
249
217
|
end
|
250
218
|
private_class_method :client_read
|
251
219
|
|
252
|
-
# Internal helper for calling to running Tika server
|
253
|
-
#
|
254
|
-
def self.server_read(data)
|
255
|
-
s = TCPSocket.new('localhost', @@server_port)
|
256
|
-
file = StringIO.new(data, 'r')
|
257
|
-
|
258
|
-
loop do
|
259
|
-
chunk = file.read(65_536)
|
260
|
-
break unless chunk
|
261
|
-
|
262
|
-
s.write(chunk)
|
263
|
-
end
|
264
|
-
|
265
|
-
# tell Tika that we're done sending data
|
266
|
-
s.shutdown(Socket::SHUT_WR)
|
267
|
-
|
268
|
-
resp = +''
|
269
|
-
loop do
|
270
|
-
chunk = s.recv(65_536)
|
271
|
-
break if !chunk || chunk.empty?
|
272
|
-
|
273
|
-
resp << chunk
|
274
|
-
end
|
275
|
-
filter_response resp
|
276
|
-
end
|
277
|
-
private_class_method :server_read
|
278
|
-
|
279
220
|
# Internal helper for building the Java command to call Tika
|
280
221
|
#
|
281
|
-
def self.tika_command(type,
|
282
|
-
|
283
|
-
|
284
|
-
|
222
|
+
def self.tika_command(type, include_ocr: false)
|
223
|
+
[
|
224
|
+
java_path,
|
225
|
+
'-Djava.awt.headless=true',
|
226
|
+
'-jar',
|
227
|
+
Henkei::JAR_PATH,
|
228
|
+
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
|
229
|
+
] + switch_for_type(type)
|
285
230
|
end
|
286
231
|
private_class_method :tika_command
|
287
232
|
|
@@ -296,13 +241,4 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
296
241
|
}[type]
|
297
242
|
end
|
298
243
|
private_class_method :switch_for_type
|
299
|
-
|
300
|
-
# Internal helper to remove erroneous output
|
301
|
-
#
|
302
|
-
def self.filter_response(response)
|
303
|
-
response.delete_prefix(
|
304
|
-
"WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.\n"
|
305
|
-
)
|
306
|
-
end
|
307
|
-
private_class_method :filter_response
|
308
244
|
end
|
data/spec/henkei_spec.rb
CHANGED
@@ -2,10 +2,15 @@
|
|
2
2
|
|
3
3
|
require 'helper'
|
4
4
|
require 'henkei'
|
5
|
+
require 'nokogiri'
|
5
6
|
|
6
7
|
# Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
|
7
8
|
require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
|
8
9
|
|
10
|
+
def travis_ci?
|
11
|
+
ENV['CI'] == 'true' && ENV['TRAVIS'] == 'true'
|
12
|
+
end
|
13
|
+
|
9
14
|
describe Henkei do
|
10
15
|
let(:data) { File.read 'spec/samples/sample.docx' }
|
11
16
|
|
@@ -15,13 +20,13 @@ describe Henkei do
|
|
15
20
|
|
16
21
|
describe '.read' do
|
17
22
|
it 'reads text' do
|
18
|
-
text =
|
23
|
+
text = Henkei.read :text, data
|
19
24
|
|
20
25
|
expect(text).to include 'The quick brown fox jumped over the lazy cat.'
|
21
26
|
end
|
22
27
|
|
23
28
|
it 'reads metadata' do
|
24
|
-
metadata =
|
29
|
+
metadata = Henkei.read :metadata, data
|
25
30
|
|
26
31
|
expect(metadata['Content-Type']).to(
|
27
32
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
@@ -30,13 +35,13 @@ describe Henkei do
|
|
30
35
|
|
31
36
|
it 'reads metadata values with colons as strings' do
|
32
37
|
data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
|
33
|
-
metadata =
|
38
|
+
metadata = Henkei.read :metadata, data
|
34
39
|
|
35
40
|
expect(metadata['dc:title']).to eq 'problem: test'
|
36
41
|
end
|
37
42
|
|
38
43
|
it 'reads mimetype' do
|
39
|
-
mimetype =
|
44
|
+
mimetype = Henkei.read :mimetype, data
|
40
45
|
|
41
46
|
expect(mimetype.content_type).to(
|
42
47
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
@@ -48,20 +53,37 @@ describe Henkei do
|
|
48
53
|
let(:data) { File.read 'spec/samples/pipe-error.png' }
|
49
54
|
|
50
55
|
it 'returns an empty result' do
|
51
|
-
text =
|
56
|
+
text = Henkei.read :text, data
|
52
57
|
|
53
58
|
expect(text).to eq ''
|
54
59
|
end
|
60
|
+
|
61
|
+
unless travis_ci?
|
62
|
+
context 'when `include_ocr` is enabled' do
|
63
|
+
it 'returns parsed plain text in the image' do
|
64
|
+
text = Henkei.read :text, data, include_ocr: true
|
65
|
+
|
66
|
+
expect(text).to include <<~TEXT
|
67
|
+
West Side
|
68
|
+
|
69
|
+
Sea Island
|
70
|
+
PP
|
71
|
+
|
72
|
+
Richmond
|
73
|
+
TEXT
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
55
77
|
end
|
56
78
|
end
|
57
79
|
|
58
80
|
describe '.new' do
|
59
81
|
it 'requires parameters' do
|
60
|
-
expect {
|
82
|
+
expect { Henkei.new }.to raise_error ArgumentError
|
61
83
|
end
|
62
84
|
|
63
85
|
it 'accepts a root path' do
|
64
|
-
henkei =
|
86
|
+
henkei = Henkei.new 'spec/samples/sample.pages'
|
65
87
|
|
66
88
|
expect(henkei).to be_path
|
67
89
|
expect(henkei).not_to be_uri
|
@@ -69,7 +91,7 @@ describe Henkei do
|
|
69
91
|
end
|
70
92
|
|
71
93
|
it 'accepts a relative path' do
|
72
|
-
henkei =
|
94
|
+
henkei = Henkei.new 'spec/samples/sample.pages'
|
73
95
|
|
74
96
|
expect(henkei).to be_path
|
75
97
|
expect(henkei).not_to be_uri
|
@@ -77,7 +99,7 @@ describe Henkei do
|
|
77
99
|
end
|
78
100
|
|
79
101
|
it 'accepts a path with spaces' do
|
80
|
-
henkei =
|
102
|
+
henkei = Henkei.new 'spec/samples/sample filename with spaces.pages'
|
81
103
|
|
82
104
|
expect(henkei).to be_path
|
83
105
|
expect(henkei).not_to be_uri
|
@@ -85,7 +107,7 @@ describe Henkei do
|
|
85
107
|
end
|
86
108
|
|
87
109
|
it 'accepts a URI' do
|
88
|
-
henkei =
|
110
|
+
henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
89
111
|
|
90
112
|
expect(henkei).to be_uri
|
91
113
|
expect(henkei).not_to be_path
|
@@ -94,7 +116,7 @@ describe Henkei do
|
|
94
116
|
|
95
117
|
it 'accepts a stream or object that can be read' do
|
96
118
|
File.open 'spec/samples/sample.pages', 'r' do |file|
|
97
|
-
henkei =
|
119
|
+
henkei = Henkei.new file
|
98
120
|
|
99
121
|
expect(henkei).to be_stream
|
100
122
|
expect(henkei).not_to be_path
|
@@ -103,38 +125,38 @@ describe Henkei do
|
|
103
125
|
end
|
104
126
|
|
105
127
|
it 'refuses a path to a missing file' do
|
106
|
-
expect {
|
128
|
+
expect { Henkei.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
|
107
129
|
end
|
108
130
|
|
109
131
|
it 'refuses other objects' do
|
110
132
|
[nil, 1, 1.1].each do |object|
|
111
|
-
expect {
|
133
|
+
expect { Henkei.new object }.to raise_error TypeError
|
112
134
|
end
|
113
135
|
end
|
114
136
|
end
|
115
137
|
|
116
138
|
describe '.creation_date' do
|
117
|
-
let(:henkei) {
|
139
|
+
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
118
140
|
|
119
|
-
it '
|
141
|
+
it 'should return Time' do
|
120
142
|
expect(henkei.creation_date).to be_a Time
|
121
143
|
end
|
122
144
|
end
|
123
145
|
|
124
146
|
describe '.java' do
|
125
147
|
specify 'with no specified JAVA_HOME' do
|
126
|
-
expect(
|
148
|
+
expect(Henkei.send(:java_path)).to eq 'java'
|
127
149
|
end
|
128
150
|
|
129
151
|
specify 'with a specified JAVA_HOME' do
|
130
152
|
ENV['JAVA_HOME'] = '/path/to/java/home'
|
131
153
|
|
132
|
-
expect(
|
154
|
+
expect(Henkei.send(:java_path)).to eq '/path/to/java/home/bin/java'
|
133
155
|
end
|
134
156
|
end
|
135
157
|
|
136
|
-
context '
|
137
|
-
let(:henkei) {
|
158
|
+
context 'initialized with a given path' do
|
159
|
+
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
138
160
|
|
139
161
|
specify '#text reads text' do
|
140
162
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
@@ -145,7 +167,7 @@ describe Henkei do
|
|
145
167
|
end
|
146
168
|
|
147
169
|
context 'when passing in the `pipe-error.png` test file' do
|
148
|
-
let(:henkei) {
|
170
|
+
let(:henkei) { Henkei.new 'spec/samples/pipe-error.png' }
|
149
171
|
|
150
172
|
it '#text returns an empty result' do
|
151
173
|
expect(henkei.text).to eq ''
|
@@ -159,11 +181,35 @@ describe Henkei do
|
|
159
181
|
it '#mimetype returns `image/png`' do
|
160
182
|
expect(henkei.mimetype.content_type).to eq 'image/png'
|
161
183
|
end
|
184
|
+
|
185
|
+
unless travis_ci?
|
186
|
+
context 'when `include_ocr` is enabled' do
|
187
|
+
it '#text returns plain text of parsed text in the image' do
|
188
|
+
expect(henkei.text(include_ocr: true)).to include <<~TEXT
|
189
|
+
West Side
|
190
|
+
|
191
|
+
Sea Island
|
192
|
+
PP
|
193
|
+
|
194
|
+
Richmond
|
195
|
+
TEXT
|
196
|
+
end
|
197
|
+
|
198
|
+
it '#html returns HTML of parsed text in the image' do
|
199
|
+
expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
|
200
|
+
|
201
|
+
html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
|
202
|
+
['Anmore', 'Coquitlam', 'West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
|
203
|
+
expect(html_body.text).to include location
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
162
208
|
end
|
163
209
|
end
|
164
210
|
|
165
|
-
context '
|
166
|
-
let(:henkei) {
|
211
|
+
context 'initialized with a given URI' do
|
212
|
+
let(:henkei) { Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
|
167
213
|
|
168
214
|
specify '#text reads text' do
|
169
215
|
expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
|
@@ -176,8 +222,8 @@ describe Henkei do
|
|
176
222
|
end
|
177
223
|
end
|
178
224
|
|
179
|
-
context '
|
180
|
-
let(:henkei) {
|
225
|
+
context 'initialized with a given stream' do
|
226
|
+
let(:henkei) { Henkei.new File.open('spec/samples/sample.pages', 'rb') }
|
181
227
|
|
182
228
|
specify '#text reads text' do
|
183
229
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
@@ -189,7 +235,7 @@ describe Henkei do
|
|
189
235
|
end
|
190
236
|
|
191
237
|
context 'when source is a remote PDF' do
|
192
|
-
let(:henkei) {
|
238
|
+
let(:henkei) { Henkei.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
|
193
239
|
|
194
240
|
specify '#text reads text' do
|
195
241
|
expect(henkei.text).to include 'Dummy PDF file'
|
@@ -199,36 +245,4 @@ describe Henkei do
|
|
199
245
|
expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
|
200
246
|
end
|
201
247
|
end
|
202
|
-
|
203
|
-
context 'when working as server mode' do
|
204
|
-
specify '#starts and kills server' do
|
205
|
-
described_class.server(:text)
|
206
|
-
expect(described_class.class_variable_get(:@@server_pid)).not_to be_nil
|
207
|
-
expect(described_class.class_variable_get(:@@server_port)).not_to be_nil
|
208
|
-
|
209
|
-
s = TCPSocket.new('localhost', described_class.class_variable_get(:@@server_port))
|
210
|
-
expect(s).to be_a TCPSocket
|
211
|
-
s.close
|
212
|
-
ensure
|
213
|
-
port = described_class.class_variable_get(:@@server_port)
|
214
|
-
described_class.kill_server!
|
215
|
-
sleep 2
|
216
|
-
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
217
|
-
end
|
218
|
-
|
219
|
-
specify '#runs samples through server mode' do
|
220
|
-
described_class.server(:text)
|
221
|
-
expect(described_class.new('spec/samples/sample.pages').text).to(
|
222
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
223
|
-
)
|
224
|
-
expect(described_class.new('spec/samples/sample filename with spaces.pages').text).to(
|
225
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
226
|
-
)
|
227
|
-
expect(described_class.new('spec/samples/sample.docx').text).to(
|
228
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
229
|
-
)
|
230
|
-
ensure
|
231
|
-
described_class.kill_server!
|
232
|
-
end
|
233
|
-
end
|
234
248
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-12-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -51,6 +51,104 @@ dependencies:
|
|
51
51
|
- - "<"
|
52
52
|
- !ruby/object:Gem::Version
|
53
53
|
version: '2'
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: bundler
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '2.0'
|
61
|
+
type: :development
|
62
|
+
prerelease: false
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '2.0'
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: nokogiri
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '1.12'
|
75
|
+
type: :development
|
76
|
+
prerelease: false
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '1.12'
|
82
|
+
- !ruby/object:Gem::Dependency
|
83
|
+
name: rails
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '5.0'
|
89
|
+
type: :development
|
90
|
+
prerelease: false
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - "~>"
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '5.0'
|
96
|
+
- !ruby/object:Gem::Dependency
|
97
|
+
name: rake
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - "~>"
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '12.3'
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '12.3'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: rspec
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - "~>"
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '3.7'
|
117
|
+
type: :development
|
118
|
+
prerelease: false
|
119
|
+
version_requirements: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - "~>"
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '3.7'
|
124
|
+
- !ruby/object:Gem::Dependency
|
125
|
+
name: rubocop
|
126
|
+
requirement: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - "~>"
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '0.71'
|
131
|
+
type: :development
|
132
|
+
prerelease: false
|
133
|
+
version_requirements: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - "~>"
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '0.71'
|
138
|
+
- !ruby/object:Gem::Dependency
|
139
|
+
name: simplecov
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '0.15'
|
145
|
+
type: :development
|
146
|
+
prerelease: false
|
147
|
+
version_requirements: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - "~>"
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0.15'
|
54
152
|
description: Read text and metadata from files and documents using Apache Tika toolkit
|
55
153
|
email:
|
56
154
|
- erol.fornoles@gmail.com
|
@@ -60,10 +158,10 @@ executables:
|
|
60
158
|
extensions: []
|
61
159
|
extra_rdoc_files: []
|
62
160
|
files:
|
63
|
-
- ".github/workflows/test.yml"
|
64
161
|
- ".gitignore"
|
65
162
|
- ".rspec"
|
66
163
|
- ".rubocop.yml"
|
164
|
+
- ".travis.yml"
|
67
165
|
- Gemfile
|
68
166
|
- LICENSE
|
69
167
|
- NOTICE.txt
|
@@ -71,7 +169,8 @@ files:
|
|
71
169
|
- Rakefile
|
72
170
|
- bin/console
|
73
171
|
- henkei.gemspec
|
74
|
-
- jar/tika-app-
|
172
|
+
- jar/tika-app-2.2.0.jar
|
173
|
+
- jar/tika-config-without-ocr.xml
|
75
174
|
- jar/tika-config.xml
|
76
175
|
- lib/henkei.rb
|
77
176
|
- lib/henkei/configuration.rb
|
@@ -84,12 +183,10 @@ files:
|
|
84
183
|
- spec/samples/sample-metadata-values-with-colons.doc
|
85
184
|
- spec/samples/sample.docx
|
86
185
|
- spec/samples/sample.pages
|
87
|
-
homepage:
|
186
|
+
homepage: http://github.com/abrom/henkei
|
88
187
|
licenses:
|
89
188
|
- MIT
|
90
|
-
metadata:
|
91
|
-
allowed_push_host: https://rubygems.org
|
92
|
-
rubygems_mfa_required: 'true'
|
189
|
+
metadata: {}
|
93
190
|
post_install_message:
|
94
191
|
rdoc_options: []
|
95
192
|
require_paths:
|
@@ -98,19 +195,26 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
98
195
|
requirements:
|
99
196
|
- - ">="
|
100
197
|
- !ruby/object:Gem::Version
|
101
|
-
version:
|
198
|
+
version: 2.4.0
|
102
199
|
- - "<"
|
103
200
|
- !ruby/object:Gem::Version
|
104
|
-
version: 3.
|
201
|
+
version: 3.1.0
|
105
202
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
203
|
requirements:
|
107
204
|
- - ">="
|
108
205
|
- !ruby/object:Gem::Version
|
109
206
|
version: '0'
|
110
207
|
requirements: []
|
111
|
-
rubygems_version: 3.
|
208
|
+
rubygems_version: 3.0.9
|
112
209
|
signing_key:
|
113
210
|
specification_version: 4
|
114
211
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
115
212
|
.rtf, .pdf) using Apache Tika toolkit
|
116
|
-
test_files:
|
213
|
+
test_files:
|
214
|
+
- spec/helper.rb
|
215
|
+
- spec/henkei_spec.rb
|
216
|
+
- spec/samples/pipe-error.png
|
217
|
+
- spec/samples/sample filename with spaces.pages
|
218
|
+
- spec/samples/sample-metadata-values-with-colons.doc
|
219
|
+
- spec/samples/sample.docx
|
220
|
+
- spec/samples/sample.pages
|
data/.github/workflows/test.yml
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
name: Test Henkei Ruby gem
|
2
|
-
|
3
|
-
on:
|
4
|
-
push:
|
5
|
-
branches: [1.x]
|
6
|
-
pull_request:
|
7
|
-
branches: [1.x]
|
8
|
-
|
9
|
-
env:
|
10
|
-
CI: true
|
11
|
-
|
12
|
-
jobs:
|
13
|
-
test:
|
14
|
-
runs-on: ubuntu-latest
|
15
|
-
strategy:
|
16
|
-
matrix:
|
17
|
-
ruby-version: ['3.0', '3.1', '3.2', '3.3']
|
18
|
-
|
19
|
-
steps:
|
20
|
-
- uses: actions/checkout@v3
|
21
|
-
|
22
|
-
- name: Set up Ruby
|
23
|
-
uses: ruby/setup-ruby@v1
|
24
|
-
with:
|
25
|
-
ruby-version: ${{ matrix.ruby-version }}
|
26
|
-
bundler-cache: true
|
27
|
-
|
28
|
-
- name: Lint code - Rubocop
|
29
|
-
run: bundle exec rubocop
|
30
|
-
|
31
|
-
- name: Run tests
|
32
|
-
run: bundle exec rspec
|
33
|
-
|
34
|
-
- name: Test & publish code coverage
|
35
|
-
uses: paambaati/codeclimate-action@v3.2.0
|
36
|
-
env:
|
37
|
-
CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|