henkei 1.28.5.2 → 2.2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +0 -12
- data/.travis.yml +32 -0
- data/Gemfile +0 -9
- data/README.md +24 -1
- data/henkei.gemspec +11 -9
- data/jar/{tika-app-1.28.5.jar → tika-app-2.2.0.jar} +0 -0
- data/jar/tika-config-without-ocr.xml +9 -0
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +33 -97
- data/spec/henkei_spec.rb +71 -57
- metadata +116 -12
- data/.github/workflows/test.yml +0 -37
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64f2ec330c97bf77b16e2f7e14e08f90c405ae42c7aeddce733c0d889eeb4782
|
4
|
+
data.tar.gz: c3b3b91c569c7093bf22a5c751153426f1fdaa62da61f8c40f8b8cabc6ce072c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9f2263c057cb9e958039930aa3c0244c31b5bcc5e636515a757e3b1c5d43a6b52c932a76a8c132af73707453df1e2652e6f02d9926276a75bceb4deb69ffa59
|
7
|
+
data.tar.gz: f8ea0d87ad2bc75213483824a30edba37507110fc721434f0a40477500dc4674d8ce8d1c2a47b06624249b48228bbe9c9a9f9c86bb19e5cb2dcc18a8dec23dbe
|
data/.rubocop.yml
CHANGED
@@ -1,11 +1,5 @@
|
|
1
|
-
require:
|
2
|
-
- rubocop-performance
|
3
|
-
- rubocop-rake
|
4
|
-
- rubocop-rspec
|
5
|
-
|
6
1
|
AllCops:
|
7
2
|
NewCops: enable
|
8
|
-
TargetRubyVersion: 3.0
|
9
3
|
|
10
4
|
Layout/EmptyLinesAroundAttributeAccessor:
|
11
5
|
Enabled: true
|
@@ -35,12 +29,6 @@ Metrics/BlockLength:
|
|
35
29
|
Metrics/MethodLength:
|
36
30
|
Max: 15
|
37
31
|
|
38
|
-
RSpec/ExampleLength:
|
39
|
-
Max: 12
|
40
|
-
|
41
|
-
RSpec/MultipleExpectations:
|
42
|
-
Max: 4
|
43
|
-
|
44
32
|
Style/ClassVars:
|
45
33
|
Enabled: false
|
46
34
|
|
data/.travis.yml
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
env:
|
2
|
+
global:
|
3
|
+
- CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
4
|
+
jobs:
|
5
|
+
- INCLUDE_RAILS=false
|
6
|
+
- INCLUDE_RAILS=true
|
7
|
+
|
8
|
+
language: ruby
|
9
|
+
rvm:
|
10
|
+
- 2.5
|
11
|
+
- 2.6
|
12
|
+
- 2.7
|
13
|
+
- 3.0
|
14
|
+
|
15
|
+
before_install:
|
16
|
+
- gem update bundler
|
17
|
+
|
18
|
+
install:
|
19
|
+
- bundle install --jobs=3 --retry=3
|
20
|
+
- gem install rubocop
|
21
|
+
|
22
|
+
before_script:
|
23
|
+
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
24
|
+
- chmod +x ./cc-test-reporter
|
25
|
+
- ./cc-test-reporter before-build
|
26
|
+
|
27
|
+
script:
|
28
|
+
- bundle exec rubocop
|
29
|
+
- bundle exec rspec
|
30
|
+
|
31
|
+
after_script:
|
32
|
+
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
|
data/Gemfile
CHANGED
@@ -4,12 +4,3 @@ source 'https://rubygems.org'
|
|
4
4
|
|
5
5
|
# Specify your gem's dependencies in henkei.gemspec
|
6
6
|
gemspec
|
7
|
-
|
8
|
-
gem 'bundler', '~> 2.0'
|
9
|
-
gem 'rake', '~> 12.3'
|
10
|
-
gem 'rspec', '~> 3.7'
|
11
|
-
gem 'rubocop', '~> 1.26'
|
12
|
-
gem 'rubocop-performance', '~> 1.13'
|
13
|
-
gem 'rubocop-rake', '~> 0.6'
|
14
|
-
gem 'rubocop-rspec', '~> 2.9'
|
15
|
-
gem 'simplecov', '~> 0.15', '< 0.18'
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[![
|
1
|
+
[![Travis Build Status](http://img.shields.io/travis/abrom/henkei.svg?style=flat)](https://travis-ci.org/abrom/henkei)
|
2
2
|
[![Maintainability](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/maintainability)](https://codeclimate.com/github/abrom/henkei/maintainability)
|
3
3
|
[![Test Coverage](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/test_coverage)](https://codeclimate.com/github/abrom/henkei/test_coverage)
|
4
4
|
[![Gem Version](http://img.shields.io/gem/v/henkei.svg?style=flat)](#)
|
@@ -21,6 +21,15 @@ Here are some of the formats supported:
|
|
21
21
|
For the complete list of supported formats, please visit the Apache Tika
|
22
22
|
[Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
|
23
23
|
|
24
|
+
## Upgrading from v1.x to v2.x
|
25
|
+
|
26
|
+
Apache Tika v2.x brings with it some changes. One key change is that the Tika client and server applications have
|
27
|
+
been split up. To keep the gem size down Henkei will only include the client app. That is to say, each time you
|
28
|
+
call to Henkei, a new Java process will be started, run your command, then terminate.
|
29
|
+
|
30
|
+
Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
|
31
|
+
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
32
|
+
|
24
33
|
## Usage
|
25
34
|
|
26
35
|
Text, metadata and MIME type information can be extracted by calling `Henkei.read` directly:
|
@@ -69,6 +78,20 @@ post '/:name/:filename' do
|
|
69
78
|
end
|
70
79
|
```
|
71
80
|
|
81
|
+
### Reading text from inside images (OCR)
|
82
|
+
|
83
|
+
You can enable OCR by specifying the optional `include_ocr: true` when calling to the `text` or `html` instance methods,
|
84
|
+
as well as the `read` class method. Note that Tika does indicate this will greatly increase processing time.
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
henkei = Henkei.new 'sample.pages'
|
88
|
+
text_with_ocr = henkei.text(include_ocr: true)
|
89
|
+
html_with_ocr = henkei.html(include_ocr: true)
|
90
|
+
|
91
|
+
data = File.read 'sample.pages'
|
92
|
+
text_with_ocr = Henkei.read :text, data, include_ocr: true
|
93
|
+
```
|
94
|
+
|
72
95
|
### Reading metadata
|
73
96
|
|
74
97
|
Metadata is returned as a hash.
|
data/henkei.gemspec
CHANGED
@@ -13,21 +13,23 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
14
14
|
spec.summary = 'Read text and metadata from files and documents ' \
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
|
-
spec.homepage = '
|
16
|
+
spec.homepage = 'http://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
|
-
spec.required_ruby_version = ['>=
|
19
|
-
|
20
|
-
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
21
|
-
# delete this section to allow pushing this gem to any host.
|
22
|
-
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.' unless spec.respond_to?(:metadata)
|
23
|
-
|
24
|
-
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
25
|
-
spec.metadata['rubygems_mfa_required'] = 'true'
|
18
|
+
spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
|
26
19
|
|
27
20
|
spec.files = `git ls-files`.split("\n")
|
28
21
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
22
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
29
23
|
spec.require_paths = ['lib']
|
30
24
|
|
31
25
|
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
32
26
|
spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
|
27
|
+
|
28
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
29
|
+
spec.add_development_dependency 'nokogiri', '~> 1.12'
|
30
|
+
spec.add_development_dependency 'rails', '~> 5.0'
|
31
|
+
spec.add_development_dependency 'rake', '~> 12.3'
|
32
|
+
spec.add_development_dependency 'rspec', '~> 3.7'
|
33
|
+
spec.add_development_dependency 'rubocop', '~> 0.71'
|
34
|
+
spec.add_development_dependency 'simplecov', '~> 0.15'
|
33
35
|
end
|
Binary file
|
@@ -0,0 +1,9 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<properties>
|
3
|
+
<service-loader initializableProblemHandler="ignore"/>
|
4
|
+
<parsers>
|
5
|
+
<parser class="org.apache.tika.parser.DefaultParser">
|
6
|
+
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
|
7
|
+
</parser>
|
8
|
+
</parsers>
|
9
|
+
</properties>
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -25,17 +25,14 @@ require 'open3'
|
|
25
25
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
26
26
|
class Henkei # rubocop:disable Metrics/ClassLength
|
27
27
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
28
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-
|
28
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.2.0.jar')
|
29
29
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
30
|
-
|
31
|
-
|
32
|
-
@@server_port = nil
|
33
|
-
@@server_pid = nil
|
30
|
+
CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
|
34
31
|
|
35
32
|
def self.mimetype(content_type)
|
36
33
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
37
|
-
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.
|
38
|
-
|
34
|
+
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
35
|
+
' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
39
36
|
MIME::Types[content_type].first
|
40
37
|
else
|
41
38
|
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
@@ -50,11 +47,12 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
50
47
|
# text = Henkei.read :text, data
|
51
48
|
# metadata = Henkei.read :metadata, data
|
52
49
|
#
|
53
|
-
def self.read(type, data)
|
54
|
-
result =
|
50
|
+
def self.read(type, data, include_ocr: false)
|
51
|
+
result = client_read(type, data, include_ocr: include_ocr)
|
55
52
|
|
56
53
|
case type
|
57
|
-
when :text
|
54
|
+
when :text then result
|
55
|
+
when :html then result
|
58
56
|
when :metadata then JSON.parse(result)
|
59
57
|
when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
|
60
58
|
end
|
@@ -78,7 +76,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
78
76
|
if input.is_a? String
|
79
77
|
if File.exist? input
|
80
78
|
@path = input
|
81
|
-
elsif input
|
79
|
+
elsif input =~ URI::DEFAULT_PARSER.make_regexp
|
82
80
|
@uri = URI.parse input
|
83
81
|
else
|
84
82
|
raise Errno::ENOENT, "missing file or invalid URI - #{input}"
|
@@ -95,10 +93,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
95
93
|
# henkei = Henkei.new 'sample.pages'
|
96
94
|
# henkei.text
|
97
95
|
#
|
98
|
-
|
96
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
97
|
+
#
|
98
|
+
# henkei.text(include_ocr: true)
|
99
|
+
#
|
100
|
+
def text(include_ocr: false)
|
99
101
|
return @text if defined? @text
|
100
102
|
|
101
|
-
@text = Henkei.read :text, data
|
103
|
+
@text = Henkei.read :text, data, include_ocr: include_ocr
|
102
104
|
end
|
103
105
|
|
104
106
|
# Returns the text content of the Henkei document in HTML.
|
@@ -106,10 +108,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
106
108
|
# henkei = Henkei.new 'sample.pages'
|
107
109
|
# henkei.html
|
108
110
|
#
|
109
|
-
|
111
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
112
|
+
#
|
113
|
+
# henkei.html(include_ocr: true)
|
114
|
+
#
|
115
|
+
def html(include_ocr: false)
|
110
116
|
return @html if defined? @html
|
111
117
|
|
112
|
-
@html = Henkei.read :html, data
|
118
|
+
@html = Henkei.read :html, data, include_ocr: include_ocr
|
113
119
|
end
|
114
120
|
|
115
121
|
# Returns the metadata hash of the Henkei document.
|
@@ -143,9 +149,9 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
143
149
|
#
|
144
150
|
def creation_date
|
145
151
|
return @creation_date if defined? @creation_date
|
146
|
-
return unless metadata['
|
152
|
+
return unless metadata['dcterms:created']
|
147
153
|
|
148
|
-
@creation_date = Time.parse(metadata['
|
154
|
+
@creation_date = Time.parse(metadata['dcterms:created'])
|
149
155
|
end
|
150
156
|
|
151
157
|
# Returns +true+ if the Henkei document was specified using a file path.
|
@@ -195,44 +201,6 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
195
201
|
@data
|
196
202
|
end
|
197
203
|
|
198
|
-
# Returns pid of Tika server, started as a new spawned process.
|
199
|
-
#
|
200
|
-
# type :html, :text or :metadata
|
201
|
-
# custom_port e.g. 9293
|
202
|
-
#
|
203
|
-
# Henkei.server(:text, 9294)
|
204
|
-
#
|
205
|
-
def self.server(type, custom_port = nil)
|
206
|
-
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
207
|
-
|
208
|
-
@@server_pid = Process.spawn(*tika_command(type, server: true))
|
209
|
-
sleep(2) # Give the server 2 seconds to spin up.
|
210
|
-
@@server_pid
|
211
|
-
end
|
212
|
-
|
213
|
-
# Kills server started by Henkei.server
|
214
|
-
#
|
215
|
-
# Always run this when you're done, or else Tika might run until you kill it manually
|
216
|
-
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
217
|
-
# putting this method in the ensure block.
|
218
|
-
#
|
219
|
-
# Henkei.server(:text)
|
220
|
-
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
221
|
-
# begin
|
222
|
-
# my_texts = reports.map{ |report_path| Henkei.new(report_path).text }
|
223
|
-
# rescue
|
224
|
-
# ensure
|
225
|
-
# Henkei.kill_server!
|
226
|
-
# end
|
227
|
-
#
|
228
|
-
def self.kill_server!
|
229
|
-
return unless @@server_pid
|
230
|
-
|
231
|
-
Process.kill('INT', @@server_pid)
|
232
|
-
@@server_pid = nil
|
233
|
-
@@server_port = nil
|
234
|
-
end
|
235
|
-
|
236
204
|
### Private class methods
|
237
205
|
|
238
206
|
# Provide the path to the Java binary
|
@@ -244,44 +212,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
244
212
|
|
245
213
|
# Internal helper for calling to Tika library directly
|
246
214
|
#
|
247
|
-
def self.client_read(type, data)
|
248
|
-
|
215
|
+
def self.client_read(type, data, include_ocr: false)
|
216
|
+
Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
|
249
217
|
end
|
250
218
|
private_class_method :client_read
|
251
219
|
|
252
|
-
# Internal helper for calling to running Tika server
|
253
|
-
#
|
254
|
-
def self.server_read(data)
|
255
|
-
s = TCPSocket.new('localhost', @@server_port)
|
256
|
-
file = StringIO.new(data, 'r')
|
257
|
-
|
258
|
-
loop do
|
259
|
-
chunk = file.read(65_536)
|
260
|
-
break unless chunk
|
261
|
-
|
262
|
-
s.write(chunk)
|
263
|
-
end
|
264
|
-
|
265
|
-
# tell Tika that we're done sending data
|
266
|
-
s.shutdown(Socket::SHUT_WR)
|
267
|
-
|
268
|
-
resp = +''
|
269
|
-
loop do
|
270
|
-
chunk = s.recv(65_536)
|
271
|
-
break if !chunk || chunk.empty?
|
272
|
-
|
273
|
-
resp << chunk
|
274
|
-
end
|
275
|
-
filter_response resp
|
276
|
-
end
|
277
|
-
private_class_method :server_read
|
278
|
-
|
279
220
|
# Internal helper for building the Java command to call Tika
|
280
221
|
#
|
281
|
-
def self.tika_command(type,
|
282
|
-
|
283
|
-
|
284
|
-
|
222
|
+
def self.tika_command(type, include_ocr: false)
|
223
|
+
[
|
224
|
+
java_path,
|
225
|
+
'-Djava.awt.headless=true',
|
226
|
+
'-jar',
|
227
|
+
Henkei::JAR_PATH,
|
228
|
+
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
|
229
|
+
] + switch_for_type(type)
|
285
230
|
end
|
286
231
|
private_class_method :tika_command
|
287
232
|
|
@@ -296,13 +241,4 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
296
241
|
}[type]
|
297
242
|
end
|
298
243
|
private_class_method :switch_for_type
|
299
|
-
|
300
|
-
# Internal helper to remove erroneous output
|
301
|
-
#
|
302
|
-
def self.filter_response(response)
|
303
|
-
response.delete_prefix(
|
304
|
-
"WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.\n"
|
305
|
-
)
|
306
|
-
end
|
307
|
-
private_class_method :filter_response
|
308
244
|
end
|
data/spec/henkei_spec.rb
CHANGED
@@ -2,10 +2,15 @@
|
|
2
2
|
|
3
3
|
require 'helper'
|
4
4
|
require 'henkei'
|
5
|
+
require 'nokogiri'
|
5
6
|
|
6
7
|
# Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
|
7
8
|
require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
|
8
9
|
|
10
|
+
def travis_ci?
|
11
|
+
ENV['CI'] == 'true' && ENV['TRAVIS'] == 'true'
|
12
|
+
end
|
13
|
+
|
9
14
|
describe Henkei do
|
10
15
|
let(:data) { File.read 'spec/samples/sample.docx' }
|
11
16
|
|
@@ -15,13 +20,13 @@ describe Henkei do
|
|
15
20
|
|
16
21
|
describe '.read' do
|
17
22
|
it 'reads text' do
|
18
|
-
text =
|
23
|
+
text = Henkei.read :text, data
|
19
24
|
|
20
25
|
expect(text).to include 'The quick brown fox jumped over the lazy cat.'
|
21
26
|
end
|
22
27
|
|
23
28
|
it 'reads metadata' do
|
24
|
-
metadata =
|
29
|
+
metadata = Henkei.read :metadata, data
|
25
30
|
|
26
31
|
expect(metadata['Content-Type']).to(
|
27
32
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
@@ -30,13 +35,13 @@ describe Henkei do
|
|
30
35
|
|
31
36
|
it 'reads metadata values with colons as strings' do
|
32
37
|
data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
|
33
|
-
metadata =
|
38
|
+
metadata = Henkei.read :metadata, data
|
34
39
|
|
35
40
|
expect(metadata['dc:title']).to eq 'problem: test'
|
36
41
|
end
|
37
42
|
|
38
43
|
it 'reads mimetype' do
|
39
|
-
mimetype =
|
44
|
+
mimetype = Henkei.read :mimetype, data
|
40
45
|
|
41
46
|
expect(mimetype.content_type).to(
|
42
47
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
@@ -48,20 +53,37 @@ describe Henkei do
|
|
48
53
|
let(:data) { File.read 'spec/samples/pipe-error.png' }
|
49
54
|
|
50
55
|
it 'returns an empty result' do
|
51
|
-
text =
|
56
|
+
text = Henkei.read :text, data
|
52
57
|
|
53
58
|
expect(text).to eq ''
|
54
59
|
end
|
60
|
+
|
61
|
+
unless travis_ci?
|
62
|
+
context 'when `include_ocr` is enabled' do
|
63
|
+
it 'returns parsed plain text in the image' do
|
64
|
+
text = Henkei.read :text, data, include_ocr: true
|
65
|
+
|
66
|
+
expect(text).to include <<~TEXT
|
67
|
+
West Side
|
68
|
+
|
69
|
+
Sea Island
|
70
|
+
PP
|
71
|
+
|
72
|
+
Richmond
|
73
|
+
TEXT
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
55
77
|
end
|
56
78
|
end
|
57
79
|
|
58
80
|
describe '.new' do
|
59
81
|
it 'requires parameters' do
|
60
|
-
expect {
|
82
|
+
expect { Henkei.new }.to raise_error ArgumentError
|
61
83
|
end
|
62
84
|
|
63
85
|
it 'accepts a root path' do
|
64
|
-
henkei =
|
86
|
+
henkei = Henkei.new 'spec/samples/sample.pages'
|
65
87
|
|
66
88
|
expect(henkei).to be_path
|
67
89
|
expect(henkei).not_to be_uri
|
@@ -69,7 +91,7 @@ describe Henkei do
|
|
69
91
|
end
|
70
92
|
|
71
93
|
it 'accepts a relative path' do
|
72
|
-
henkei =
|
94
|
+
henkei = Henkei.new 'spec/samples/sample.pages'
|
73
95
|
|
74
96
|
expect(henkei).to be_path
|
75
97
|
expect(henkei).not_to be_uri
|
@@ -77,7 +99,7 @@ describe Henkei do
|
|
77
99
|
end
|
78
100
|
|
79
101
|
it 'accepts a path with spaces' do
|
80
|
-
henkei =
|
102
|
+
henkei = Henkei.new 'spec/samples/sample filename with spaces.pages'
|
81
103
|
|
82
104
|
expect(henkei).to be_path
|
83
105
|
expect(henkei).not_to be_uri
|
@@ -85,7 +107,7 @@ describe Henkei do
|
|
85
107
|
end
|
86
108
|
|
87
109
|
it 'accepts a URI' do
|
88
|
-
henkei =
|
110
|
+
henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
89
111
|
|
90
112
|
expect(henkei).to be_uri
|
91
113
|
expect(henkei).not_to be_path
|
@@ -94,7 +116,7 @@ describe Henkei do
|
|
94
116
|
|
95
117
|
it 'accepts a stream or object that can be read' do
|
96
118
|
File.open 'spec/samples/sample.pages', 'r' do |file|
|
97
|
-
henkei =
|
119
|
+
henkei = Henkei.new file
|
98
120
|
|
99
121
|
expect(henkei).to be_stream
|
100
122
|
expect(henkei).not_to be_path
|
@@ -103,38 +125,38 @@ describe Henkei do
|
|
103
125
|
end
|
104
126
|
|
105
127
|
it 'refuses a path to a missing file' do
|
106
|
-
expect {
|
128
|
+
expect { Henkei.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
|
107
129
|
end
|
108
130
|
|
109
131
|
it 'refuses other objects' do
|
110
132
|
[nil, 1, 1.1].each do |object|
|
111
|
-
expect {
|
133
|
+
expect { Henkei.new object }.to raise_error TypeError
|
112
134
|
end
|
113
135
|
end
|
114
136
|
end
|
115
137
|
|
116
138
|
describe '.creation_date' do
|
117
|
-
let(:henkei) {
|
139
|
+
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
118
140
|
|
119
|
-
it '
|
141
|
+
it 'should return Time' do
|
120
142
|
expect(henkei.creation_date).to be_a Time
|
121
143
|
end
|
122
144
|
end
|
123
145
|
|
124
146
|
describe '.java' do
|
125
147
|
specify 'with no specified JAVA_HOME' do
|
126
|
-
expect(
|
148
|
+
expect(Henkei.send(:java_path)).to eq 'java'
|
127
149
|
end
|
128
150
|
|
129
151
|
specify 'with a specified JAVA_HOME' do
|
130
152
|
ENV['JAVA_HOME'] = '/path/to/java/home'
|
131
153
|
|
132
|
-
expect(
|
154
|
+
expect(Henkei.send(:java_path)).to eq '/path/to/java/home/bin/java'
|
133
155
|
end
|
134
156
|
end
|
135
157
|
|
136
|
-
context '
|
137
|
-
let(:henkei) {
|
158
|
+
context 'initialized with a given path' do
|
159
|
+
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
138
160
|
|
139
161
|
specify '#text reads text' do
|
140
162
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
@@ -145,7 +167,7 @@ describe Henkei do
|
|
145
167
|
end
|
146
168
|
|
147
169
|
context 'when passing in the `pipe-error.png` test file' do
|
148
|
-
let(:henkei) {
|
170
|
+
let(:henkei) { Henkei.new 'spec/samples/pipe-error.png' }
|
149
171
|
|
150
172
|
it '#text returns an empty result' do
|
151
173
|
expect(henkei.text).to eq ''
|
@@ -159,11 +181,35 @@ describe Henkei do
|
|
159
181
|
it '#mimetype returns `image/png`' do
|
160
182
|
expect(henkei.mimetype.content_type).to eq 'image/png'
|
161
183
|
end
|
184
|
+
|
185
|
+
unless travis_ci?
|
186
|
+
context 'when `include_ocr` is enabled' do
|
187
|
+
it '#text returns plain text of parsed text in the image' do
|
188
|
+
expect(henkei.text(include_ocr: true)).to include <<~TEXT
|
189
|
+
West Side
|
190
|
+
|
191
|
+
Sea Island
|
192
|
+
PP
|
193
|
+
|
194
|
+
Richmond
|
195
|
+
TEXT
|
196
|
+
end
|
197
|
+
|
198
|
+
it '#html returns HTML of parsed text in the image' do
|
199
|
+
expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
|
200
|
+
|
201
|
+
html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
|
202
|
+
['Anmore', 'Coquitlam', 'West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
|
203
|
+
expect(html_body.text).to include location
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
162
208
|
end
|
163
209
|
end
|
164
210
|
|
165
|
-
context '
|
166
|
-
let(:henkei) {
|
211
|
+
context 'initialized with a given URI' do
|
212
|
+
let(:henkei) { Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
|
167
213
|
|
168
214
|
specify '#text reads text' do
|
169
215
|
expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
|
@@ -176,8 +222,8 @@ describe Henkei do
|
|
176
222
|
end
|
177
223
|
end
|
178
224
|
|
179
|
-
context '
|
180
|
-
let(:henkei) {
|
225
|
+
context 'initialized with a given stream' do
|
226
|
+
let(:henkei) { Henkei.new File.open('spec/samples/sample.pages', 'rb') }
|
181
227
|
|
182
228
|
specify '#text reads text' do
|
183
229
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
@@ -189,7 +235,7 @@ describe Henkei do
|
|
189
235
|
end
|
190
236
|
|
191
237
|
context 'when source is a remote PDF' do
|
192
|
-
let(:henkei) {
|
238
|
+
let(:henkei) { Henkei.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
|
193
239
|
|
194
240
|
specify '#text reads text' do
|
195
241
|
expect(henkei.text).to include 'Dummy PDF file'
|
@@ -199,36 +245,4 @@ describe Henkei do
|
|
199
245
|
expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
|
200
246
|
end
|
201
247
|
end
|
202
|
-
|
203
|
-
context 'when working as server mode' do
|
204
|
-
specify '#starts and kills server' do
|
205
|
-
described_class.server(:text)
|
206
|
-
expect(described_class.class_variable_get(:@@server_pid)).not_to be_nil
|
207
|
-
expect(described_class.class_variable_get(:@@server_port)).not_to be_nil
|
208
|
-
|
209
|
-
s = TCPSocket.new('localhost', described_class.class_variable_get(:@@server_port))
|
210
|
-
expect(s).to be_a TCPSocket
|
211
|
-
s.close
|
212
|
-
ensure
|
213
|
-
port = described_class.class_variable_get(:@@server_port)
|
214
|
-
described_class.kill_server!
|
215
|
-
sleep 2
|
216
|
-
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
217
|
-
end
|
218
|
-
|
219
|
-
specify '#runs samples through server mode' do
|
220
|
-
described_class.server(:text)
|
221
|
-
expect(described_class.new('spec/samples/sample.pages').text).to(
|
222
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
223
|
-
)
|
224
|
-
expect(described_class.new('spec/samples/sample filename with spaces.pages').text).to(
|
225
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
226
|
-
)
|
227
|
-
expect(described_class.new('spec/samples/sample.docx').text).to(
|
228
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
229
|
-
)
|
230
|
-
ensure
|
231
|
-
described_class.kill_server!
|
232
|
-
end
|
233
|
-
end
|
234
248
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-12-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -51,6 +51,104 @@ dependencies:
|
|
51
51
|
- - "<"
|
52
52
|
- !ruby/object:Gem::Version
|
53
53
|
version: '2'
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: bundler
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '2.0'
|
61
|
+
type: :development
|
62
|
+
prerelease: false
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '2.0'
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: nokogiri
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '1.12'
|
75
|
+
type: :development
|
76
|
+
prerelease: false
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '1.12'
|
82
|
+
- !ruby/object:Gem::Dependency
|
83
|
+
name: rails
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '5.0'
|
89
|
+
type: :development
|
90
|
+
prerelease: false
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - "~>"
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '5.0'
|
96
|
+
- !ruby/object:Gem::Dependency
|
97
|
+
name: rake
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - "~>"
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '12.3'
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '12.3'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: rspec
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - "~>"
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '3.7'
|
117
|
+
type: :development
|
118
|
+
prerelease: false
|
119
|
+
version_requirements: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - "~>"
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '3.7'
|
124
|
+
- !ruby/object:Gem::Dependency
|
125
|
+
name: rubocop
|
126
|
+
requirement: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - "~>"
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '0.71'
|
131
|
+
type: :development
|
132
|
+
prerelease: false
|
133
|
+
version_requirements: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - "~>"
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '0.71'
|
138
|
+
- !ruby/object:Gem::Dependency
|
139
|
+
name: simplecov
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '0.15'
|
145
|
+
type: :development
|
146
|
+
prerelease: false
|
147
|
+
version_requirements: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - "~>"
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0.15'
|
54
152
|
description: Read text and metadata from files and documents using Apache Tika toolkit
|
55
153
|
email:
|
56
154
|
- erol.fornoles@gmail.com
|
@@ -60,10 +158,10 @@ executables:
|
|
60
158
|
extensions: []
|
61
159
|
extra_rdoc_files: []
|
62
160
|
files:
|
63
|
-
- ".github/workflows/test.yml"
|
64
161
|
- ".gitignore"
|
65
162
|
- ".rspec"
|
66
163
|
- ".rubocop.yml"
|
164
|
+
- ".travis.yml"
|
67
165
|
- Gemfile
|
68
166
|
- LICENSE
|
69
167
|
- NOTICE.txt
|
@@ -71,7 +169,8 @@ files:
|
|
71
169
|
- Rakefile
|
72
170
|
- bin/console
|
73
171
|
- henkei.gemspec
|
74
|
-
- jar/tika-app-
|
172
|
+
- jar/tika-app-2.2.0.jar
|
173
|
+
- jar/tika-config-without-ocr.xml
|
75
174
|
- jar/tika-config.xml
|
76
175
|
- lib/henkei.rb
|
77
176
|
- lib/henkei/configuration.rb
|
@@ -84,12 +183,10 @@ files:
|
|
84
183
|
- spec/samples/sample-metadata-values-with-colons.doc
|
85
184
|
- spec/samples/sample.docx
|
86
185
|
- spec/samples/sample.pages
|
87
|
-
homepage:
|
186
|
+
homepage: http://github.com/abrom/henkei
|
88
187
|
licenses:
|
89
188
|
- MIT
|
90
|
-
metadata:
|
91
|
-
allowed_push_host: https://rubygems.org
|
92
|
-
rubygems_mfa_required: 'true'
|
189
|
+
metadata: {}
|
93
190
|
post_install_message:
|
94
191
|
rdoc_options: []
|
95
192
|
require_paths:
|
@@ -98,19 +195,26 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
98
195
|
requirements:
|
99
196
|
- - ">="
|
100
197
|
- !ruby/object:Gem::Version
|
101
|
-
version:
|
198
|
+
version: 2.4.0
|
102
199
|
- - "<"
|
103
200
|
- !ruby/object:Gem::Version
|
104
|
-
version: 3.
|
201
|
+
version: 3.1.0
|
105
202
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
203
|
requirements:
|
107
204
|
- - ">="
|
108
205
|
- !ruby/object:Gem::Version
|
109
206
|
version: '0'
|
110
207
|
requirements: []
|
111
|
-
rubygems_version: 3.
|
208
|
+
rubygems_version: 3.0.9
|
112
209
|
signing_key:
|
113
210
|
specification_version: 4
|
114
211
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
115
212
|
.rtf, .pdf) using Apache Tika toolkit
|
116
|
-
test_files:
|
213
|
+
test_files:
|
214
|
+
- spec/helper.rb
|
215
|
+
- spec/henkei_spec.rb
|
216
|
+
- spec/samples/pipe-error.png
|
217
|
+
- spec/samples/sample filename with spaces.pages
|
218
|
+
- spec/samples/sample-metadata-values-with-colons.doc
|
219
|
+
- spec/samples/sample.docx
|
220
|
+
- spec/samples/sample.pages
|
data/.github/workflows/test.yml
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
name: Test Henkei Ruby gem
|
2
|
-
|
3
|
-
on:
|
4
|
-
push:
|
5
|
-
branches: [1.x]
|
6
|
-
pull_request:
|
7
|
-
branches: [1.x]
|
8
|
-
|
9
|
-
env:
|
10
|
-
CI: true
|
11
|
-
|
12
|
-
jobs:
|
13
|
-
test:
|
14
|
-
runs-on: ubuntu-latest
|
15
|
-
strategy:
|
16
|
-
matrix:
|
17
|
-
ruby-version: ['3.0', '3.1', '3.2', '3.3']
|
18
|
-
|
19
|
-
steps:
|
20
|
-
- uses: actions/checkout@v3
|
21
|
-
|
22
|
-
- name: Set up Ruby
|
23
|
-
uses: ruby/setup-ruby@v1
|
24
|
-
with:
|
25
|
-
ruby-version: ${{ matrix.ruby-version }}
|
26
|
-
bundler-cache: true
|
27
|
-
|
28
|
-
- name: Lint code - Rubocop
|
29
|
-
run: bundle exec rubocop
|
30
|
-
|
31
|
-
- name: Run tests
|
32
|
-
run: bundle exec rspec
|
33
|
-
|
34
|
-
- name: Test & publish code coverage
|
35
|
-
uses: paambaati/codeclimate-action@v3.2.0
|
36
|
-
env:
|
37
|
-
CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|