henkei 1.28.3.1 → 2.2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +0 -1
- data/.travis.yml +32 -0
- data/README.md +24 -1
- data/henkei.gemspec +7 -16
- data/jar/{tika-app-1.28.3.jar → tika-app-2.2.0.jar} +0 -0
- data/jar/tika-config-without-ocr.xml +9 -0
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +31 -96
- data/spec/henkei_spec.rb +47 -32
- metadata +33 -75
- data/.github/workflows/test.yml +0 -37
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64f2ec330c97bf77b16e2f7e14e08f90c405ae42c7aeddce733c0d889eeb4782
|
4
|
+
data.tar.gz: c3b3b91c569c7093bf22a5c751153426f1fdaa62da61f8c40f8b8cabc6ce072c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b9f2263c057cb9e958039930aa3c0244c31b5bcc5e636515a757e3b1c5d43a6b52c932a76a8c132af73707453df1e2652e6f02d9926276a75bceb4deb69ffa59
|
7
|
+
data.tar.gz: f8ea0d87ad2bc75213483824a30edba37507110fc721434f0a40477500dc4674d8ce8d1c2a47b06624249b48228bbe9c9a9f9c86bb19e5cb2dcc18a8dec23dbe
|
data/.rubocop.yml
CHANGED
data/.travis.yml
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
env:
|
2
|
+
global:
|
3
|
+
- CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
4
|
+
jobs:
|
5
|
+
- INCLUDE_RAILS=false
|
6
|
+
- INCLUDE_RAILS=true
|
7
|
+
|
8
|
+
language: ruby
|
9
|
+
rvm:
|
10
|
+
- 2.5
|
11
|
+
- 2.6
|
12
|
+
- 2.7
|
13
|
+
- 3.0
|
14
|
+
|
15
|
+
before_install:
|
16
|
+
- gem update bundler
|
17
|
+
|
18
|
+
install:
|
19
|
+
- bundle install --jobs=3 --retry=3
|
20
|
+
- gem install rubocop
|
21
|
+
|
22
|
+
before_script:
|
23
|
+
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
24
|
+
- chmod +x ./cc-test-reporter
|
25
|
+
- ./cc-test-reporter before-build
|
26
|
+
|
27
|
+
script:
|
28
|
+
- bundle exec rubocop
|
29
|
+
- bundle exec rspec
|
30
|
+
|
31
|
+
after_script:
|
32
|
+
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[![
|
1
|
+
[![Travis Build Status](http://img.shields.io/travis/abrom/henkei.svg?style=flat)](https://travis-ci.org/abrom/henkei)
|
2
2
|
[![Maintainability](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/maintainability)](https://codeclimate.com/github/abrom/henkei/maintainability)
|
3
3
|
[![Test Coverage](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/test_coverage)](https://codeclimate.com/github/abrom/henkei/test_coverage)
|
4
4
|
[![Gem Version](http://img.shields.io/gem/v/henkei.svg?style=flat)](#)
|
@@ -21,6 +21,15 @@ Here are some of the formats supported:
|
|
21
21
|
For the complete list of supported formats, please visit the Apache Tika
|
22
22
|
[Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
|
23
23
|
|
24
|
+
## Upgrading from v1.x to v2.x
|
25
|
+
|
26
|
+
Apache Tika v2.x brings with it some changes. One key change is that the Tika client and server applications have
|
27
|
+
been split up. To keep the gem size down Henkei will only include the client app. That is to say, each time you
|
28
|
+
call to Henkei, a new Java process will be started, run your command, then terminate.
|
29
|
+
|
30
|
+
Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
|
31
|
+
based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
|
32
|
+
|
24
33
|
## Usage
|
25
34
|
|
26
35
|
Text, metadata and MIME type information can be extracted by calling `Henkei.read` directly:
|
@@ -69,6 +78,20 @@ post '/:name/:filename' do
|
|
69
78
|
end
|
70
79
|
```
|
71
80
|
|
81
|
+
### Reading text from inside images (OCR)
|
82
|
+
|
83
|
+
You can enable OCR by specifying the optional `include_ocr: true` when calling to the `text` or `html` instance methods,
|
84
|
+
as well as the `read` class method. Note that Tika does indicate this will greatly increase processing time.
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
henkei = Henkei.new 'sample.pages'
|
88
|
+
text_with_ocr = henkei.text(include_ocr: true)
|
89
|
+
html_with_ocr = henkei.html(include_ocr: true)
|
90
|
+
|
91
|
+
data = File.read 'sample.pages'
|
92
|
+
text_with_ocr = Henkei.read :text, data, include_ocr: true
|
93
|
+
```
|
94
|
+
|
72
95
|
### Reading metadata
|
73
96
|
|
74
97
|
Metadata is returned as a hash.
|
data/henkei.gemspec
CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
5
|
|
6
6
|
require 'henkei/version'
|
7
7
|
|
8
|
-
Gem::Specification.new do |spec|
|
8
|
+
Gem::Specification.new do |spec|
|
9
9
|
spec.name = 'henkei'
|
10
10
|
spec.version = Henkei::VERSION
|
11
11
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
@@ -13,32 +13,23 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
|
13
13
|
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
14
14
|
spec.summary = 'Read text and metadata from files and documents ' \
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
|
-
spec.homepage = '
|
16
|
+
spec.homepage = 'http://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
|
-
spec.required_ruby_version = ['>= 2.
|
19
|
-
|
20
|
-
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
21
|
-
# delete this section to allow pushing this gem to any host.
|
22
|
-
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.' unless spec.respond_to?(:metadata)
|
23
|
-
|
24
|
-
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
25
|
-
spec.metadata['rubygems_mfa_required'] = 'true'
|
18
|
+
spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
|
26
19
|
|
27
20
|
spec.files = `git ls-files`.split("\n")
|
28
21
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
22
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
29
23
|
spec.require_paths = ['lib']
|
30
24
|
|
31
25
|
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
32
26
|
spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
|
33
27
|
|
34
28
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
29
|
+
spec.add_development_dependency 'nokogiri', '~> 1.12'
|
35
30
|
spec.add_development_dependency 'rails', '~> 5.0'
|
36
31
|
spec.add_development_dependency 'rake', '~> 12.3'
|
37
32
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
38
|
-
spec.add_development_dependency 'rubocop', '~>
|
39
|
-
spec.add_development_dependency '
|
40
|
-
spec.add_development_dependency 'rubocop-rails', '~> 2.14'
|
41
|
-
spec.add_development_dependency 'rubocop-rake', '~> 0.6'
|
42
|
-
spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
|
43
|
-
spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
|
33
|
+
spec.add_development_dependency 'rubocop', '~> 0.71'
|
34
|
+
spec.add_development_dependency 'simplecov', '~> 0.15'
|
44
35
|
end
|
Binary file
|
@@ -0,0 +1,9 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<properties>
|
3
|
+
<service-loader initializableProblemHandler="ignore"/>
|
4
|
+
<parsers>
|
5
|
+
<parser class="org.apache.tika.parser.DefaultParser">
|
6
|
+
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
|
7
|
+
</parser>
|
8
|
+
</parsers>
|
9
|
+
</properties>
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -25,17 +25,14 @@ require 'open3'
|
|
25
25
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
26
26
|
class Henkei # rubocop:disable Metrics/ClassLength
|
27
27
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
28
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-
|
28
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.2.0.jar')
|
29
29
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
30
|
-
|
31
|
-
|
32
|
-
@@server_port = nil
|
33
|
-
@@server_pid = nil
|
30
|
+
CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
|
34
31
|
|
35
32
|
def self.mimetype(content_type)
|
36
33
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
37
34
|
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
38
|
-
|
35
|
+
' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
39
36
|
MIME::Types[content_type].first
|
40
37
|
else
|
41
38
|
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
@@ -50,11 +47,12 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
50
47
|
# text = Henkei.read :text, data
|
51
48
|
# metadata = Henkei.read :metadata, data
|
52
49
|
#
|
53
|
-
def self.read(type, data)
|
54
|
-
result =
|
50
|
+
def self.read(type, data, include_ocr: false)
|
51
|
+
result = client_read(type, data, include_ocr: include_ocr)
|
55
52
|
|
56
53
|
case type
|
57
|
-
when :text
|
54
|
+
when :text then result
|
55
|
+
when :html then result
|
58
56
|
when :metadata then JSON.parse(result)
|
59
57
|
when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
|
60
58
|
end
|
@@ -95,10 +93,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
95
93
|
# henkei = Henkei.new 'sample.pages'
|
96
94
|
# henkei.text
|
97
95
|
#
|
98
|
-
|
96
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
97
|
+
#
|
98
|
+
# henkei.text(include_ocr: true)
|
99
|
+
#
|
100
|
+
def text(include_ocr: false)
|
99
101
|
return @text if defined? @text
|
100
102
|
|
101
|
-
@text = Henkei.read :text, data
|
103
|
+
@text = Henkei.read :text, data, include_ocr: include_ocr
|
102
104
|
end
|
103
105
|
|
104
106
|
# Returns the text content of the Henkei document in HTML.
|
@@ -106,10 +108,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
106
108
|
# henkei = Henkei.new 'sample.pages'
|
107
109
|
# henkei.html
|
108
110
|
#
|
109
|
-
|
111
|
+
# Include OCR results from images (includes embedded images in pages/docx/pdf etc)
|
112
|
+
#
|
113
|
+
# henkei.html(include_ocr: true)
|
114
|
+
#
|
115
|
+
def html(include_ocr: false)
|
110
116
|
return @html if defined? @html
|
111
117
|
|
112
|
-
@html = Henkei.read :html, data
|
118
|
+
@html = Henkei.read :html, data, include_ocr: include_ocr
|
113
119
|
end
|
114
120
|
|
115
121
|
# Returns the metadata hash of the Henkei document.
|
@@ -143,9 +149,9 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
143
149
|
#
|
144
150
|
def creation_date
|
145
151
|
return @creation_date if defined? @creation_date
|
146
|
-
return unless metadata['
|
152
|
+
return unless metadata['dcterms:created']
|
147
153
|
|
148
|
-
@creation_date = Time.parse(metadata['
|
154
|
+
@creation_date = Time.parse(metadata['dcterms:created'])
|
149
155
|
end
|
150
156
|
|
151
157
|
# Returns +true+ if the Henkei document was specified using a file path.
|
@@ -195,44 +201,6 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
195
201
|
@data
|
196
202
|
end
|
197
203
|
|
198
|
-
# Returns pid of Tika server, started as a new spawned process.
|
199
|
-
#
|
200
|
-
# type :html, :text or :metadata
|
201
|
-
# custom_port e.g. 9293
|
202
|
-
#
|
203
|
-
# Henkei.server(:text, 9294)
|
204
|
-
#
|
205
|
-
def self.server(type, custom_port = nil)
|
206
|
-
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
207
|
-
|
208
|
-
@@server_pid = Process.spawn(*tika_command(type, server: true))
|
209
|
-
sleep(2) # Give the server 2 seconds to spin up.
|
210
|
-
@@server_pid
|
211
|
-
end
|
212
|
-
|
213
|
-
# Kills server started by Henkei.server
|
214
|
-
#
|
215
|
-
# Always run this when you're done, or else Tika might run until you kill it manually
|
216
|
-
# You might try putting your extraction in a begin..rescue...ensure...end block and
|
217
|
-
# putting this method in the ensure block.
|
218
|
-
#
|
219
|
-
# Henkei.server(:text)
|
220
|
-
# reports = ["report1.docx", "report2.doc", "report3.pdf"]
|
221
|
-
# begin
|
222
|
-
# my_texts = reports.map{ |report_path| Henkei.new(report_path).text }
|
223
|
-
# rescue
|
224
|
-
# ensure
|
225
|
-
# Henkei.kill_server!
|
226
|
-
# end
|
227
|
-
#
|
228
|
-
def self.kill_server!
|
229
|
-
return unless @@server_pid
|
230
|
-
|
231
|
-
Process.kill('INT', @@server_pid)
|
232
|
-
@@server_pid = nil
|
233
|
-
@@server_port = nil
|
234
|
-
end
|
235
|
-
|
236
204
|
### Private class methods
|
237
205
|
|
238
206
|
# Provide the path to the Java binary
|
@@ -244,44 +212,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
244
212
|
|
245
213
|
# Internal helper for calling to Tika library directly
|
246
214
|
#
|
247
|
-
def self.client_read(type, data)
|
248
|
-
|
215
|
+
def self.client_read(type, data, include_ocr: false)
|
216
|
+
Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
|
249
217
|
end
|
250
218
|
private_class_method :client_read
|
251
219
|
|
252
|
-
# Internal helper for calling to running Tika server
|
253
|
-
#
|
254
|
-
def self.server_read(data)
|
255
|
-
s = TCPSocket.new('localhost', @@server_port)
|
256
|
-
file = StringIO.new(data, 'r')
|
257
|
-
|
258
|
-
loop do
|
259
|
-
chunk = file.read(65_536)
|
260
|
-
break unless chunk
|
261
|
-
|
262
|
-
s.write(chunk)
|
263
|
-
end
|
264
|
-
|
265
|
-
# tell Tika that we're done sending data
|
266
|
-
s.shutdown(Socket::SHUT_WR)
|
267
|
-
|
268
|
-
resp = String.new ''
|
269
|
-
loop do
|
270
|
-
chunk = s.recv(65_536)
|
271
|
-
break if chunk.empty? || !chunk
|
272
|
-
|
273
|
-
resp << chunk
|
274
|
-
end
|
275
|
-
filter_response resp
|
276
|
-
end
|
277
|
-
private_class_method :server_read
|
278
|
-
|
279
220
|
# Internal helper for building the Java command to call Tika
|
280
221
|
#
|
281
|
-
def self.tika_command(type,
|
282
|
-
|
283
|
-
|
284
|
-
|
222
|
+
def self.tika_command(type, include_ocr: false)
|
223
|
+
[
|
224
|
+
java_path,
|
225
|
+
'-Djava.awt.headless=true',
|
226
|
+
'-jar',
|
227
|
+
Henkei::JAR_PATH,
|
228
|
+
"--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
|
229
|
+
] + switch_for_type(type)
|
285
230
|
end
|
286
231
|
private_class_method :tika_command
|
287
232
|
|
@@ -296,14 +241,4 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
296
241
|
}[type]
|
297
242
|
end
|
298
243
|
private_class_method :switch_for_type
|
299
|
-
|
300
|
-
# Internal helper to remove erroneous output
|
301
|
-
#
|
302
|
-
def self.filter_response(response)
|
303
|
-
response.gsub(
|
304
|
-
/\AWARNING: sun\.reflect\.Reflection\.getCallerClass is not supported\. This will impact performance\.\n/,
|
305
|
-
''
|
306
|
-
)
|
307
|
-
end
|
308
|
-
private_class_method :filter_response
|
309
244
|
end
|
data/spec/henkei_spec.rb
CHANGED
@@ -2,10 +2,15 @@
|
|
2
2
|
|
3
3
|
require 'helper'
|
4
4
|
require 'henkei'
|
5
|
+
require 'nokogiri'
|
5
6
|
|
6
7
|
# Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
|
7
8
|
require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
|
8
9
|
|
10
|
+
def travis_ci?
|
11
|
+
ENV['CI'] == 'true' && ENV['TRAVIS'] == 'true'
|
12
|
+
end
|
13
|
+
|
9
14
|
describe Henkei do
|
10
15
|
let(:data) { File.read 'spec/samples/sample.docx' }
|
11
16
|
|
@@ -52,6 +57,23 @@ describe Henkei do
|
|
52
57
|
|
53
58
|
expect(text).to eq ''
|
54
59
|
end
|
60
|
+
|
61
|
+
unless travis_ci?
|
62
|
+
context 'when `include_ocr` is enabled' do
|
63
|
+
it 'returns parsed plain text in the image' do
|
64
|
+
text = Henkei.read :text, data, include_ocr: true
|
65
|
+
|
66
|
+
expect(text).to include <<~TEXT
|
67
|
+
West Side
|
68
|
+
|
69
|
+
Sea Island
|
70
|
+
PP
|
71
|
+
|
72
|
+
Richmond
|
73
|
+
TEXT
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
55
77
|
end
|
56
78
|
end
|
57
79
|
|
@@ -115,6 +137,7 @@ describe Henkei do
|
|
115
137
|
|
116
138
|
describe '.creation_date' do
|
117
139
|
let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
|
140
|
+
|
118
141
|
it 'should return Time' do
|
119
142
|
expect(henkei.creation_date).to be_a Time
|
120
143
|
end
|
@@ -158,6 +181,30 @@ describe Henkei do
|
|
158
181
|
it '#mimetype returns `image/png`' do
|
159
182
|
expect(henkei.mimetype.content_type).to eq 'image/png'
|
160
183
|
end
|
184
|
+
|
185
|
+
unless travis_ci?
|
186
|
+
context 'when `include_ocr` is enabled' do
|
187
|
+
it '#text returns plain text of parsed text in the image' do
|
188
|
+
expect(henkei.text(include_ocr: true)).to include <<~TEXT
|
189
|
+
West Side
|
190
|
+
|
191
|
+
Sea Island
|
192
|
+
PP
|
193
|
+
|
194
|
+
Richmond
|
195
|
+
TEXT
|
196
|
+
end
|
197
|
+
|
198
|
+
it '#html returns HTML of parsed text in the image' do
|
199
|
+
expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
|
200
|
+
|
201
|
+
html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
|
202
|
+
['Anmore', 'Coquitlam', 'West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
|
203
|
+
expect(html_body.text).to include location
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
161
208
|
end
|
162
209
|
end
|
163
210
|
|
@@ -198,36 +245,4 @@ describe Henkei do
|
|
198
245
|
expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
|
199
246
|
end
|
200
247
|
end
|
201
|
-
|
202
|
-
context 'working as server mode' do
|
203
|
-
specify '#starts and kills server' do
|
204
|
-
Henkei.server(:text)
|
205
|
-
expect(Henkei.class_variable_get(:@@server_pid)).not_to be_nil
|
206
|
-
expect(Henkei.class_variable_get(:@@server_port)).not_to be_nil
|
207
|
-
|
208
|
-
s = TCPSocket.new('localhost', Henkei.class_variable_get(:@@server_port))
|
209
|
-
expect(s).to be_a TCPSocket
|
210
|
-
s.close
|
211
|
-
ensure
|
212
|
-
port = Henkei.class_variable_get(:@@server_port)
|
213
|
-
Henkei.kill_server!
|
214
|
-
sleep 2
|
215
|
-
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
216
|
-
end
|
217
|
-
|
218
|
-
specify '#runs samples through server mode' do
|
219
|
-
Henkei.server(:text)
|
220
|
-
expect(Henkei.new('spec/samples/sample.pages').text).to(
|
221
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
222
|
-
)
|
223
|
-
expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
|
224
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
225
|
-
)
|
226
|
-
expect(Henkei.new('spec/samples/sample.docx').text).to(
|
227
|
-
include 'The quick brown fox jumped over the lazy cat.'
|
228
|
-
)
|
229
|
-
ensure
|
230
|
-
Henkei.kill_server!
|
231
|
-
end
|
232
|
-
end
|
233
248
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.2.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-12-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -66,117 +66,75 @@ dependencies:
|
|
66
66
|
- !ruby/object:Gem::Version
|
67
67
|
version: '2.0'
|
68
68
|
- !ruby/object:Gem::Dependency
|
69
|
-
name:
|
70
|
-
requirement: !ruby/object:Gem::Requirement
|
71
|
-
requirements:
|
72
|
-
- - "~>"
|
73
|
-
- !ruby/object:Gem::Version
|
74
|
-
version: '5.0'
|
75
|
-
type: :development
|
76
|
-
prerelease: false
|
77
|
-
version_requirements: !ruby/object:Gem::Requirement
|
78
|
-
requirements:
|
79
|
-
- - "~>"
|
80
|
-
- !ruby/object:Gem::Version
|
81
|
-
version: '5.0'
|
82
|
-
- !ruby/object:Gem::Dependency
|
83
|
-
name: rake
|
84
|
-
requirement: !ruby/object:Gem::Requirement
|
85
|
-
requirements:
|
86
|
-
- - "~>"
|
87
|
-
- !ruby/object:Gem::Version
|
88
|
-
version: '12.3'
|
89
|
-
type: :development
|
90
|
-
prerelease: false
|
91
|
-
version_requirements: !ruby/object:Gem::Requirement
|
92
|
-
requirements:
|
93
|
-
- - "~>"
|
94
|
-
- !ruby/object:Gem::Version
|
95
|
-
version: '12.3'
|
96
|
-
- !ruby/object:Gem::Dependency
|
97
|
-
name: rspec
|
98
|
-
requirement: !ruby/object:Gem::Requirement
|
99
|
-
requirements:
|
100
|
-
- - "~>"
|
101
|
-
- !ruby/object:Gem::Version
|
102
|
-
version: '3.7'
|
103
|
-
type: :development
|
104
|
-
prerelease: false
|
105
|
-
version_requirements: !ruby/object:Gem::Requirement
|
106
|
-
requirements:
|
107
|
-
- - "~>"
|
108
|
-
- !ruby/object:Gem::Version
|
109
|
-
version: '3.7'
|
110
|
-
- !ruby/object:Gem::Dependency
|
111
|
-
name: rubocop
|
69
|
+
name: nokogiri
|
112
70
|
requirement: !ruby/object:Gem::Requirement
|
113
71
|
requirements:
|
114
72
|
- - "~>"
|
115
73
|
- !ruby/object:Gem::Version
|
116
|
-
version: '1.
|
74
|
+
version: '1.12'
|
117
75
|
type: :development
|
118
76
|
prerelease: false
|
119
77
|
version_requirements: !ruby/object:Gem::Requirement
|
120
78
|
requirements:
|
121
79
|
- - "~>"
|
122
80
|
- !ruby/object:Gem::Version
|
123
|
-
version: '1.
|
81
|
+
version: '1.12'
|
124
82
|
- !ruby/object:Gem::Dependency
|
125
|
-
name:
|
83
|
+
name: rails
|
126
84
|
requirement: !ruby/object:Gem::Requirement
|
127
85
|
requirements:
|
128
86
|
- - "~>"
|
129
87
|
- !ruby/object:Gem::Version
|
130
|
-
version: '
|
88
|
+
version: '5.0'
|
131
89
|
type: :development
|
132
90
|
prerelease: false
|
133
91
|
version_requirements: !ruby/object:Gem::Requirement
|
134
92
|
requirements:
|
135
93
|
- - "~>"
|
136
94
|
- !ruby/object:Gem::Version
|
137
|
-
version: '
|
95
|
+
version: '5.0'
|
138
96
|
- !ruby/object:Gem::Dependency
|
139
|
-
name:
|
97
|
+
name: rake
|
140
98
|
requirement: !ruby/object:Gem::Requirement
|
141
99
|
requirements:
|
142
100
|
- - "~>"
|
143
101
|
- !ruby/object:Gem::Version
|
144
|
-
version: '
|
102
|
+
version: '12.3'
|
145
103
|
type: :development
|
146
104
|
prerelease: false
|
147
105
|
version_requirements: !ruby/object:Gem::Requirement
|
148
106
|
requirements:
|
149
107
|
- - "~>"
|
150
108
|
- !ruby/object:Gem::Version
|
151
|
-
version: '
|
109
|
+
version: '12.3'
|
152
110
|
- !ruby/object:Gem::Dependency
|
153
|
-
name:
|
111
|
+
name: rspec
|
154
112
|
requirement: !ruby/object:Gem::Requirement
|
155
113
|
requirements:
|
156
114
|
- - "~>"
|
157
115
|
- !ruby/object:Gem::Version
|
158
|
-
version: '
|
116
|
+
version: '3.7'
|
159
117
|
type: :development
|
160
118
|
prerelease: false
|
161
119
|
version_requirements: !ruby/object:Gem::Requirement
|
162
120
|
requirements:
|
163
121
|
- - "~>"
|
164
122
|
- !ruby/object:Gem::Version
|
165
|
-
version: '
|
123
|
+
version: '3.7'
|
166
124
|
- !ruby/object:Gem::Dependency
|
167
|
-
name: rubocop
|
125
|
+
name: rubocop
|
168
126
|
requirement: !ruby/object:Gem::Requirement
|
169
127
|
requirements:
|
170
128
|
- - "~>"
|
171
129
|
- !ruby/object:Gem::Version
|
172
|
-
version: '
|
130
|
+
version: '0.71'
|
173
131
|
type: :development
|
174
132
|
prerelease: false
|
175
133
|
version_requirements: !ruby/object:Gem::Requirement
|
176
134
|
requirements:
|
177
135
|
- - "~>"
|
178
136
|
- !ruby/object:Gem::Version
|
179
|
-
version: '
|
137
|
+
version: '0.71'
|
180
138
|
- !ruby/object:Gem::Dependency
|
181
139
|
name: simplecov
|
182
140
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,9 +142,6 @@ dependencies:
|
|
184
142
|
- - "~>"
|
185
143
|
- !ruby/object:Gem::Version
|
186
144
|
version: '0.15'
|
187
|
-
- - "<"
|
188
|
-
- !ruby/object:Gem::Version
|
189
|
-
version: '0.18'
|
190
145
|
type: :development
|
191
146
|
prerelease: false
|
192
147
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -194,9 +149,6 @@ dependencies:
|
|
194
149
|
- - "~>"
|
195
150
|
- !ruby/object:Gem::Version
|
196
151
|
version: '0.15'
|
197
|
-
- - "<"
|
198
|
-
- !ruby/object:Gem::Version
|
199
|
-
version: '0.18'
|
200
152
|
description: Read text and metadata from files and documents using Apache Tika toolkit
|
201
153
|
email:
|
202
154
|
- erol.fornoles@gmail.com
|
@@ -206,10 +158,10 @@ executables:
|
|
206
158
|
extensions: []
|
207
159
|
extra_rdoc_files: []
|
208
160
|
files:
|
209
|
-
- ".github/workflows/test.yml"
|
210
161
|
- ".gitignore"
|
211
162
|
- ".rspec"
|
212
163
|
- ".rubocop.yml"
|
164
|
+
- ".travis.yml"
|
213
165
|
- Gemfile
|
214
166
|
- LICENSE
|
215
167
|
- NOTICE.txt
|
@@ -217,7 +169,8 @@ files:
|
|
217
169
|
- Rakefile
|
218
170
|
- bin/console
|
219
171
|
- henkei.gemspec
|
220
|
-
- jar/tika-app-
|
172
|
+
- jar/tika-app-2.2.0.jar
|
173
|
+
- jar/tika-config-without-ocr.xml
|
221
174
|
- jar/tika-config.xml
|
222
175
|
- lib/henkei.rb
|
223
176
|
- lib/henkei/configuration.rb
|
@@ -230,12 +183,10 @@ files:
|
|
230
183
|
- spec/samples/sample-metadata-values-with-colons.doc
|
231
184
|
- spec/samples/sample.docx
|
232
185
|
- spec/samples/sample.pages
|
233
|
-
homepage:
|
186
|
+
homepage: http://github.com/abrom/henkei
|
234
187
|
licenses:
|
235
188
|
- MIT
|
236
|
-
metadata:
|
237
|
-
allowed_push_host: https://rubygems.org
|
238
|
-
rubygems_mfa_required: 'true'
|
189
|
+
metadata: {}
|
239
190
|
post_install_message:
|
240
191
|
rdoc_options: []
|
241
192
|
require_paths:
|
@@ -244,19 +195,26 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
244
195
|
requirements:
|
245
196
|
- - ">="
|
246
197
|
- !ruby/object:Gem::Version
|
247
|
-
version: 2.
|
198
|
+
version: 2.4.0
|
248
199
|
- - "<"
|
249
200
|
- !ruby/object:Gem::Version
|
250
|
-
version: 3.
|
201
|
+
version: 3.1.0
|
251
202
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
252
203
|
requirements:
|
253
204
|
- - ">="
|
254
205
|
- !ruby/object:Gem::Version
|
255
206
|
version: '0'
|
256
207
|
requirements: []
|
257
|
-
rubygems_version: 3.
|
208
|
+
rubygems_version: 3.0.9
|
258
209
|
signing_key:
|
259
210
|
specification_version: 4
|
260
211
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
261
212
|
.rtf, .pdf) using Apache Tika toolkit
|
262
|
-
test_files:
|
213
|
+
test_files:
|
214
|
+
- spec/helper.rb
|
215
|
+
- spec/henkei_spec.rb
|
216
|
+
- spec/samples/pipe-error.png
|
217
|
+
- spec/samples/sample filename with spaces.pages
|
218
|
+
- spec/samples/sample-metadata-values-with-colons.doc
|
219
|
+
- spec/samples/sample.docx
|
220
|
+
- spec/samples/sample.pages
|
data/.github/workflows/test.yml
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
name: Test Henkei Ruby gem
|
2
|
-
|
3
|
-
on:
|
4
|
-
push:
|
5
|
-
branches: [1.x]
|
6
|
-
pull_request:
|
7
|
-
branches: [1.x]
|
8
|
-
|
9
|
-
env:
|
10
|
-
CI: true
|
11
|
-
|
12
|
-
jobs:
|
13
|
-
test:
|
14
|
-
runs-on: ubuntu-latest
|
15
|
-
strategy:
|
16
|
-
matrix:
|
17
|
-
ruby-version: ['2.6', '2.7', '3.0', '3.1']
|
18
|
-
|
19
|
-
steps:
|
20
|
-
- uses: actions/checkout@v2
|
21
|
-
|
22
|
-
- name: Set up Ruby
|
23
|
-
uses: ruby/setup-ruby@v1
|
24
|
-
with:
|
25
|
-
ruby-version: ${{ matrix.ruby-version }}
|
26
|
-
bundler-cache: true
|
27
|
-
|
28
|
-
- name: Lint code - Rubocop
|
29
|
-
run: bundle exec rubocop
|
30
|
-
|
31
|
-
- name: Run tests
|
32
|
-
run: bundle exec rspec
|
33
|
-
|
34
|
-
- name: Test & publish code coverage
|
35
|
-
uses: paambaati/codeclimate-action@v3.0.0
|
36
|
-
env:
|
37
|
-
CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|