henkei 1.21.0 → 1.23.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +45 -3
- data/.travis.yml +6 -2
- data/bin/console +8 -0
- data/henkei.gemspec +3 -1
- data/jar/{tika-app-1.21.jar → tika-app-1.23.jar} +0 -0
- data/lib/henkei.rb +41 -24
- data/lib/henkei/configuration.rb +21 -0
- data/lib/henkei/version.rb +1 -1
- data/spec/henkei_spec.rb +43 -1
- data/spec/samples/pipe-error.png +0 -0
- metadata +33 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7ee68ad858a48fb82526d230d1d49788be9eee8e0d27562276fd6b18c11e4923
|
4
|
+
data.tar.gz: ebedf682b5ef4f979d02eb6cec23d4bdbdf581b958873dc80469fb0bfa15560d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 724b7788968a98cf2120912eccfd49b761b25b01666f20f58334eedec44537d9b2238006df3f3fee26f048e54d36213d168152d6809b25466d41ae055566add6
|
7
|
+
data.tar.gz: e867f325855b63232d86ab1208c6c35c4b71ad620fab893451f33c1590eaaf3622e72ea5ea71ce60d7486a3bffac9c4f54829cc30683cbe8af20b3c969b4745f
|
data/.rubocop.yml
CHANGED
@@ -1,10 +1,31 @@
|
|
1
|
+
AllCops:
|
2
|
+
NewCops: enable
|
3
|
+
|
4
|
+
Layout/EmptyLinesAroundAttributeAccessor:
|
5
|
+
Enabled: true
|
6
|
+
|
7
|
+
Layout/LineLength:
|
8
|
+
Max: 120
|
9
|
+
|
10
|
+
Layout/SpaceAroundMethodCallOperator:
|
11
|
+
Enabled: true
|
12
|
+
|
13
|
+
Lint/DeprecatedOpenSSLConstant:
|
14
|
+
Enabled: true
|
15
|
+
|
16
|
+
Lint/MixedRegexpCaptureTypes:
|
17
|
+
Enabled: true
|
18
|
+
|
19
|
+
Lint/RaiseException:
|
20
|
+
Enabled: true
|
21
|
+
|
22
|
+
Lint/StructNewOverride:
|
23
|
+
Enabled: true
|
24
|
+
|
1
25
|
Metrics/BlockLength:
|
2
26
|
Exclude:
|
3
27
|
- 'spec/**/*'
|
4
28
|
|
5
|
-
Metrics/LineLength:
|
6
|
-
Max: 120
|
7
|
-
|
8
29
|
Metrics/MethodLength:
|
9
30
|
Max: 15
|
10
31
|
|
@@ -13,3 +34,24 @@ Style/ClassVars:
|
|
13
34
|
|
14
35
|
Style/DoubleNegation:
|
15
36
|
Enabled: false
|
37
|
+
|
38
|
+
Style/ExponentialNotation:
|
39
|
+
Enabled: true
|
40
|
+
|
41
|
+
Style/HashEachMethods:
|
42
|
+
Enabled: true
|
43
|
+
|
44
|
+
Style/HashTransformKeys:
|
45
|
+
Enabled: true
|
46
|
+
|
47
|
+
Style/HashTransformValues:
|
48
|
+
Enabled: true
|
49
|
+
|
50
|
+
Style/RedundantRegexpCharacterClass:
|
51
|
+
Enabled: true
|
52
|
+
|
53
|
+
Style/RedundantRegexpEscape:
|
54
|
+
Enabled: true
|
55
|
+
|
56
|
+
Style/SlicingWithRange:
|
57
|
+
Enabled: true
|
data/.travis.yml
CHANGED
@@ -1,13 +1,17 @@
|
|
1
1
|
env:
|
2
2
|
global:
|
3
3
|
- CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
4
|
+
jobs:
|
5
|
+
- INCLUDE_RAILS=false
|
6
|
+
- INCLUDE_RAILS=true
|
4
7
|
|
5
8
|
language: ruby
|
6
9
|
rvm:
|
7
|
-
- 2.3
|
8
10
|
- 2.4
|
9
11
|
- 2.5
|
10
12
|
- 2.6
|
13
|
+
- 2.7
|
14
|
+
- 3.0
|
11
15
|
|
12
16
|
before_install:
|
13
17
|
- gem update bundler
|
@@ -22,7 +26,7 @@ before_script:
|
|
22
26
|
- ./cc-test-reporter before-build
|
23
27
|
|
24
28
|
script:
|
25
|
-
- rubocop
|
29
|
+
- bundle exec rubocop
|
26
30
|
- bundle exec rspec
|
27
31
|
|
28
32
|
after_script:
|
data/bin/console
ADDED
data/henkei.gemspec
CHANGED
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
|
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
16
|
spec.homepage = 'http://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
|
+
spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
|
18
19
|
|
19
20
|
spec.files = `git ls-files`.split("\n")
|
20
21
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
@@ -22,9 +23,10 @@ Gem::Specification.new do |spec|
|
|
22
23
|
spec.require_paths = ['lib']
|
23
24
|
|
24
25
|
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
25
|
-
spec.add_runtime_dependency '
|
26
|
+
spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
|
26
27
|
|
27
28
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
29
|
+
spec.add_development_dependency 'rails', '~> 5.0'
|
28
30
|
spec.add_development_dependency 'rake', '~> 12.3'
|
29
31
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
30
32
|
spec.add_development_dependency 'rubocop', '~> 0.71'
|
Binary file
|
data/lib/henkei.rb
CHANGED
@@ -2,25 +2,48 @@
|
|
2
2
|
|
3
3
|
require 'henkei/version'
|
4
4
|
require 'henkei/yomu'
|
5
|
+
require 'henkei/configuration'
|
5
6
|
|
6
7
|
require 'net/http'
|
7
|
-
require '
|
8
|
+
require 'mini_mime'
|
9
|
+
|
10
|
+
# require 'mime/types' if available
|
11
|
+
begin
|
12
|
+
require 'mime/types'
|
13
|
+
rescue LoadError
|
14
|
+
nil
|
15
|
+
end
|
16
|
+
|
8
17
|
require 'time'
|
9
18
|
require 'json'
|
10
19
|
|
11
20
|
require 'socket'
|
12
21
|
require 'stringio'
|
13
22
|
|
23
|
+
require 'open3'
|
24
|
+
|
14
25
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
15
26
|
class Henkei # rubocop:disable Metrics/ClassLength
|
16
27
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
17
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.
|
28
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.23.jar')
|
18
29
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
19
30
|
DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
|
20
31
|
|
21
32
|
@@server_port = nil
|
22
33
|
@@server_pid = nil
|
23
34
|
|
35
|
+
def self.mimetype(content_type)
|
36
|
+
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
37
|
+
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
38
|
+
' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
39
|
+
MIME::Types[content_type].first
|
40
|
+
else
|
41
|
+
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
42
|
+
object.define_singleton_method(:extensions) { [extension] }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
24
47
|
# Read text or metadata from a data buffer.
|
25
48
|
#
|
26
49
|
# data = File.read 'sample.pages'
|
@@ -34,7 +57,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
34
57
|
when :text then result
|
35
58
|
when :html then result
|
36
59
|
when :metadata then JSON.parse(result)
|
37
|
-
when :mimetype then
|
60
|
+
when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
|
38
61
|
end
|
39
62
|
end
|
40
63
|
|
@@ -110,9 +133,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
110
133
|
def mimetype
|
111
134
|
return @mimetype if defined? @mimetype
|
112
135
|
|
113
|
-
|
114
|
-
|
115
|
-
@mimetype = MIME::Types[type].first
|
136
|
+
content_type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
|
137
|
+
@mimetype = Henkei.mimetype(content_type)
|
116
138
|
end
|
117
139
|
|
118
140
|
# Returns +true+ if the Henkei document was specified using a file path.
|
@@ -184,7 +206,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
184
206
|
def self.server(type, custom_port = nil)
|
185
207
|
@@server_port = custom_port || DEFAULT_SERVER_PORT
|
186
208
|
|
187
|
-
@@server_pid = Process.spawn
|
209
|
+
@@server_pid = Process.spawn(*tika_command(type, server: true))
|
188
210
|
sleep(2) # Give the server 2 seconds to spin up.
|
189
211
|
@@server_pid
|
190
212
|
end
|
@@ -217,18 +239,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
217
239
|
# Provide the path to the Java binary
|
218
240
|
#
|
219
241
|
def self.java_path
|
220
|
-
ENV['JAVA_HOME'] ? ENV['JAVA_HOME']
|
242
|
+
ENV['JAVA_HOME'] ? "#{ENV['JAVA_HOME']}/bin/java" : 'java'
|
221
243
|
end
|
222
244
|
private_class_method :java_path
|
223
245
|
|
224
246
|
# Internal helper for calling to Tika library directly
|
225
247
|
#
|
226
248
|
def self.client_read(type, data)
|
227
|
-
|
228
|
-
io.write data
|
229
|
-
io.close_write
|
230
|
-
io.read
|
231
|
-
end
|
249
|
+
Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
|
232
250
|
end
|
233
251
|
private_class_method :client_read
|
234
252
|
|
@@ -261,23 +279,22 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
261
279
|
|
262
280
|
# Internal helper for building the Java command to call Tika
|
263
281
|
#
|
264
|
-
def self.tika_command(type, server
|
265
|
-
command = [
|
266
|
-
command
|
267
|
-
command
|
268
|
-
command.join ' '
|
282
|
+
def self.tika_command(type, server: false)
|
283
|
+
command = [java_path, '-Djava.awt.headless=true', '-jar', Henkei::JAR_PATH, "--config=#{Henkei::CONFIG_PATH}"]
|
284
|
+
command += ['--server', '--port', @@server_port.to_s] if server
|
285
|
+
command + switch_for_type(type)
|
269
286
|
end
|
270
287
|
private_class_method :tika_command
|
271
288
|
|
272
289
|
# Internal helper for building the Java command to call Tika
|
273
290
|
#
|
274
291
|
def self.switch_for_type(type)
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
292
|
+
{
|
293
|
+
text: ['-t'],
|
294
|
+
html: ['-h'],
|
295
|
+
metadata: %w[-m -j],
|
296
|
+
mimetype: %w[-m -j]
|
297
|
+
}[type]
|
281
298
|
end
|
282
299
|
private_class_method :switch_for_type
|
283
300
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Henkei monkey patch for configuration support
|
4
|
+
class Henkei
|
5
|
+
def self.configuration
|
6
|
+
@configuration ||= Configuration.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.configure
|
10
|
+
yield(configuration)
|
11
|
+
end
|
12
|
+
|
13
|
+
# Handle Henkei configuration
|
14
|
+
class Configuration
|
15
|
+
attr_accessor :mime_library
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
@mime_library = 'mime/types'
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/henkei/version.rb
CHANGED
data/spec/henkei_spec.rb
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'helper
|
3
|
+
require 'helper'
|
4
4
|
require 'henkei'
|
5
5
|
|
6
|
+
# Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
|
7
|
+
require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
|
8
|
+
|
6
9
|
describe Henkei do
|
7
10
|
let(:data) { File.read 'spec/samples/sample.docx' }
|
8
11
|
|
@@ -40,6 +43,16 @@ describe Henkei do
|
|
40
43
|
)
|
41
44
|
expect(mimetype.extensions).to include 'docx'
|
42
45
|
end
|
46
|
+
|
47
|
+
context 'when passing in the `pipe-error.png` test file' do
|
48
|
+
let(:data) { File.read 'spec/samples/pipe-error.png' }
|
49
|
+
|
50
|
+
it 'returns an empty result' do
|
51
|
+
text = Henkei.read :text, data
|
52
|
+
|
53
|
+
expect(text).to eq ''
|
54
|
+
end
|
55
|
+
end
|
43
56
|
end
|
44
57
|
|
45
58
|
describe '.new' do
|
@@ -129,6 +142,23 @@ describe Henkei do
|
|
129
142
|
specify '#metadata reads metadata' do
|
130
143
|
expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
|
131
144
|
end
|
145
|
+
|
146
|
+
context 'when passing in the `pipe-error.png` test file' do
|
147
|
+
let(:henkei) { Henkei.new 'spec/samples/pipe-error.png' }
|
148
|
+
|
149
|
+
it '#text returns an empty result' do
|
150
|
+
expect(henkei.text).to eq ''
|
151
|
+
end
|
152
|
+
|
153
|
+
it '#html returns an empty body' do
|
154
|
+
expect(henkei.html).to include '<body/>'
|
155
|
+
expect(henkei.html).to include '<meta name="tiff:ImageWidth" content="792"/>'
|
156
|
+
end
|
157
|
+
|
158
|
+
it '#mimetype returns `image/png`' do
|
159
|
+
expect(henkei.mimetype.content_type).to eq 'image/png'
|
160
|
+
end
|
161
|
+
end
|
132
162
|
end
|
133
163
|
|
134
164
|
context 'initialized with a given URI' do
|
@@ -157,6 +187,18 @@ describe Henkei do
|
|
157
187
|
end
|
158
188
|
end
|
159
189
|
|
190
|
+
context 'when source is a remote PDF' do
|
191
|
+
let(:henkei) { Henkei.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
|
192
|
+
|
193
|
+
specify '#text reads text' do
|
194
|
+
expect(henkei.text).to include 'Dummy PDF file'
|
195
|
+
end
|
196
|
+
|
197
|
+
specify '#metadata reads metadata' do
|
198
|
+
expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
160
202
|
context 'working as server mode' do
|
161
203
|
specify '#starts and kills server' do
|
162
204
|
begin
|
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.23.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-02-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -32,25 +32,25 @@ dependencies:
|
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '3'
|
34
34
|
- !ruby/object:Gem::Dependency
|
35
|
-
name:
|
35
|
+
name: mini_mime
|
36
36
|
requirement: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 0.1.1
|
41
41
|
- - "<"
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: '
|
43
|
+
version: '2'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
46
|
version_requirements: !ruby/object:Gem::Requirement
|
47
47
|
requirements:
|
48
48
|
- - ">="
|
49
49
|
- !ruby/object:Gem::Version
|
50
|
-
version:
|
50
|
+
version: 0.1.1
|
51
51
|
- - "<"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '
|
53
|
+
version: '2'
|
54
54
|
- !ruby/object:Gem::Dependency
|
55
55
|
name: bundler
|
56
56
|
requirement: !ruby/object:Gem::Requirement
|
@@ -65,6 +65,20 @@ dependencies:
|
|
65
65
|
- - "~>"
|
66
66
|
- !ruby/object:Gem::Version
|
67
67
|
version: '2.0'
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: rails
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '5.0'
|
75
|
+
type: :development
|
76
|
+
prerelease: false
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '5.0'
|
68
82
|
- !ruby/object:Gem::Dependency
|
69
83
|
name: rake
|
70
84
|
requirement: !ruby/object:Gem::Requirement
|
@@ -125,7 +139,8 @@ description: Read text and metadata from files and documents using Apache Tika t
|
|
125
139
|
email:
|
126
140
|
- erol.fornoles@gmail.com
|
127
141
|
- a.bromwich@gmail.com
|
128
|
-
executables:
|
142
|
+
executables:
|
143
|
+
- console
|
129
144
|
extensions: []
|
130
145
|
extra_rdoc_files: []
|
131
146
|
files:
|
@@ -138,14 +153,17 @@ files:
|
|
138
153
|
- NOTICE.txt
|
139
154
|
- README.md
|
140
155
|
- Rakefile
|
156
|
+
- bin/console
|
141
157
|
- henkei.gemspec
|
142
|
-
- jar/tika-app-1.
|
158
|
+
- jar/tika-app-1.23.jar
|
143
159
|
- jar/tika-config.xml
|
144
160
|
- lib/henkei.rb
|
161
|
+
- lib/henkei/configuration.rb
|
145
162
|
- lib/henkei/version.rb
|
146
163
|
- lib/henkei/yomu.rb
|
147
164
|
- spec/helper.rb
|
148
165
|
- spec/henkei_spec.rb
|
166
|
+
- spec/samples/pipe-error.png
|
149
167
|
- spec/samples/sample filename with spaces.pages
|
150
168
|
- spec/samples/sample-metadata-values-with-colons.doc
|
151
169
|
- spec/samples/sample.docx
|
@@ -162,14 +180,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
162
180
|
requirements:
|
163
181
|
- - ">="
|
164
182
|
- !ruby/object:Gem::Version
|
165
|
-
version:
|
183
|
+
version: 2.4.0
|
184
|
+
- - "<"
|
185
|
+
- !ruby/object:Gem::Version
|
186
|
+
version: 3.1.0
|
166
187
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
167
188
|
requirements:
|
168
189
|
- - ">="
|
169
190
|
- !ruby/object:Gem::Version
|
170
191
|
version: '0'
|
171
192
|
requirements: []
|
172
|
-
rubygems_version: 3.0.
|
193
|
+
rubygems_version: 3.0.6
|
173
194
|
signing_key:
|
174
195
|
specification_version: 4
|
175
196
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
@@ -177,6 +198,7 @@ summary: Read text and metadata from files and documents (.doc, .docx, .pages, .
|
|
177
198
|
test_files:
|
178
199
|
- spec/helper.rb
|
179
200
|
- spec/henkei_spec.rb
|
201
|
+
- spec/samples/pipe-error.png
|
180
202
|
- spec/samples/sample filename with spaces.pages
|
181
203
|
- spec/samples/sample-metadata-values-with-colons.doc
|
182
204
|
- spec/samples/sample.docx
|