henkei 1.21.0 → 1.23.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dcd47b09b6fe957b09566d905fa0f1800baa3bd385bf5ecb57996f5d5d49d531
4
- data.tar.gz: 961c762ff76e4be1356f5d3ac8af80385a2f811d4099afe98b461228b6bce374
3
+ metadata.gz: 7ee68ad858a48fb82526d230d1d49788be9eee8e0d27562276fd6b18c11e4923
4
+ data.tar.gz: ebedf682b5ef4f979d02eb6cec23d4bdbdf581b958873dc80469fb0bfa15560d
5
5
  SHA512:
6
- metadata.gz: 7017ca74a03708360c2e89decec78eff6e681f61819398374ed85a456674fedcf6e5eedf83008d224686c4e716d7e9228b2f54521862e44bbd4d98f50729a76d
7
- data.tar.gz: 6b2f718bbea1af08e0ee4ed96ed84b8d0bd423ab0596f8d9d09e7a32d11a242188edd7f48c6df46ce52b4e28964c1ca40642318ef4f48b0ffc5aea999f844371
6
+ metadata.gz: 724b7788968a98cf2120912eccfd49b761b25b01666f20f58334eedec44537d9b2238006df3f3fee26f048e54d36213d168152d6809b25466d41ae055566add6
7
+ data.tar.gz: e867f325855b63232d86ab1208c6c35c4b71ad620fab893451f33c1590eaaf3622e72ea5ea71ce60d7486a3bffac9c4f54829cc30683cbe8af20b3c969b4745f
data/.rubocop.yml CHANGED
@@ -1,10 +1,31 @@
1
+ AllCops:
2
+ NewCops: enable
3
+
4
+ Layout/EmptyLinesAroundAttributeAccessor:
5
+ Enabled: true
6
+
7
+ Layout/LineLength:
8
+ Max: 120
9
+
10
+ Layout/SpaceAroundMethodCallOperator:
11
+ Enabled: true
12
+
13
+ Lint/DeprecatedOpenSSLConstant:
14
+ Enabled: true
15
+
16
+ Lint/MixedRegexpCaptureTypes:
17
+ Enabled: true
18
+
19
+ Lint/RaiseException:
20
+ Enabled: true
21
+
22
+ Lint/StructNewOverride:
23
+ Enabled: true
24
+
1
25
  Metrics/BlockLength:
2
26
  Exclude:
3
27
  - 'spec/**/*'
4
28
 
5
- Metrics/LineLength:
6
- Max: 120
7
-
8
29
  Metrics/MethodLength:
9
30
  Max: 15
10
31
 
@@ -13,3 +34,24 @@ Style/ClassVars:
13
34
 
14
35
  Style/DoubleNegation:
15
36
  Enabled: false
37
+
38
+ Style/ExponentialNotation:
39
+ Enabled: true
40
+
41
+ Style/HashEachMethods:
42
+ Enabled: true
43
+
44
+ Style/HashTransformKeys:
45
+ Enabled: true
46
+
47
+ Style/HashTransformValues:
48
+ Enabled: true
49
+
50
+ Style/RedundantRegexpCharacterClass:
51
+ Enabled: true
52
+
53
+ Style/RedundantRegexpEscape:
54
+ Enabled: true
55
+
56
+ Style/SlicingWithRange:
57
+ Enabled: true
data/.travis.yml CHANGED
@@ -1,13 +1,17 @@
1
1
  env:
2
2
  global:
3
3
  - CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
4
+ jobs:
5
+ - INCLUDE_RAILS=false
6
+ - INCLUDE_RAILS=true
4
7
 
5
8
  language: ruby
6
9
  rvm:
7
- - 2.3
8
10
  - 2.4
9
11
  - 2.5
10
12
  - 2.6
13
+ - 2.7
14
+ - 3.0
11
15
 
12
16
  before_install:
13
17
  - gem update bundler
@@ -22,7 +26,7 @@ before_script:
22
26
  - ./cc-test-reporter before-build
23
27
 
24
28
  script:
25
- - rubocop
29
+ - bundle exec rubocop
26
30
  - bundle exec rspec
27
31
 
28
32
  after_script:
data/bin/console ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'henkei'
6
+
7
+ require 'irb'
8
+ IRB.start
data/henkei.gemspec CHANGED
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
15
15
  '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
16
16
  spec.homepage = 'http://github.com/abrom/henkei'
17
17
  spec.license = 'MIT'
18
+ spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
18
19
 
19
20
  spec.files = `git ls-files`.split("\n")
20
21
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -22,9 +23,10 @@ Gem::Specification.new do |spec|
22
23
  spec.require_paths = ['lib']
23
24
 
24
25
  spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
25
- spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
26
+ spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
26
27
 
27
28
  spec.add_development_dependency 'bundler', '~> 2.0'
29
+ spec.add_development_dependency 'rails', '~> 5.0'
28
30
  spec.add_development_dependency 'rake', '~> 12.3'
29
31
  spec.add_development_dependency 'rspec', '~> 3.7'
30
32
  spec.add_development_dependency 'rubocop', '~> 0.71'
data/lib/henkei.rb CHANGED
@@ -2,25 +2,48 @@
2
2
 
3
3
  require 'henkei/version'
4
4
  require 'henkei/yomu'
5
+ require 'henkei/configuration'
5
6
 
6
7
  require 'net/http'
7
- require 'mime/types'
8
+ require 'mini_mime'
9
+
10
+ # require 'mime/types' if available
11
+ begin
12
+ require 'mime/types'
13
+ rescue LoadError
14
+ nil
15
+ end
16
+
8
17
  require 'time'
9
18
  require 'json'
10
19
 
11
20
  require 'socket'
12
21
  require 'stringio'
13
22
 
23
+ require 'open3'
24
+
14
25
  # Read text and metadata from files and documents using Apache Tika toolkit
15
26
  class Henkei # rubocop:disable Metrics/ClassLength
16
27
  GEM_PATH = File.dirname(File.dirname(__FILE__))
17
- JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.21.jar')
28
+ JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.23.jar')
18
29
  CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
19
30
  DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
20
31
 
21
32
  @@server_port = nil
22
33
  @@server_pid = nil
23
34
 
35
+ def self.mimetype(content_type)
36
+ if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
37
+ warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
38
+ ' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
39
+ MIME::Types[content_type].first
40
+ else
41
+ MiniMime.lookup_by_content_type(content_type).tap do |object|
42
+ object.define_singleton_method(:extensions) { [extension] }
43
+ end
44
+ end
45
+ end
46
+
24
47
  # Read text or metadata from a data buffer.
25
48
  #
26
49
  # data = File.read 'sample.pages'
@@ -34,7 +57,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
34
57
  when :text then result
35
58
  when :html then result
36
59
  when :metadata then JSON.parse(result)
37
- when :mimetype then MIME::Types[JSON.parse(result)['Content-Type']].first
60
+ when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
38
61
  end
39
62
  end
40
63
 
@@ -110,9 +133,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
110
133
  def mimetype
111
134
  return @mimetype if defined? @mimetype
112
135
 
113
- type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
114
-
115
- @mimetype = MIME::Types[type].first
136
+ content_type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
137
+ @mimetype = Henkei.mimetype(content_type)
116
138
  end
117
139
 
118
140
  # Returns +true+ if the Henkei document was specified using a file path.
@@ -184,7 +206,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
184
206
  def self.server(type, custom_port = nil)
185
207
  @@server_port = custom_port || DEFAULT_SERVER_PORT
186
208
 
187
- @@server_pid = Process.spawn tika_command(type, true)
209
+ @@server_pid = Process.spawn(*tika_command(type, server: true))
188
210
  sleep(2) # Give the server 2 seconds to spin up.
189
211
  @@server_pid
190
212
  end
@@ -217,18 +239,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
217
239
  # Provide the path to the Java binary
218
240
  #
219
241
  def self.java_path
220
- ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
242
+ ENV['JAVA_HOME'] ? "#{ENV['JAVA_HOME']}/bin/java" : 'java'
221
243
  end
222
244
  private_class_method :java_path
223
245
 
224
246
  # Internal helper for calling to Tika library directly
225
247
  #
226
248
  def self.client_read(type, data)
227
- IO.popen tika_command(type), 'r+' do |io|
228
- io.write data
229
- io.close_write
230
- io.read
231
- end
249
+ Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
232
250
  end
233
251
  private_class_method :client_read
234
252
 
@@ -261,23 +279,22 @@ class Henkei # rubocop:disable Metrics/ClassLength
261
279
 
262
280
  # Internal helper for building the Java command to call Tika
263
281
  #
264
- def self.tika_command(type, server = false)
265
- command = ["#{java_path} -Djava.awt.headless=true -jar #{Henkei::JAR_PATH} --config=#{Henkei::CONFIG_PATH}"]
266
- command << "--server --port #{@@server_port}" if server
267
- command << switch_for_type(type)
268
- command.join ' '
282
+ def self.tika_command(type, server: false)
283
+ command = [java_path, '-Djava.awt.headless=true', '-jar', Henkei::JAR_PATH, "--config=#{Henkei::CONFIG_PATH}"]
284
+ command += ['--server', '--port', @@server_port.to_s] if server
285
+ command + switch_for_type(type)
269
286
  end
270
287
  private_class_method :tika_command
271
288
 
272
289
  # Internal helper for building the Java command to call Tika
273
290
  #
274
291
  def self.switch_for_type(type)
275
- case type
276
- when :text then '-t'
277
- when :html then '-h'
278
- when :metadata then '-m -j'
279
- when :mimetype then '-m -j'
280
- end
292
+ {
293
+ text: ['-t'],
294
+ html: ['-h'],
295
+ metadata: %w[-m -j],
296
+ mimetype: %w[-m -j]
297
+ }[type]
281
298
  end
282
299
  private_class_method :switch_for_type
283
300
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Henkei monkey patch for configuration support
4
+ class Henkei
5
+ def self.configuration
6
+ @configuration ||= Configuration.new
7
+ end
8
+
9
+ def self.configure
10
+ yield(configuration)
11
+ end
12
+
13
+ # Handle Henkei configuration
14
+ class Configuration
15
+ attr_accessor :mime_library
16
+
17
+ def initialize
18
+ @mime_library = 'mime/types'
19
+ end
20
+ end
21
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Henkei
4
- VERSION = '1.21.0'
4
+ VERSION = '1.23.3'
5
5
  end
data/spec/henkei_spec.rb CHANGED
@@ -1,8 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'helper.rb'
3
+ require 'helper'
4
4
  require 'henkei'
5
5
 
6
+ # Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
7
+ require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
8
+
6
9
  describe Henkei do
7
10
  let(:data) { File.read 'spec/samples/sample.docx' }
8
11
 
@@ -40,6 +43,16 @@ describe Henkei do
40
43
  )
41
44
  expect(mimetype.extensions).to include 'docx'
42
45
  end
46
+
47
+ context 'when passing in the `pipe-error.png` test file' do
48
+ let(:data) { File.read 'spec/samples/pipe-error.png' }
49
+
50
+ it 'returns an empty result' do
51
+ text = Henkei.read :text, data
52
+
53
+ expect(text).to eq ''
54
+ end
55
+ end
43
56
  end
44
57
 
45
58
  describe '.new' do
@@ -129,6 +142,23 @@ describe Henkei do
129
142
  specify '#metadata reads metadata' do
130
143
  expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
131
144
  end
145
+
146
+ context 'when passing in the `pipe-error.png` test file' do
147
+ let(:henkei) { Henkei.new 'spec/samples/pipe-error.png' }
148
+
149
+ it '#text returns an empty result' do
150
+ expect(henkei.text).to eq ''
151
+ end
152
+
153
+ it '#html returns an empty body' do
154
+ expect(henkei.html).to include '<body/>'
155
+ expect(henkei.html).to include '<meta name="tiff:ImageWidth" content="792"/>'
156
+ end
157
+
158
+ it '#mimetype returns `image/png`' do
159
+ expect(henkei.mimetype.content_type).to eq 'image/png'
160
+ end
161
+ end
132
162
  end
133
163
 
134
164
  context 'initialized with a given URI' do
@@ -157,6 +187,18 @@ describe Henkei do
157
187
  end
158
188
  end
159
189
 
190
+ context 'when source is a remote PDF' do
191
+ let(:henkei) { Henkei.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
192
+
193
+ specify '#text reads text' do
194
+ expect(henkei.text).to include 'Dummy PDF file'
195
+ end
196
+
197
+ specify '#metadata reads metadata' do
198
+ expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
199
+ end
200
+ end
201
+
160
202
  context 'working as server mode' do
161
203
  specify '#starts and kills server' do
162
204
  begin
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.21.0
4
+ version: 1.23.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-06-20 00:00:00.000000000 Z
12
+ date: 2021-02-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -32,25 +32,25 @@ dependencies:
32
32
  - !ruby/object:Gem::Version
33
33
  version: '3'
34
34
  - !ruby/object:Gem::Dependency
35
- name: mime-types
35
+ name: mini_mime
36
36
  requirement: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '1.23'
40
+ version: 0.1.1
41
41
  - - "<"
42
42
  - !ruby/object:Gem::Version
43
- version: '4'
43
+ version: '2'
44
44
  type: :runtime
45
45
  prerelease: false
46
46
  version_requirements: !ruby/object:Gem::Requirement
47
47
  requirements:
48
48
  - - ">="
49
49
  - !ruby/object:Gem::Version
50
- version: '1.23'
50
+ version: 0.1.1
51
51
  - - "<"
52
52
  - !ruby/object:Gem::Version
53
- version: '4'
53
+ version: '2'
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: bundler
56
56
  requirement: !ruby/object:Gem::Requirement
@@ -65,6 +65,20 @@ dependencies:
65
65
  - - "~>"
66
66
  - !ruby/object:Gem::Version
67
67
  version: '2.0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: rails
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '5.0'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '5.0'
68
82
  - !ruby/object:Gem::Dependency
69
83
  name: rake
70
84
  requirement: !ruby/object:Gem::Requirement
@@ -125,7 +139,8 @@ description: Read text and metadata from files and documents using Apache Tika t
125
139
  email:
126
140
  - erol.fornoles@gmail.com
127
141
  - a.bromwich@gmail.com
128
- executables: []
142
+ executables:
143
+ - console
129
144
  extensions: []
130
145
  extra_rdoc_files: []
131
146
  files:
@@ -138,14 +153,17 @@ files:
138
153
  - NOTICE.txt
139
154
  - README.md
140
155
  - Rakefile
156
+ - bin/console
141
157
  - henkei.gemspec
142
- - jar/tika-app-1.21.jar
158
+ - jar/tika-app-1.23.jar
143
159
  - jar/tika-config.xml
144
160
  - lib/henkei.rb
161
+ - lib/henkei/configuration.rb
145
162
  - lib/henkei/version.rb
146
163
  - lib/henkei/yomu.rb
147
164
  - spec/helper.rb
148
165
  - spec/henkei_spec.rb
166
+ - spec/samples/pipe-error.png
149
167
  - spec/samples/sample filename with spaces.pages
150
168
  - spec/samples/sample-metadata-values-with-colons.doc
151
169
  - spec/samples/sample.docx
@@ -162,14 +180,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
162
180
  requirements:
163
181
  - - ">="
164
182
  - !ruby/object:Gem::Version
165
- version: '0'
183
+ version: 2.4.0
184
+ - - "<"
185
+ - !ruby/object:Gem::Version
186
+ version: 3.1.0
166
187
  required_rubygems_version: !ruby/object:Gem::Requirement
167
188
  requirements:
168
189
  - - ">="
169
190
  - !ruby/object:Gem::Version
170
191
  version: '0'
171
192
  requirements: []
172
- rubygems_version: 3.0.3
193
+ rubygems_version: 3.0.6
173
194
  signing_key:
174
195
  specification_version: 4
175
196
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
@@ -177,6 +198,7 @@ summary: Read text and metadata from files and documents (.doc, .docx, .pages, .
177
198
  test_files:
178
199
  - spec/helper.rb
179
200
  - spec/henkei_spec.rb
201
+ - spec/samples/pipe-error.png
180
202
  - spec/samples/sample filename with spaces.pages
181
203
  - spec/samples/sample-metadata-values-with-colons.doc
182
204
  - spec/samples/sample.docx