henkei 1.23.0 → 1.25.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f9ebc4be4691020c72328134a33a9ffe6b4fb79a939ddc9ce833c56551abb86d
4
- data.tar.gz: 17402ba43e9840b59090a82f1cd39e79e594ca3d36c763b958f9243174990f8e
3
+ metadata.gz: fd17a3239a6521a21c21ef943bfd476396ea6cca4d69636eaafec2303d2ab11d
4
+ data.tar.gz: 601055f7fffbf29539a9092ce577814b3f1be75480abd0578b1f4940f1a58ae9
5
5
  SHA512:
6
- metadata.gz: 74dcf4d6f2ce5f99b77b3c1fdd34a271220c58e8aae167b40cde35eef2166570d3c4de7b94f91d98158fe3cc384ec7a7688cf98812e378607e31f8d24e06420f
7
- data.tar.gz: aa210ee582d56592932684216eb93cd3f91ea7ba95e3b1d4bc672ed09bdc9605008e7f6f21472ca291c574cb1050b18eb6bdb47aa5caa03341ae393cbb0b9939
6
+ metadata.gz: 44bfa5b4d0bec122127f64a3a6ec745d6efcb56b132b48cd9e768e8ed76ab97729d724a44317c631cccc8edf2e8e21d4f3703d7da6dbc611258a100f6fb7fad6
7
+ data.tar.gz: bb537b0c7d75c3a9de2433b35acda5e25e8b4a8f72ee55ccf880f91ba70a039057c135f367ec08de6ef08cda5cac6ca65bcc4400f4966d6c86cb90df0edc2eb5
data/.rubocop.yml CHANGED
@@ -1,6 +1,27 @@
1
+ AllCops:
2
+ NewCops: enable
3
+
4
+ Layout/EmptyLinesAroundAttributeAccessor:
5
+ Enabled: true
6
+
1
7
  Layout/LineLength:
2
8
  Max: 120
3
9
 
10
+ Layout/SpaceAroundMethodCallOperator:
11
+ Enabled: true
12
+
13
+ Lint/DeprecatedOpenSSLConstant:
14
+ Enabled: true
15
+
16
+ Lint/MixedRegexpCaptureTypes:
17
+ Enabled: true
18
+
19
+ Lint/RaiseException:
20
+ Enabled: true
21
+
22
+ Lint/StructNewOverride:
23
+ Enabled: true
24
+
4
25
  Metrics/BlockLength:
5
26
  Exclude:
6
27
  - 'spec/**/*'
@@ -13,3 +34,24 @@ Style/ClassVars:
13
34
 
14
35
  Style/DoubleNegation:
15
36
  Enabled: false
37
+
38
+ Style/ExponentialNotation:
39
+ Enabled: true
40
+
41
+ Style/HashEachMethods:
42
+ Enabled: true
43
+
44
+ Style/HashTransformKeys:
45
+ Enabled: true
46
+
47
+ Style/HashTransformValues:
48
+ Enabled: true
49
+
50
+ Style/RedundantRegexpCharacterClass:
51
+ Enabled: true
52
+
53
+ Style/RedundantRegexpEscape:
54
+ Enabled: true
55
+
56
+ Style/SlicingWithRange:
57
+ Enabled: true
data/.travis.yml CHANGED
@@ -1,13 +1,17 @@
1
1
  env:
2
2
  global:
3
3
  - CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
4
+ jobs:
5
+ - INCLUDE_RAILS=false
6
+ - INCLUDE_RAILS=true
4
7
 
5
8
  language: ruby
6
9
  rvm:
7
- - 2.3
8
10
  - 2.4
9
11
  - 2.5
10
12
  - 2.6
13
+ - 2.7
14
+ - 3.0
11
15
 
12
16
  before_install:
13
17
  - gem update bundler
@@ -22,7 +26,7 @@ before_script:
22
26
  - ./cc-test-reporter before-build
23
27
 
24
28
  script:
25
- - rubocop
29
+ - bundle exec rubocop
26
30
  - bundle exec rspec
27
31
 
28
32
  after_script:
data/henkei.gemspec CHANGED
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
15
15
  '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
16
16
  spec.homepage = 'http://github.com/abrom/henkei'
17
17
  spec.license = 'MIT'
18
+ spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
18
19
 
19
20
  spec.files = `git ls-files`.split("\n")
20
21
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -22,9 +23,10 @@ Gem::Specification.new do |spec|
22
23
  spec.require_paths = ['lib']
23
24
 
24
25
  spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
25
- spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
26
+ spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
26
27
 
27
28
  spec.add_development_dependency 'bundler', '~> 2.0'
29
+ spec.add_development_dependency 'rails', '~> 5.0'
28
30
  spec.add_development_dependency 'rake', '~> 12.3'
29
31
  spec.add_development_dependency 'rspec', '~> 3.7'
30
32
  spec.add_development_dependency 'rubocop', '~> 0.71'
data/lib/henkei.rb CHANGED
@@ -2,9 +2,18 @@
2
2
 
3
3
  require 'henkei/version'
4
4
  require 'henkei/yomu'
5
+ require 'henkei/configuration'
5
6
 
6
7
  require 'net/http'
7
- require 'mime/types'
8
+ require 'mini_mime'
9
+
10
+ # require 'mime/types' if available
11
+ begin
12
+ require 'mime/types'
13
+ rescue LoadError
14
+ nil
15
+ end
16
+
8
17
  require 'time'
9
18
  require 'json'
10
19
 
@@ -16,13 +25,25 @@ require 'open3'
16
25
  # Read text and metadata from files and documents using Apache Tika toolkit
17
26
  class Henkei # rubocop:disable Metrics/ClassLength
18
27
  GEM_PATH = File.dirname(File.dirname(__FILE__))
19
- JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.23.jar')
28
+ JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.25.jar')
20
29
  CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
21
30
  DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
22
31
 
23
32
  @@server_port = nil
24
33
  @@server_pid = nil
25
34
 
35
+ def self.mimetype(content_type)
36
+ if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
37
+ warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
38
+ ' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
39
+ MIME::Types[content_type].first
40
+ else
41
+ MiniMime.lookup_by_content_type(content_type).tap do |object|
42
+ object.define_singleton_method(:extensions) { [extension] }
43
+ end
44
+ end
45
+ end
46
+
26
47
  # Read text or metadata from a data buffer.
27
48
  #
28
49
  # data = File.read 'sample.pages'
@@ -36,7 +57,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
36
57
  when :text then result
37
58
  when :html then result
38
59
  when :metadata then JSON.parse(result)
39
- when :mimetype then MIME::Types[JSON.parse(result)['Content-Type']].first
60
+ when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
40
61
  end
41
62
  end
42
63
 
@@ -112,9 +133,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
112
133
  def mimetype
113
134
  return @mimetype if defined? @mimetype
114
135
 
115
- type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
116
-
117
- @mimetype = MIME::Types[type].first
136
+ content_type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
137
+ @mimetype = Henkei.mimetype(content_type)
118
138
  end
119
139
 
120
140
  # Returns +true+ if the Henkei document was specified using a file path.
@@ -186,7 +206,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
186
206
  def self.server(type, custom_port = nil)
187
207
  @@server_port = custom_port || DEFAULT_SERVER_PORT
188
208
 
189
- @@server_pid = Process.spawn tika_command(type, true)
209
+ @@server_pid = Process.spawn(*tika_command(type, server: true))
190
210
  sleep(2) # Give the server 2 seconds to spin up.
191
211
  @@server_pid
192
212
  end
@@ -219,14 +239,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
219
239
  # Provide the path to the Java binary
220
240
  #
221
241
  def self.java_path
222
- ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
242
+ ENV['JAVA_HOME'] ? "#{ENV['JAVA_HOME']}/bin/java" : 'java'
223
243
  end
224
244
  private_class_method :java_path
225
245
 
226
246
  # Internal helper for calling to Tika library directly
227
247
  #
228
248
  def self.client_read(type, data)
229
- Open3.capture2(tika_command(type), stdin_data: data).first
249
+ Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
230
250
  end
231
251
  private_class_method :client_read
232
252
 
@@ -259,23 +279,22 @@ class Henkei # rubocop:disable Metrics/ClassLength
259
279
 
260
280
  # Internal helper for building the Java command to call Tika
261
281
  #
262
- def self.tika_command(type, server = false)
263
- command = ["#{java_path} -Djava.awt.headless=true -jar #{Henkei::JAR_PATH} --config=#{Henkei::CONFIG_PATH}"]
264
- command << "--server --port #{@@server_port}" if server
265
- command << switch_for_type(type)
266
- command.join ' '
282
+ def self.tika_command(type, server: false)
283
+ command = [java_path, '-Djava.awt.headless=true', '-jar', Henkei::JAR_PATH, "--config=#{Henkei::CONFIG_PATH}"]
284
+ command += ['--server', '--port', @@server_port.to_s] if server
285
+ command + switch_for_type(type)
267
286
  end
268
287
  private_class_method :tika_command
269
288
 
270
289
  # Internal helper for building the Java command to call Tika
271
290
  #
272
291
  def self.switch_for_type(type)
273
- case type
274
- when :text then '-t'
275
- when :html then '-h'
276
- when :metadata then '-m -j'
277
- when :mimetype then '-m -j'
278
- end
292
+ {
293
+ text: ['-t'],
294
+ html: ['-h'],
295
+ metadata: %w[-m -j],
296
+ mimetype: %w[-m -j]
297
+ }[type]
279
298
  end
280
299
  private_class_method :switch_for_type
281
300
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Henkei monkey patch for configuration support
4
+ class Henkei
5
+ def self.configuration
6
+ @configuration ||= Configuration.new
7
+ end
8
+
9
+ def self.configure
10
+ yield(configuration)
11
+ end
12
+
13
+ # Handle Henkei configuration
14
+ class Configuration
15
+ attr_accessor :mime_library
16
+
17
+ def initialize
18
+ @mime_library = 'mime/types'
19
+ end
20
+ end
21
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Henkei
4
- VERSION = '1.23.0'
4
+ VERSION = '1.25.1'
5
5
  end
data/spec/henkei_spec.rb CHANGED
@@ -1,8 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'helper.rb'
3
+ require 'helper'
4
4
  require 'henkei'
5
5
 
6
+ # Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
7
+ require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
8
+
6
9
  describe Henkei do
7
10
  let(:data) { File.read 'spec/samples/sample.docx' }
8
11
 
@@ -152,7 +155,7 @@ describe Henkei do
152
155
  expect(henkei.html).to include '<meta name="tiff:ImageWidth" content="792"/>'
153
156
  end
154
157
 
155
- it '#mimetype returns an empty result' do
158
+ it '#mimetype returns `image/png`' do
156
159
  expect(henkei.mimetype.content_type).to eq 'image/png'
157
160
  end
158
161
  end
@@ -184,6 +187,18 @@ describe Henkei do
184
187
  end
185
188
  end
186
189
 
190
+ context 'when source is a remote PDF' do
191
+ let(:henkei) { Henkei.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
192
+
193
+ specify '#text reads text' do
194
+ expect(henkei.text).to include 'Dummy PDF file'
195
+ end
196
+
197
+ specify '#metadata reads metadata' do
198
+ expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
199
+ end
200
+ end
201
+
187
202
  context 'working as server mode' do
188
203
  specify '#starts and kills server' do
189
204
  begin
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.23.0
4
+ version: 1.25.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-12-26 00:00:00.000000000 Z
12
+ date: 2021-02-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -32,25 +32,25 @@ dependencies:
32
32
  - !ruby/object:Gem::Version
33
33
  version: '3'
34
34
  - !ruby/object:Gem::Dependency
35
- name: mime-types
35
+ name: mini_mime
36
36
  requirement: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '1.23'
40
+ version: 0.1.1
41
41
  - - "<"
42
42
  - !ruby/object:Gem::Version
43
- version: '4'
43
+ version: '2'
44
44
  type: :runtime
45
45
  prerelease: false
46
46
  version_requirements: !ruby/object:Gem::Requirement
47
47
  requirements:
48
48
  - - ">="
49
49
  - !ruby/object:Gem::Version
50
- version: '1.23'
50
+ version: 0.1.1
51
51
  - - "<"
52
52
  - !ruby/object:Gem::Version
53
- version: '4'
53
+ version: '2'
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: bundler
56
56
  requirement: !ruby/object:Gem::Requirement
@@ -65,6 +65,20 @@ dependencies:
65
65
  - - "~>"
66
66
  - !ruby/object:Gem::Version
67
67
  version: '2.0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: rails
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '5.0'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '5.0'
68
82
  - !ruby/object:Gem::Dependency
69
83
  name: rake
70
84
  requirement: !ruby/object:Gem::Requirement
@@ -141,9 +155,10 @@ files:
141
155
  - Rakefile
142
156
  - bin/console
143
157
  - henkei.gemspec
144
- - jar/tika-app-1.23.jar
158
+ - jar/tika-app-1.25.jar
145
159
  - jar/tika-config.xml
146
160
  - lib/henkei.rb
161
+ - lib/henkei/configuration.rb
147
162
  - lib/henkei/version.rb
148
163
  - lib/henkei/yomu.rb
149
164
  - spec/helper.rb
@@ -165,7 +180,10 @@ required_ruby_version: !ruby/object:Gem::Requirement
165
180
  requirements:
166
181
  - - ">="
167
182
  - !ruby/object:Gem::Version
168
- version: '0'
183
+ version: 2.4.0
184
+ - - "<"
185
+ - !ruby/object:Gem::Version
186
+ version: 3.1.0
169
187
  required_rubygems_version: !ruby/object:Gem::Requirement
170
188
  requirements:
171
189
  - - ">="