henkei 1.20.0 → 1.23.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ca7e4a2903bba6a17a310d55d42ae8647bfa6857e27820394b4745d11a36393a
4
- data.tar.gz: e78d60f55d368455ab7fa4007240961d95e98232df41b267d057535bfb9d9f63
3
+ metadata.gz: 97b9b37da4e96281569a16e469d23a00c41cd35bbd15e5a7c1d1025ea2862f46
4
+ data.tar.gz: 45a7c3ae645fc417ae6064a7237d87ae7e56c3c20392f83cad42f9fdafd569e8
5
5
  SHA512:
6
- metadata.gz: bf5b4fa6bfaa408f0572e6a034f01a8d5bdcb629208ab17f143fc7dffee936cd012a2d839852c01d39a8006d890a6158ed7c798120e8d41b91f0b00850082056
7
- data.tar.gz: 46461c2c92c1798f97fe8f1942d31eb0ca82867015fe8b7de186140da98e0d4377860852c15d61e0533d7515db2cbccfd687027a8a5887d6a677dbeb79d776f2
6
+ metadata.gz: c323d5f5c2056bedce26e05b40d70eecd4f4fff6f8ae24362dfc80952e788f33454b8a7be6a375c54001694e6a2fbe2d7dfb60392dc7324b865d056e99637d78
7
+ data.tar.gz: fe9c6ee2a7033d55d8ccc36a3d3a1cc78fd459654314096fa087c8373544deaa291952b0303db11f362a7f758028acf11692f7b25ca132d7cb4c6603a10a5a9d
data/.rubocop.yml CHANGED
@@ -1,10 +1,31 @@
1
+ AllCops:
2
+ NewCops: enable
3
+
4
+ Layout/EmptyLinesAroundAttributeAccessor:
5
+ Enabled: true
6
+
7
+ Layout/LineLength:
8
+ Max: 120
9
+
10
+ Layout/SpaceAroundMethodCallOperator:
11
+ Enabled: true
12
+
13
+ Lint/DeprecatedOpenSSLConstant:
14
+ Enabled: true
15
+
16
+ Lint/MixedRegexpCaptureTypes:
17
+ Enabled: true
18
+
19
+ Lint/RaiseException:
20
+ Enabled: true
21
+
22
+ Lint/StructNewOverride:
23
+ Enabled: true
24
+
1
25
  Metrics/BlockLength:
2
26
  Exclude:
3
27
  - 'spec/**/*'
4
28
 
5
- Metrics/LineLength:
6
- Max: 120
7
-
8
29
  Metrics/MethodLength:
9
30
  Max: 15
10
31
 
@@ -13,3 +34,24 @@ Style/ClassVars:
13
34
 
14
35
  Style/DoubleNegation:
15
36
  Enabled: false
37
+
38
+ Style/ExponentialNotation:
39
+ Enabled: true
40
+
41
+ Style/HashEachMethods:
42
+ Enabled: true
43
+
44
+ Style/HashTransformKeys:
45
+ Enabled: true
46
+
47
+ Style/HashTransformValues:
48
+ Enabled: true
49
+
50
+ Style/RedundantRegexpCharacterClass:
51
+ Enabled: true
52
+
53
+ Style/RedundantRegexpEscape:
54
+ Enabled: true
55
+
56
+ Style/SlicingWithRange:
57
+ Enabled: true
data/.travis.yml CHANGED
@@ -1,13 +1,17 @@
1
1
  env:
2
2
  global:
3
3
  - CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
4
+ jobs:
5
+ - INCLUDE_RAILS=false
6
+ - INCLUDE_RAILS=true
4
7
 
5
8
  language: ruby
6
9
  rvm:
7
- - 2.2
8
- - 2.3
9
10
  - 2.4
10
11
  - 2.5
12
+ - 2.6
13
+ - 2.7
14
+ - 3.0
11
15
 
12
16
  before_install:
13
17
  - gem update bundler
@@ -22,7 +26,7 @@ before_script:
22
26
  - ./cc-test-reporter before-build
23
27
 
24
28
  script:
25
- - rubocop
29
+ - bundle exec rubocop
26
30
  - bundle exec rspec
27
31
 
28
32
  after_script:
data/Gemfile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
5
  # Specify your gem's dependencies in henkei.gemspec
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env rake
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'bundler/gem_tasks'
4
5
  require 'rspec/core/rake_task'
data/bin/console ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'henkei'
6
+
7
+ require 'irb'
8
+ IRB.start
data/henkei.gemspec CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  lib = File.expand_path('lib', __dir__)
2
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
5
 
@@ -13,6 +15,7 @@ Gem::Specification.new do |spec|
13
15
  '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
14
16
  spec.homepage = 'http://github.com/abrom/henkei'
15
17
  spec.license = 'MIT'
18
+ spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
16
19
 
17
20
  spec.files = `git ls-files`.split("\n")
18
21
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -20,11 +23,12 @@ Gem::Specification.new do |spec|
20
23
  spec.require_paths = ['lib']
21
24
 
22
25
  spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
23
- spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
26
+ spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 1'
24
27
 
25
- spec.add_development_dependency 'bundler', '~> 1.3'
28
+ spec.add_development_dependency 'bundler', '~> 2.0'
29
+ spec.add_development_dependency 'rails', '~> 5.0'
26
30
  spec.add_development_dependency 'rake', '~> 12.3'
27
31
  spec.add_development_dependency 'rspec', '~> 3.7'
28
- spec.add_development_dependency 'rubocop', '~> 0.53'
32
+ spec.add_development_dependency 'rubocop', '~> 0.71'
29
33
  spec.add_development_dependency 'simplecov', '~> 0.15'
30
34
  end
data/lib/henkei.rb CHANGED
@@ -1,24 +1,49 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'henkei/version'
2
4
  require 'henkei/yomu'
5
+ require 'henkei/configuration'
3
6
 
4
7
  require 'net/http'
5
- require 'mime/types'
8
+ require 'mini_mime'
9
+
10
+ # require 'mime/types' if available
11
+ begin
12
+ require 'mime/types'
13
+ rescue LoadError
14
+ nil
15
+ end
16
+
6
17
  require 'time'
7
18
  require 'json'
8
19
 
9
20
  require 'socket'
10
21
  require 'stringio'
11
22
 
23
+ require 'open3'
24
+
12
25
  # Read text and metadata from files and documents using Apache Tika toolkit
13
26
  class Henkei # rubocop:disable Metrics/ClassLength
14
27
  GEM_PATH = File.dirname(File.dirname(__FILE__))
15
- JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.20.jar')
28
+ JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.23.jar')
16
29
  CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
17
30
  DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
18
31
 
19
32
  @@server_port = nil
20
33
  @@server_pid = nil
21
34
 
35
+ def self.mimetype(content_type)
36
+ if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
37
+ warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
38
+ ' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
39
+ MIME::Types[content_type].first
40
+ else
41
+ MiniMime.lookup_by_content_type(content_type).tap do |object|
42
+ object.define_singleton_method(:extensions) { [extension] }
43
+ end
44
+ end
45
+ end
46
+
22
47
  # Read text or metadata from a data buffer.
23
48
  #
24
49
  # data = File.read 'sample.pages'
@@ -32,7 +57,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
32
57
  when :text then result
33
58
  when :html then result
34
59
  when :metadata then JSON.parse(result)
35
- when :mimetype then MIME::Types[JSON.parse(result)['Content-Type']].first
60
+ when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
36
61
  end
37
62
  end
38
63
 
@@ -108,9 +133,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
108
133
  def mimetype
109
134
  return @mimetype if defined? @mimetype
110
135
 
111
- type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
112
-
113
- @mimetype = MIME::Types[type].first
136
+ content_type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
137
+ @mimetype = Henkei.mimetype(content_type)
114
138
  end
115
139
 
116
140
  # Returns +true+ if the Henkei document was specified using a file path.
@@ -182,7 +206,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
182
206
  def self.server(type, custom_port = nil)
183
207
  @@server_port = custom_port || DEFAULT_SERVER_PORT
184
208
 
185
- @@server_pid = Process.spawn tika_command(type, true)
209
+ @@server_pid = Process.spawn(*tika_command(type, server: true))
186
210
  sleep(2) # Give the server 2 seconds to spin up.
187
211
  @@server_pid
188
212
  end
@@ -215,18 +239,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
215
239
  # Provide the path to the Java binary
216
240
  #
217
241
  def self.java_path
218
- ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
242
+ ENV['JAVA_HOME'] ? "#{ENV['JAVA_HOME']}/bin/java" : 'java'
219
243
  end
220
244
  private_class_method :java_path
221
245
 
222
246
  # Internal helper for calling to Tika library directly
223
247
  #
224
248
  def self.client_read(type, data)
225
- IO.popen tika_command(type), 'r+' do |io|
226
- io.write data
227
- io.close_write
228
- io.read
229
- end
249
+ Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
230
250
  end
231
251
  private_class_method :client_read
232
252
 
@@ -246,7 +266,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
246
266
  # tell Tika that we're done sending data
247
267
  s.shutdown(Socket::SHUT_WR)
248
268
 
249
- resp = ''
269
+ resp = String.new ''
250
270
  loop do
251
271
  chunk = s.recv(65_536)
252
272
  break if chunk.empty? || !chunk
@@ -259,23 +279,22 @@ class Henkei # rubocop:disable Metrics/ClassLength
259
279
 
260
280
  # Internal helper for building the Java command to call Tika
261
281
  #
262
- def self.tika_command(type, server = false)
263
- command = ["#{java_path} -Djava.awt.headless=true -jar #{Henkei::JAR_PATH} --config=#{Henkei::CONFIG_PATH}"]
264
- command << "--server --port #{@@server_port}" if server
265
- command << switch_for_type(type)
266
- command.join ' '
282
+ def self.tika_command(type, server: false)
283
+ command = [java_path, '-Djava.awt.headless=true', '-jar', Henkei::JAR_PATH, "--config=#{Henkei::CONFIG_PATH}"]
284
+ command += ['--server', '--port', @@server_port.to_s] if server
285
+ command + switch_for_type(type)
267
286
  end
268
287
  private_class_method :tika_command
269
288
 
270
289
  # Internal helper for building the Java command to call Tika
271
290
  #
272
291
  def self.switch_for_type(type)
273
- case type
274
- when :text then '-t'
275
- when :html then '-h'
276
- when :metadata then '-m -j'
277
- when :mimetype then '-m -j'
278
- end
292
+ {
293
+ text: ['-t'],
294
+ html: ['-h'],
295
+ metadata: %w[-m -j],
296
+ mimetype: %w[-m -j]
297
+ }[type]
279
298
  end
280
299
  private_class_method :switch_for_type
281
300
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Henkei monkey patch for configuration support
4
+ class Henkei
5
+ def self.configuration
6
+ @configuration ||= Configuration.new
7
+ end
8
+
9
+ def self.configure
10
+ yield(configuration)
11
+ end
12
+
13
+ # Handle Henkei configuration
14
+ class Configuration
15
+ attr_accessor :mime_library
16
+
17
+ def initialize
18
+ @mime_library = 'mime/types'
19
+ end
20
+ end
21
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Henkei
2
- VERSION = '1.20.0'.freeze
4
+ VERSION = '1.23.2'
3
5
  end
data/lib/henkei/yomu.rb CHANGED
@@ -1 +1,3 @@
1
+ # frozen_string_literal: true
2
+
1
3
  Yomu = Henkei
data/spec/helper.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'simplecov'
2
4
  SimpleCov.start
3
5
 
data/spec/henkei_spec.rb CHANGED
@@ -1,6 +1,11 @@
1
- require 'helper.rb'
1
+ # frozen_string_literal: true
2
+
3
+ require 'helper'
2
4
  require 'henkei'
3
5
 
6
+ # Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
7
+ require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
8
+
4
9
  describe Henkei do
5
10
  let(:data) { File.read 'spec/samples/sample.docx' }
6
11
 
@@ -38,6 +43,16 @@ describe Henkei do
38
43
  )
39
44
  expect(mimetype.extensions).to include 'docx'
40
45
  end
46
+
47
+ context 'when passing in the `pipe-error.png` test file' do
48
+ let(:data) { File.read 'spec/samples/pipe-error.png' }
49
+
50
+ it 'returns an empty result' do
51
+ text = Henkei.read :text, data
52
+
53
+ expect(text).to eq ''
54
+ end
55
+ end
41
56
  end
42
57
 
43
58
  describe '.new' do
@@ -127,6 +142,23 @@ describe Henkei do
127
142
  specify '#metadata reads metadata' do
128
143
  expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
129
144
  end
145
+
146
+ context 'when passing in the `pipe-error.png` test file' do
147
+ let(:henkei) { Henkei.new 'spec/samples/pipe-error.png' }
148
+
149
+ it '#text returns an empty result' do
150
+ expect(henkei.text).to eq ''
151
+ end
152
+
153
+ it '#html returns an empty body' do
154
+ expect(henkei.html).to include '<body/>'
155
+ expect(henkei.html).to include '<meta name="tiff:ImageWidth" content="792"/>'
156
+ end
157
+
158
+ it '#mimetype returns `image/png`' do
159
+ expect(henkei.mimetype.content_type).to eq 'image/png'
160
+ end
161
+ end
130
162
  end
131
163
 
132
164
  context 'initialized with a given URI' do
@@ -155,6 +187,18 @@ describe Henkei do
155
187
  end
156
188
  end
157
189
 
190
+ context 'when source is a remote PDF' do
191
+ let(:henkei) { Henkei.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
192
+
193
+ specify '#text reads text' do
194
+ expect(henkei.text).to include 'Dummy PDF file'
195
+ end
196
+
197
+ specify '#metadata reads metadata' do
198
+ expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
199
+ end
200
+ end
201
+
158
202
  context 'working as server mode' do
159
203
  specify '#starts and kills server' do
160
204
  begin
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.20.0
4
+ version: 1.23.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-12-23 00:00:00.000000000 Z
12
+ date: 2021-02-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -32,39 +32,53 @@ dependencies:
32
32
  - !ruby/object:Gem::Version
33
33
  version: '3'
34
34
  - !ruby/object:Gem::Dependency
35
- name: mime-types
35
+ name: mini_mime
36
36
  requirement: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '1.23'
40
+ version: 0.1.1
41
41
  - - "<"
42
42
  - !ruby/object:Gem::Version
43
- version: '4'
43
+ version: '1'
44
44
  type: :runtime
45
45
  prerelease: false
46
46
  version_requirements: !ruby/object:Gem::Requirement
47
47
  requirements:
48
48
  - - ">="
49
49
  - !ruby/object:Gem::Version
50
- version: '1.23'
50
+ version: 0.1.1
51
51
  - - "<"
52
52
  - !ruby/object:Gem::Version
53
- version: '4'
53
+ version: '1'
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: bundler
56
56
  requirement: !ruby/object:Gem::Requirement
57
57
  requirements:
58
58
  - - "~>"
59
59
  - !ruby/object:Gem::Version
60
- version: '1.3'
60
+ version: '2.0'
61
61
  type: :development
62
62
  prerelease: false
63
63
  version_requirements: !ruby/object:Gem::Requirement
64
64
  requirements:
65
65
  - - "~>"
66
66
  - !ruby/object:Gem::Version
67
- version: '1.3'
67
+ version: '2.0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: rails
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '5.0'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '5.0'
68
82
  - !ruby/object:Gem::Dependency
69
83
  name: rake
70
84
  requirement: !ruby/object:Gem::Requirement
@@ -99,14 +113,14 @@ dependencies:
99
113
  requirements:
100
114
  - - "~>"
101
115
  - !ruby/object:Gem::Version
102
- version: '0.53'
116
+ version: '0.71'
103
117
  type: :development
104
118
  prerelease: false
105
119
  version_requirements: !ruby/object:Gem::Requirement
106
120
  requirements:
107
121
  - - "~>"
108
122
  - !ruby/object:Gem::Version
109
- version: '0.53'
123
+ version: '0.71'
110
124
  - !ruby/object:Gem::Dependency
111
125
  name: simplecov
112
126
  requirement: !ruby/object:Gem::Requirement
@@ -125,7 +139,8 @@ description: Read text and metadata from files and documents using Apache Tika t
125
139
  email:
126
140
  - erol.fornoles@gmail.com
127
141
  - a.bromwich@gmail.com
128
- executables: []
142
+ executables:
143
+ - console
129
144
  extensions: []
130
145
  extra_rdoc_files: []
131
146
  files:
@@ -138,14 +153,17 @@ files:
138
153
  - NOTICE.txt
139
154
  - README.md
140
155
  - Rakefile
156
+ - bin/console
141
157
  - henkei.gemspec
142
- - jar/tika-app-1.20.jar
158
+ - jar/tika-app-1.23.jar
143
159
  - jar/tika-config.xml
144
160
  - lib/henkei.rb
161
+ - lib/henkei/configuration.rb
145
162
  - lib/henkei/version.rb
146
163
  - lib/henkei/yomu.rb
147
164
  - spec/helper.rb
148
165
  - spec/henkei_spec.rb
166
+ - spec/samples/pipe-error.png
149
167
  - spec/samples/sample filename with spaces.pages
150
168
  - spec/samples/sample-metadata-values-with-colons.doc
151
169
  - spec/samples/sample.docx
@@ -162,15 +180,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
162
180
  requirements:
163
181
  - - ">="
164
182
  - !ruby/object:Gem::Version
165
- version: '0'
183
+ version: 2.4.0
184
+ - - "<"
185
+ - !ruby/object:Gem::Version
186
+ version: 3.1.0
166
187
  required_rubygems_version: !ruby/object:Gem::Requirement
167
188
  requirements:
168
189
  - - ">="
169
190
  - !ruby/object:Gem::Version
170
191
  version: '0'
171
192
  requirements: []
172
- rubyforge_project:
173
- rubygems_version: 2.7.6
193
+ rubygems_version: 3.0.6
174
194
  signing_key:
175
195
  specification_version: 4
176
196
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
@@ -178,6 +198,7 @@ summary: Read text and metadata from files and documents (.doc, .docx, .pages, .
178
198
  test_files:
179
199
  - spec/helper.rb
180
200
  - spec/henkei_spec.rb
201
+ - spec/samples/pipe-error.png
181
202
  - spec/samples/sample filename with spaces.pages
182
203
  - spec/samples/sample-metadata-values-with-colons.doc
183
204
  - spec/samples/sample.docx