henkei 1.20.0 → 1.23.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ca7e4a2903bba6a17a310d55d42ae8647bfa6857e27820394b4745d11a36393a
4
- data.tar.gz: e78d60f55d368455ab7fa4007240961d95e98232df41b267d057535bfb9d9f63
3
+ metadata.gz: 97b9b37da4e96281569a16e469d23a00c41cd35bbd15e5a7c1d1025ea2862f46
4
+ data.tar.gz: 45a7c3ae645fc417ae6064a7237d87ae7e56c3c20392f83cad42f9fdafd569e8
5
5
  SHA512:
6
- metadata.gz: bf5b4fa6bfaa408f0572e6a034f01a8d5bdcb629208ab17f143fc7dffee936cd012a2d839852c01d39a8006d890a6158ed7c798120e8d41b91f0b00850082056
7
- data.tar.gz: 46461c2c92c1798f97fe8f1942d31eb0ca82867015fe8b7de186140da98e0d4377860852c15d61e0533d7515db2cbccfd687027a8a5887d6a677dbeb79d776f2
6
+ metadata.gz: c323d5f5c2056bedce26e05b40d70eecd4f4fff6f8ae24362dfc80952e788f33454b8a7be6a375c54001694e6a2fbe2d7dfb60392dc7324b865d056e99637d78
7
+ data.tar.gz: fe9c6ee2a7033d55d8ccc36a3d3a1cc78fd459654314096fa087c8373544deaa291952b0303db11f362a7f758028acf11692f7b25ca132d7cb4c6603a10a5a9d
data/.rubocop.yml CHANGED
@@ -1,10 +1,31 @@
1
+ AllCops:
2
+ NewCops: enable
3
+
4
+ Layout/EmptyLinesAroundAttributeAccessor:
5
+ Enabled: true
6
+
7
+ Layout/LineLength:
8
+ Max: 120
9
+
10
+ Layout/SpaceAroundMethodCallOperator:
11
+ Enabled: true
12
+
13
+ Lint/DeprecatedOpenSSLConstant:
14
+ Enabled: true
15
+
16
+ Lint/MixedRegexpCaptureTypes:
17
+ Enabled: true
18
+
19
+ Lint/RaiseException:
20
+ Enabled: true
21
+
22
+ Lint/StructNewOverride:
23
+ Enabled: true
24
+
1
25
  Metrics/BlockLength:
2
26
  Exclude:
3
27
  - 'spec/**/*'
4
28
 
5
- Metrics/LineLength:
6
- Max: 120
7
-
8
29
  Metrics/MethodLength:
9
30
  Max: 15
10
31
 
@@ -13,3 +34,24 @@ Style/ClassVars:
13
34
 
14
35
  Style/DoubleNegation:
15
36
  Enabled: false
37
+
38
+ Style/ExponentialNotation:
39
+ Enabled: true
40
+
41
+ Style/HashEachMethods:
42
+ Enabled: true
43
+
44
+ Style/HashTransformKeys:
45
+ Enabled: true
46
+
47
+ Style/HashTransformValues:
48
+ Enabled: true
49
+
50
+ Style/RedundantRegexpCharacterClass:
51
+ Enabled: true
52
+
53
+ Style/RedundantRegexpEscape:
54
+ Enabled: true
55
+
56
+ Style/SlicingWithRange:
57
+ Enabled: true
data/.travis.yml CHANGED
@@ -1,13 +1,17 @@
1
1
  env:
2
2
  global:
3
3
  - CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
4
+ jobs:
5
+ - INCLUDE_RAILS=false
6
+ - INCLUDE_RAILS=true
4
7
 
5
8
  language: ruby
6
9
  rvm:
7
- - 2.2
8
- - 2.3
9
10
  - 2.4
10
11
  - 2.5
12
+ - 2.6
13
+ - 2.7
14
+ - 3.0
11
15
 
12
16
  before_install:
13
17
  - gem update bundler
@@ -22,7 +26,7 @@ before_script:
22
26
  - ./cc-test-reporter before-build
23
27
 
24
28
  script:
25
- - rubocop
29
+ - bundle exec rubocop
26
30
  - bundle exec rspec
27
31
 
28
32
  after_script:
data/Gemfile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
5
  # Specify your gem's dependencies in henkei.gemspec
data/Rakefile CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env rake
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'bundler/gem_tasks'
4
5
  require 'rspec/core/rake_task'
data/bin/console ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'bundler/setup'
5
+ require 'henkei'
6
+
7
+ require 'irb'
8
+ IRB.start
data/henkei.gemspec CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  lib = File.expand_path('lib', __dir__)
2
4
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
5
 
@@ -13,6 +15,7 @@ Gem::Specification.new do |spec|
13
15
  '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
14
16
  spec.homepage = 'http://github.com/abrom/henkei'
15
17
  spec.license = 'MIT'
18
+ spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
16
19
 
17
20
  spec.files = `git ls-files`.split("\n")
18
21
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -20,11 +23,12 @@ Gem::Specification.new do |spec|
20
23
  spec.require_paths = ['lib']
21
24
 
22
25
  spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
23
- spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
26
+ spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 1'
24
27
 
25
- spec.add_development_dependency 'bundler', '~> 1.3'
28
+ spec.add_development_dependency 'bundler', '~> 2.0'
29
+ spec.add_development_dependency 'rails', '~> 5.0'
26
30
  spec.add_development_dependency 'rake', '~> 12.3'
27
31
  spec.add_development_dependency 'rspec', '~> 3.7'
28
- spec.add_development_dependency 'rubocop', '~> 0.53'
32
+ spec.add_development_dependency 'rubocop', '~> 0.71'
29
33
  spec.add_development_dependency 'simplecov', '~> 0.15'
30
34
  end
data/lib/henkei.rb CHANGED
@@ -1,24 +1,49 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'henkei/version'
2
4
  require 'henkei/yomu'
5
+ require 'henkei/configuration'
3
6
 
4
7
  require 'net/http'
5
- require 'mime/types'
8
+ require 'mini_mime'
9
+
10
+ # require 'mime/types' if available
11
+ begin
12
+ require 'mime/types'
13
+ rescue LoadError
14
+ nil
15
+ end
16
+
6
17
  require 'time'
7
18
  require 'json'
8
19
 
9
20
  require 'socket'
10
21
  require 'stringio'
11
22
 
23
+ require 'open3'
24
+
12
25
  # Read text and metadata from files and documents using Apache Tika toolkit
13
26
  class Henkei # rubocop:disable Metrics/ClassLength
14
27
  GEM_PATH = File.dirname(File.dirname(__FILE__))
15
- JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.20.jar')
28
+ JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.23.jar')
16
29
  CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
17
30
  DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
18
31
 
19
32
  @@server_port = nil
20
33
  @@server_pid = nil
21
34
 
35
+ def self.mimetype(content_type)
36
+ if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
37
+ warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
38
+ ' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
39
+ MIME::Types[content_type].first
40
+ else
41
+ MiniMime.lookup_by_content_type(content_type).tap do |object|
42
+ object.define_singleton_method(:extensions) { [extension] }
43
+ end
44
+ end
45
+ end
46
+
22
47
  # Read text or metadata from a data buffer.
23
48
  #
24
49
  # data = File.read 'sample.pages'
@@ -32,7 +57,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
32
57
  when :text then result
33
58
  when :html then result
34
59
  when :metadata then JSON.parse(result)
35
- when :mimetype then MIME::Types[JSON.parse(result)['Content-Type']].first
60
+ when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
36
61
  end
37
62
  end
38
63
 
@@ -108,9 +133,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
108
133
  def mimetype
109
134
  return @mimetype if defined? @mimetype
110
135
 
111
- type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
112
-
113
- @mimetype = MIME::Types[type].first
136
+ content_type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
137
+ @mimetype = Henkei.mimetype(content_type)
114
138
  end
115
139
 
116
140
  # Returns +true+ if the Henkei document was specified using a file path.
@@ -182,7 +206,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
182
206
  def self.server(type, custom_port = nil)
183
207
  @@server_port = custom_port || DEFAULT_SERVER_PORT
184
208
 
185
- @@server_pid = Process.spawn tika_command(type, true)
209
+ @@server_pid = Process.spawn(*tika_command(type, server: true))
186
210
  sleep(2) # Give the server 2 seconds to spin up.
187
211
  @@server_pid
188
212
  end
@@ -215,18 +239,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
215
239
  # Provide the path to the Java binary
216
240
  #
217
241
  def self.java_path
218
- ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
242
+ ENV['JAVA_HOME'] ? "#{ENV['JAVA_HOME']}/bin/java" : 'java'
219
243
  end
220
244
  private_class_method :java_path
221
245
 
222
246
  # Internal helper for calling to Tika library directly
223
247
  #
224
248
  def self.client_read(type, data)
225
- IO.popen tika_command(type), 'r+' do |io|
226
- io.write data
227
- io.close_write
228
- io.read
229
- end
249
+ Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
230
250
  end
231
251
  private_class_method :client_read
232
252
 
@@ -246,7 +266,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
246
266
  # tell Tika that we're done sending data
247
267
  s.shutdown(Socket::SHUT_WR)
248
268
 
249
- resp = ''
269
+ resp = String.new ''
250
270
  loop do
251
271
  chunk = s.recv(65_536)
252
272
  break if chunk.empty? || !chunk
@@ -259,23 +279,22 @@ class Henkei # rubocop:disable Metrics/ClassLength
259
279
 
260
280
  # Internal helper for building the Java command to call Tika
261
281
  #
262
- def self.tika_command(type, server = false)
263
- command = ["#{java_path} -Djava.awt.headless=true -jar #{Henkei::JAR_PATH} --config=#{Henkei::CONFIG_PATH}"]
264
- command << "--server --port #{@@server_port}" if server
265
- command << switch_for_type(type)
266
- command.join ' '
282
+ def self.tika_command(type, server: false)
283
+ command = [java_path, '-Djava.awt.headless=true', '-jar', Henkei::JAR_PATH, "--config=#{Henkei::CONFIG_PATH}"]
284
+ command += ['--server', '--port', @@server_port.to_s] if server
285
+ command + switch_for_type(type)
267
286
  end
268
287
  private_class_method :tika_command
269
288
 
270
289
  # Internal helper for building the Java command to call Tika
271
290
  #
272
291
  def self.switch_for_type(type)
273
- case type
274
- when :text then '-t'
275
- when :html then '-h'
276
- when :metadata then '-m -j'
277
- when :mimetype then '-m -j'
278
- end
292
+ {
293
+ text: ['-t'],
294
+ html: ['-h'],
295
+ metadata: %w[-m -j],
296
+ mimetype: %w[-m -j]
297
+ }[type]
279
298
  end
280
299
  private_class_method :switch_for_type
281
300
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Henkei monkey patch for configuration support
4
+ class Henkei
5
+ def self.configuration
6
+ @configuration ||= Configuration.new
7
+ end
8
+
9
+ def self.configure
10
+ yield(configuration)
11
+ end
12
+
13
+ # Handle Henkei configuration
14
+ class Configuration
15
+ attr_accessor :mime_library
16
+
17
+ def initialize
18
+ @mime_library = 'mime/types'
19
+ end
20
+ end
21
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class Henkei
2
- VERSION = '1.20.0'.freeze
4
+ VERSION = '1.23.2'
3
5
  end
data/lib/henkei/yomu.rb CHANGED
@@ -1 +1,3 @@
1
+ # frozen_string_literal: true
2
+
1
3
  Yomu = Henkei
data/spec/helper.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'simplecov'
2
4
  SimpleCov.start
3
5
 
data/spec/henkei_spec.rb CHANGED
@@ -1,6 +1,11 @@
1
- require 'helper.rb'
1
+ # frozen_string_literal: true
2
+
3
+ require 'helper'
2
4
  require 'henkei'
3
5
 
6
+ # Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
7
+ require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
8
+
4
9
  describe Henkei do
5
10
  let(:data) { File.read 'spec/samples/sample.docx' }
6
11
 
@@ -38,6 +43,16 @@ describe Henkei do
38
43
  )
39
44
  expect(mimetype.extensions).to include 'docx'
40
45
  end
46
+
47
+ context 'when passing in the `pipe-error.png` test file' do
48
+ let(:data) { File.read 'spec/samples/pipe-error.png' }
49
+
50
+ it 'returns an empty result' do
51
+ text = Henkei.read :text, data
52
+
53
+ expect(text).to eq ''
54
+ end
55
+ end
41
56
  end
42
57
 
43
58
  describe '.new' do
@@ -127,6 +142,23 @@ describe Henkei do
127
142
  specify '#metadata reads metadata' do
128
143
  expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
129
144
  end
145
+
146
+ context 'when passing in the `pipe-error.png` test file' do
147
+ let(:henkei) { Henkei.new 'spec/samples/pipe-error.png' }
148
+
149
+ it '#text returns an empty result' do
150
+ expect(henkei.text).to eq ''
151
+ end
152
+
153
+ it '#html returns an empty body' do
154
+ expect(henkei.html).to include '<body/>'
155
+ expect(henkei.html).to include '<meta name="tiff:ImageWidth" content="792"/>'
156
+ end
157
+
158
+ it '#mimetype returns `image/png`' do
159
+ expect(henkei.mimetype.content_type).to eq 'image/png'
160
+ end
161
+ end
130
162
  end
131
163
 
132
164
  context 'initialized with a given URI' do
@@ -155,6 +187,18 @@ describe Henkei do
155
187
  end
156
188
  end
157
189
 
190
+ context 'when source is a remote PDF' do
191
+ let(:henkei) { Henkei.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
192
+
193
+ specify '#text reads text' do
194
+ expect(henkei.text).to include 'Dummy PDF file'
195
+ end
196
+
197
+ specify '#metadata reads metadata' do
198
+ expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
199
+ end
200
+ end
201
+
158
202
  context 'working as server mode' do
159
203
  specify '#starts and kills server' do
160
204
  begin
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.20.0
4
+ version: 1.23.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-12-23 00:00:00.000000000 Z
12
+ date: 2021-02-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -32,39 +32,53 @@ dependencies:
32
32
  - !ruby/object:Gem::Version
33
33
  version: '3'
34
34
  - !ruby/object:Gem::Dependency
35
- name: mime-types
35
+ name: mini_mime
36
36
  requirement: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '1.23'
40
+ version: 0.1.1
41
41
  - - "<"
42
42
  - !ruby/object:Gem::Version
43
- version: '4'
43
+ version: '1'
44
44
  type: :runtime
45
45
  prerelease: false
46
46
  version_requirements: !ruby/object:Gem::Requirement
47
47
  requirements:
48
48
  - - ">="
49
49
  - !ruby/object:Gem::Version
50
- version: '1.23'
50
+ version: 0.1.1
51
51
  - - "<"
52
52
  - !ruby/object:Gem::Version
53
- version: '4'
53
+ version: '1'
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: bundler
56
56
  requirement: !ruby/object:Gem::Requirement
57
57
  requirements:
58
58
  - - "~>"
59
59
  - !ruby/object:Gem::Version
60
- version: '1.3'
60
+ version: '2.0'
61
61
  type: :development
62
62
  prerelease: false
63
63
  version_requirements: !ruby/object:Gem::Requirement
64
64
  requirements:
65
65
  - - "~>"
66
66
  - !ruby/object:Gem::Version
67
- version: '1.3'
67
+ version: '2.0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: rails
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '5.0'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '5.0'
68
82
  - !ruby/object:Gem::Dependency
69
83
  name: rake
70
84
  requirement: !ruby/object:Gem::Requirement
@@ -99,14 +113,14 @@ dependencies:
99
113
  requirements:
100
114
  - - "~>"
101
115
  - !ruby/object:Gem::Version
102
- version: '0.53'
116
+ version: '0.71'
103
117
  type: :development
104
118
  prerelease: false
105
119
  version_requirements: !ruby/object:Gem::Requirement
106
120
  requirements:
107
121
  - - "~>"
108
122
  - !ruby/object:Gem::Version
109
- version: '0.53'
123
+ version: '0.71'
110
124
  - !ruby/object:Gem::Dependency
111
125
  name: simplecov
112
126
  requirement: !ruby/object:Gem::Requirement
@@ -125,7 +139,8 @@ description: Read text and metadata from files and documents using Apache Tika t
125
139
  email:
126
140
  - erol.fornoles@gmail.com
127
141
  - a.bromwich@gmail.com
128
- executables: []
142
+ executables:
143
+ - console
129
144
  extensions: []
130
145
  extra_rdoc_files: []
131
146
  files:
@@ -138,14 +153,17 @@ files:
138
153
  - NOTICE.txt
139
154
  - README.md
140
155
  - Rakefile
156
+ - bin/console
141
157
  - henkei.gemspec
142
- - jar/tika-app-1.20.jar
158
+ - jar/tika-app-1.23.jar
143
159
  - jar/tika-config.xml
144
160
  - lib/henkei.rb
161
+ - lib/henkei/configuration.rb
145
162
  - lib/henkei/version.rb
146
163
  - lib/henkei/yomu.rb
147
164
  - spec/helper.rb
148
165
  - spec/henkei_spec.rb
166
+ - spec/samples/pipe-error.png
149
167
  - spec/samples/sample filename with spaces.pages
150
168
  - spec/samples/sample-metadata-values-with-colons.doc
151
169
  - spec/samples/sample.docx
@@ -162,15 +180,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
162
180
  requirements:
163
181
  - - ">="
164
182
  - !ruby/object:Gem::Version
165
- version: '0'
183
+ version: 2.4.0
184
+ - - "<"
185
+ - !ruby/object:Gem::Version
186
+ version: 3.1.0
166
187
  required_rubygems_version: !ruby/object:Gem::Requirement
167
188
  requirements:
168
189
  - - ">="
169
190
  - !ruby/object:Gem::Version
170
191
  version: '0'
171
192
  requirements: []
172
- rubyforge_project:
173
- rubygems_version: 2.7.6
193
+ rubygems_version: 3.0.6
174
194
  signing_key:
175
195
  specification_version: 4
176
196
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
@@ -178,6 +198,7 @@ summary: Read text and metadata from files and documents (.doc, .docx, .pages, .
178
198
  test_files:
179
199
  - spec/helper.rb
180
200
  - spec/henkei_spec.rb
201
+ - spec/samples/pipe-error.png
181
202
  - spec/samples/sample filename with spaces.pages
182
203
  - spec/samples/sample-metadata-values-with-colons.doc
183
204
  - spec/samples/sample.docx