henkei 2.4.0.1 → 2.4.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 52de171e9cba852d1551459674a12adfca8fd6166cb5e5707f3bc6a7cec9415c
4
- data.tar.gz: '08807feea85b577c37153c290331c8f9c4441c2eef6c2600d630948a27b9ba5e'
3
+ metadata.gz: e5f5bee2529b8b7ea72cef8962f1bf4ce87d7988e9c76a2481a72e555d949490
4
+ data.tar.gz: c54a7262d038b9c32d667f44111b912e0d58ad8c058314539e3b68588acfbc81
5
5
  SHA512:
6
- metadata.gz: 6481f5588edeb5cf7e806cd9326636d14e936bee95064de49017614e999f295f9683b3cbe6346dbb4d0611753288d7b628435514fe13ff0b039723de55262db1
7
- data.tar.gz: 27a33e20e068708563324db99798abf56a81b25386899e41cc4b5e15097df11e7bc61f69b1705a5a0d537ac3488c608e28c6337e6bc8d4fe6af3b2aba6e19416
6
+ metadata.gz: 483996169fb05e873aec30fe2d30c0e80f1b35e34ff1442c01a79a42b3374c4262233ed4e71e3cdd552fa7d8e12124609de0a762c05d478e4337e910091bd3c8
7
+ data.tar.gz: 11d71da34a976c8fe7bc8ddd57161c4ee2407a852b2ea1ddb1ee5e8ef4013976ce094b97bf9b02842331b1bf3e6f574c2ef24b83a9106b2448ccf9fad2e0980f
@@ -14,10 +14,10 @@ jobs:
14
14
  runs-on: ubuntu-latest
15
15
  strategy:
16
16
  matrix:
17
- ruby-version: ['2.6', '2.7', '3.0', '3.1']
17
+ ruby-version: ['2.7', '3.0', '3.1', '3.2']
18
18
 
19
19
  steps:
20
- - uses: actions/checkout@v2
20
+ - uses: actions/checkout@v3
21
21
 
22
22
  - name: Set up Ruby
23
23
  uses: ruby/setup-ruby@v1
@@ -32,6 +32,6 @@ jobs:
32
32
  run: bundle exec rspec
33
33
 
34
34
  - name: Test & publish code coverage
35
- uses: paambaati/codeclimate-action@v3.0.0
35
+ uses: paambaati/codeclimate-action@v3.2.0
36
36
  env:
37
37
  CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
data/.rubocop.yml CHANGED
@@ -1,6 +1,10 @@
1
+ require:
2
+ - rubocop-rake
3
+ - rubocop-rspec
4
+
1
5
  AllCops:
2
6
  NewCops: enable
3
- TargetRubyVersion: 2.6
7
+ TargetRubyVersion: 2.7
4
8
 
5
9
  Layout/EmptyLinesAroundAttributeAccessor:
6
10
  Enabled: true
@@ -30,6 +34,12 @@ Metrics/BlockLength:
30
34
  Metrics/MethodLength:
31
35
  Max: 15
32
36
 
37
+ RSpec/ExampleLength:
38
+ Max: 10
39
+
40
+ RSpec/MultipleExpectations:
41
+ Max: 3
42
+
33
43
  Style/ClassVars:
34
44
  Enabled: false
35
45
 
data/henkei.gemspec CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
5
 
6
6
  require 'henkei/version'
7
7
 
8
- Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
8
+ Gem::Specification.new do |spec|
9
9
  spec.name = 'henkei'
10
10
  spec.version = Henkei::VERSION
11
11
  spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
15
15
  '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
16
16
  spec.homepage = 'https://github.com/abrom/henkei'
17
17
  spec.license = 'MIT'
18
- spec.required_ruby_version = ['>= 2.6.0', '< 3.2.0']
18
+ spec.required_ruby_version = ['>= 2.7.0', '< 3.3.0']
19
19
 
20
20
  # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
21
21
  # delete this section to allow pushing this gem to any host.
@@ -38,7 +38,6 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
38
38
  spec.add_development_dependency 'rspec', '~> 3.7'
39
39
  spec.add_development_dependency 'rubocop', '~> 1.26'
40
40
  spec.add_development_dependency 'rubocop-performance', '~> 1.13'
41
- spec.add_development_dependency 'rubocop-rails', '~> 2.14'
42
41
  spec.add_development_dependency 'rubocop-rake', '~> 0.6'
43
42
  spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
44
43
  spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Henkei
4
- VERSION = '2.4.0.1'
4
+ VERSION = '2.4.1.1'
5
5
  end
data/lib/henkei.rb CHANGED
@@ -25,14 +25,14 @@ require 'open3'
25
25
  # Read text and metadata from files and documents using Apache Tika toolkit
26
26
  class Henkei # rubocop:disable Metrics/ClassLength
27
27
  GEM_PATH = File.dirname(File.dirname(__FILE__))
28
- JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.4.0.jar')
28
+ JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.4.1.jar')
29
29
  CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
30
30
  CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
31
31
 
32
32
  def self.mimetype(content_type)
33
33
  if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
34
- warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
35
- ' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
34
+ warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead. ' \
35
+ 'Use Henkei.configure and assign "mini_mime" to `mime_library`.'
36
36
  MIME::Types[content_type].first
37
37
  else
38
38
  MiniMime.lookup_by_content_type(content_type).tap do |object|
data/spec/henkei_spec.rb CHANGED
@@ -20,13 +20,13 @@ describe Henkei do
20
20
 
21
21
  describe '.read' do
22
22
  it 'reads text' do
23
- text = Henkei.read :text, data
23
+ text = described_class.read :text, data
24
24
 
25
25
  expect(text).to include 'The quick brown fox jumped over the lazy cat.'
26
26
  end
27
27
 
28
28
  it 'reads metadata' do
29
- metadata = Henkei.read :metadata, data
29
+ metadata = described_class.read :metadata, data
30
30
 
31
31
  expect(metadata['Content-Type']).to(
32
32
  eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
@@ -35,13 +35,13 @@ describe Henkei do
35
35
 
36
36
  it 'reads metadata values with colons as strings' do
37
37
  data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
38
- metadata = Henkei.read :metadata, data
38
+ metadata = described_class.read :metadata, data
39
39
 
40
40
  expect(metadata['dc:title']).to eq 'problem: test'
41
41
  end
42
42
 
43
43
  it 'reads mimetype' do
44
- mimetype = Henkei.read :mimetype, data
44
+ mimetype = described_class.read :mimetype, data
45
45
 
46
46
  expect(mimetype.content_type).to(
47
47
  eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
@@ -53,7 +53,7 @@ describe Henkei do
53
53
  let(:data) { File.read 'spec/samples/pipe-error.png' }
54
54
 
55
55
  it 'returns an empty result' do
56
- text = Henkei.read :text, data
56
+ text = described_class.read :text, data
57
57
 
58
58
  expect(text).to eq ''
59
59
  end
@@ -61,15 +61,12 @@ describe Henkei do
61
61
  unless ci?
62
62
  context 'when `include_ocr` is enabled' do
63
63
  it 'returns parsed plain text in the image' do
64
- text = Henkei.read :text, data, include_ocr: true
64
+ text = described_class.read :text, data, include_ocr: true
65
65
 
66
66
  expect(text).to include <<~TEXT
67
67
  West Side
68
68
 
69
69
  Sea Island
70
- PP
71
-
72
- Richmond
73
70
  TEXT
74
71
  end
75
72
  end
@@ -79,11 +76,11 @@ describe Henkei do
79
76
 
80
77
  describe '.new' do
81
78
  it 'requires parameters' do
82
- expect { Henkei.new }.to raise_error ArgumentError
79
+ expect { described_class.new }.to raise_error ArgumentError
83
80
  end
84
81
 
85
82
  it 'accepts a root path' do
86
- henkei = Henkei.new 'spec/samples/sample.pages'
83
+ henkei = described_class.new File.join(Henkei::GEM_PATH, 'spec/samples/sample.pages')
87
84
 
88
85
  expect(henkei).to be_path
89
86
  expect(henkei).not_to be_uri
@@ -91,7 +88,7 @@ describe Henkei do
91
88
  end
92
89
 
93
90
  it 'accepts a relative path' do
94
- henkei = Henkei.new 'spec/samples/sample.pages'
91
+ henkei = described_class.new 'spec/samples/sample.pages'
95
92
 
96
93
  expect(henkei).to be_path
97
94
  expect(henkei).not_to be_uri
@@ -99,7 +96,7 @@ describe Henkei do
99
96
  end
100
97
 
101
98
  it 'accepts a path with spaces' do
102
- henkei = Henkei.new 'spec/samples/sample filename with spaces.pages'
99
+ henkei = described_class.new 'spec/samples/sample filename with spaces.pages'
103
100
 
104
101
  expect(henkei).to be_path
105
102
  expect(henkei).not_to be_uri
@@ -107,7 +104,7 @@ describe Henkei do
107
104
  end
108
105
 
109
106
  it 'accepts a URI' do
110
- henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
107
+ henkei = described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
111
108
 
112
109
  expect(henkei).to be_uri
113
110
  expect(henkei).not_to be_path
@@ -116,7 +113,7 @@ describe Henkei do
116
113
 
117
114
  it 'accepts a stream or object that can be read' do
118
115
  File.open 'spec/samples/sample.pages', 'r' do |file|
119
- henkei = Henkei.new file
116
+ henkei = described_class.new file
120
117
 
121
118
  expect(henkei).to be_stream
122
119
  expect(henkei).not_to be_path
@@ -125,38 +122,38 @@ describe Henkei do
125
122
  end
126
123
 
127
124
  it 'refuses a path to a missing file' do
128
- expect { Henkei.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
125
+ expect { described_class.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
129
126
  end
130
127
 
131
128
  it 'refuses other objects' do
132
129
  [nil, 1, 1.1].each do |object|
133
- expect { Henkei.new object }.to raise_error TypeError
130
+ expect { described_class.new object }.to raise_error TypeError
134
131
  end
135
132
  end
136
133
  end
137
134
 
138
135
  describe '.creation_date' do
139
- let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
136
+ let(:henkei) { described_class.new 'spec/samples/sample.pages' }
140
137
 
141
- it 'should return Time' do
138
+ it 'returns a Time' do
142
139
  expect(henkei.creation_date).to be_a Time
143
140
  end
144
141
  end
145
142
 
146
143
  describe '.java' do
147
144
  specify 'with no specified JAVA_HOME' do
148
- expect(Henkei.send(:java_path)).to eq 'java'
145
+ expect(described_class.send(:java_path)).to eq 'java'
149
146
  end
150
147
 
151
148
  specify 'with a specified JAVA_HOME' do
152
149
  ENV['JAVA_HOME'] = '/path/to/java/home'
153
150
 
154
- expect(Henkei.send(:java_path)).to eq '/path/to/java/home/bin/java'
151
+ expect(described_class.send(:java_path)).to eq '/path/to/java/home/bin/java'
155
152
  end
156
153
  end
157
154
 
158
- context 'initialized with a given path' do
159
- let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
155
+ context 'when initialized with a given path' do
156
+ let(:henkei) { described_class.new 'spec/samples/sample.pages' }
160
157
 
161
158
  specify '#text reads text' do
162
159
  expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
@@ -167,7 +164,7 @@ describe Henkei do
167
164
  end
168
165
 
169
166
  context 'when passing in the `pipe-error.png` test file' do
170
- let(:henkei) { Henkei.new 'spec/samples/pipe-error.png' }
167
+ let(:henkei) { described_class.new 'spec/samples/pipe-error.png' }
171
168
 
172
169
  it '#text returns an empty result' do
173
170
  expect(henkei.text).to eq ''
@@ -189,9 +186,6 @@ describe Henkei do
189
186
  West Side
190
187
 
191
188
  Sea Island
192
- PP
193
-
194
- Richmond
195
189
  TEXT
196
190
  end
197
191
 
@@ -199,7 +193,7 @@ describe Henkei do
199
193
  expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
200
194
 
201
195
  html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
202
- ['Anmore', 'Coquitlam', 'West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
196
+ ['West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
203
197
  expect(html_body.text).to include location
204
198
  end
205
199
  end
@@ -208,8 +202,8 @@ describe Henkei do
208
202
  end
209
203
  end
210
204
 
211
- context 'initialized with a given URI' do
212
- let(:henkei) { Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
205
+ context 'when initialized with a given URI' do
206
+ let(:henkei) { described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
213
207
 
214
208
  specify '#text reads text' do
215
209
  expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
@@ -222,8 +216,8 @@ describe Henkei do
222
216
  end
223
217
  end
224
218
 
225
- context 'initialized with a given stream' do
226
- let(:henkei) { Henkei.new File.open('spec/samples/sample.pages', 'rb') }
219
+ context 'when initialized with a given stream' do
220
+ let(:henkei) { described_class.new File.open('spec/samples/sample.pages', 'rb') }
227
221
 
228
222
  specify '#text reads text' do
229
223
  expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
@@ -235,7 +229,7 @@ describe Henkei do
235
229
  end
236
230
 
237
231
  context 'when source is a remote PDF' do
238
- let(:henkei) { Henkei.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
232
+ let(:henkei) { described_class.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
239
233
 
240
234
  specify '#text reads text' do
241
235
  expect(henkei.text).to include 'Dummy PDF file'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.4.0.1
4
+ version: 2.4.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2022-05-28 00:00:00.000000000 Z
12
+ date: 2023-01-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -149,20 +149,6 @@ dependencies:
149
149
  - - "~>"
150
150
  - !ruby/object:Gem::Version
151
151
  version: '1.13'
152
- - !ruby/object:Gem::Dependency
153
- name: rubocop-rails
154
- requirement: !ruby/object:Gem::Requirement
155
- requirements:
156
- - - "~>"
157
- - !ruby/object:Gem::Version
158
- version: '2.14'
159
- type: :development
160
- prerelease: false
161
- version_requirements: !ruby/object:Gem::Requirement
162
- requirements:
163
- - - "~>"
164
- - !ruby/object:Gem::Version
165
- version: '2.14'
166
152
  - !ruby/object:Gem::Dependency
167
153
  name: rubocop-rake
168
154
  requirement: !ruby/object:Gem::Requirement
@@ -231,7 +217,7 @@ files:
231
217
  - Rakefile
232
218
  - bin/console
233
219
  - henkei.gemspec
234
- - jar/tika-app-2.4.0.jar
220
+ - jar/tika-app-2.4.1.jar
235
221
  - jar/tika-config-without-ocr.xml
236
222
  - jar/tika-config.xml
237
223
  - lib/henkei.rb
@@ -259,17 +245,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
259
245
  requirements:
260
246
  - - ">="
261
247
  - !ruby/object:Gem::Version
262
- version: 2.6.0
248
+ version: 2.7.0
263
249
  - - "<"
264
250
  - !ruby/object:Gem::Version
265
- version: 3.2.0
251
+ version: 3.3.0
266
252
  required_rubygems_version: !ruby/object:Gem::Requirement
267
253
  requirements:
268
254
  - - ">="
269
255
  - !ruby/object:Gem::Version
270
256
  version: '0'
271
257
  requirements: []
272
- rubygems_version: 3.2.3
258
+ rubygems_version: 3.4.1
273
259
  signing_key:
274
260
  specification_version: 4
275
261
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,