henkei 2.4.0.1 → 2.4.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +3 -3
- data/.rubocop.yml +11 -1
- data/henkei.gemspec +2 -3
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +2 -2
- data/spec/henkei_spec.rb +27 -33
- metadata +5 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a039368aaffaee95c3d48c5b56fc30b81babafc7b996986e56c788bb12c20bc0
|
4
|
+
data.tar.gz: d31e08f66e605ea99209911edebbb62fce65e75c328043a9ed3e53f1eac4d80d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1daffbde6948f1e8d9c8003c8ab60a3f86051c307b04a0aa3a654bdf05efd830c3148e17a555fee4cd4f52d85ed80eb6071e4df54754e5458b7e1caa1a9b2474
|
7
|
+
data.tar.gz: a230d09cde52cbedbc0b67ed025a7666119b5c66f7611cb730b1558aba5b77f5f50cc878e17f8d11a14d9840d0b379aeedb00d141fbde924ec30f3fe42414747
|
data/.github/workflows/test.yml
CHANGED
@@ -14,10 +14,10 @@ jobs:
|
|
14
14
|
runs-on: ubuntu-latest
|
15
15
|
strategy:
|
16
16
|
matrix:
|
17
|
-
ruby-version: ['2.
|
17
|
+
ruby-version: ['2.7', '3.0', '3.1', '3.2']
|
18
18
|
|
19
19
|
steps:
|
20
|
-
- uses: actions/checkout@
|
20
|
+
- uses: actions/checkout@v3
|
21
21
|
|
22
22
|
- name: Set up Ruby
|
23
23
|
uses: ruby/setup-ruby@v1
|
@@ -32,6 +32,6 @@ jobs:
|
|
32
32
|
run: bundle exec rspec
|
33
33
|
|
34
34
|
- name: Test & publish code coverage
|
35
|
-
uses: paambaati/codeclimate-action@v3.
|
35
|
+
uses: paambaati/codeclimate-action@v3.2.0
|
36
36
|
env:
|
37
37
|
CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
data/.rubocop.yml
CHANGED
@@ -1,6 +1,10 @@
|
|
1
|
+
require:
|
2
|
+
- rubocop-rake
|
3
|
+
- rubocop-rspec
|
4
|
+
|
1
5
|
AllCops:
|
2
6
|
NewCops: enable
|
3
|
-
TargetRubyVersion: 2.
|
7
|
+
TargetRubyVersion: 2.7
|
4
8
|
|
5
9
|
Layout/EmptyLinesAroundAttributeAccessor:
|
6
10
|
Enabled: true
|
@@ -30,6 +34,12 @@ Metrics/BlockLength:
|
|
30
34
|
Metrics/MethodLength:
|
31
35
|
Max: 15
|
32
36
|
|
37
|
+
RSpec/ExampleLength:
|
38
|
+
Max: 10
|
39
|
+
|
40
|
+
RSpec/MultipleExpectations:
|
41
|
+
Max: 3
|
42
|
+
|
33
43
|
Style/ClassVars:
|
34
44
|
Enabled: false
|
35
45
|
|
data/henkei.gemspec
CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
5
|
|
6
6
|
require 'henkei/version'
|
7
7
|
|
8
|
-
Gem::Specification.new do |spec|
|
8
|
+
Gem::Specification.new do |spec|
|
9
9
|
spec.name = 'henkei'
|
10
10
|
spec.version = Henkei::VERSION
|
11
11
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
16
|
spec.homepage = 'https://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
|
-
spec.required_ruby_version = ['>= 2.
|
18
|
+
spec.required_ruby_version = ['>= 2.7.0', '< 3.3.0']
|
19
19
|
|
20
20
|
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
21
21
|
# delete this section to allow pushing this gem to any host.
|
@@ -38,7 +38,6 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
|
38
38
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
39
39
|
spec.add_development_dependency 'rubocop', '~> 1.26'
|
40
40
|
spec.add_development_dependency 'rubocop-performance', '~> 1.13'
|
41
|
-
spec.add_development_dependency 'rubocop-rails', '~> 2.14'
|
42
41
|
spec.add_development_dependency 'rubocop-rake', '~> 0.6'
|
43
42
|
spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
|
44
43
|
spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -31,8 +31,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
31
31
|
|
32
32
|
def self.mimetype(content_type)
|
33
33
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
34
|
-
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
35
|
-
'
|
34
|
+
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead. ' \
|
35
|
+
'Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
36
36
|
MIME::Types[content_type].first
|
37
37
|
else
|
38
38
|
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
data/spec/henkei_spec.rb
CHANGED
@@ -20,13 +20,13 @@ describe Henkei do
|
|
20
20
|
|
21
21
|
describe '.read' do
|
22
22
|
it 'reads text' do
|
23
|
-
text =
|
23
|
+
text = described_class.read :text, data
|
24
24
|
|
25
25
|
expect(text).to include 'The quick brown fox jumped over the lazy cat.'
|
26
26
|
end
|
27
27
|
|
28
28
|
it 'reads metadata' do
|
29
|
-
metadata =
|
29
|
+
metadata = described_class.read :metadata, data
|
30
30
|
|
31
31
|
expect(metadata['Content-Type']).to(
|
32
32
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
@@ -35,13 +35,13 @@ describe Henkei do
|
|
35
35
|
|
36
36
|
it 'reads metadata values with colons as strings' do
|
37
37
|
data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
|
38
|
-
metadata =
|
38
|
+
metadata = described_class.read :metadata, data
|
39
39
|
|
40
40
|
expect(metadata['dc:title']).to eq 'problem: test'
|
41
41
|
end
|
42
42
|
|
43
43
|
it 'reads mimetype' do
|
44
|
-
mimetype =
|
44
|
+
mimetype = described_class.read :mimetype, data
|
45
45
|
|
46
46
|
expect(mimetype.content_type).to(
|
47
47
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
@@ -53,7 +53,7 @@ describe Henkei do
|
|
53
53
|
let(:data) { File.read 'spec/samples/pipe-error.png' }
|
54
54
|
|
55
55
|
it 'returns an empty result' do
|
56
|
-
text =
|
56
|
+
text = described_class.read :text, data
|
57
57
|
|
58
58
|
expect(text).to eq ''
|
59
59
|
end
|
@@ -61,15 +61,12 @@ describe Henkei do
|
|
61
61
|
unless ci?
|
62
62
|
context 'when `include_ocr` is enabled' do
|
63
63
|
it 'returns parsed plain text in the image' do
|
64
|
-
text =
|
64
|
+
text = described_class.read :text, data, include_ocr: true
|
65
65
|
|
66
66
|
expect(text).to include <<~TEXT
|
67
67
|
West Side
|
68
68
|
|
69
69
|
Sea Island
|
70
|
-
PP
|
71
|
-
|
72
|
-
Richmond
|
73
70
|
TEXT
|
74
71
|
end
|
75
72
|
end
|
@@ -79,11 +76,11 @@ describe Henkei do
|
|
79
76
|
|
80
77
|
describe '.new' do
|
81
78
|
it 'requires parameters' do
|
82
|
-
expect {
|
79
|
+
expect { described_class.new }.to raise_error ArgumentError
|
83
80
|
end
|
84
81
|
|
85
82
|
it 'accepts a root path' do
|
86
|
-
henkei =
|
83
|
+
henkei = described_class.new File.join(Henkei::GEM_PATH, 'spec/samples/sample.pages')
|
87
84
|
|
88
85
|
expect(henkei).to be_path
|
89
86
|
expect(henkei).not_to be_uri
|
@@ -91,7 +88,7 @@ describe Henkei do
|
|
91
88
|
end
|
92
89
|
|
93
90
|
it 'accepts a relative path' do
|
94
|
-
henkei =
|
91
|
+
henkei = described_class.new 'spec/samples/sample.pages'
|
95
92
|
|
96
93
|
expect(henkei).to be_path
|
97
94
|
expect(henkei).not_to be_uri
|
@@ -99,7 +96,7 @@ describe Henkei do
|
|
99
96
|
end
|
100
97
|
|
101
98
|
it 'accepts a path with spaces' do
|
102
|
-
henkei =
|
99
|
+
henkei = described_class.new 'spec/samples/sample filename with spaces.pages'
|
103
100
|
|
104
101
|
expect(henkei).to be_path
|
105
102
|
expect(henkei).not_to be_uri
|
@@ -107,7 +104,7 @@ describe Henkei do
|
|
107
104
|
end
|
108
105
|
|
109
106
|
it 'accepts a URI' do
|
110
|
-
henkei =
|
107
|
+
henkei = described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
111
108
|
|
112
109
|
expect(henkei).to be_uri
|
113
110
|
expect(henkei).not_to be_path
|
@@ -116,7 +113,7 @@ describe Henkei do
|
|
116
113
|
|
117
114
|
it 'accepts a stream or object that can be read' do
|
118
115
|
File.open 'spec/samples/sample.pages', 'r' do |file|
|
119
|
-
henkei =
|
116
|
+
henkei = described_class.new file
|
120
117
|
|
121
118
|
expect(henkei).to be_stream
|
122
119
|
expect(henkei).not_to be_path
|
@@ -125,38 +122,38 @@ describe Henkei do
|
|
125
122
|
end
|
126
123
|
|
127
124
|
it 'refuses a path to a missing file' do
|
128
|
-
expect {
|
125
|
+
expect { described_class.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
|
129
126
|
end
|
130
127
|
|
131
128
|
it 'refuses other objects' do
|
132
129
|
[nil, 1, 1.1].each do |object|
|
133
|
-
expect {
|
130
|
+
expect { described_class.new object }.to raise_error TypeError
|
134
131
|
end
|
135
132
|
end
|
136
133
|
end
|
137
134
|
|
138
135
|
describe '.creation_date' do
|
139
|
-
let(:henkei) {
|
136
|
+
let(:henkei) { described_class.new 'spec/samples/sample.pages' }
|
140
137
|
|
141
|
-
it '
|
138
|
+
it 'returns a Time' do
|
142
139
|
expect(henkei.creation_date).to be_a Time
|
143
140
|
end
|
144
141
|
end
|
145
142
|
|
146
143
|
describe '.java' do
|
147
144
|
specify 'with no specified JAVA_HOME' do
|
148
|
-
expect(
|
145
|
+
expect(described_class.send(:java_path)).to eq 'java'
|
149
146
|
end
|
150
147
|
|
151
148
|
specify 'with a specified JAVA_HOME' do
|
152
149
|
ENV['JAVA_HOME'] = '/path/to/java/home'
|
153
150
|
|
154
|
-
expect(
|
151
|
+
expect(described_class.send(:java_path)).to eq '/path/to/java/home/bin/java'
|
155
152
|
end
|
156
153
|
end
|
157
154
|
|
158
|
-
context 'initialized with a given path' do
|
159
|
-
let(:henkei) {
|
155
|
+
context 'when initialized with a given path' do
|
156
|
+
let(:henkei) { described_class.new 'spec/samples/sample.pages' }
|
160
157
|
|
161
158
|
specify '#text reads text' do
|
162
159
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
@@ -167,7 +164,7 @@ describe Henkei do
|
|
167
164
|
end
|
168
165
|
|
169
166
|
context 'when passing in the `pipe-error.png` test file' do
|
170
|
-
let(:henkei) {
|
167
|
+
let(:henkei) { described_class.new 'spec/samples/pipe-error.png' }
|
171
168
|
|
172
169
|
it '#text returns an empty result' do
|
173
170
|
expect(henkei.text).to eq ''
|
@@ -189,9 +186,6 @@ describe Henkei do
|
|
189
186
|
West Side
|
190
187
|
|
191
188
|
Sea Island
|
192
|
-
PP
|
193
|
-
|
194
|
-
Richmond
|
195
189
|
TEXT
|
196
190
|
end
|
197
191
|
|
@@ -199,7 +193,7 @@ describe Henkei do
|
|
199
193
|
expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
|
200
194
|
|
201
195
|
html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
|
202
|
-
['
|
196
|
+
['West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
|
203
197
|
expect(html_body.text).to include location
|
204
198
|
end
|
205
199
|
end
|
@@ -208,8 +202,8 @@ describe Henkei do
|
|
208
202
|
end
|
209
203
|
end
|
210
204
|
|
211
|
-
context 'initialized with a given URI' do
|
212
|
-
let(:henkei) {
|
205
|
+
context 'when initialized with a given URI' do
|
206
|
+
let(:henkei) { described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
|
213
207
|
|
214
208
|
specify '#text reads text' do
|
215
209
|
expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
|
@@ -222,8 +216,8 @@ describe Henkei do
|
|
222
216
|
end
|
223
217
|
end
|
224
218
|
|
225
|
-
context 'initialized with a given stream' do
|
226
|
-
let(:henkei) {
|
219
|
+
context 'when initialized with a given stream' do
|
220
|
+
let(:henkei) { described_class.new File.open('spec/samples/sample.pages', 'rb') }
|
227
221
|
|
228
222
|
specify '#text reads text' do
|
229
223
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
@@ -235,7 +229,7 @@ describe Henkei do
|
|
235
229
|
end
|
236
230
|
|
237
231
|
context 'when source is a remote PDF' do
|
238
|
-
let(:henkei) {
|
232
|
+
let(:henkei) { described_class.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
|
239
233
|
|
240
234
|
specify '#text reads text' do
|
241
235
|
expect(henkei.text).to include 'Dummy PDF file'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.4.0.
|
4
|
+
version: 2.4.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -149,20 +149,6 @@ dependencies:
|
|
149
149
|
- - "~>"
|
150
150
|
- !ruby/object:Gem::Version
|
151
151
|
version: '1.13'
|
152
|
-
- !ruby/object:Gem::Dependency
|
153
|
-
name: rubocop-rails
|
154
|
-
requirement: !ruby/object:Gem::Requirement
|
155
|
-
requirements:
|
156
|
-
- - "~>"
|
157
|
-
- !ruby/object:Gem::Version
|
158
|
-
version: '2.14'
|
159
|
-
type: :development
|
160
|
-
prerelease: false
|
161
|
-
version_requirements: !ruby/object:Gem::Requirement
|
162
|
-
requirements:
|
163
|
-
- - "~>"
|
164
|
-
- !ruby/object:Gem::Version
|
165
|
-
version: '2.14'
|
166
152
|
- !ruby/object:Gem::Dependency
|
167
153
|
name: rubocop-rake
|
168
154
|
requirement: !ruby/object:Gem::Requirement
|
@@ -259,17 +245,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
259
245
|
requirements:
|
260
246
|
- - ">="
|
261
247
|
- !ruby/object:Gem::Version
|
262
|
-
version: 2.
|
248
|
+
version: 2.7.0
|
263
249
|
- - "<"
|
264
250
|
- !ruby/object:Gem::Version
|
265
|
-
version: 3.
|
251
|
+
version: 3.3.0
|
266
252
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
267
253
|
requirements:
|
268
254
|
- - ">="
|
269
255
|
- !ruby/object:Gem::Version
|
270
256
|
version: '0'
|
271
257
|
requirements: []
|
272
|
-
rubygems_version: 3.
|
258
|
+
rubygems_version: 3.4.1
|
273
259
|
signing_key:
|
274
260
|
specification_version: 4
|
275
261
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|