henkei 2.4.0.1 → 2.4.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +3 -3
- data/.rubocop.yml +11 -1
- data/henkei.gemspec +2 -3
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +2 -2
- data/spec/henkei_spec.rb +27 -33
- metadata +5 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a039368aaffaee95c3d48c5b56fc30b81babafc7b996986e56c788bb12c20bc0
|
4
|
+
data.tar.gz: d31e08f66e605ea99209911edebbb62fce65e75c328043a9ed3e53f1eac4d80d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1daffbde6948f1e8d9c8003c8ab60a3f86051c307b04a0aa3a654bdf05efd830c3148e17a555fee4cd4f52d85ed80eb6071e4df54754e5458b7e1caa1a9b2474
|
7
|
+
data.tar.gz: a230d09cde52cbedbc0b67ed025a7666119b5c66f7611cb730b1558aba5b77f5f50cc878e17f8d11a14d9840d0b379aeedb00d141fbde924ec30f3fe42414747
|
data/.github/workflows/test.yml
CHANGED
@@ -14,10 +14,10 @@ jobs:
|
|
14
14
|
runs-on: ubuntu-latest
|
15
15
|
strategy:
|
16
16
|
matrix:
|
17
|
-
ruby-version: ['2.
|
17
|
+
ruby-version: ['2.7', '3.0', '3.1', '3.2']
|
18
18
|
|
19
19
|
steps:
|
20
|
-
- uses: actions/checkout@
|
20
|
+
- uses: actions/checkout@v3
|
21
21
|
|
22
22
|
- name: Set up Ruby
|
23
23
|
uses: ruby/setup-ruby@v1
|
@@ -32,6 +32,6 @@ jobs:
|
|
32
32
|
run: bundle exec rspec
|
33
33
|
|
34
34
|
- name: Test & publish code coverage
|
35
|
-
uses: paambaati/codeclimate-action@v3.
|
35
|
+
uses: paambaati/codeclimate-action@v3.2.0
|
36
36
|
env:
|
37
37
|
CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
data/.rubocop.yml
CHANGED
@@ -1,6 +1,10 @@
|
|
1
|
+
require:
|
2
|
+
- rubocop-rake
|
3
|
+
- rubocop-rspec
|
4
|
+
|
1
5
|
AllCops:
|
2
6
|
NewCops: enable
|
3
|
-
TargetRubyVersion: 2.
|
7
|
+
TargetRubyVersion: 2.7
|
4
8
|
|
5
9
|
Layout/EmptyLinesAroundAttributeAccessor:
|
6
10
|
Enabled: true
|
@@ -30,6 +34,12 @@ Metrics/BlockLength:
|
|
30
34
|
Metrics/MethodLength:
|
31
35
|
Max: 15
|
32
36
|
|
37
|
+
RSpec/ExampleLength:
|
38
|
+
Max: 10
|
39
|
+
|
40
|
+
RSpec/MultipleExpectations:
|
41
|
+
Max: 3
|
42
|
+
|
33
43
|
Style/ClassVars:
|
34
44
|
Enabled: false
|
35
45
|
|
data/henkei.gemspec
CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
5
|
|
6
6
|
require 'henkei/version'
|
7
7
|
|
8
|
-
Gem::Specification.new do |spec|
|
8
|
+
Gem::Specification.new do |spec|
|
9
9
|
spec.name = 'henkei'
|
10
10
|
spec.version = Henkei::VERSION
|
11
11
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
16
|
spec.homepage = 'https://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
|
-
spec.required_ruby_version = ['>= 2.
|
18
|
+
spec.required_ruby_version = ['>= 2.7.0', '< 3.3.0']
|
19
19
|
|
20
20
|
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
21
21
|
# delete this section to allow pushing this gem to any host.
|
@@ -38,7 +38,6 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
|
38
38
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
39
39
|
spec.add_development_dependency 'rubocop', '~> 1.26'
|
40
40
|
spec.add_development_dependency 'rubocop-performance', '~> 1.13'
|
41
|
-
spec.add_development_dependency 'rubocop-rails', '~> 2.14'
|
42
41
|
spec.add_development_dependency 'rubocop-rake', '~> 0.6'
|
43
42
|
spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
|
44
43
|
spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -31,8 +31,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
31
31
|
|
32
32
|
def self.mimetype(content_type)
|
33
33
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
34
|
-
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
35
|
-
'
|
34
|
+
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead. ' \
|
35
|
+
'Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
36
36
|
MIME::Types[content_type].first
|
37
37
|
else
|
38
38
|
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
data/spec/henkei_spec.rb
CHANGED
@@ -20,13 +20,13 @@ describe Henkei do
|
|
20
20
|
|
21
21
|
describe '.read' do
|
22
22
|
it 'reads text' do
|
23
|
-
text =
|
23
|
+
text = described_class.read :text, data
|
24
24
|
|
25
25
|
expect(text).to include 'The quick brown fox jumped over the lazy cat.'
|
26
26
|
end
|
27
27
|
|
28
28
|
it 'reads metadata' do
|
29
|
-
metadata =
|
29
|
+
metadata = described_class.read :metadata, data
|
30
30
|
|
31
31
|
expect(metadata['Content-Type']).to(
|
32
32
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
@@ -35,13 +35,13 @@ describe Henkei do
|
|
35
35
|
|
36
36
|
it 'reads metadata values with colons as strings' do
|
37
37
|
data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
|
38
|
-
metadata =
|
38
|
+
metadata = described_class.read :metadata, data
|
39
39
|
|
40
40
|
expect(metadata['dc:title']).to eq 'problem: test'
|
41
41
|
end
|
42
42
|
|
43
43
|
it 'reads mimetype' do
|
44
|
-
mimetype =
|
44
|
+
mimetype = described_class.read :mimetype, data
|
45
45
|
|
46
46
|
expect(mimetype.content_type).to(
|
47
47
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
@@ -53,7 +53,7 @@ describe Henkei do
|
|
53
53
|
let(:data) { File.read 'spec/samples/pipe-error.png' }
|
54
54
|
|
55
55
|
it 'returns an empty result' do
|
56
|
-
text =
|
56
|
+
text = described_class.read :text, data
|
57
57
|
|
58
58
|
expect(text).to eq ''
|
59
59
|
end
|
@@ -61,15 +61,12 @@ describe Henkei do
|
|
61
61
|
unless ci?
|
62
62
|
context 'when `include_ocr` is enabled' do
|
63
63
|
it 'returns parsed plain text in the image' do
|
64
|
-
text =
|
64
|
+
text = described_class.read :text, data, include_ocr: true
|
65
65
|
|
66
66
|
expect(text).to include <<~TEXT
|
67
67
|
West Side
|
68
68
|
|
69
69
|
Sea Island
|
70
|
-
PP
|
71
|
-
|
72
|
-
Richmond
|
73
70
|
TEXT
|
74
71
|
end
|
75
72
|
end
|
@@ -79,11 +76,11 @@ describe Henkei do
|
|
79
76
|
|
80
77
|
describe '.new' do
|
81
78
|
it 'requires parameters' do
|
82
|
-
expect {
|
79
|
+
expect { described_class.new }.to raise_error ArgumentError
|
83
80
|
end
|
84
81
|
|
85
82
|
it 'accepts a root path' do
|
86
|
-
henkei =
|
83
|
+
henkei = described_class.new File.join(Henkei::GEM_PATH, 'spec/samples/sample.pages')
|
87
84
|
|
88
85
|
expect(henkei).to be_path
|
89
86
|
expect(henkei).not_to be_uri
|
@@ -91,7 +88,7 @@ describe Henkei do
|
|
91
88
|
end
|
92
89
|
|
93
90
|
it 'accepts a relative path' do
|
94
|
-
henkei =
|
91
|
+
henkei = described_class.new 'spec/samples/sample.pages'
|
95
92
|
|
96
93
|
expect(henkei).to be_path
|
97
94
|
expect(henkei).not_to be_uri
|
@@ -99,7 +96,7 @@ describe Henkei do
|
|
99
96
|
end
|
100
97
|
|
101
98
|
it 'accepts a path with spaces' do
|
102
|
-
henkei =
|
99
|
+
henkei = described_class.new 'spec/samples/sample filename with spaces.pages'
|
103
100
|
|
104
101
|
expect(henkei).to be_path
|
105
102
|
expect(henkei).not_to be_uri
|
@@ -107,7 +104,7 @@ describe Henkei do
|
|
107
104
|
end
|
108
105
|
|
109
106
|
it 'accepts a URI' do
|
110
|
-
henkei =
|
107
|
+
henkei = described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
111
108
|
|
112
109
|
expect(henkei).to be_uri
|
113
110
|
expect(henkei).not_to be_path
|
@@ -116,7 +113,7 @@ describe Henkei do
|
|
116
113
|
|
117
114
|
it 'accepts a stream or object that can be read' do
|
118
115
|
File.open 'spec/samples/sample.pages', 'r' do |file|
|
119
|
-
henkei =
|
116
|
+
henkei = described_class.new file
|
120
117
|
|
121
118
|
expect(henkei).to be_stream
|
122
119
|
expect(henkei).not_to be_path
|
@@ -125,38 +122,38 @@ describe Henkei do
|
|
125
122
|
end
|
126
123
|
|
127
124
|
it 'refuses a path to a missing file' do
|
128
|
-
expect {
|
125
|
+
expect { described_class.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
|
129
126
|
end
|
130
127
|
|
131
128
|
it 'refuses other objects' do
|
132
129
|
[nil, 1, 1.1].each do |object|
|
133
|
-
expect {
|
130
|
+
expect { described_class.new object }.to raise_error TypeError
|
134
131
|
end
|
135
132
|
end
|
136
133
|
end
|
137
134
|
|
138
135
|
describe '.creation_date' do
|
139
|
-
let(:henkei) {
|
136
|
+
let(:henkei) { described_class.new 'spec/samples/sample.pages' }
|
140
137
|
|
141
|
-
it '
|
138
|
+
it 'returns a Time' do
|
142
139
|
expect(henkei.creation_date).to be_a Time
|
143
140
|
end
|
144
141
|
end
|
145
142
|
|
146
143
|
describe '.java' do
|
147
144
|
specify 'with no specified JAVA_HOME' do
|
148
|
-
expect(
|
145
|
+
expect(described_class.send(:java_path)).to eq 'java'
|
149
146
|
end
|
150
147
|
|
151
148
|
specify 'with a specified JAVA_HOME' do
|
152
149
|
ENV['JAVA_HOME'] = '/path/to/java/home'
|
153
150
|
|
154
|
-
expect(
|
151
|
+
expect(described_class.send(:java_path)).to eq '/path/to/java/home/bin/java'
|
155
152
|
end
|
156
153
|
end
|
157
154
|
|
158
|
-
context 'initialized with a given path' do
|
159
|
-
let(:henkei) {
|
155
|
+
context 'when initialized with a given path' do
|
156
|
+
let(:henkei) { described_class.new 'spec/samples/sample.pages' }
|
160
157
|
|
161
158
|
specify '#text reads text' do
|
162
159
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
@@ -167,7 +164,7 @@ describe Henkei do
|
|
167
164
|
end
|
168
165
|
|
169
166
|
context 'when passing in the `pipe-error.png` test file' do
|
170
|
-
let(:henkei) {
|
167
|
+
let(:henkei) { described_class.new 'spec/samples/pipe-error.png' }
|
171
168
|
|
172
169
|
it '#text returns an empty result' do
|
173
170
|
expect(henkei.text).to eq ''
|
@@ -189,9 +186,6 @@ describe Henkei do
|
|
189
186
|
West Side
|
190
187
|
|
191
188
|
Sea Island
|
192
|
-
PP
|
193
|
-
|
194
|
-
Richmond
|
195
189
|
TEXT
|
196
190
|
end
|
197
191
|
|
@@ -199,7 +193,7 @@ describe Henkei do
|
|
199
193
|
expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
|
200
194
|
|
201
195
|
html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
|
202
|
-
['
|
196
|
+
['West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
|
203
197
|
expect(html_body.text).to include location
|
204
198
|
end
|
205
199
|
end
|
@@ -208,8 +202,8 @@ describe Henkei do
|
|
208
202
|
end
|
209
203
|
end
|
210
204
|
|
211
|
-
context 'initialized with a given URI' do
|
212
|
-
let(:henkei) {
|
205
|
+
context 'when initialized with a given URI' do
|
206
|
+
let(:henkei) { described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
|
213
207
|
|
214
208
|
specify '#text reads text' do
|
215
209
|
expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
|
@@ -222,8 +216,8 @@ describe Henkei do
|
|
222
216
|
end
|
223
217
|
end
|
224
218
|
|
225
|
-
context 'initialized with a given stream' do
|
226
|
-
let(:henkei) {
|
219
|
+
context 'when initialized with a given stream' do
|
220
|
+
let(:henkei) { described_class.new File.open('spec/samples/sample.pages', 'rb') }
|
227
221
|
|
228
222
|
specify '#text reads text' do
|
229
223
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
@@ -235,7 +229,7 @@ describe Henkei do
|
|
235
229
|
end
|
236
230
|
|
237
231
|
context 'when source is a remote PDF' do
|
238
|
-
let(:henkei) {
|
232
|
+
let(:henkei) { described_class.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
|
239
233
|
|
240
234
|
specify '#text reads text' do
|
241
235
|
expect(henkei.text).to include 'Dummy PDF file'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.4.0.
|
4
|
+
version: 2.4.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -149,20 +149,6 @@ dependencies:
|
|
149
149
|
- - "~>"
|
150
150
|
- !ruby/object:Gem::Version
|
151
151
|
version: '1.13'
|
152
|
-
- !ruby/object:Gem::Dependency
|
153
|
-
name: rubocop-rails
|
154
|
-
requirement: !ruby/object:Gem::Requirement
|
155
|
-
requirements:
|
156
|
-
- - "~>"
|
157
|
-
- !ruby/object:Gem::Version
|
158
|
-
version: '2.14'
|
159
|
-
type: :development
|
160
|
-
prerelease: false
|
161
|
-
version_requirements: !ruby/object:Gem::Requirement
|
162
|
-
requirements:
|
163
|
-
- - "~>"
|
164
|
-
- !ruby/object:Gem::Version
|
165
|
-
version: '2.14'
|
166
152
|
- !ruby/object:Gem::Dependency
|
167
153
|
name: rubocop-rake
|
168
154
|
requirement: !ruby/object:Gem::Requirement
|
@@ -259,17 +245,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
259
245
|
requirements:
|
260
246
|
- - ">="
|
261
247
|
- !ruby/object:Gem::Version
|
262
|
-
version: 2.
|
248
|
+
version: 2.7.0
|
263
249
|
- - "<"
|
264
250
|
- !ruby/object:Gem::Version
|
265
|
-
version: 3.
|
251
|
+
version: 3.3.0
|
266
252
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
267
253
|
requirements:
|
268
254
|
- - ">="
|
269
255
|
- !ruby/object:Gem::Version
|
270
256
|
version: '0'
|
271
257
|
requirements: []
|
272
|
-
rubygems_version: 3.
|
258
|
+
rubygems_version: 3.4.1
|
273
259
|
signing_key:
|
274
260
|
specification_version: 4
|
275
261
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|