henkei 1.28.3.1 → 1.28.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +3 -3
- data/.rubocop.yml +12 -1
- data/henkei.gemspec +2 -3
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +6 -7
- data/spec/henkei_spec.rb +38 -37
- metadata +5 -19
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d6f63ac4ab328389684f60a90d9e5bfecbf14e74d9c9b6685abce2e984257ef7
|
|
4
|
+
data.tar.gz: 3f952baeb1a1c089ae4bccaa51d502909e6ed71f5bb30343ade57a51fab8a688
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 955576e745930cc52480ae2ebd25d93d74b7334868605b6a65a46832702693e8e7028a2c6f8ce548395a61bb31f6d0da40bf9118c23af0689988cb09f24a9166
|
|
7
|
+
data.tar.gz: 476c59eb877e5e03cf316a83126ff39b5e2e86cec0424c33d04cf324bc1dd2ffa52323b436747f1fd206680da6efe8e43e2794e8ca271b0a4854187f82728df8
|
data/.github/workflows/test.yml
CHANGED
|
@@ -14,10 +14,10 @@ jobs:
|
|
|
14
14
|
runs-on: ubuntu-latest
|
|
15
15
|
strategy:
|
|
16
16
|
matrix:
|
|
17
|
-
ruby-version: ['2.
|
|
17
|
+
ruby-version: ['2.7', '3.0', '3.1', '3.2']
|
|
18
18
|
|
|
19
19
|
steps:
|
|
20
|
-
- uses: actions/checkout@
|
|
20
|
+
- uses: actions/checkout@v3
|
|
21
21
|
|
|
22
22
|
- name: Set up Ruby
|
|
23
23
|
uses: ruby/setup-ruby@v1
|
|
@@ -32,6 +32,6 @@ jobs:
|
|
|
32
32
|
run: bundle exec rspec
|
|
33
33
|
|
|
34
34
|
- name: Test & publish code coverage
|
|
35
|
-
uses: paambaati/codeclimate-action@v3.
|
|
35
|
+
uses: paambaati/codeclimate-action@v3.2.0
|
|
36
36
|
env:
|
|
37
37
|
CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
data/.rubocop.yml
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
|
+
require:
|
|
2
|
+
- rubocop-performance
|
|
3
|
+
- rubocop-rake
|
|
4
|
+
- rubocop-rspec
|
|
5
|
+
|
|
1
6
|
AllCops:
|
|
2
7
|
NewCops: enable
|
|
3
|
-
TargetRubyVersion: 2.
|
|
8
|
+
TargetRubyVersion: 2.7
|
|
4
9
|
|
|
5
10
|
Layout/EmptyLinesAroundAttributeAccessor:
|
|
6
11
|
Enabled: true
|
|
@@ -30,6 +35,12 @@ Metrics/BlockLength:
|
|
|
30
35
|
Metrics/MethodLength:
|
|
31
36
|
Max: 15
|
|
32
37
|
|
|
38
|
+
RSpec/ExampleLength:
|
|
39
|
+
Max: 12
|
|
40
|
+
|
|
41
|
+
RSpec/MultipleExpectations:
|
|
42
|
+
Max: 4
|
|
43
|
+
|
|
33
44
|
Style/ClassVars:
|
|
34
45
|
Enabled: false
|
|
35
46
|
|
data/henkei.gemspec
CHANGED
|
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
|
5
5
|
|
|
6
6
|
require 'henkei/version'
|
|
7
7
|
|
|
8
|
-
Gem::Specification.new do |spec|
|
|
8
|
+
Gem::Specification.new do |spec|
|
|
9
9
|
spec.name = 'henkei'
|
|
10
10
|
spec.version = Henkei::VERSION
|
|
11
11
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
|
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
|
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
|
16
16
|
spec.homepage = 'https://github.com/abrom/henkei'
|
|
17
17
|
spec.license = 'MIT'
|
|
18
|
-
spec.required_ruby_version = ['>= 2.
|
|
18
|
+
spec.required_ruby_version = ['>= 2.7.0', '< 3.3.0']
|
|
19
19
|
|
|
20
20
|
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
|
21
21
|
# delete this section to allow pushing this gem to any host.
|
|
@@ -37,7 +37,6 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
|
|
37
37
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
|
38
38
|
spec.add_development_dependency 'rubocop', '~> 1.26'
|
|
39
39
|
spec.add_development_dependency 'rubocop-performance', '~> 1.13'
|
|
40
|
-
spec.add_development_dependency 'rubocop-rails', '~> 2.14'
|
|
41
40
|
spec.add_development_dependency 'rubocop-rake', '~> 0.6'
|
|
42
41
|
spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
|
|
43
42
|
spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
|
@@ -34,8 +34,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
|
34
34
|
|
|
35
35
|
def self.mimetype(content_type)
|
|
36
36
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
|
37
|
-
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
|
38
|
-
'
|
|
37
|
+
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead. ' \
|
|
38
|
+
'Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
|
39
39
|
MIME::Types[content_type].first
|
|
40
40
|
else
|
|
41
41
|
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
|
@@ -78,7 +78,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
|
78
78
|
if input.is_a? String
|
|
79
79
|
if File.exist? input
|
|
80
80
|
@path = input
|
|
81
|
-
elsif input
|
|
81
|
+
elsif input&.match?(URI::DEFAULT_PARSER.make_regexp)
|
|
82
82
|
@uri = URI.parse input
|
|
83
83
|
else
|
|
84
84
|
raise Errno::ENOENT, "missing file or invalid URI - #{input}"
|
|
@@ -265,7 +265,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
|
265
265
|
# tell Tika that we're done sending data
|
|
266
266
|
s.shutdown(Socket::SHUT_WR)
|
|
267
267
|
|
|
268
|
-
resp =
|
|
268
|
+
resp = +''
|
|
269
269
|
loop do
|
|
270
270
|
chunk = s.recv(65_536)
|
|
271
271
|
break if chunk.empty? || !chunk
|
|
@@ -300,9 +300,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
|
300
300
|
# Internal helper to remove erroneous output
|
|
301
301
|
#
|
|
302
302
|
def self.filter_response(response)
|
|
303
|
-
response.
|
|
304
|
-
|
|
305
|
-
''
|
|
303
|
+
response.delete_prefix(
|
|
304
|
+
"WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.\n"
|
|
306
305
|
)
|
|
307
306
|
end
|
|
308
307
|
private_class_method :filter_response
|
data/spec/henkei_spec.rb
CHANGED
|
@@ -15,13 +15,13 @@ describe Henkei do
|
|
|
15
15
|
|
|
16
16
|
describe '.read' do
|
|
17
17
|
it 'reads text' do
|
|
18
|
-
text =
|
|
18
|
+
text = described_class.read :text, data
|
|
19
19
|
|
|
20
20
|
expect(text).to include 'The quick brown fox jumped over the lazy cat.'
|
|
21
21
|
end
|
|
22
22
|
|
|
23
23
|
it 'reads metadata' do
|
|
24
|
-
metadata =
|
|
24
|
+
metadata = described_class.read :metadata, data
|
|
25
25
|
|
|
26
26
|
expect(metadata['Content-Type']).to(
|
|
27
27
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
@@ -30,13 +30,13 @@ describe Henkei do
|
|
|
30
30
|
|
|
31
31
|
it 'reads metadata values with colons as strings' do
|
|
32
32
|
data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
|
|
33
|
-
metadata =
|
|
33
|
+
metadata = described_class.read :metadata, data
|
|
34
34
|
|
|
35
35
|
expect(metadata['dc:title']).to eq 'problem: test'
|
|
36
36
|
end
|
|
37
37
|
|
|
38
38
|
it 'reads mimetype' do
|
|
39
|
-
mimetype =
|
|
39
|
+
mimetype = described_class.read :mimetype, data
|
|
40
40
|
|
|
41
41
|
expect(mimetype.content_type).to(
|
|
42
42
|
eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
@@ -48,7 +48,7 @@ describe Henkei do
|
|
|
48
48
|
let(:data) { File.read 'spec/samples/pipe-error.png' }
|
|
49
49
|
|
|
50
50
|
it 'returns an empty result' do
|
|
51
|
-
text =
|
|
51
|
+
text = described_class.read :text, data
|
|
52
52
|
|
|
53
53
|
expect(text).to eq ''
|
|
54
54
|
end
|
|
@@ -57,11 +57,11 @@ describe Henkei do
|
|
|
57
57
|
|
|
58
58
|
describe '.new' do
|
|
59
59
|
it 'requires parameters' do
|
|
60
|
-
expect {
|
|
60
|
+
expect { described_class.new }.to raise_error ArgumentError
|
|
61
61
|
end
|
|
62
62
|
|
|
63
63
|
it 'accepts a root path' do
|
|
64
|
-
henkei =
|
|
64
|
+
henkei = described_class.new File.join(Henkei::GEM_PATH, 'spec/samples/sample.pages')
|
|
65
65
|
|
|
66
66
|
expect(henkei).to be_path
|
|
67
67
|
expect(henkei).not_to be_uri
|
|
@@ -69,7 +69,7 @@ describe Henkei do
|
|
|
69
69
|
end
|
|
70
70
|
|
|
71
71
|
it 'accepts a relative path' do
|
|
72
|
-
henkei =
|
|
72
|
+
henkei = described_class.new 'spec/samples/sample.pages'
|
|
73
73
|
|
|
74
74
|
expect(henkei).to be_path
|
|
75
75
|
expect(henkei).not_to be_uri
|
|
@@ -77,7 +77,7 @@ describe Henkei do
|
|
|
77
77
|
end
|
|
78
78
|
|
|
79
79
|
it 'accepts a path with spaces' do
|
|
80
|
-
henkei =
|
|
80
|
+
henkei = described_class.new 'spec/samples/sample filename with spaces.pages'
|
|
81
81
|
|
|
82
82
|
expect(henkei).to be_path
|
|
83
83
|
expect(henkei).not_to be_uri
|
|
@@ -85,7 +85,7 @@ describe Henkei do
|
|
|
85
85
|
end
|
|
86
86
|
|
|
87
87
|
it 'accepts a URI' do
|
|
88
|
-
henkei =
|
|
88
|
+
henkei = described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
|
|
89
89
|
|
|
90
90
|
expect(henkei).to be_uri
|
|
91
91
|
expect(henkei).not_to be_path
|
|
@@ -94,7 +94,7 @@ describe Henkei do
|
|
|
94
94
|
|
|
95
95
|
it 'accepts a stream or object that can be read' do
|
|
96
96
|
File.open 'spec/samples/sample.pages', 'r' do |file|
|
|
97
|
-
henkei =
|
|
97
|
+
henkei = described_class.new file
|
|
98
98
|
|
|
99
99
|
expect(henkei).to be_stream
|
|
100
100
|
expect(henkei).not_to be_path
|
|
@@ -103,37 +103,38 @@ describe Henkei do
|
|
|
103
103
|
end
|
|
104
104
|
|
|
105
105
|
it 'refuses a path to a missing file' do
|
|
106
|
-
expect {
|
|
106
|
+
expect { described_class.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
|
|
107
107
|
end
|
|
108
108
|
|
|
109
109
|
it 'refuses other objects' do
|
|
110
110
|
[nil, 1, 1.1].each do |object|
|
|
111
|
-
expect {
|
|
111
|
+
expect { described_class.new object }.to raise_error TypeError
|
|
112
112
|
end
|
|
113
113
|
end
|
|
114
114
|
end
|
|
115
115
|
|
|
116
116
|
describe '.creation_date' do
|
|
117
|
-
let(:henkei) {
|
|
118
|
-
|
|
117
|
+
let(:henkei) { described_class.new 'spec/samples/sample.pages' }
|
|
118
|
+
|
|
119
|
+
it 'returns Time' do
|
|
119
120
|
expect(henkei.creation_date).to be_a Time
|
|
120
121
|
end
|
|
121
122
|
end
|
|
122
123
|
|
|
123
124
|
describe '.java' do
|
|
124
125
|
specify 'with no specified JAVA_HOME' do
|
|
125
|
-
expect(
|
|
126
|
+
expect(described_class.send(:java_path)).to eq 'java'
|
|
126
127
|
end
|
|
127
128
|
|
|
128
129
|
specify 'with a specified JAVA_HOME' do
|
|
129
130
|
ENV['JAVA_HOME'] = '/path/to/java/home'
|
|
130
131
|
|
|
131
|
-
expect(
|
|
132
|
+
expect(described_class.send(:java_path)).to eq '/path/to/java/home/bin/java'
|
|
132
133
|
end
|
|
133
134
|
end
|
|
134
135
|
|
|
135
|
-
context 'initialized with a given path' do
|
|
136
|
-
let(:henkei) {
|
|
136
|
+
context 'when initialized with a given path' do
|
|
137
|
+
let(:henkei) { described_class.new 'spec/samples/sample.pages' }
|
|
137
138
|
|
|
138
139
|
specify '#text reads text' do
|
|
139
140
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
|
@@ -144,7 +145,7 @@ describe Henkei do
|
|
|
144
145
|
end
|
|
145
146
|
|
|
146
147
|
context 'when passing in the `pipe-error.png` test file' do
|
|
147
|
-
let(:henkei) {
|
|
148
|
+
let(:henkei) { described_class.new 'spec/samples/pipe-error.png' }
|
|
148
149
|
|
|
149
150
|
it '#text returns an empty result' do
|
|
150
151
|
expect(henkei.text).to eq ''
|
|
@@ -161,8 +162,8 @@ describe Henkei do
|
|
|
161
162
|
end
|
|
162
163
|
end
|
|
163
164
|
|
|
164
|
-
context 'initialized with a given URI' do
|
|
165
|
-
let(:henkei) {
|
|
165
|
+
context 'when initialized with a given URI' do
|
|
166
|
+
let(:henkei) { described_class.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
|
|
166
167
|
|
|
167
168
|
specify '#text reads text' do
|
|
168
169
|
expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
|
|
@@ -175,8 +176,8 @@ describe Henkei do
|
|
|
175
176
|
end
|
|
176
177
|
end
|
|
177
178
|
|
|
178
|
-
context 'initialized with a given stream' do
|
|
179
|
-
let(:henkei) {
|
|
179
|
+
context 'when initialized with a given stream' do
|
|
180
|
+
let(:henkei) { described_class.new File.open('spec/samples/sample.pages', 'rb') }
|
|
180
181
|
|
|
181
182
|
specify '#text reads text' do
|
|
182
183
|
expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
|
|
@@ -188,7 +189,7 @@ describe Henkei do
|
|
|
188
189
|
end
|
|
189
190
|
|
|
190
191
|
context 'when source is a remote PDF' do
|
|
191
|
-
let(:henkei) {
|
|
192
|
+
let(:henkei) { described_class.new 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf' }
|
|
192
193
|
|
|
193
194
|
specify '#text reads text' do
|
|
194
195
|
expect(henkei.text).to include 'Dummy PDF file'
|
|
@@ -199,35 +200,35 @@ describe Henkei do
|
|
|
199
200
|
end
|
|
200
201
|
end
|
|
201
202
|
|
|
202
|
-
context 'working as server mode' do
|
|
203
|
+
context 'when working as server mode' do
|
|
203
204
|
specify '#starts and kills server' do
|
|
204
|
-
|
|
205
|
-
expect(
|
|
206
|
-
expect(
|
|
205
|
+
described_class.server(:text)
|
|
206
|
+
expect(described_class.class_variable_get(:@@server_pid)).not_to be_nil
|
|
207
|
+
expect(described_class.class_variable_get(:@@server_port)).not_to be_nil
|
|
207
208
|
|
|
208
|
-
s = TCPSocket.new('localhost',
|
|
209
|
+
s = TCPSocket.new('localhost', described_class.class_variable_get(:@@server_port))
|
|
209
210
|
expect(s).to be_a TCPSocket
|
|
210
211
|
s.close
|
|
211
212
|
ensure
|
|
212
|
-
port =
|
|
213
|
-
|
|
213
|
+
port = described_class.class_variable_get(:@@server_port)
|
|
214
|
+
described_class.kill_server!
|
|
214
215
|
sleep 2
|
|
215
216
|
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
|
216
217
|
end
|
|
217
218
|
|
|
218
219
|
specify '#runs samples through server mode' do
|
|
219
|
-
|
|
220
|
-
expect(
|
|
220
|
+
described_class.server(:text)
|
|
221
|
+
expect(described_class.new('spec/samples/sample.pages').text).to(
|
|
221
222
|
include 'The quick brown fox jumped over the lazy cat.'
|
|
222
223
|
)
|
|
223
|
-
expect(
|
|
224
|
+
expect(described_class.new('spec/samples/sample filename with spaces.pages').text).to(
|
|
224
225
|
include 'The quick brown fox jumped over the lazy cat.'
|
|
225
226
|
)
|
|
226
|
-
expect(
|
|
227
|
+
expect(described_class.new('spec/samples/sample.docx').text).to(
|
|
227
228
|
include 'The quick brown fox jumped over the lazy cat.'
|
|
228
229
|
)
|
|
229
230
|
ensure
|
|
230
|
-
|
|
231
|
+
described_class.kill_server!
|
|
231
232
|
end
|
|
232
233
|
end
|
|
233
234
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: henkei
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.28.3.
|
|
4
|
+
version: 1.28.3.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Erol Fornoles
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date:
|
|
12
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: json
|
|
@@ -135,20 +135,6 @@ dependencies:
|
|
|
135
135
|
- - "~>"
|
|
136
136
|
- !ruby/object:Gem::Version
|
|
137
137
|
version: '1.13'
|
|
138
|
-
- !ruby/object:Gem::Dependency
|
|
139
|
-
name: rubocop-rails
|
|
140
|
-
requirement: !ruby/object:Gem::Requirement
|
|
141
|
-
requirements:
|
|
142
|
-
- - "~>"
|
|
143
|
-
- !ruby/object:Gem::Version
|
|
144
|
-
version: '2.14'
|
|
145
|
-
type: :development
|
|
146
|
-
prerelease: false
|
|
147
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
148
|
-
requirements:
|
|
149
|
-
- - "~>"
|
|
150
|
-
- !ruby/object:Gem::Version
|
|
151
|
-
version: '2.14'
|
|
152
138
|
- !ruby/object:Gem::Dependency
|
|
153
139
|
name: rubocop-rake
|
|
154
140
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -244,17 +230,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
244
230
|
requirements:
|
|
245
231
|
- - ">="
|
|
246
232
|
- !ruby/object:Gem::Version
|
|
247
|
-
version: 2.
|
|
233
|
+
version: 2.7.0
|
|
248
234
|
- - "<"
|
|
249
235
|
- !ruby/object:Gem::Version
|
|
250
|
-
version: 3.
|
|
236
|
+
version: 3.3.0
|
|
251
237
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
252
238
|
requirements:
|
|
253
239
|
- - ">="
|
|
254
240
|
- !ruby/object:Gem::Version
|
|
255
241
|
version: '0'
|
|
256
242
|
requirements: []
|
|
257
|
-
rubygems_version: 3.
|
|
243
|
+
rubygems_version: 3.4.1
|
|
258
244
|
signing_key:
|
|
259
245
|
specification_version: 4
|
|
260
246
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|