henkei 1.22.0 → 1.23.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -3
- data/bin/console +8 -0
- data/jar/{tika-app-1.22.jar → tika-app-1.23.jar} +0 -0
- data/lib/henkei.rb +4 -6
- data/lib/henkei/version.rb +1 -1
- data/spec/henkei_spec.rb +27 -0
- data/spec/samples/pipe-error.png +0 -0
- metadata +9 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f9ebc4be4691020c72328134a33a9ffe6b4fb79a939ddc9ce833c56551abb86d
|
4
|
+
data.tar.gz: 17402ba43e9840b59090a82f1cd39e79e594ca3d36c763b958f9243174990f8e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74dcf4d6f2ce5f99b77b3c1fdd34a271220c58e8aae167b40cde35eef2166570d3c4de7b94f91d98158fe3cc384ec7a7688cf98812e378607e31f8d24e06420f
|
7
|
+
data.tar.gz: aa210ee582d56592932684216eb93cd3f91ea7ba95e3b1d4bc672ed09bdc9605008e7f6f21472ca291c574cb1050b18eb6bdb47aa5caa03341ae393cbb0b9939
|
data/.rubocop.yml
CHANGED
data/bin/console
ADDED
Binary file
|
data/lib/henkei.rb
CHANGED
@@ -11,10 +11,12 @@ require 'json'
|
|
11
11
|
require 'socket'
|
12
12
|
require 'stringio'
|
13
13
|
|
14
|
+
require 'open3'
|
15
|
+
|
14
16
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
15
17
|
class Henkei # rubocop:disable Metrics/ClassLength
|
16
18
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
17
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.
|
19
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.23.jar')
|
18
20
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
19
21
|
DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
|
20
22
|
|
@@ -224,11 +226,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
224
226
|
# Internal helper for calling to Tika library directly
|
225
227
|
#
|
226
228
|
def self.client_read(type, data)
|
227
|
-
|
228
|
-
io.write data
|
229
|
-
io.close_write
|
230
|
-
io.read
|
231
|
-
end
|
229
|
+
Open3.capture2(tika_command(type), stdin_data: data).first
|
232
230
|
end
|
233
231
|
private_class_method :client_read
|
234
232
|
|
data/lib/henkei/version.rb
CHANGED
data/spec/henkei_spec.rb
CHANGED
@@ -40,6 +40,16 @@ describe Henkei do
|
|
40
40
|
)
|
41
41
|
expect(mimetype.extensions).to include 'docx'
|
42
42
|
end
|
43
|
+
|
44
|
+
context 'when passing in the `pipe-error.png` test file' do
|
45
|
+
let(:data) { File.read 'spec/samples/pipe-error.png' }
|
46
|
+
|
47
|
+
it 'returns an empty result' do
|
48
|
+
text = Henkei.read :text, data
|
49
|
+
|
50
|
+
expect(text).to eq ''
|
51
|
+
end
|
52
|
+
end
|
43
53
|
end
|
44
54
|
|
45
55
|
describe '.new' do
|
@@ -129,6 +139,23 @@ describe Henkei do
|
|
129
139
|
specify '#metadata reads metadata' do
|
130
140
|
expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
|
131
141
|
end
|
142
|
+
|
143
|
+
context 'when passing in the `pipe-error.png` test file' do
|
144
|
+
let(:henkei) { Henkei.new 'spec/samples/pipe-error.png' }
|
145
|
+
|
146
|
+
it '#text returns an empty result' do
|
147
|
+
expect(henkei.text).to eq ''
|
148
|
+
end
|
149
|
+
|
150
|
+
it '#html returns an empty body' do
|
151
|
+
expect(henkei.html).to include '<body/>'
|
152
|
+
expect(henkei.html).to include '<meta name="tiff:ImageWidth" content="792"/>'
|
153
|
+
end
|
154
|
+
|
155
|
+
it '#mimetype returns an empty result' do
|
156
|
+
expect(henkei.mimetype.content_type).to eq 'image/png'
|
157
|
+
end
|
158
|
+
end
|
132
159
|
end
|
133
160
|
|
134
161
|
context 'initialized with a given URI' do
|
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.23.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2019-
|
12
|
+
date: 2019-12-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -125,7 +125,8 @@ description: Read text and metadata from files and documents using Apache Tika t
|
|
125
125
|
email:
|
126
126
|
- erol.fornoles@gmail.com
|
127
127
|
- a.bromwich@gmail.com
|
128
|
-
executables:
|
128
|
+
executables:
|
129
|
+
- console
|
129
130
|
extensions: []
|
130
131
|
extra_rdoc_files: []
|
131
132
|
files:
|
@@ -138,14 +139,16 @@ files:
|
|
138
139
|
- NOTICE.txt
|
139
140
|
- README.md
|
140
141
|
- Rakefile
|
142
|
+
- bin/console
|
141
143
|
- henkei.gemspec
|
142
|
-
- jar/tika-app-1.
|
144
|
+
- jar/tika-app-1.23.jar
|
143
145
|
- jar/tika-config.xml
|
144
146
|
- lib/henkei.rb
|
145
147
|
- lib/henkei/version.rb
|
146
148
|
- lib/henkei/yomu.rb
|
147
149
|
- spec/helper.rb
|
148
150
|
- spec/henkei_spec.rb
|
151
|
+
- spec/samples/pipe-error.png
|
149
152
|
- spec/samples/sample filename with spaces.pages
|
150
153
|
- spec/samples/sample-metadata-values-with-colons.doc
|
151
154
|
- spec/samples/sample.docx
|
@@ -169,8 +172,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
169
172
|
- !ruby/object:Gem::Version
|
170
173
|
version: '0'
|
171
174
|
requirements: []
|
172
|
-
|
173
|
-
rubygems_version: 2.7.6.2
|
175
|
+
rubygems_version: 3.0.6
|
174
176
|
signing_key:
|
175
177
|
specification_version: 4
|
176
178
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
@@ -178,6 +180,7 @@ summary: Read text and metadata from files and documents (.doc, .docx, .pages, .
|
|
178
180
|
test_files:
|
179
181
|
- spec/helper.rb
|
180
182
|
- spec/henkei_spec.rb
|
183
|
+
- spec/samples/pipe-error.png
|
181
184
|
- spec/samples/sample filename with spaces.pages
|
182
185
|
- spec/samples/sample-metadata-values-with-colons.doc
|
183
186
|
- spec/samples/sample.docx
|