henkei 1.27.1 → 1.28.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +37 -0
- data/.rubocop.yml +1 -0
- data/README.md +1 -1
- data/henkei.gemspec +16 -6
- data/jar/{tika-app-1.27.jar → tika-app-1.28.jar} +0 -0
- data/jar/tika-config.xml +1 -0
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +15 -6
- data/spec/henkei_spec.rb +24 -28
- metadata +79 -22
- data/.travis.yml +0 -32
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e77d196ca581d7e2d12b1045710115fd9ab6ef903b4a8142d473591956f526f
|
4
|
+
data.tar.gz: fb66905068f383d12a104c128b0ae7458964ab89e3076f497c6b5d0855d2c532
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ab2f691265c8b721608da07c47cf185838e969a60d631e0bd2a8a50b201caaa71743c0b30d2252f1fd08e0d7f0ef3a2e6cbfedeeeba74c6c869bf4a0ac0292dd
|
7
|
+
data.tar.gz: ac5bb11ebe786907c207ff1fe8115486deba3f361f42b179104cf119a714c6d778746f412a670df96e9ef7a6cf897fc15f1ddea1690d33a8646a42dcb0d375a2
|
@@ -0,0 +1,37 @@
|
|
1
|
+
name: Test Henkei Ruby gem
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [1.x]
|
6
|
+
pull_request:
|
7
|
+
branches: [1.x]
|
8
|
+
|
9
|
+
env:
|
10
|
+
CI: true
|
11
|
+
|
12
|
+
jobs:
|
13
|
+
test:
|
14
|
+
runs-on: ubuntu-latest
|
15
|
+
strategy:
|
16
|
+
matrix:
|
17
|
+
ruby-version: ['2.6', '2.7', '3.0', '3.1']
|
18
|
+
|
19
|
+
steps:
|
20
|
+
- uses: actions/checkout@v2
|
21
|
+
|
22
|
+
- name: Set up Ruby
|
23
|
+
uses: ruby/setup-ruby@v1
|
24
|
+
with:
|
25
|
+
ruby-version: ${{ matrix.ruby-version }}
|
26
|
+
bundler-cache: true
|
27
|
+
|
28
|
+
- name: Lint code - Rubocop
|
29
|
+
run: bundle exec rubocop
|
30
|
+
|
31
|
+
- name: Run tests
|
32
|
+
run: bundle exec rspec
|
33
|
+
|
34
|
+
- name: Test & publish code coverage
|
35
|
+
uses: paambaati/codeclimate-action@v3.0.0
|
36
|
+
env:
|
37
|
+
CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
data/.rubocop.yml
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[![
|
1
|
+
[![Github Build Status](https://github.com/abrom/henkei/actions/workflows/test.yml/badge.svg)](https://github.com/abrom/henkei/actions/workflows/test.yml)
|
2
2
|
[![Maintainability](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/maintainability)](https://codeclimate.com/github/abrom/henkei/maintainability)
|
3
3
|
[![Test Coverage](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/test_coverage)](https://codeclimate.com/github/abrom/henkei/test_coverage)
|
4
4
|
[![Gem Version](http://img.shields.io/gem/v/henkei.svg?style=flat)](#)
|
data/henkei.gemspec
CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
5
|
|
6
6
|
require 'henkei/version'
|
7
7
|
|
8
|
-
Gem::Specification.new do |spec|
|
8
|
+
Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
9
9
|
spec.name = 'henkei'
|
10
10
|
spec.version = Henkei::VERSION
|
11
11
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
@@ -13,13 +13,19 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
14
14
|
spec.summary = 'Read text and metadata from files and documents ' \
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
|
-
spec.homepage = '
|
16
|
+
spec.homepage = 'https://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
|
-
spec.required_ruby_version = ['>= 2.
|
18
|
+
spec.required_ruby_version = ['>= 2.6.0', '< 3.2.0']
|
19
|
+
|
20
|
+
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
21
|
+
# delete this section to allow pushing this gem to any host.
|
22
|
+
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.' unless spec.respond_to?(:metadata)
|
23
|
+
|
24
|
+
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
25
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
19
26
|
|
20
27
|
spec.files = `git ls-files`.split("\n")
|
21
28
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
22
|
-
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
23
29
|
spec.require_paths = ['lib']
|
24
30
|
|
25
31
|
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
@@ -29,6 +35,10 @@ Gem::Specification.new do |spec|
|
|
29
35
|
spec.add_development_dependency 'rails', '~> 5.0'
|
30
36
|
spec.add_development_dependency 'rake', '~> 12.3'
|
31
37
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
32
|
-
spec.add_development_dependency 'rubocop', '~>
|
33
|
-
spec.add_development_dependency '
|
38
|
+
spec.add_development_dependency 'rubocop', '~> 1.26'
|
39
|
+
spec.add_development_dependency 'rubocop-performance', '~> 1.13'
|
40
|
+
spec.add_development_dependency 'rubocop-rails', '~> 2.14'
|
41
|
+
spec.add_development_dependency 'rubocop-rake', '~> 0.6'
|
42
|
+
spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
|
43
|
+
spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
|
34
44
|
end
|
Binary file
|
data/jar/tika-config.xml
CHANGED
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -25,7 +25,7 @@ require 'open3'
|
|
25
25
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
26
26
|
class Henkei # rubocop:disable Metrics/ClassLength
|
27
27
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
28
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.
|
28
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.28.jar')
|
29
29
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
30
30
|
DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
|
31
31
|
|
@@ -35,7 +35,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
35
35
|
def self.mimetype(content_type)
|
36
36
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
37
37
|
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
38
|
-
|
38
|
+
' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
39
39
|
MIME::Types[content_type].first
|
40
40
|
else
|
41
41
|
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
@@ -54,8 +54,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
54
54
|
result = @@server_pid ? server_read(data) : client_read(type, data)
|
55
55
|
|
56
56
|
case type
|
57
|
-
when :text then result
|
58
|
-
when :html then result
|
57
|
+
when :text, :html then result
|
59
58
|
when :metadata then JSON.parse(result)
|
60
59
|
when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
|
61
60
|
end
|
@@ -246,7 +245,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
246
245
|
# Internal helper for calling to Tika library directly
|
247
246
|
#
|
248
247
|
def self.client_read(type, data)
|
249
|
-
Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
|
248
|
+
filter_response Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
|
250
249
|
end
|
251
250
|
private_class_method :client_read
|
252
251
|
|
@@ -273,7 +272,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
273
272
|
|
274
273
|
resp << chunk
|
275
274
|
end
|
276
|
-
resp
|
275
|
+
filter_response resp
|
277
276
|
end
|
278
277
|
private_class_method :server_read
|
279
278
|
|
@@ -297,4 +296,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
297
296
|
}[type]
|
298
297
|
end
|
299
298
|
private_class_method :switch_for_type
|
299
|
+
|
300
|
+
# Internal helper to remove erroneous output
|
301
|
+
#
|
302
|
+
def self.filter_response(response)
|
303
|
+
response.gsub(
|
304
|
+
/\AWARNING: sun\.reflect\.Reflection\.getCallerClass is not supported\. This will impact performance\.\n/,
|
305
|
+
''
|
306
|
+
)
|
307
|
+
end
|
308
|
+
private_class_method :filter_response
|
300
309
|
end
|
data/spec/henkei_spec.rb
CHANGED
@@ -201,37 +201,33 @@ describe Henkei do
|
|
201
201
|
|
202
202
|
context 'working as server mode' do
|
203
203
|
specify '#starts and kills server' do
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
217
|
-
end
|
204
|
+
Henkei.server(:text)
|
205
|
+
expect(Henkei.class_variable_get(:@@server_pid)).not_to be_nil
|
206
|
+
expect(Henkei.class_variable_get(:@@server_port)).not_to be_nil
|
207
|
+
|
208
|
+
s = TCPSocket.new('localhost', Henkei.class_variable_get(:@@server_port))
|
209
|
+
expect(s).to be_a TCPSocket
|
210
|
+
s.close
|
211
|
+
ensure
|
212
|
+
port = Henkei.class_variable_get(:@@server_port)
|
213
|
+
Henkei.kill_server!
|
214
|
+
sleep 2
|
215
|
+
expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
|
218
216
|
end
|
219
217
|
|
220
218
|
specify '#runs samples through server mode' do
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
Henkei.kill_server!
|
234
|
-
end
|
219
|
+
Henkei.server(:text)
|
220
|
+
expect(Henkei.new('spec/samples/sample.pages').text).to(
|
221
|
+
include 'The quick brown fox jumped over the lazy cat.'
|
222
|
+
)
|
223
|
+
expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
|
224
|
+
include 'The quick brown fox jumped over the lazy cat.'
|
225
|
+
)
|
226
|
+
expect(Henkei.new('spec/samples/sample.docx').text).to(
|
227
|
+
include 'The quick brown fox jumped over the lazy cat.'
|
228
|
+
)
|
229
|
+
ensure
|
230
|
+
Henkei.kill_server!
|
235
231
|
end
|
236
232
|
end
|
237
233
|
end
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.28.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
8
8
|
- Andrew Bromwich
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-05-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -113,14 +113,70 @@ dependencies:
|
|
113
113
|
requirements:
|
114
114
|
- - "~>"
|
115
115
|
- !ruby/object:Gem::Version
|
116
|
-
version: '
|
116
|
+
version: '1.26'
|
117
117
|
type: :development
|
118
118
|
prerelease: false
|
119
119
|
version_requirements: !ruby/object:Gem::Requirement
|
120
120
|
requirements:
|
121
121
|
- - "~>"
|
122
122
|
- !ruby/object:Gem::Version
|
123
|
-
version: '
|
123
|
+
version: '1.26'
|
124
|
+
- !ruby/object:Gem::Dependency
|
125
|
+
name: rubocop-performance
|
126
|
+
requirement: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - "~>"
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '1.13'
|
131
|
+
type: :development
|
132
|
+
prerelease: false
|
133
|
+
version_requirements: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - "~>"
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '1.13'
|
138
|
+
- !ruby/object:Gem::Dependency
|
139
|
+
name: rubocop-rails
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '2.14'
|
145
|
+
type: :development
|
146
|
+
prerelease: false
|
147
|
+
version_requirements: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - "~>"
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '2.14'
|
152
|
+
- !ruby/object:Gem::Dependency
|
153
|
+
name: rubocop-rake
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - "~>"
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '0.6'
|
159
|
+
type: :development
|
160
|
+
prerelease: false
|
161
|
+
version_requirements: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - "~>"
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0.6'
|
166
|
+
- !ruby/object:Gem::Dependency
|
167
|
+
name: rubocop-rspec
|
168
|
+
requirement: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - "~>"
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '2.9'
|
173
|
+
type: :development
|
174
|
+
prerelease: false
|
175
|
+
version_requirements: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '2.9'
|
124
180
|
- !ruby/object:Gem::Dependency
|
125
181
|
name: simplecov
|
126
182
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,6 +184,9 @@ dependencies:
|
|
128
184
|
- - "~>"
|
129
185
|
- !ruby/object:Gem::Version
|
130
186
|
version: '0.15'
|
187
|
+
- - "<"
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: '0.18'
|
131
190
|
type: :development
|
132
191
|
prerelease: false
|
133
192
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -135,6 +194,9 @@ dependencies:
|
|
135
194
|
- - "~>"
|
136
195
|
- !ruby/object:Gem::Version
|
137
196
|
version: '0.15'
|
197
|
+
- - "<"
|
198
|
+
- !ruby/object:Gem::Version
|
199
|
+
version: '0.18'
|
138
200
|
description: Read text and metadata from files and documents using Apache Tika toolkit
|
139
201
|
email:
|
140
202
|
- erol.fornoles@gmail.com
|
@@ -144,10 +206,10 @@ executables:
|
|
144
206
|
extensions: []
|
145
207
|
extra_rdoc_files: []
|
146
208
|
files:
|
209
|
+
- ".github/workflows/test.yml"
|
147
210
|
- ".gitignore"
|
148
211
|
- ".rspec"
|
149
212
|
- ".rubocop.yml"
|
150
|
-
- ".travis.yml"
|
151
213
|
- Gemfile
|
152
214
|
- LICENSE
|
153
215
|
- NOTICE.txt
|
@@ -155,7 +217,7 @@ files:
|
|
155
217
|
- Rakefile
|
156
218
|
- bin/console
|
157
219
|
- henkei.gemspec
|
158
|
-
- jar/tika-app-1.
|
220
|
+
- jar/tika-app-1.28.jar
|
159
221
|
- jar/tika-config.xml
|
160
222
|
- lib/henkei.rb
|
161
223
|
- lib/henkei/configuration.rb
|
@@ -168,11 +230,13 @@ files:
|
|
168
230
|
- spec/samples/sample-metadata-values-with-colons.doc
|
169
231
|
- spec/samples/sample.docx
|
170
232
|
- spec/samples/sample.pages
|
171
|
-
homepage:
|
233
|
+
homepage: https://github.com/abrom/henkei
|
172
234
|
licenses:
|
173
235
|
- MIT
|
174
|
-
metadata:
|
175
|
-
|
236
|
+
metadata:
|
237
|
+
allowed_push_host: https://rubygems.org
|
238
|
+
rubygems_mfa_required: 'true'
|
239
|
+
post_install_message:
|
176
240
|
rdoc_options: []
|
177
241
|
require_paths:
|
178
242
|
- lib
|
@@ -180,26 +244,19 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
180
244
|
requirements:
|
181
245
|
- - ">="
|
182
246
|
- !ruby/object:Gem::Version
|
183
|
-
version: 2.
|
247
|
+
version: 2.6.0
|
184
248
|
- - "<"
|
185
249
|
- !ruby/object:Gem::Version
|
186
|
-
version: 3.
|
250
|
+
version: 3.2.0
|
187
251
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
188
252
|
requirements:
|
189
253
|
- - ">="
|
190
254
|
- !ruby/object:Gem::Version
|
191
255
|
version: '0'
|
192
256
|
requirements: []
|
193
|
-
rubygems_version: 3.
|
194
|
-
signing_key:
|
257
|
+
rubygems_version: 3.2.3
|
258
|
+
signing_key:
|
195
259
|
specification_version: 4
|
196
260
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
197
261
|
.rtf, .pdf) using Apache Tika toolkit
|
198
|
-
test_files:
|
199
|
-
- spec/helper.rb
|
200
|
-
- spec/henkei_spec.rb
|
201
|
-
- spec/samples/pipe-error.png
|
202
|
-
- spec/samples/sample filename with spaces.pages
|
203
|
-
- spec/samples/sample-metadata-values-with-colons.doc
|
204
|
-
- spec/samples/sample.docx
|
205
|
-
- spec/samples/sample.pages
|
262
|
+
test_files: []
|
data/.travis.yml
DELETED
@@ -1,32 +0,0 @@
|
|
1
|
-
env:
|
2
|
-
global:
|
3
|
-
- CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
4
|
-
jobs:
|
5
|
-
- INCLUDE_RAILS=false
|
6
|
-
- INCLUDE_RAILS=true
|
7
|
-
|
8
|
-
language: ruby
|
9
|
-
rvm:
|
10
|
-
- 2.5
|
11
|
-
- 2.6
|
12
|
-
- 2.7
|
13
|
-
- 3.0
|
14
|
-
|
15
|
-
before_install:
|
16
|
-
- gem update bundler
|
17
|
-
|
18
|
-
install:
|
19
|
-
- bundle install --jobs=3 --retry=3
|
20
|
-
- gem install rubocop
|
21
|
-
|
22
|
-
before_script:
|
23
|
-
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
24
|
-
- chmod +x ./cc-test-reporter
|
25
|
-
- ./cc-test-reporter before-build
|
26
|
-
|
27
|
-
script:
|
28
|
-
- bundle exec rubocop
|
29
|
-
- bundle exec rspec
|
30
|
-
|
31
|
-
after_script:
|
32
|
-
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
|