henkei 2.3.0.1 → 2.4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +37 -0
- data/README.md +1 -1
- data/henkei.gemspec +15 -5
- data/jar/{tika-app-2.3.0.jar → tika-app-2.4.0.jar} +0 -0
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +3 -4
- data/spec/henkei_spec.rb +8 -8
- metadata +74 -17
- data/.travis.yml +0 -32
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52de171e9cba852d1551459674a12adfca8fd6166cb5e5707f3bc6a7cec9415c
|
4
|
+
data.tar.gz: '08807feea85b577c37153c290331c8f9c4441c2eef6c2600d630948a27b9ba5e'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6481f5588edeb5cf7e806cd9326636d14e936bee95064de49017614e999f295f9683b3cbe6346dbb4d0611753288d7b628435514fe13ff0b039723de55262db1
|
7
|
+
data.tar.gz: 27a33e20e068708563324db99798abf56a81b25386899e41cc4b5e15097df11e7bc61f69b1705a5a0d537ac3488c608e28c6337e6bc8d4fe6af3b2aba6e19416
|
@@ -0,0 +1,37 @@
|
|
1
|
+
name: Test Henkei Ruby gem
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [main]
|
6
|
+
pull_request:
|
7
|
+
branches: [main]
|
8
|
+
|
9
|
+
env:
|
10
|
+
CI: true
|
11
|
+
|
12
|
+
jobs:
|
13
|
+
test:
|
14
|
+
runs-on: ubuntu-latest
|
15
|
+
strategy:
|
16
|
+
matrix:
|
17
|
+
ruby-version: ['2.6', '2.7', '3.0', '3.1']
|
18
|
+
|
19
|
+
steps:
|
20
|
+
- uses: actions/checkout@v2
|
21
|
+
|
22
|
+
- name: Set up Ruby
|
23
|
+
uses: ruby/setup-ruby@v1
|
24
|
+
with:
|
25
|
+
ruby-version: ${{ matrix.ruby-version }}
|
26
|
+
bundler-cache: true
|
27
|
+
|
28
|
+
- name: Lint code - Rubocop
|
29
|
+
run: bundle exec rubocop
|
30
|
+
|
31
|
+
- name: Run tests
|
32
|
+
run: bundle exec rspec
|
33
|
+
|
34
|
+
- name: Test & publish code coverage
|
35
|
+
uses: paambaati/codeclimate-action@v3.0.0
|
36
|
+
env:
|
37
|
+
CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[](https://github.com/abrom/henkei/actions/workflows/test.yml)
|
2
2
|
[](https://codeclimate.com/github/abrom/henkei/maintainability)
|
3
3
|
[](https://codeclimate.com/github/abrom/henkei/test_coverage)
|
4
4
|
[](#)
|
data/henkei.gemspec
CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
5
|
|
6
6
|
require 'henkei/version'
|
7
7
|
|
8
|
-
Gem::Specification.new do |spec|
|
8
|
+
Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
9
9
|
spec.name = 'henkei'
|
10
10
|
spec.version = Henkei::VERSION
|
11
11
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
@@ -13,13 +13,19 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
14
14
|
spec.summary = 'Read text and metadata from files and documents ' \
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
|
-
spec.homepage = '
|
16
|
+
spec.homepage = 'https://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
18
|
spec.required_ruby_version = ['>= 2.6.0', '< 3.2.0']
|
19
19
|
|
20
|
+
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
21
|
+
# delete this section to allow pushing this gem to any host.
|
22
|
+
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.' unless spec.respond_to?(:metadata)
|
23
|
+
|
24
|
+
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
25
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
26
|
+
|
20
27
|
spec.files = `git ls-files`.split("\n")
|
21
28
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
22
|
-
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
23
29
|
spec.require_paths = ['lib']
|
24
30
|
|
25
31
|
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
@@ -30,6 +36,10 @@ Gem::Specification.new do |spec|
|
|
30
36
|
spec.add_development_dependency 'rails', '~> 5.0'
|
31
37
|
spec.add_development_dependency 'rake', '~> 12.3'
|
32
38
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
33
|
-
spec.add_development_dependency 'rubocop', '~>
|
34
|
-
spec.add_development_dependency '
|
39
|
+
spec.add_development_dependency 'rubocop', '~> 1.26'
|
40
|
+
spec.add_development_dependency 'rubocop-performance', '~> 1.13'
|
41
|
+
spec.add_development_dependency 'rubocop-rails', '~> 2.14'
|
42
|
+
spec.add_development_dependency 'rubocop-rake', '~> 0.6'
|
43
|
+
spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
|
44
|
+
spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
|
35
45
|
end
|
Binary file
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -25,14 +25,14 @@ require 'open3'
|
|
25
25
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
26
26
|
class Henkei # rubocop:disable Metrics/ClassLength
|
27
27
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
28
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.
|
28
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.4.0.jar')
|
29
29
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
30
30
|
CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
|
31
31
|
|
32
32
|
def self.mimetype(content_type)
|
33
33
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
34
34
|
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
35
|
-
|
35
|
+
' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
36
36
|
MIME::Types[content_type].first
|
37
37
|
else
|
38
38
|
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
@@ -51,8 +51,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
51
51
|
result = client_read(type, data, include_ocr: include_ocr)
|
52
52
|
|
53
53
|
case type
|
54
|
-
when :text then result
|
55
|
-
when :html then result
|
54
|
+
when :text, :html then result
|
56
55
|
when :metadata then JSON.parse(result)
|
57
56
|
when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
|
58
57
|
end
|
data/spec/henkei_spec.rb
CHANGED
@@ -7,8 +7,8 @@ require 'nokogiri'
|
|
7
7
|
# Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
|
8
8
|
require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
|
9
9
|
|
10
|
-
def
|
11
|
-
ENV['CI'] == 'true'
|
10
|
+
def ci?
|
11
|
+
ENV['CI'] == 'true'
|
12
12
|
end
|
13
13
|
|
14
14
|
describe Henkei do
|
@@ -58,17 +58,17 @@ describe Henkei do
|
|
58
58
|
expect(text).to eq ''
|
59
59
|
end
|
60
60
|
|
61
|
-
unless
|
61
|
+
unless ci?
|
62
62
|
context 'when `include_ocr` is enabled' do
|
63
63
|
it 'returns parsed plain text in the image' do
|
64
64
|
text = Henkei.read :text, data, include_ocr: true
|
65
65
|
|
66
66
|
expect(text).to include <<~TEXT
|
67
67
|
West Side
|
68
|
-
|
68
|
+
|
69
69
|
Sea Island
|
70
70
|
PP
|
71
|
-
|
71
|
+
|
72
72
|
Richmond
|
73
73
|
TEXT
|
74
74
|
end
|
@@ -182,15 +182,15 @@ describe Henkei do
|
|
182
182
|
expect(henkei.mimetype.content_type).to eq 'image/png'
|
183
183
|
end
|
184
184
|
|
185
|
-
unless
|
185
|
+
unless ci?
|
186
186
|
context 'when `include_ocr` is enabled' do
|
187
187
|
it '#text returns plain text of parsed text in the image' do
|
188
188
|
expect(henkei.text(include_ocr: true)).to include <<~TEXT
|
189
189
|
West Side
|
190
|
-
|
190
|
+
|
191
191
|
Sea Island
|
192
192
|
PP
|
193
|
-
|
193
|
+
|
194
194
|
Richmond
|
195
195
|
TEXT
|
196
196
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.4.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2022-
|
12
|
+
date: 2022-05-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -127,14 +127,70 @@ dependencies:
|
|
127
127
|
requirements:
|
128
128
|
- - "~>"
|
129
129
|
- !ruby/object:Gem::Version
|
130
|
-
version: '
|
130
|
+
version: '1.26'
|
131
131
|
type: :development
|
132
132
|
prerelease: false
|
133
133
|
version_requirements: !ruby/object:Gem::Requirement
|
134
134
|
requirements:
|
135
135
|
- - "~>"
|
136
136
|
- !ruby/object:Gem::Version
|
137
|
-
version: '
|
137
|
+
version: '1.26'
|
138
|
+
- !ruby/object:Gem::Dependency
|
139
|
+
name: rubocop-performance
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '1.13'
|
145
|
+
type: :development
|
146
|
+
prerelease: false
|
147
|
+
version_requirements: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - "~>"
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '1.13'
|
152
|
+
- !ruby/object:Gem::Dependency
|
153
|
+
name: rubocop-rails
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - "~>"
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '2.14'
|
159
|
+
type: :development
|
160
|
+
prerelease: false
|
161
|
+
version_requirements: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - "~>"
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '2.14'
|
166
|
+
- !ruby/object:Gem::Dependency
|
167
|
+
name: rubocop-rake
|
168
|
+
requirement: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - "~>"
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0.6'
|
173
|
+
type: :development
|
174
|
+
prerelease: false
|
175
|
+
version_requirements: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '0.6'
|
180
|
+
- !ruby/object:Gem::Dependency
|
181
|
+
name: rubocop-rspec
|
182
|
+
requirement: !ruby/object:Gem::Requirement
|
183
|
+
requirements:
|
184
|
+
- - "~>"
|
185
|
+
- !ruby/object:Gem::Version
|
186
|
+
version: '2.9'
|
187
|
+
type: :development
|
188
|
+
prerelease: false
|
189
|
+
version_requirements: !ruby/object:Gem::Requirement
|
190
|
+
requirements:
|
191
|
+
- - "~>"
|
192
|
+
- !ruby/object:Gem::Version
|
193
|
+
version: '2.9'
|
138
194
|
- !ruby/object:Gem::Dependency
|
139
195
|
name: simplecov
|
140
196
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,6 +198,9 @@ dependencies:
|
|
142
198
|
- - "~>"
|
143
199
|
- !ruby/object:Gem::Version
|
144
200
|
version: '0.15'
|
201
|
+
- - "<"
|
202
|
+
- !ruby/object:Gem::Version
|
203
|
+
version: '0.18'
|
145
204
|
type: :development
|
146
205
|
prerelease: false
|
147
206
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -149,6 +208,9 @@ dependencies:
|
|
149
208
|
- - "~>"
|
150
209
|
- !ruby/object:Gem::Version
|
151
210
|
version: '0.15'
|
211
|
+
- - "<"
|
212
|
+
- !ruby/object:Gem::Version
|
213
|
+
version: '0.18'
|
152
214
|
description: Read text and metadata from files and documents using Apache Tika toolkit
|
153
215
|
email:
|
154
216
|
- erol.fornoles@gmail.com
|
@@ -158,10 +220,10 @@ executables:
|
|
158
220
|
extensions: []
|
159
221
|
extra_rdoc_files: []
|
160
222
|
files:
|
223
|
+
- ".github/workflows/test.yml"
|
161
224
|
- ".gitignore"
|
162
225
|
- ".rspec"
|
163
226
|
- ".rubocop.yml"
|
164
|
-
- ".travis.yml"
|
165
227
|
- Gemfile
|
166
228
|
- LICENSE
|
167
229
|
- NOTICE.txt
|
@@ -169,7 +231,7 @@ files:
|
|
169
231
|
- Rakefile
|
170
232
|
- bin/console
|
171
233
|
- henkei.gemspec
|
172
|
-
- jar/tika-app-2.
|
234
|
+
- jar/tika-app-2.4.0.jar
|
173
235
|
- jar/tika-config-without-ocr.xml
|
174
236
|
- jar/tika-config.xml
|
175
237
|
- lib/henkei.rb
|
@@ -183,10 +245,12 @@ files:
|
|
183
245
|
- spec/samples/sample-metadata-values-with-colons.doc
|
184
246
|
- spec/samples/sample.docx
|
185
247
|
- spec/samples/sample.pages
|
186
|
-
homepage:
|
248
|
+
homepage: https://github.com/abrom/henkei
|
187
249
|
licenses:
|
188
250
|
- MIT
|
189
|
-
metadata:
|
251
|
+
metadata:
|
252
|
+
allowed_push_host: https://rubygems.org
|
253
|
+
rubygems_mfa_required: 'true'
|
190
254
|
post_install_message:
|
191
255
|
rdoc_options: []
|
192
256
|
require_paths:
|
@@ -205,16 +269,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
205
269
|
- !ruby/object:Gem::Version
|
206
270
|
version: '0'
|
207
271
|
requirements: []
|
208
|
-
rubygems_version: 3.
|
272
|
+
rubygems_version: 3.2.3
|
209
273
|
signing_key:
|
210
274
|
specification_version: 4
|
211
275
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
212
276
|
.rtf, .pdf) using Apache Tika toolkit
|
213
|
-
test_files:
|
214
|
-
- spec/helper.rb
|
215
|
-
- spec/henkei_spec.rb
|
216
|
-
- spec/samples/pipe-error.png
|
217
|
-
- spec/samples/sample filename with spaces.pages
|
218
|
-
- spec/samples/sample-metadata-values-with-colons.doc
|
219
|
-
- spec/samples/sample.docx
|
220
|
-
- spec/samples/sample.pages
|
277
|
+
test_files: []
|
data/.travis.yml
DELETED
@@ -1,32 +0,0 @@
|
|
1
|
-
env:
|
2
|
-
global:
|
3
|
-
- CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
4
|
-
jobs:
|
5
|
-
- INCLUDE_RAILS=false
|
6
|
-
- INCLUDE_RAILS=true
|
7
|
-
|
8
|
-
language: ruby
|
9
|
-
rvm:
|
10
|
-
- 2.6
|
11
|
-
- 2.7
|
12
|
-
- 3.0
|
13
|
-
- 3.1
|
14
|
-
|
15
|
-
before_install:
|
16
|
-
- gem update bundler
|
17
|
-
|
18
|
-
install:
|
19
|
-
- bundle install --jobs=3 --retry=3
|
20
|
-
- gem install rubocop
|
21
|
-
|
22
|
-
before_script:
|
23
|
-
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
24
|
-
- chmod +x ./cc-test-reporter
|
25
|
-
- ./cc-test-reporter before-build
|
26
|
-
|
27
|
-
script:
|
28
|
-
- bundle exec rubocop
|
29
|
-
- bundle exec rspec
|
30
|
-
|
31
|
-
after_script:
|
32
|
-
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
|