henkei 2.3.0.1 → 2.4.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +37 -0
- data/README.md +1 -1
- data/henkei.gemspec +15 -5
- data/jar/{tika-app-2.3.0.jar → tika-app-2.4.0.jar} +0 -0
- data/lib/henkei/version.rb +1 -1
- data/lib/henkei.rb +3 -4
- data/spec/henkei_spec.rb +8 -8
- metadata +74 -17
- data/.travis.yml +0 -32
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52de171e9cba852d1551459674a12adfca8fd6166cb5e5707f3bc6a7cec9415c
|
4
|
+
data.tar.gz: '08807feea85b577c37153c290331c8f9c4441c2eef6c2600d630948a27b9ba5e'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6481f5588edeb5cf7e806cd9326636d14e936bee95064de49017614e999f295f9683b3cbe6346dbb4d0611753288d7b628435514fe13ff0b039723de55262db1
|
7
|
+
data.tar.gz: 27a33e20e068708563324db99798abf56a81b25386899e41cc4b5e15097df11e7bc61f69b1705a5a0d537ac3488c608e28c6337e6bc8d4fe6af3b2aba6e19416
|
@@ -0,0 +1,37 @@
|
|
1
|
+
name: Test Henkei Ruby gem
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [main]
|
6
|
+
pull_request:
|
7
|
+
branches: [main]
|
8
|
+
|
9
|
+
env:
|
10
|
+
CI: true
|
11
|
+
|
12
|
+
jobs:
|
13
|
+
test:
|
14
|
+
runs-on: ubuntu-latest
|
15
|
+
strategy:
|
16
|
+
matrix:
|
17
|
+
ruby-version: ['2.6', '2.7', '3.0', '3.1']
|
18
|
+
|
19
|
+
steps:
|
20
|
+
- uses: actions/checkout@v2
|
21
|
+
|
22
|
+
- name: Set up Ruby
|
23
|
+
uses: ruby/setup-ruby@v1
|
24
|
+
with:
|
25
|
+
ruby-version: ${{ matrix.ruby-version }}
|
26
|
+
bundler-cache: true
|
27
|
+
|
28
|
+
- name: Lint code - Rubocop
|
29
|
+
run: bundle exec rubocop
|
30
|
+
|
31
|
+
- name: Run tests
|
32
|
+
run: bundle exec rspec
|
33
|
+
|
34
|
+
- name: Test & publish code coverage
|
35
|
+
uses: paambaati/codeclimate-action@v3.0.0
|
36
|
+
env:
|
37
|
+
CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[![
|
1
|
+
[![Github Build Status](https://github.com/abrom/henkei/actions/workflows/test.yml/badge.svg)](https://github.com/abrom/henkei/actions/workflows/test.yml)
|
2
2
|
[![Maintainability](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/maintainability)](https://codeclimate.com/github/abrom/henkei/maintainability)
|
3
3
|
[![Test Coverage](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/test_coverage)](https://codeclimate.com/github/abrom/henkei/test_coverage)
|
4
4
|
[![Gem Version](http://img.shields.io/gem/v/henkei.svg?style=flat)](#)
|
data/henkei.gemspec
CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
5
|
|
6
6
|
require 'henkei/version'
|
7
7
|
|
8
|
-
Gem::Specification.new do |spec|
|
8
|
+
Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
|
9
9
|
spec.name = 'henkei'
|
10
10
|
spec.version = Henkei::VERSION
|
11
11
|
spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
|
@@ -13,13 +13,19 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
|
14
14
|
spec.summary = 'Read text and metadata from files and documents ' \
|
15
15
|
'(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
|
16
|
-
spec.homepage = '
|
16
|
+
spec.homepage = 'https://github.com/abrom/henkei'
|
17
17
|
spec.license = 'MIT'
|
18
18
|
spec.required_ruby_version = ['>= 2.6.0', '< 3.2.0']
|
19
19
|
|
20
|
+
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
21
|
+
# delete this section to allow pushing this gem to any host.
|
22
|
+
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.' unless spec.respond_to?(:metadata)
|
23
|
+
|
24
|
+
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
25
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
26
|
+
|
20
27
|
spec.files = `git ls-files`.split("\n")
|
21
28
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
22
|
-
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
23
29
|
spec.require_paths = ['lib']
|
24
30
|
|
25
31
|
spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
|
@@ -30,6 +36,10 @@ Gem::Specification.new do |spec|
|
|
30
36
|
spec.add_development_dependency 'rails', '~> 5.0'
|
31
37
|
spec.add_development_dependency 'rake', '~> 12.3'
|
32
38
|
spec.add_development_dependency 'rspec', '~> 3.7'
|
33
|
-
spec.add_development_dependency 'rubocop', '~>
|
34
|
-
spec.add_development_dependency '
|
39
|
+
spec.add_development_dependency 'rubocop', '~> 1.26'
|
40
|
+
spec.add_development_dependency 'rubocop-performance', '~> 1.13'
|
41
|
+
spec.add_development_dependency 'rubocop-rails', '~> 2.14'
|
42
|
+
spec.add_development_dependency 'rubocop-rake', '~> 0.6'
|
43
|
+
spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
|
44
|
+
spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
|
35
45
|
end
|
Binary file
|
data/lib/henkei/version.rb
CHANGED
data/lib/henkei.rb
CHANGED
@@ -25,14 +25,14 @@ require 'open3'
|
|
25
25
|
# Read text and metadata from files and documents using Apache Tika toolkit
|
26
26
|
class Henkei # rubocop:disable Metrics/ClassLength
|
27
27
|
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
28
|
-
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.
|
28
|
+
JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.4.0.jar')
|
29
29
|
CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
|
30
30
|
CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
|
31
31
|
|
32
32
|
def self.mimetype(content_type)
|
33
33
|
if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
|
34
34
|
warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
|
35
|
-
|
35
|
+
' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
|
36
36
|
MIME::Types[content_type].first
|
37
37
|
else
|
38
38
|
MiniMime.lookup_by_content_type(content_type).tap do |object|
|
@@ -51,8 +51,7 @@ class Henkei # rubocop:disable Metrics/ClassLength
|
|
51
51
|
result = client_read(type, data, include_ocr: include_ocr)
|
52
52
|
|
53
53
|
case type
|
54
|
-
when :text then result
|
55
|
-
when :html then result
|
54
|
+
when :text, :html then result
|
56
55
|
when :metadata then JSON.parse(result)
|
57
56
|
when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
|
58
57
|
end
|
data/spec/henkei_spec.rb
CHANGED
@@ -7,8 +7,8 @@ require 'nokogiri'
|
|
7
7
|
# Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
|
8
8
|
require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
|
9
9
|
|
10
|
-
def
|
11
|
-
ENV['CI'] == 'true'
|
10
|
+
def ci?
|
11
|
+
ENV['CI'] == 'true'
|
12
12
|
end
|
13
13
|
|
14
14
|
describe Henkei do
|
@@ -58,17 +58,17 @@ describe Henkei do
|
|
58
58
|
expect(text).to eq ''
|
59
59
|
end
|
60
60
|
|
61
|
-
unless
|
61
|
+
unless ci?
|
62
62
|
context 'when `include_ocr` is enabled' do
|
63
63
|
it 'returns parsed plain text in the image' do
|
64
64
|
text = Henkei.read :text, data, include_ocr: true
|
65
65
|
|
66
66
|
expect(text).to include <<~TEXT
|
67
67
|
West Side
|
68
|
-
|
68
|
+
|
69
69
|
Sea Island
|
70
70
|
PP
|
71
|
-
|
71
|
+
|
72
72
|
Richmond
|
73
73
|
TEXT
|
74
74
|
end
|
@@ -182,15 +182,15 @@ describe Henkei do
|
|
182
182
|
expect(henkei.mimetype.content_type).to eq 'image/png'
|
183
183
|
end
|
184
184
|
|
185
|
-
unless
|
185
|
+
unless ci?
|
186
186
|
context 'when `include_ocr` is enabled' do
|
187
187
|
it '#text returns plain text of parsed text in the image' do
|
188
188
|
expect(henkei.text(include_ocr: true)).to include <<~TEXT
|
189
189
|
West Side
|
190
|
-
|
190
|
+
|
191
191
|
Sea Island
|
192
192
|
PP
|
193
|
-
|
193
|
+
|
194
194
|
Richmond
|
195
195
|
TEXT
|
196
196
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: henkei
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.4.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erol Fornoles
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2022-
|
12
|
+
date: 2022-05-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: json
|
@@ -127,14 +127,70 @@ dependencies:
|
|
127
127
|
requirements:
|
128
128
|
- - "~>"
|
129
129
|
- !ruby/object:Gem::Version
|
130
|
-
version: '
|
130
|
+
version: '1.26'
|
131
131
|
type: :development
|
132
132
|
prerelease: false
|
133
133
|
version_requirements: !ruby/object:Gem::Requirement
|
134
134
|
requirements:
|
135
135
|
- - "~>"
|
136
136
|
- !ruby/object:Gem::Version
|
137
|
-
version: '
|
137
|
+
version: '1.26'
|
138
|
+
- !ruby/object:Gem::Dependency
|
139
|
+
name: rubocop-performance
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '1.13'
|
145
|
+
type: :development
|
146
|
+
prerelease: false
|
147
|
+
version_requirements: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - "~>"
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '1.13'
|
152
|
+
- !ruby/object:Gem::Dependency
|
153
|
+
name: rubocop-rails
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - "~>"
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '2.14'
|
159
|
+
type: :development
|
160
|
+
prerelease: false
|
161
|
+
version_requirements: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - "~>"
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '2.14'
|
166
|
+
- !ruby/object:Gem::Dependency
|
167
|
+
name: rubocop-rake
|
168
|
+
requirement: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - "~>"
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0.6'
|
173
|
+
type: :development
|
174
|
+
prerelease: false
|
175
|
+
version_requirements: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '0.6'
|
180
|
+
- !ruby/object:Gem::Dependency
|
181
|
+
name: rubocop-rspec
|
182
|
+
requirement: !ruby/object:Gem::Requirement
|
183
|
+
requirements:
|
184
|
+
- - "~>"
|
185
|
+
- !ruby/object:Gem::Version
|
186
|
+
version: '2.9'
|
187
|
+
type: :development
|
188
|
+
prerelease: false
|
189
|
+
version_requirements: !ruby/object:Gem::Requirement
|
190
|
+
requirements:
|
191
|
+
- - "~>"
|
192
|
+
- !ruby/object:Gem::Version
|
193
|
+
version: '2.9'
|
138
194
|
- !ruby/object:Gem::Dependency
|
139
195
|
name: simplecov
|
140
196
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,6 +198,9 @@ dependencies:
|
|
142
198
|
- - "~>"
|
143
199
|
- !ruby/object:Gem::Version
|
144
200
|
version: '0.15'
|
201
|
+
- - "<"
|
202
|
+
- !ruby/object:Gem::Version
|
203
|
+
version: '0.18'
|
145
204
|
type: :development
|
146
205
|
prerelease: false
|
147
206
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -149,6 +208,9 @@ dependencies:
|
|
149
208
|
- - "~>"
|
150
209
|
- !ruby/object:Gem::Version
|
151
210
|
version: '0.15'
|
211
|
+
- - "<"
|
212
|
+
- !ruby/object:Gem::Version
|
213
|
+
version: '0.18'
|
152
214
|
description: Read text and metadata from files and documents using Apache Tika toolkit
|
153
215
|
email:
|
154
216
|
- erol.fornoles@gmail.com
|
@@ -158,10 +220,10 @@ executables:
|
|
158
220
|
extensions: []
|
159
221
|
extra_rdoc_files: []
|
160
222
|
files:
|
223
|
+
- ".github/workflows/test.yml"
|
161
224
|
- ".gitignore"
|
162
225
|
- ".rspec"
|
163
226
|
- ".rubocop.yml"
|
164
|
-
- ".travis.yml"
|
165
227
|
- Gemfile
|
166
228
|
- LICENSE
|
167
229
|
- NOTICE.txt
|
@@ -169,7 +231,7 @@ files:
|
|
169
231
|
- Rakefile
|
170
232
|
- bin/console
|
171
233
|
- henkei.gemspec
|
172
|
-
- jar/tika-app-2.
|
234
|
+
- jar/tika-app-2.4.0.jar
|
173
235
|
- jar/tika-config-without-ocr.xml
|
174
236
|
- jar/tika-config.xml
|
175
237
|
- lib/henkei.rb
|
@@ -183,10 +245,12 @@ files:
|
|
183
245
|
- spec/samples/sample-metadata-values-with-colons.doc
|
184
246
|
- spec/samples/sample.docx
|
185
247
|
- spec/samples/sample.pages
|
186
|
-
homepage:
|
248
|
+
homepage: https://github.com/abrom/henkei
|
187
249
|
licenses:
|
188
250
|
- MIT
|
189
|
-
metadata:
|
251
|
+
metadata:
|
252
|
+
allowed_push_host: https://rubygems.org
|
253
|
+
rubygems_mfa_required: 'true'
|
190
254
|
post_install_message:
|
191
255
|
rdoc_options: []
|
192
256
|
require_paths:
|
@@ -205,16 +269,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
205
269
|
- !ruby/object:Gem::Version
|
206
270
|
version: '0'
|
207
271
|
requirements: []
|
208
|
-
rubygems_version: 3.
|
272
|
+
rubygems_version: 3.2.3
|
209
273
|
signing_key:
|
210
274
|
specification_version: 4
|
211
275
|
summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
|
212
276
|
.rtf, .pdf) using Apache Tika toolkit
|
213
|
-
test_files:
|
214
|
-
- spec/helper.rb
|
215
|
-
- spec/henkei_spec.rb
|
216
|
-
- spec/samples/pipe-error.png
|
217
|
-
- spec/samples/sample filename with spaces.pages
|
218
|
-
- spec/samples/sample-metadata-values-with-colons.doc
|
219
|
-
- spec/samples/sample.docx
|
220
|
-
- spec/samples/sample.pages
|
277
|
+
test_files: []
|
data/.travis.yml
DELETED
@@ -1,32 +0,0 @@
|
|
1
|
-
env:
|
2
|
-
global:
|
3
|
-
- CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
|
4
|
-
jobs:
|
5
|
-
- INCLUDE_RAILS=false
|
6
|
-
- INCLUDE_RAILS=true
|
7
|
-
|
8
|
-
language: ruby
|
9
|
-
rvm:
|
10
|
-
- 2.6
|
11
|
-
- 2.7
|
12
|
-
- 3.0
|
13
|
-
- 3.1
|
14
|
-
|
15
|
-
before_install:
|
16
|
-
- gem update bundler
|
17
|
-
|
18
|
-
install:
|
19
|
-
- bundle install --jobs=3 --retry=3
|
20
|
-
- gem install rubocop
|
21
|
-
|
22
|
-
before_script:
|
23
|
-
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
24
|
-
- chmod +x ./cc-test-reporter
|
25
|
-
- ./cc-test-reporter before-build
|
26
|
-
|
27
|
-
script:
|
28
|
-
- bundle exec rubocop
|
29
|
-
- bundle exec rspec
|
30
|
-
|
31
|
-
after_script:
|
32
|
-
- ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
|