henkei 1.28.3.1 → 2.2.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bd6ddc3967c88a20c41845c884623e60d689bf20f84e95de5050a8abcdad8037
4
- data.tar.gz: 197f4ee86dad00184c1129c58a27bd511244e238e94ca50e0eaffa77f15a2052
3
+ metadata.gz: 64f2ec330c97bf77b16e2f7e14e08f90c405ae42c7aeddce733c0d889eeb4782
4
+ data.tar.gz: c3b3b91c569c7093bf22a5c751153426f1fdaa62da61f8c40f8b8cabc6ce072c
5
5
  SHA512:
6
- metadata.gz: 4cf5bd57225bcdfa44884d05eb100aac25d46b0ebf08024b4f8e8923eea40f0a25635e4c89ef20972a51e9b6278e348bd47db299db704db82264ae7138d3eb3e
7
- data.tar.gz: a801fc206f243dc029b04d85d5796f6f005a3ccfe4b61a8501ebfc7a32250e397b73dcd317d8ea61ea7c9f4f4be5f806ae97d0b18349e1acf2c671e45dcf4a24
6
+ metadata.gz: b9f2263c057cb9e958039930aa3c0244c31b5bcc5e636515a757e3b1c5d43a6b52c932a76a8c132af73707453df1e2652e6f02d9926276a75bceb4deb69ffa59
7
+ data.tar.gz: f8ea0d87ad2bc75213483824a30edba37507110fc721434f0a40477500dc4674d8ce8d1c2a47b06624249b48228bbe9c9a9f9c86bb19e5cb2dcc18a8dec23dbe
data/.rubocop.yml CHANGED
@@ -1,6 +1,5 @@
1
1
  AllCops:
2
2
  NewCops: enable
3
- TargetRubyVersion: 2.6
4
3
 
5
4
  Layout/EmptyLinesAroundAttributeAccessor:
6
5
  Enabled: true
data/.travis.yml ADDED
@@ -0,0 +1,32 @@
1
+ env:
2
+ global:
3
+ - CC_TEST_REPORTER_ID=bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e
4
+ jobs:
5
+ - INCLUDE_RAILS=false
6
+ - INCLUDE_RAILS=true
7
+
8
+ language: ruby
9
+ rvm:
10
+ - 2.5
11
+ - 2.6
12
+ - 2.7
13
+ - 3.0
14
+
15
+ before_install:
16
+ - gem update bundler
17
+
18
+ install:
19
+ - bundle install --jobs=3 --retry=3
20
+ - gem install rubocop
21
+
22
+ before_script:
23
+ - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
24
+ - chmod +x ./cc-test-reporter
25
+ - ./cc-test-reporter before-build
26
+
27
+ script:
28
+ - bundle exec rubocop
29
+ - bundle exec rspec
30
+
31
+ after_script:
32
+ - ./cc-test-reporter after-build --exit-code $TRAVIS_TEST_RESULT
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- [![Github Build Status](https://github.com/abrom/henkei/actions/workflows/test.yml/badge.svg)](https://github.com/abrom/henkei/actions/workflows/test.yml)
1
+ [![Travis Build Status](http://img.shields.io/travis/abrom/henkei.svg?style=flat)](https://travis-ci.org/abrom/henkei)
2
2
  [![Maintainability](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/maintainability)](https://codeclimate.com/github/abrom/henkei/maintainability)
3
3
  [![Test Coverage](https://api.codeclimate.com/v1/badges/d06e8c917cf7d8c07234/test_coverage)](https://codeclimate.com/github/abrom/henkei/test_coverage)
4
4
  [![Gem Version](http://img.shields.io/gem/v/henkei.svg?style=flat)](#)
@@ -21,6 +21,15 @@ Here are some of the formats supported:
21
21
  For the complete list of supported formats, please visit the Apache Tika
22
22
  [Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
23
23
 
24
+ ## Upgrading from v1.x to v2.x
25
+
26
+ Apache Tika v2.x brings with it some changes. One key change is that the Tika client and server applications have
27
+ been split up. To keep the gem size down Henkei will only include the client app. That is to say, each time you
28
+ call to Henkei, a new Java process will be started, run your command, then terminate.
29
+
30
+ Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
31
+ based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
32
+
24
33
  ## Usage
25
34
 
26
35
  Text, metadata and MIME type information can be extracted by calling `Henkei.read` directly:
@@ -69,6 +78,20 @@ post '/:name/:filename' do
69
78
  end
70
79
  ```
71
80
 
81
+ ### Reading text from inside images (OCR)
82
+
83
+ You can enable OCR by specifying the optional `include_ocr: true` when calling to the `text` or `html` instance methods,
84
+ as well as the `read` class method. Note that Tika does indicate this will greatly increase processing time.
85
+
86
+ ```ruby
87
+ henkei = Henkei.new 'sample.pages'
88
+ text_with_ocr = henkei.text(include_ocr: true)
89
+ html_with_ocr = henkei.html(include_ocr: true)
90
+
91
+ data = File.read 'sample.pages'
92
+ text_with_ocr = Henkei.read :text, data, include_ocr: true
93
+ ```
94
+
72
95
  ### Reading metadata
73
96
 
74
97
  Metadata is returned as a hash.
data/henkei.gemspec CHANGED
@@ -5,7 +5,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
5
 
6
6
  require 'henkei/version'
7
7
 
8
- Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
8
+ Gem::Specification.new do |spec|
9
9
  spec.name = 'henkei'
10
10
  spec.version = Henkei::VERSION
11
11
  spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
@@ -13,32 +13,23 @@ Gem::Specification.new do |spec| # rubocop:disable Metrics/BlockLength
13
13
  spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
14
14
  spec.summary = 'Read text and metadata from files and documents ' \
15
15
  '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
16
- spec.homepage = 'https://github.com/abrom/henkei'
16
+ spec.homepage = 'http://github.com/abrom/henkei'
17
17
  spec.license = 'MIT'
18
- spec.required_ruby_version = ['>= 2.6.0', '< 3.2.0']
19
-
20
- # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
21
- # delete this section to allow pushing this gem to any host.
22
- raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.' unless spec.respond_to?(:metadata)
23
-
24
- spec.metadata['allowed_push_host'] = 'https://rubygems.org'
25
- spec.metadata['rubygems_mfa_required'] = 'true'
18
+ spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
26
19
 
27
20
  spec.files = `git ls-files`.split("\n")
28
21
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
22
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
29
23
  spec.require_paths = ['lib']
30
24
 
31
25
  spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
32
26
  spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
33
27
 
34
28
  spec.add_development_dependency 'bundler', '~> 2.0'
29
+ spec.add_development_dependency 'nokogiri', '~> 1.12'
35
30
  spec.add_development_dependency 'rails', '~> 5.0'
36
31
  spec.add_development_dependency 'rake', '~> 12.3'
37
32
  spec.add_development_dependency 'rspec', '~> 3.7'
38
- spec.add_development_dependency 'rubocop', '~> 1.26'
39
- spec.add_development_dependency 'rubocop-performance', '~> 1.13'
40
- spec.add_development_dependency 'rubocop-rails', '~> 2.14'
41
- spec.add_development_dependency 'rubocop-rake', '~> 0.6'
42
- spec.add_development_dependency 'rubocop-rspec', '~> 2.9'
43
- spec.add_development_dependency 'simplecov', '~> 0.15', '< 0.18'
33
+ spec.add_development_dependency 'rubocop', '~> 0.71'
34
+ spec.add_development_dependency 'simplecov', '~> 0.15'
44
35
  end
@@ -0,0 +1,9 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <properties>
3
+ <service-loader initializableProblemHandler="ignore"/>
4
+ <parsers>
5
+ <parser class="org.apache.tika.parser.DefaultParser">
6
+ <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
7
+ </parser>
8
+ </parsers>
9
+ </properties>
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Henkei
4
- VERSION = '1.28.3.1'
4
+ VERSION = '2.2.0.1'
5
5
  end
data/lib/henkei.rb CHANGED
@@ -25,17 +25,14 @@ require 'open3'
25
25
  # Read text and metadata from files and documents using Apache Tika toolkit
26
26
  class Henkei # rubocop:disable Metrics/ClassLength
27
27
  GEM_PATH = File.dirname(File.dirname(__FILE__))
28
- JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.28.3.jar')
28
+ JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.2.0.jar')
29
29
  CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
30
- DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
31
-
32
- @@server_port = nil
33
- @@server_pid = nil
30
+ CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
34
31
 
35
32
  def self.mimetype(content_type)
36
33
  if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
37
34
  warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead.'\
38
- ' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
35
+ ' Use Henkei.configure and assign "mini_mime" to `mime_library`.'
39
36
  MIME::Types[content_type].first
40
37
  else
41
38
  MiniMime.lookup_by_content_type(content_type).tap do |object|
@@ -50,11 +47,12 @@ class Henkei # rubocop:disable Metrics/ClassLength
50
47
  # text = Henkei.read :text, data
51
48
  # metadata = Henkei.read :metadata, data
52
49
  #
53
- def self.read(type, data)
54
- result = @@server_pid ? server_read(data) : client_read(type, data)
50
+ def self.read(type, data, include_ocr: false)
51
+ result = client_read(type, data, include_ocr: include_ocr)
55
52
 
56
53
  case type
57
- when :text, :html then result
54
+ when :text then result
55
+ when :html then result
58
56
  when :metadata then JSON.parse(result)
59
57
  when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
60
58
  end
@@ -95,10 +93,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
95
93
  # henkei = Henkei.new 'sample.pages'
96
94
  # henkei.text
97
95
  #
98
- def text
96
+ # Include OCR results from images (includes embedded images in pages/docx/pdf etc)
97
+ #
98
+ # henkei.text(include_ocr: true)
99
+ #
100
+ def text(include_ocr: false)
99
101
  return @text if defined? @text
100
102
 
101
- @text = Henkei.read :text, data
103
+ @text = Henkei.read :text, data, include_ocr: include_ocr
102
104
  end
103
105
 
104
106
  # Returns the text content of the Henkei document in HTML.
@@ -106,10 +108,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
106
108
  # henkei = Henkei.new 'sample.pages'
107
109
  # henkei.html
108
110
  #
109
- def html
111
+ # Include OCR results from images (includes embedded images in pages/docx/pdf etc)
112
+ #
113
+ # henkei.html(include_ocr: true)
114
+ #
115
+ def html(include_ocr: false)
110
116
  return @html if defined? @html
111
117
 
112
- @html = Henkei.read :html, data
118
+ @html = Henkei.read :html, data, include_ocr: include_ocr
113
119
  end
114
120
 
115
121
  # Returns the metadata hash of the Henkei document.
@@ -143,9 +149,9 @@ class Henkei # rubocop:disable Metrics/ClassLength
143
149
  #
144
150
  def creation_date
145
151
  return @creation_date if defined? @creation_date
146
- return unless metadata['Creation-Date']
152
+ return unless metadata['dcterms:created']
147
153
 
148
- @creation_date = Time.parse(metadata['Creation-Date'])
154
+ @creation_date = Time.parse(metadata['dcterms:created'])
149
155
  end
150
156
 
151
157
  # Returns +true+ if the Henkei document was specified using a file path.
@@ -195,44 +201,6 @@ class Henkei # rubocop:disable Metrics/ClassLength
195
201
  @data
196
202
  end
197
203
 
198
- # Returns pid of Tika server, started as a new spawned process.
199
- #
200
- # type :html, :text or :metadata
201
- # custom_port e.g. 9293
202
- #
203
- # Henkei.server(:text, 9294)
204
- #
205
- def self.server(type, custom_port = nil)
206
- @@server_port = custom_port || DEFAULT_SERVER_PORT
207
-
208
- @@server_pid = Process.spawn(*tika_command(type, server: true))
209
- sleep(2) # Give the server 2 seconds to spin up.
210
- @@server_pid
211
- end
212
-
213
- # Kills server started by Henkei.server
214
- #
215
- # Always run this when you're done, or else Tika might run until you kill it manually
216
- # You might try putting your extraction in a begin..rescue...ensure...end block and
217
- # putting this method in the ensure block.
218
- #
219
- # Henkei.server(:text)
220
- # reports = ["report1.docx", "report2.doc", "report3.pdf"]
221
- # begin
222
- # my_texts = reports.map{ |report_path| Henkei.new(report_path).text }
223
- # rescue
224
- # ensure
225
- # Henkei.kill_server!
226
- # end
227
- #
228
- def self.kill_server!
229
- return unless @@server_pid
230
-
231
- Process.kill('INT', @@server_pid)
232
- @@server_pid = nil
233
- @@server_port = nil
234
- end
235
-
236
204
  ### Private class methods
237
205
 
238
206
  # Provide the path to the Java binary
@@ -244,44 +212,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
244
212
 
245
213
  # Internal helper for calling to Tika library directly
246
214
  #
247
- def self.client_read(type, data)
248
- filter_response Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
215
+ def self.client_read(type, data, include_ocr: false)
216
+ Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
249
217
  end
250
218
  private_class_method :client_read
251
219
 
252
- # Internal helper for calling to running Tika server
253
- #
254
- def self.server_read(data)
255
- s = TCPSocket.new('localhost', @@server_port)
256
- file = StringIO.new(data, 'r')
257
-
258
- loop do
259
- chunk = file.read(65_536)
260
- break unless chunk
261
-
262
- s.write(chunk)
263
- end
264
-
265
- # tell Tika that we're done sending data
266
- s.shutdown(Socket::SHUT_WR)
267
-
268
- resp = String.new ''
269
- loop do
270
- chunk = s.recv(65_536)
271
- break if chunk.empty? || !chunk
272
-
273
- resp << chunk
274
- end
275
- filter_response resp
276
- end
277
- private_class_method :server_read
278
-
279
220
  # Internal helper for building the Java command to call Tika
280
221
  #
281
- def self.tika_command(type, server: false)
282
- command = [java_path, '-Djava.awt.headless=true', '-jar', Henkei::JAR_PATH, "--config=#{Henkei::CONFIG_PATH}"]
283
- command += ['--server', '--port', @@server_port.to_s] if server
284
- command + switch_for_type(type)
222
+ def self.tika_command(type, include_ocr: false)
223
+ [
224
+ java_path,
225
+ '-Djava.awt.headless=true',
226
+ '-jar',
227
+ Henkei::JAR_PATH,
228
+ "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
229
+ ] + switch_for_type(type)
285
230
  end
286
231
  private_class_method :tika_command
287
232
 
@@ -296,14 +241,4 @@ class Henkei # rubocop:disable Metrics/ClassLength
296
241
  }[type]
297
242
  end
298
243
  private_class_method :switch_for_type
299
-
300
- # Internal helper to remove erroneous output
301
- #
302
- def self.filter_response(response)
303
- response.gsub(
304
- /\AWARNING: sun\.reflect\.Reflection\.getCallerClass is not supported\. This will impact performance\.\n/,
305
- ''
306
- )
307
- end
308
- private_class_method :filter_response
309
244
  end
data/spec/henkei_spec.rb CHANGED
@@ -2,10 +2,15 @@
2
2
 
3
3
  require 'helper'
4
4
  require 'henkei'
5
+ require 'nokogiri'
5
6
 
6
7
  # Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
7
8
  require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
8
9
 
10
+ def travis_ci?
11
+ ENV['CI'] == 'true' && ENV['TRAVIS'] == 'true'
12
+ end
13
+
9
14
  describe Henkei do
10
15
  let(:data) { File.read 'spec/samples/sample.docx' }
11
16
 
@@ -52,6 +57,23 @@ describe Henkei do
52
57
 
53
58
  expect(text).to eq ''
54
59
  end
60
+
61
+ unless travis_ci?
62
+ context 'when `include_ocr` is enabled' do
63
+ it 'returns parsed plain text in the image' do
64
+ text = Henkei.read :text, data, include_ocr: true
65
+
66
+ expect(text).to include <<~TEXT
67
+ West Side
68
+
69
+ Sea Island
70
+ PP
71
+
72
+ Richmond
73
+ TEXT
74
+ end
75
+ end
76
+ end
55
77
  end
56
78
  end
57
79
 
@@ -115,6 +137,7 @@ describe Henkei do
115
137
 
116
138
  describe '.creation_date' do
117
139
  let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
140
+
118
141
  it 'should return Time' do
119
142
  expect(henkei.creation_date).to be_a Time
120
143
  end
@@ -158,6 +181,30 @@ describe Henkei do
158
181
  it '#mimetype returns `image/png`' do
159
182
  expect(henkei.mimetype.content_type).to eq 'image/png'
160
183
  end
184
+
185
+ unless travis_ci?
186
+ context 'when `include_ocr` is enabled' do
187
+ it '#text returns plain text of parsed text in the image' do
188
+ expect(henkei.text(include_ocr: true)).to include <<~TEXT
189
+ West Side
190
+
191
+ Sea Island
192
+ PP
193
+
194
+ Richmond
195
+ TEXT
196
+ end
197
+
198
+ it '#html returns HTML of parsed text in the image' do
199
+ expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
200
+
201
+ html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
202
+ ['Anmore', 'Coquitlam', 'West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
203
+ expect(html_body.text).to include location
204
+ end
205
+ end
206
+ end
207
+ end
161
208
  end
162
209
  end
163
210
 
@@ -198,36 +245,4 @@ describe Henkei do
198
245
  expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
199
246
  end
200
247
  end
201
-
202
- context 'working as server mode' do
203
- specify '#starts and kills server' do
204
- Henkei.server(:text)
205
- expect(Henkei.class_variable_get(:@@server_pid)).not_to be_nil
206
- expect(Henkei.class_variable_get(:@@server_port)).not_to be_nil
207
-
208
- s = TCPSocket.new('localhost', Henkei.class_variable_get(:@@server_port))
209
- expect(s).to be_a TCPSocket
210
- s.close
211
- ensure
212
- port = Henkei.class_variable_get(:@@server_port)
213
- Henkei.kill_server!
214
- sleep 2
215
- expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
216
- end
217
-
218
- specify '#runs samples through server mode' do
219
- Henkei.server(:text)
220
- expect(Henkei.new('spec/samples/sample.pages').text).to(
221
- include 'The quick brown fox jumped over the lazy cat.'
222
- )
223
- expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
224
- include 'The quick brown fox jumped over the lazy cat.'
225
- )
226
- expect(Henkei.new('spec/samples/sample.docx').text).to(
227
- include 'The quick brown fox jumped over the lazy cat.'
228
- )
229
- ensure
230
- Henkei.kill_server!
231
- end
232
- end
233
248
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.28.3.1
4
+ version: 2.2.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2022-05-28 00:00:00.000000000 Z
12
+ date: 2021-12-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -66,117 +66,75 @@ dependencies:
66
66
  - !ruby/object:Gem::Version
67
67
  version: '2.0'
68
68
  - !ruby/object:Gem::Dependency
69
- name: rails
70
- requirement: !ruby/object:Gem::Requirement
71
- requirements:
72
- - - "~>"
73
- - !ruby/object:Gem::Version
74
- version: '5.0'
75
- type: :development
76
- prerelease: false
77
- version_requirements: !ruby/object:Gem::Requirement
78
- requirements:
79
- - - "~>"
80
- - !ruby/object:Gem::Version
81
- version: '5.0'
82
- - !ruby/object:Gem::Dependency
83
- name: rake
84
- requirement: !ruby/object:Gem::Requirement
85
- requirements:
86
- - - "~>"
87
- - !ruby/object:Gem::Version
88
- version: '12.3'
89
- type: :development
90
- prerelease: false
91
- version_requirements: !ruby/object:Gem::Requirement
92
- requirements:
93
- - - "~>"
94
- - !ruby/object:Gem::Version
95
- version: '12.3'
96
- - !ruby/object:Gem::Dependency
97
- name: rspec
98
- requirement: !ruby/object:Gem::Requirement
99
- requirements:
100
- - - "~>"
101
- - !ruby/object:Gem::Version
102
- version: '3.7'
103
- type: :development
104
- prerelease: false
105
- version_requirements: !ruby/object:Gem::Requirement
106
- requirements:
107
- - - "~>"
108
- - !ruby/object:Gem::Version
109
- version: '3.7'
110
- - !ruby/object:Gem::Dependency
111
- name: rubocop
69
+ name: nokogiri
112
70
  requirement: !ruby/object:Gem::Requirement
113
71
  requirements:
114
72
  - - "~>"
115
73
  - !ruby/object:Gem::Version
116
- version: '1.26'
74
+ version: '1.12'
117
75
  type: :development
118
76
  prerelease: false
119
77
  version_requirements: !ruby/object:Gem::Requirement
120
78
  requirements:
121
79
  - - "~>"
122
80
  - !ruby/object:Gem::Version
123
- version: '1.26'
81
+ version: '1.12'
124
82
  - !ruby/object:Gem::Dependency
125
- name: rubocop-performance
83
+ name: rails
126
84
  requirement: !ruby/object:Gem::Requirement
127
85
  requirements:
128
86
  - - "~>"
129
87
  - !ruby/object:Gem::Version
130
- version: '1.13'
88
+ version: '5.0'
131
89
  type: :development
132
90
  prerelease: false
133
91
  version_requirements: !ruby/object:Gem::Requirement
134
92
  requirements:
135
93
  - - "~>"
136
94
  - !ruby/object:Gem::Version
137
- version: '1.13'
95
+ version: '5.0'
138
96
  - !ruby/object:Gem::Dependency
139
- name: rubocop-rails
97
+ name: rake
140
98
  requirement: !ruby/object:Gem::Requirement
141
99
  requirements:
142
100
  - - "~>"
143
101
  - !ruby/object:Gem::Version
144
- version: '2.14'
102
+ version: '12.3'
145
103
  type: :development
146
104
  prerelease: false
147
105
  version_requirements: !ruby/object:Gem::Requirement
148
106
  requirements:
149
107
  - - "~>"
150
108
  - !ruby/object:Gem::Version
151
- version: '2.14'
109
+ version: '12.3'
152
110
  - !ruby/object:Gem::Dependency
153
- name: rubocop-rake
111
+ name: rspec
154
112
  requirement: !ruby/object:Gem::Requirement
155
113
  requirements:
156
114
  - - "~>"
157
115
  - !ruby/object:Gem::Version
158
- version: '0.6'
116
+ version: '3.7'
159
117
  type: :development
160
118
  prerelease: false
161
119
  version_requirements: !ruby/object:Gem::Requirement
162
120
  requirements:
163
121
  - - "~>"
164
122
  - !ruby/object:Gem::Version
165
- version: '0.6'
123
+ version: '3.7'
166
124
  - !ruby/object:Gem::Dependency
167
- name: rubocop-rspec
125
+ name: rubocop
168
126
  requirement: !ruby/object:Gem::Requirement
169
127
  requirements:
170
128
  - - "~>"
171
129
  - !ruby/object:Gem::Version
172
- version: '2.9'
130
+ version: '0.71'
173
131
  type: :development
174
132
  prerelease: false
175
133
  version_requirements: !ruby/object:Gem::Requirement
176
134
  requirements:
177
135
  - - "~>"
178
136
  - !ruby/object:Gem::Version
179
- version: '2.9'
137
+ version: '0.71'
180
138
  - !ruby/object:Gem::Dependency
181
139
  name: simplecov
182
140
  requirement: !ruby/object:Gem::Requirement
@@ -184,9 +142,6 @@ dependencies:
184
142
  - - "~>"
185
143
  - !ruby/object:Gem::Version
186
144
  version: '0.15'
187
- - - "<"
188
- - !ruby/object:Gem::Version
189
- version: '0.18'
190
145
  type: :development
191
146
  prerelease: false
192
147
  version_requirements: !ruby/object:Gem::Requirement
@@ -194,9 +149,6 @@ dependencies:
194
149
  - - "~>"
195
150
  - !ruby/object:Gem::Version
196
151
  version: '0.15'
197
- - - "<"
198
- - !ruby/object:Gem::Version
199
- version: '0.18'
200
152
  description: Read text and metadata from files and documents using Apache Tika toolkit
201
153
  email:
202
154
  - erol.fornoles@gmail.com
@@ -206,10 +158,10 @@ executables:
206
158
  extensions: []
207
159
  extra_rdoc_files: []
208
160
  files:
209
- - ".github/workflows/test.yml"
210
161
  - ".gitignore"
211
162
  - ".rspec"
212
163
  - ".rubocop.yml"
164
+ - ".travis.yml"
213
165
  - Gemfile
214
166
  - LICENSE
215
167
  - NOTICE.txt
@@ -217,7 +169,8 @@ files:
217
169
  - Rakefile
218
170
  - bin/console
219
171
  - henkei.gemspec
220
- - jar/tika-app-1.28.3.jar
172
+ - jar/tika-app-2.2.0.jar
173
+ - jar/tika-config-without-ocr.xml
221
174
  - jar/tika-config.xml
222
175
  - lib/henkei.rb
223
176
  - lib/henkei/configuration.rb
@@ -230,12 +183,10 @@ files:
230
183
  - spec/samples/sample-metadata-values-with-colons.doc
231
184
  - spec/samples/sample.docx
232
185
  - spec/samples/sample.pages
233
- homepage: https://github.com/abrom/henkei
186
+ homepage: http://github.com/abrom/henkei
234
187
  licenses:
235
188
  - MIT
236
- metadata:
237
- allowed_push_host: https://rubygems.org
238
- rubygems_mfa_required: 'true'
189
+ metadata: {}
239
190
  post_install_message:
240
191
  rdoc_options: []
241
192
  require_paths:
@@ -244,19 +195,26 @@ required_ruby_version: !ruby/object:Gem::Requirement
244
195
  requirements:
245
196
  - - ">="
246
197
  - !ruby/object:Gem::Version
247
- version: 2.6.0
198
+ version: 2.4.0
248
199
  - - "<"
249
200
  - !ruby/object:Gem::Version
250
- version: 3.2.0
201
+ version: 3.1.0
251
202
  required_rubygems_version: !ruby/object:Gem::Requirement
252
203
  requirements:
253
204
  - - ">="
254
205
  - !ruby/object:Gem::Version
255
206
  version: '0'
256
207
  requirements: []
257
- rubygems_version: 3.2.3
208
+ rubygems_version: 3.0.9
258
209
  signing_key:
259
210
  specification_version: 4
260
211
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
261
212
  .rtf, .pdf) using Apache Tika toolkit
262
- test_files: []
213
+ test_files:
214
+ - spec/helper.rb
215
+ - spec/henkei_spec.rb
216
+ - spec/samples/pipe-error.png
217
+ - spec/samples/sample filename with spaces.pages
218
+ - spec/samples/sample-metadata-values-with-colons.doc
219
+ - spec/samples/sample.docx
220
+ - spec/samples/sample.pages
@@ -1,37 +0,0 @@
1
- name: Test Henkei Ruby gem
2
-
3
- on:
4
- push:
5
- branches: [1.x]
6
- pull_request:
7
- branches: [1.x]
8
-
9
- env:
10
- CI: true
11
-
12
- jobs:
13
- test:
14
- runs-on: ubuntu-latest
15
- strategy:
16
- matrix:
17
- ruby-version: ['2.6', '2.7', '3.0', '3.1']
18
-
19
- steps:
20
- - uses: actions/checkout@v2
21
-
22
- - name: Set up Ruby
23
- uses: ruby/setup-ruby@v1
24
- with:
25
- ruby-version: ${{ matrix.ruby-version }}
26
- bundler-cache: true
27
-
28
- - name: Lint code - Rubocop
29
- run: bundle exec rubocop
30
-
31
- - name: Run tests
32
- run: bundle exec rspec
33
-
34
- - name: Test & publish code coverage
35
- uses: paambaati/codeclimate-action@v3.0.0
36
- env:
37
- CC_TEST_REPORTER_ID: bb96c1ff9dc66724c38fb4eb54486dd72dc88a7fd6e727c034b9cf8d747d069e