henkei 1.27.1 → 2.2.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7f17479851871389aaeaa73798025c62d10cabc40a6ac703c2b25385258ae179
4
- data.tar.gz: 3e1ff327a0ff55ca55b8ce2d44ffce9bd493dcf2894129ecfc143b965d765e84
3
+ metadata.gz: 8722cfd61bfea12a7c0c9568d8b8dca05ffcb25ef10e4757a59611ac03fd452d
4
+ data.tar.gz: e2993c4a9a4144143915eff24334380d69e1c63f3d807dd76d8acf8a814c0953
5
5
  SHA512:
6
- metadata.gz: ab3a4254edd84f35c990013f684b1f3fbfb98f19c8d6e74aa3f27ffd4e3280c8e84b1679508b751af5178f3cd827cc5fb717d5d8819819fcc1cf77057272bae5
7
- data.tar.gz: 8c6d9fcc938653550877fdce23de4a15d0427e41de023a113fd27c3e0bf213765df445b9d14e45496302148a1dc8a7120591505ed68e55b50f4927bdcafb940b
6
+ metadata.gz: 2f4329296268adfc06d6fa2dbb68c3d6a97d54266a6fcde2503624aa9c1399b60bc7fe550c0af5844c9adc9bcd395ccf08780e4407f98325d9c720a9dc222401
7
+ data.tar.gz: cc12a3fdfa40e528068185c61fa1ed39416a26139989f30a4f52c2bb4f14c192a4927cb250927f231b70f0105c81e23cf90792108abf78326eff922a9ed7e261
data/.rubocop.yml CHANGED
@@ -1,5 +1,6 @@
1
1
  AllCops:
2
2
  NewCops: enable
3
+ TargetRubyVersion: 2.6
3
4
 
4
5
  Layout/EmptyLinesAroundAttributeAccessor:
5
6
  Enabled: true
data/.travis.yml CHANGED
@@ -7,10 +7,10 @@ env:
7
7
 
8
8
  language: ruby
9
9
  rvm:
10
- - 2.5
11
10
  - 2.6
12
11
  - 2.7
13
12
  - 3.0
13
+ - 3.1
14
14
 
15
15
  before_install:
16
16
  - gem update bundler
data/README.md CHANGED
@@ -21,6 +21,15 @@ Here are some of the formats supported:
21
21
  For the complete list of supported formats, please visit the Apache Tika
22
22
  [Supported Document Formats](http://tika.apache.org/0.9/formats.html) page.
23
23
 
24
+ ## Upgrading from v1.x to v2.x
25
+
26
+ Apache Tika v2.x brings with it some changes. One key change is that the Tika client and server applications have
27
+ been split up. To keep the gem size down Henkei will only include the client app. That is to say, each time you
28
+ call to Henkei, a new Java process will be started, run your command, then terminate.
29
+
30
+ Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
31
+ based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
32
+
24
33
  ## Usage
25
34
 
26
35
  Text, metadata and MIME type information can be extracted by calling `Henkei.read` directly:
@@ -69,6 +78,20 @@ post '/:name/:filename' do
69
78
  end
70
79
  ```
71
80
 
81
+ ### Reading text from inside images (OCR)
82
+
83
+ You can enable OCR by specifying the optional `include_ocr: true` when calling to the `text` or `html` instance methods,
84
+ as well as the `read` class method. Note that Tika does indicate this will greatly increase processing time.
85
+
86
+ ```ruby
87
+ henkei = Henkei.new 'sample.pages'
88
+ text_with_ocr = henkei.text(include_ocr: true)
89
+ html_with_ocr = henkei.html(include_ocr: true)
90
+
91
+ data = File.read 'sample.pages'
92
+ text_with_ocr = Henkei.read :text, data, include_ocr: true
93
+ ```
94
+
72
95
  ### Reading metadata
73
96
 
74
97
  Metadata is returned as a hash.
data/henkei.gemspec CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec|
15
15
  '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
16
16
  spec.homepage = 'http://github.com/abrom/henkei'
17
17
  spec.license = 'MIT'
18
- spec.required_ruby_version = ['>= 2.4.0', '< 3.1.0']
18
+ spec.required_ruby_version = ['>= 2.6.0', '< 3.2.0']
19
19
 
20
20
  spec.files = `git ls-files`.split("\n")
21
21
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
27
27
 
28
28
  spec.add_development_dependency 'bundler', '~> 2.0'
29
+ spec.add_development_dependency 'nokogiri', '~> 1.12'
29
30
  spec.add_development_dependency 'rails', '~> 5.0'
30
31
  spec.add_development_dependency 'rake', '~> 12.3'
31
32
  spec.add_development_dependency 'rspec', '~> 3.7'
@@ -0,0 +1,9 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <properties>
3
+ <service-loader initializableProblemHandler="ignore"/>
4
+ <parsers>
5
+ <parser class="org.apache.tika.parser.DefaultParser">
6
+ <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
7
+ </parser>
8
+ </parsers>
9
+ </properties>
data/jar/tika-config.xml CHANGED
@@ -1,3 +1,4 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
1
2
  <properties>
2
3
  <service-loader initializableProblemHandler="ignore"/>
3
4
  </properties>
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Henkei
4
- VERSION = '1.27.1'
4
+ VERSION = '2.2.1.2'
5
5
  end
data/lib/henkei.rb CHANGED
@@ -25,12 +25,9 @@ require 'open3'
25
25
  # Read text and metadata from files and documents using Apache Tika toolkit
26
26
  class Henkei # rubocop:disable Metrics/ClassLength
27
27
  GEM_PATH = File.dirname(File.dirname(__FILE__))
28
- JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.27.jar')
28
+ JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.2.1.jar')
29
29
  CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
30
- DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
31
-
32
- @@server_port = nil
33
- @@server_pid = nil
30
+ CONFIG_WITHOUT_OCR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
34
31
 
35
32
  def self.mimetype(content_type)
36
33
  if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
@@ -50,8 +47,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
50
47
  # text = Henkei.read :text, data
51
48
  # metadata = Henkei.read :metadata, data
52
49
  #
53
- def self.read(type, data)
54
- result = @@server_pid ? server_read(data) : client_read(type, data)
50
+ def self.read(type, data, include_ocr: false)
51
+ result = client_read(type, data, include_ocr: include_ocr)
55
52
 
56
53
  case type
57
54
  when :text then result
@@ -96,10 +93,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
96
93
  # henkei = Henkei.new 'sample.pages'
97
94
  # henkei.text
98
95
  #
99
- def text
96
+ # Include OCR results from images (includes embedded images in pages/docx/pdf etc)
97
+ #
98
+ # henkei.text(include_ocr: true)
99
+ #
100
+ def text(include_ocr: false)
100
101
  return @text if defined? @text
101
102
 
102
- @text = Henkei.read :text, data
103
+ @text = Henkei.read :text, data, include_ocr: include_ocr
103
104
  end
104
105
 
105
106
  # Returns the text content of the Henkei document in HTML.
@@ -107,10 +108,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
107
108
  # henkei = Henkei.new 'sample.pages'
108
109
  # henkei.html
109
110
  #
110
- def html
111
+ # Include OCR results from images (includes embedded images in pages/docx/pdf etc)
112
+ #
113
+ # henkei.html(include_ocr: true)
114
+ #
115
+ def html(include_ocr: false)
111
116
  return @html if defined? @html
112
117
 
113
- @html = Henkei.read :html, data
118
+ @html = Henkei.read :html, data, include_ocr: include_ocr
114
119
  end
115
120
 
116
121
  # Returns the metadata hash of the Henkei document.
@@ -144,9 +149,9 @@ class Henkei # rubocop:disable Metrics/ClassLength
144
149
  #
145
150
  def creation_date
146
151
  return @creation_date if defined? @creation_date
147
- return unless metadata['Creation-Date']
152
+ return unless metadata['dcterms:created']
148
153
 
149
- @creation_date = Time.parse(metadata['Creation-Date'])
154
+ @creation_date = Time.parse(metadata['dcterms:created'])
150
155
  end
151
156
 
152
157
  # Returns +true+ if the Henkei document was specified using a file path.
@@ -196,44 +201,6 @@ class Henkei # rubocop:disable Metrics/ClassLength
196
201
  @data
197
202
  end
198
203
 
199
- # Returns pid of Tika server, started as a new spawned process.
200
- #
201
- # type :html, :text or :metadata
202
- # custom_port e.g. 9293
203
- #
204
- # Henkei.server(:text, 9294)
205
- #
206
- def self.server(type, custom_port = nil)
207
- @@server_port = custom_port || DEFAULT_SERVER_PORT
208
-
209
- @@server_pid = Process.spawn(*tika_command(type, server: true))
210
- sleep(2) # Give the server 2 seconds to spin up.
211
- @@server_pid
212
- end
213
-
214
- # Kills server started by Henkei.server
215
- #
216
- # Always run this when you're done, or else Tika might run until you kill it manually
217
- # You might try putting your extraction in a begin..rescue...ensure...end block and
218
- # putting this method in the ensure block.
219
- #
220
- # Henkei.server(:text)
221
- # reports = ["report1.docx", "report2.doc", "report3.pdf"]
222
- # begin
223
- # my_texts = reports.map{ |report_path| Henkei.new(report_path).text }
224
- # rescue
225
- # ensure
226
- # Henkei.kill_server!
227
- # end
228
- #
229
- def self.kill_server!
230
- return unless @@server_pid
231
-
232
- Process.kill('INT', @@server_pid)
233
- @@server_pid = nil
234
- @@server_port = nil
235
- end
236
-
237
204
  ### Private class methods
238
205
 
239
206
  # Provide the path to the Java binary
@@ -245,44 +212,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
245
212
 
246
213
  # Internal helper for calling to Tika library directly
247
214
  #
248
- def self.client_read(type, data)
249
- Open3.capture2(*tika_command(type), stdin_data: data, binmode: true).first
215
+ def self.client_read(type, data, include_ocr: false)
216
+ Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
250
217
  end
251
218
  private_class_method :client_read
252
219
 
253
- # Internal helper for calling to running Tika server
254
- #
255
- def self.server_read(data)
256
- s = TCPSocket.new('localhost', @@server_port)
257
- file = StringIO.new(data, 'r')
258
-
259
- loop do
260
- chunk = file.read(65_536)
261
- break unless chunk
262
-
263
- s.write(chunk)
264
- end
265
-
266
- # tell Tika that we're done sending data
267
- s.shutdown(Socket::SHUT_WR)
268
-
269
- resp = String.new ''
270
- loop do
271
- chunk = s.recv(65_536)
272
- break if chunk.empty? || !chunk
273
-
274
- resp << chunk
275
- end
276
- resp
277
- end
278
- private_class_method :server_read
279
-
280
220
  # Internal helper for building the Java command to call Tika
281
221
  #
282
- def self.tika_command(type, server: false)
283
- command = [java_path, '-Djava.awt.headless=true', '-jar', Henkei::JAR_PATH, "--config=#{Henkei::CONFIG_PATH}"]
284
- command += ['--server', '--port', @@server_port.to_s] if server
285
- command + switch_for_type(type)
222
+ def self.tika_command(type, include_ocr: false)
223
+ [
224
+ java_path,
225
+ '-Djava.awt.headless=true',
226
+ '-jar',
227
+ Henkei::JAR_PATH,
228
+ "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
229
+ ] + switch_for_type(type)
286
230
  end
287
231
  private_class_method :tika_command
288
232
 
data/spec/henkei_spec.rb CHANGED
@@ -2,10 +2,15 @@
2
2
 
3
3
  require 'helper'
4
4
  require 'henkei'
5
+ require 'nokogiri'
5
6
 
6
7
  # Some of the tests have been known to fail in weird and wonderful ways when `rails` is included
7
8
  require 'rails' if ENV['INCLUDE_RAILS'] == 'true'
8
9
 
10
+ def travis_ci?
11
+ ENV['CI'] == 'true' && ENV['TRAVIS'] == 'true'
12
+ end
13
+
9
14
  describe Henkei do
10
15
  let(:data) { File.read 'spec/samples/sample.docx' }
11
16
 
@@ -52,6 +57,23 @@ describe Henkei do
52
57
 
53
58
  expect(text).to eq ''
54
59
  end
60
+
61
+ unless travis_ci?
62
+ context 'when `include_ocr` is enabled' do
63
+ it 'returns parsed plain text in the image' do
64
+ text = Henkei.read :text, data, include_ocr: true
65
+
66
+ expect(text).to include <<~TEXT
67
+ West Side
68
+
69
+ Sea Island
70
+ PP
71
+
72
+ Richmond
73
+ TEXT
74
+ end
75
+ end
76
+ end
55
77
  end
56
78
  end
57
79
 
@@ -115,6 +137,7 @@ describe Henkei do
115
137
 
116
138
  describe '.creation_date' do
117
139
  let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
140
+
118
141
  it 'should return Time' do
119
142
  expect(henkei.creation_date).to be_a Time
120
143
  end
@@ -158,6 +181,30 @@ describe Henkei do
158
181
  it '#mimetype returns `image/png`' do
159
182
  expect(henkei.mimetype.content_type).to eq 'image/png'
160
183
  end
184
+
185
+ unless travis_ci?
186
+ context 'when `include_ocr` is enabled' do
187
+ it '#text returns plain text of parsed text in the image' do
188
+ expect(henkei.text(include_ocr: true)).to include <<~TEXT
189
+ West Side
190
+
191
+ Sea Island
192
+ PP
193
+
194
+ Richmond
195
+ TEXT
196
+ end
197
+
198
+ it '#html returns HTML of parsed text in the image' do
199
+ expect(henkei.html(include_ocr: true)).to include '<meta name="tiff:ImageWidth" content="792"/>'
200
+
201
+ html_body = Nokogiri::HTML(henkei.html(include_ocr: true)).at_xpath('//body')
202
+ ['Anmore', 'Coquitlam', 'West Side', 'Sea Island', 'Richmond', 'Steveston'].each do |location|
203
+ expect(html_body.text).to include location
204
+ end
205
+ end
206
+ end
207
+ end
161
208
  end
162
209
  end
163
210
 
@@ -198,40 +245,4 @@ describe Henkei do
198
245
  expect(henkei.metadata['Content-Type']).to eq 'application/pdf'
199
246
  end
200
247
  end
201
-
202
- context 'working as server mode' do
203
- specify '#starts and kills server' do
204
- begin
205
- Henkei.server(:text)
206
- expect(Henkei.class_variable_get(:@@server_pid)).not_to be_nil
207
- expect(Henkei.class_variable_get(:@@server_port)).not_to be_nil
208
-
209
- s = TCPSocket.new('localhost', Henkei.class_variable_get(:@@server_port))
210
- expect(s).to be_a TCPSocket
211
- s.close
212
- ensure
213
- port = Henkei.class_variable_get(:@@server_port)
214
- Henkei.kill_server!
215
- sleep 2
216
- expect { TCPSocket.new('localhost', port) }.to raise_error Errno::ECONNREFUSED
217
- end
218
- end
219
-
220
- specify '#runs samples through server mode' do
221
- begin
222
- Henkei.server(:text)
223
- expect(Henkei.new('spec/samples/sample.pages').text).to(
224
- include 'The quick brown fox jumped over the lazy cat.'
225
- )
226
- expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
227
- include 'The quick brown fox jumped over the lazy cat.'
228
- )
229
- expect(Henkei.new('spec/samples/sample.docx').text).to(
230
- include 'The quick brown fox jumped over the lazy cat.'
231
- )
232
- ensure
233
- Henkei.kill_server!
234
- end
235
- end
236
- end
237
248
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.27.1
4
+ version: 2.2.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
8
8
  - Andrew Bromwich
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-07-19 00:00:00.000000000 Z
12
+ date: 2022-02-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -65,6 +65,20 @@ dependencies:
65
65
  - - "~>"
66
66
  - !ruby/object:Gem::Version
67
67
  version: '2.0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: nokogiri
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '1.12'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '1.12'
68
82
  - !ruby/object:Gem::Dependency
69
83
  name: rails
70
84
  requirement: !ruby/object:Gem::Requirement
@@ -155,7 +169,8 @@ files:
155
169
  - Rakefile
156
170
  - bin/console
157
171
  - henkei.gemspec
158
- - jar/tika-app-1.27.jar
172
+ - jar/tika-app-2.2.1.jar
173
+ - jar/tika-config-without-ocr.xml
159
174
  - jar/tika-config.xml
160
175
  - lib/henkei.rb
161
176
  - lib/henkei/configuration.rb
@@ -172,7 +187,7 @@ homepage: http://github.com/abrom/henkei
172
187
  licenses:
173
188
  - MIT
174
189
  metadata: {}
175
- post_install_message:
190
+ post_install_message:
176
191
  rdoc_options: []
177
192
  require_paths:
178
193
  - lib
@@ -180,18 +195,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
180
195
  requirements:
181
196
  - - ">="
182
197
  - !ruby/object:Gem::Version
183
- version: 2.4.0
198
+ version: 2.6.0
184
199
  - - "<"
185
200
  - !ruby/object:Gem::Version
186
- version: 3.1.0
201
+ version: 3.2.0
187
202
  required_rubygems_version: !ruby/object:Gem::Requirement
188
203
  requirements:
189
204
  - - ">="
190
205
  - !ruby/object:Gem::Version
191
206
  version: '0'
192
207
  requirements: []
193
- rubygems_version: 3.0.6
194
- signing_key:
208
+ rubygems_version: 3.3.5
209
+ signing_key:
195
210
  specification_version: 4
196
211
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
197
212
  .rtf, .pdf) using Apache Tika toolkit