henkei 2.9.2.1 → 2.9.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: db0832d80d7445a7cfd495055c4b9081af1a6aa987adccfd162985dccd4bb2f2
4
- data.tar.gz: 746bee81b787098e87b2c684a1cc33c48e3d34995b47c1d6848dd8e8f3d90991
3
+ metadata.gz: 3ea707f5995f2ff0c9036cb992b991e91ccf9f62b9298222cd81d24876678f96
4
+ data.tar.gz: 4dcd7c3394b3971a58ea11478147320388212bb67622602c795743a7b9d545f6
5
5
  SHA512:
6
- metadata.gz: df61c967907e043f3ebfb1e60a76d097e6e565b1d0d0cebf71da021af295d8c3cfc0d897c7d49f5301cd1356b8e9a13ce9ec482174a1dbdad0aa3a021f267afe
7
- data.tar.gz: 410971738b61f572796a652d260658025fe8c64c15813774a617485d98882af210daaf8fc91e64946a9844e9a61feb6e9c6f16042647b7cf83fe561f80163781
6
+ metadata.gz: 03a2c7e3be4f4065385351ddb1694ae9e5c53eae1ddfc8826a3b137868ded1c3a30efb95f515e97cf779fed09ef321565e4ad39c7c232cf315df3ba19c443625
7
+ data.tar.gz: cbccc2cad4ca82935aee2ffb2c60404264da5dc876f4a3cf05768fb70859dfebfbb436d2d62284e0d5b16d8b246920b0f38e7349d633cabb42facd8bd287ccca
data/README.md CHANGED
@@ -28,7 +28,7 @@ been split up. To keep the gem size down Henkei will only include the client app
28
28
  call to Henkei, a new Java process will be started, run your command, then terminate.
29
29
 
30
30
  Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
31
- based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
31
+ based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
32
32
 
33
33
  ## Usage
34
34
 
@@ -111,12 +111,26 @@ henkei.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.
111
111
  henkei.mimetype.extensions #=> ['docx']
112
112
  ```
113
113
 
114
+ ### Output text in a specific character encoding
115
+
116
+ You can specify the output character encoding by passing in the optional `encoding` argument when calling to the
117
+ `text` or `html` instance methods, as well as the `read` class method.
118
+
119
+ ```ruby
120
+ henkei = Henkei.new 'sample.pages'
121
+ utf_8_text = henkei.text(encoding: 'UTF-8')
122
+ utf_16_html = henkei.html(encoding: 'UTF-16')
123
+
124
+ data = File.read 'sample.pages'
125
+ utf_32_text = Henkei.read :text, data, encoding: 'UTF-32'
126
+ ```
127
+
114
128
  ## Installation and Dependencies
115
129
 
116
130
  ### Java Runtime
117
131
 
118
132
  Henkei packages the Apache Tika application jar and requires a working JRE for it to work.
119
- Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
133
+ Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
120
134
 
121
135
  ### Gem
122
136
 
@@ -131,7 +145,7 @@ And then execute:
131
145
  Or install it yourself as:
132
146
 
133
147
  $ gem install henkei
134
-
148
+
135
149
  ### Heroku
136
150
 
137
151
  Add the JVM Buildpack to your Heroku project:
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Henkei
4
- VERSION = '2.9.2.1'
4
+ VERSION = '2.9.2.2'
5
5
  end
data/lib/henkei.rb CHANGED
@@ -47,8 +47,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
47
47
  # text = Henkei.read :text, data
48
48
  # metadata = Henkei.read :metadata, data
49
49
  #
50
- def self.read(type, data, include_ocr: false)
51
- result = client_read(type, data, include_ocr: include_ocr)
50
+ def self.read(type, data, include_ocr: false, encoding: nil)
51
+ result = client_read(type, data, include_ocr: include_ocr, encoding: encoding)
52
52
 
53
53
  case type
54
54
  when :text, :html then result
@@ -96,10 +96,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
96
96
  #
97
97
  # henkei.text(include_ocr: true)
98
98
  #
99
- def text(include_ocr: false)
99
+ # Set the output character encoding (e.g. 'UTF-8')
100
+ #
101
+ # henkei.text(encoding: 'UTF-8')
102
+ #
103
+ def text(include_ocr: false, encoding: nil)
100
104
  return @text if defined? @text
101
105
 
102
- @text = Henkei.read :text, data, include_ocr: include_ocr
106
+ @text = Henkei.read :text, data, include_ocr: include_ocr, encoding: encoding
103
107
  end
104
108
 
105
109
  # Returns the text content of the Henkei document in HTML.
@@ -111,10 +115,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
111
115
  #
112
116
  # henkei.html(include_ocr: true)
113
117
  #
114
- def html(include_ocr: false)
118
+ # Set the output character encoding (e.g. 'UTF-8')
119
+ #
120
+ # henkei.text(encoding: 'UTF-8')
121
+ #
122
+ def html(include_ocr: false, encoding: nil)
115
123
  return @html if defined? @html
116
124
 
117
- @html = Henkei.read :html, data, include_ocr: include_ocr
125
+ @html = Henkei.read :html, data, include_ocr: include_ocr, encoding: encoding
118
126
  end
119
127
 
120
128
  # Returns the metadata hash of the Henkei document.
@@ -211,20 +219,37 @@ class Henkei # rubocop:disable Metrics/ClassLength
211
219
 
212
220
  # Internal helper for calling to Tika library directly
213
221
  #
214
- def self.client_read(type, data, include_ocr: false)
215
- Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
222
+ def self.client_read(type, data, include_ocr: false, encoding: nil)
223
+ unless encoding.nil? || Encoding.name_list.include?(encoding)
224
+ raise ArgumentError, "unsupported encoding - #{encoding}"
225
+ end
226
+
227
+ Open3.popen2(*tika_command(type, include_ocr: include_ocr, encoding: encoding)) do |stdin, stdout|
228
+ stdin.binmode
229
+ stdout.binmode
230
+ stdout.set_encoding encoding unless encoding.nil?
231
+
232
+ out_reader = Thread.new { stdout.read }
233
+
234
+ write_data_to_stdin(data, stdin)
235
+
236
+ stdin.close
237
+
238
+ out_reader.value
239
+ end
216
240
  end
217
241
  private_class_method :client_read
218
242
 
219
243
  # Internal helper for building the Java command to call Tika
220
244
  #
221
- def self.tika_command(type, include_ocr: false)
245
+ def self.tika_command(type, include_ocr: false, encoding: nil)
222
246
  [
223
247
  java_path,
224
248
  '-Djava.awt.headless=true',
225
249
  '-jar',
226
250
  Henkei::JAR_PATH,
227
- "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
251
+ "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}",
252
+ *("--encoding=#{encoding}" unless encoding.nil?)
228
253
  ] + switch_for_type(type)
229
254
  end
230
255
  private_class_method :tika_command
@@ -240,4 +265,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
240
265
  }[type]
241
266
  end
242
267
  private_class_method :switch_for_type
268
+
269
+ # Internal helper for writing the input data to stdin when calling Tika
270
+ #
271
+ def self.write_data_to_stdin(data, stdin)
272
+ return unless data
273
+
274
+ begin
275
+ if data.respond_to? :readpartial
276
+ IO.copy_stream(data, stdin)
277
+ else
278
+ stdin.write data
279
+ end
280
+ rescue Errno::EPIPE
281
+ # Catch broken pipe.
282
+ end
283
+ end
284
+ private_class_method :write_data_to_stdin
243
285
  end
data/spec/henkei_spec.rb CHANGED
@@ -72,6 +72,26 @@ describe Henkei do
72
72
  end
73
73
  end
74
74
  end
75
+
76
+ context 'when a valid `encoding` value is provided' do
77
+ let(:encoding) { 'UTF-32' }
78
+
79
+ it 'returns the parsed text in the specified encoding' do
80
+ text = described_class.read :text, data, encoding: encoding
81
+
82
+ expect(text.encoding.name).to eq encoding
83
+ end
84
+ end
85
+
86
+ context 'when an invalid `encoding` value is provided' do
87
+ let(:encoding) { 'Beef' }
88
+
89
+ it 'raises an error' do
90
+ expect do
91
+ described_class.read :text, data, encoding: encoding
92
+ end.to raise_error(ArgumentError, "unsupported encoding - #{encoding}")
93
+ end
94
+ end
75
95
  end
76
96
 
77
97
  describe '.new' do
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.9.2.1
4
+ version: 2.9.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
8
8
  - Andrew Bromwich
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2024-05-01 00:00:00.000000000 Z
12
+ date: 2024-06-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -91,7 +91,7 @@ licenses:
91
91
  metadata:
92
92
  allowed_push_host: https://rubygems.org
93
93
  rubygems_mfa_required: 'true'
94
- post_install_message:
94
+ post_install_message:
95
95
  rdoc_options: []
96
96
  require_paths:
97
97
  - lib
@@ -109,8 +109,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
111
  requirements: []
112
- rubygems_version: 3.3.7
113
- signing_key:
112
+ rubygems_version: 3.0.0
113
+ signing_key:
114
114
  specification_version: 4
115
115
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
116
116
  .rtf, .pdf) using Apache Tika toolkit