henkei 2.9.2.1 → 2.9.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: db0832d80d7445a7cfd495055c4b9081af1a6aa987adccfd162985dccd4bb2f2
4
- data.tar.gz: 746bee81b787098e87b2c684a1cc33c48e3d34995b47c1d6848dd8e8f3d90991
3
+ metadata.gz: 3ea707f5995f2ff0c9036cb992b991e91ccf9f62b9298222cd81d24876678f96
4
+ data.tar.gz: 4dcd7c3394b3971a58ea11478147320388212bb67622602c795743a7b9d545f6
5
5
  SHA512:
6
- metadata.gz: df61c967907e043f3ebfb1e60a76d097e6e565b1d0d0cebf71da021af295d8c3cfc0d897c7d49f5301cd1356b8e9a13ce9ec482174a1dbdad0aa3a021f267afe
7
- data.tar.gz: 410971738b61f572796a652d260658025fe8c64c15813774a617485d98882af210daaf8fc91e64946a9844e9a61feb6e9c6f16042647b7cf83fe561f80163781
6
+ metadata.gz: 03a2c7e3be4f4065385351ddb1694ae9e5c53eae1ddfc8826a3b137868ded1c3a30efb95f515e97cf779fed09ef321565e4ad39c7c232cf315df3ba19c443625
7
+ data.tar.gz: cbccc2cad4ca82935aee2ffb2c60404264da5dc876f4a3cf05768fb70859dfebfbb436d2d62284e0d5b16d8b246920b0f38e7349d633cabb42facd8bd287ccca
data/README.md CHANGED
@@ -28,7 +28,7 @@ been split up. To keep the gem size down Henkei will only include the client app
28
28
  call to Henkei, a new Java process will be started, run your command, then terminate.
29
29
 
30
30
  Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
31
- based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
31
+ based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
32
32
 
33
33
  ## Usage
34
34
 
@@ -111,12 +111,26 @@ henkei.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.
111
111
  henkei.mimetype.extensions #=> ['docx']
112
112
  ```
113
113
 
114
+ ### Output text in a specific character encoding
115
+
116
+ You can specify the output character encoding by passing in the optional `encoding` argument when calling to the
117
+ `text` or `html` instance methods, as well as the `read` class method.
118
+
119
+ ```ruby
120
+ henkei = Henkei.new 'sample.pages'
121
+ utf_8_text = henkei.text(encoding: 'UTF-8')
122
+ utf_16_html = henkei.html(encoding: 'UTF-16')
123
+
124
+ data = File.read 'sample.pages'
125
+ utf_32_text = Henkei.read :text, data, encoding: 'UTF-32'
126
+ ```
127
+
114
128
  ## Installation and Dependencies
115
129
 
116
130
  ### Java Runtime
117
131
 
118
132
  Henkei packages the Apache Tika application jar and requires a working JRE for it to work.
119
- Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
133
+ Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
120
134
 
121
135
  ### Gem
122
136
 
@@ -131,7 +145,7 @@ And then execute:
131
145
  Or install it yourself as:
132
146
 
133
147
  $ gem install henkei
134
-
148
+
135
149
  ### Heroku
136
150
 
137
151
  Add the JVM Buildpack to your Heroku project:
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Henkei
4
- VERSION = '2.9.2.1'
4
+ VERSION = '2.9.2.2'
5
5
  end
data/lib/henkei.rb CHANGED
@@ -47,8 +47,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
47
47
  # text = Henkei.read :text, data
48
48
  # metadata = Henkei.read :metadata, data
49
49
  #
50
- def self.read(type, data, include_ocr: false)
51
- result = client_read(type, data, include_ocr: include_ocr)
50
+ def self.read(type, data, include_ocr: false, encoding: nil)
51
+ result = client_read(type, data, include_ocr: include_ocr, encoding: encoding)
52
52
 
53
53
  case type
54
54
  when :text, :html then result
@@ -96,10 +96,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
96
96
  #
97
97
  # henkei.text(include_ocr: true)
98
98
  #
99
- def text(include_ocr: false)
99
+ # Set the output character encoding (e.g. 'UTF-8')
100
+ #
101
+ # henkei.text(encoding: 'UTF-8')
102
+ #
103
+ def text(include_ocr: false, encoding: nil)
100
104
  return @text if defined? @text
101
105
 
102
- @text = Henkei.read :text, data, include_ocr: include_ocr
106
+ @text = Henkei.read :text, data, include_ocr: include_ocr, encoding: encoding
103
107
  end
104
108
 
105
109
  # Returns the text content of the Henkei document in HTML.
@@ -111,10 +115,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
111
115
  #
112
116
  # henkei.html(include_ocr: true)
113
117
  #
114
- def html(include_ocr: false)
118
+ # Set the output character encoding (e.g. 'UTF-8')
119
+ #
120
+ # henkei.text(encoding: 'UTF-8')
121
+ #
122
+ def html(include_ocr: false, encoding: nil)
115
123
  return @html if defined? @html
116
124
 
117
- @html = Henkei.read :html, data, include_ocr: include_ocr
125
+ @html = Henkei.read :html, data, include_ocr: include_ocr, encoding: encoding
118
126
  end
119
127
 
120
128
  # Returns the metadata hash of the Henkei document.
@@ -211,20 +219,37 @@ class Henkei # rubocop:disable Metrics/ClassLength
211
219
 
212
220
  # Internal helper for calling to Tika library directly
213
221
  #
214
- def self.client_read(type, data, include_ocr: false)
215
- Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
222
+ def self.client_read(type, data, include_ocr: false, encoding: nil)
223
+ unless encoding.nil? || Encoding.name_list.include?(encoding)
224
+ raise ArgumentError, "unsupported encoding - #{encoding}"
225
+ end
226
+
227
+ Open3.popen2(*tika_command(type, include_ocr: include_ocr, encoding: encoding)) do |stdin, stdout|
228
+ stdin.binmode
229
+ stdout.binmode
230
+ stdout.set_encoding encoding unless encoding.nil?
231
+
232
+ out_reader = Thread.new { stdout.read }
233
+
234
+ write_data_to_stdin(data, stdin)
235
+
236
+ stdin.close
237
+
238
+ out_reader.value
239
+ end
216
240
  end
217
241
  private_class_method :client_read
218
242
 
219
243
  # Internal helper for building the Java command to call Tika
220
244
  #
221
- def self.tika_command(type, include_ocr: false)
245
+ def self.tika_command(type, include_ocr: false, encoding: nil)
222
246
  [
223
247
  java_path,
224
248
  '-Djava.awt.headless=true',
225
249
  '-jar',
226
250
  Henkei::JAR_PATH,
227
- "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
251
+ "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}",
252
+ *("--encoding=#{encoding}" unless encoding.nil?)
228
253
  ] + switch_for_type(type)
229
254
  end
230
255
  private_class_method :tika_command
@@ -240,4 +265,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
240
265
  }[type]
241
266
  end
242
267
  private_class_method :switch_for_type
268
+
269
+ # Internal helper for writing the input data to stdin when calling Tika
270
+ #
271
+ def self.write_data_to_stdin(data, stdin)
272
+ return unless data
273
+
274
+ begin
275
+ if data.respond_to? :readpartial
276
+ IO.copy_stream(data, stdin)
277
+ else
278
+ stdin.write data
279
+ end
280
+ rescue Errno::EPIPE
281
+ # Catch broken pipe.
282
+ end
283
+ end
284
+ private_class_method :write_data_to_stdin
243
285
  end
data/spec/henkei_spec.rb CHANGED
@@ -72,6 +72,26 @@ describe Henkei do
72
72
  end
73
73
  end
74
74
  end
75
+
76
+ context 'when a valid `encoding` value is provided' do
77
+ let(:encoding) { 'UTF-32' }
78
+
79
+ it 'returns the parsed text in the specified encoding' do
80
+ text = described_class.read :text, data, encoding: encoding
81
+
82
+ expect(text.encoding.name).to eq encoding
83
+ end
84
+ end
85
+
86
+ context 'when an invalid `encoding` value is provided' do
87
+ let(:encoding) { 'Beef' }
88
+
89
+ it 'raises an error' do
90
+ expect do
91
+ described_class.read :text, data, encoding: encoding
92
+ end.to raise_error(ArgumentError, "unsupported encoding - #{encoding}")
93
+ end
94
+ end
75
95
  end
76
96
 
77
97
  describe '.new' do
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.9.2.1
4
+ version: 2.9.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
8
8
  - Andrew Bromwich
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2024-05-01 00:00:00.000000000 Z
12
+ date: 2024-06-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -91,7 +91,7 @@ licenses:
91
91
  metadata:
92
92
  allowed_push_host: https://rubygems.org
93
93
  rubygems_mfa_required: 'true'
94
- post_install_message:
94
+ post_install_message:
95
95
  rdoc_options: []
96
96
  require_paths:
97
97
  - lib
@@ -109,8 +109,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
111
  requirements: []
112
- rubygems_version: 3.3.7
113
- signing_key:
112
+ rubygems_version: 3.0.0
113
+ signing_key:
114
114
  specification_version: 4
115
115
  summary: Read text and metadata from files and documents (.doc, .docx, .pages, .odt,
116
116
  .rtf, .pdf) using Apache Tika toolkit