henkei 2.9.2.1 → 2.9.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: db0832d80d7445a7cfd495055c4b9081af1a6aa987adccfd162985dccd4bb2f2
4
- data.tar.gz: 746bee81b787098e87b2c684a1cc33c48e3d34995b47c1d6848dd8e8f3d90991
3
+ metadata.gz: e944ff75afd6a3dc86edd752aaf8b9eddf1567dea93b1c1710cdf25be710b662
4
+ data.tar.gz: d8a4a86fb7c46fddaa17c9ad438ad8f48902e14d1e38021412590df3197306a4
5
5
  SHA512:
6
- metadata.gz: df61c967907e043f3ebfb1e60a76d097e6e565b1d0d0cebf71da021af295d8c3cfc0d897c7d49f5301cd1356b8e9a13ce9ec482174a1dbdad0aa3a021f267afe
7
- data.tar.gz: 410971738b61f572796a652d260658025fe8c64c15813774a617485d98882af210daaf8fc91e64946a9844e9a61feb6e9c6f16042647b7cf83fe561f80163781
6
+ metadata.gz: 8aa16da88e03ff8589c3605775cce4096a9a35d95528441171af5dcb4754279e4159ebdcae4d08d2789a33ac562d82386ff565daf02f3f800458e628af64f77e
7
+ data.tar.gz: 9cda283d4cfee5c766e6212eaa25834b0761190115c13243f8e6ad9b82e59d5ccaf9e44cb9a82fb9453e53ffc69ed92350ca521a3bf97cea065fffefafe28b70
@@ -14,7 +14,7 @@ jobs:
14
14
  runs-on: ubuntu-latest
15
15
  strategy:
16
16
  matrix:
17
- ruby-version: ['3.0', '3.1', '3.2', '3.3']
17
+ ruby-version: ['3.0', '3.1', '3.2', '3.3', '3.4']
18
18
 
19
19
  steps:
20
20
  - uses: actions/checkout@v3
data/README.md CHANGED
@@ -28,7 +28,7 @@ been split up. To keep the gem size down Henkei will only include the client app
28
28
  call to Henkei, a new Java process will be started, run your command, then terminate.
29
29
 
30
30
  Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards
31
- based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
31
+ based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata)
32
32
 
33
33
  ## Usage
34
34
 
@@ -111,12 +111,26 @@ henkei.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument.
111
111
  henkei.mimetype.extensions #=> ['docx']
112
112
  ```
113
113
 
114
+ ### Output text in a specific character encoding
115
+
116
+ You can specify the output character encoding by passing in the optional `encoding` argument when calling to the
117
+ `text` or `html` instance methods, as well as the `read` class method.
118
+
119
+ ```ruby
120
+ henkei = Henkei.new 'sample.pages'
121
+ utf_8_text = henkei.text(encoding: 'UTF-8')
122
+ utf_16_html = henkei.html(encoding: 'UTF-16')
123
+
124
+ data = File.read 'sample.pages'
125
+ utf_32_text = Henkei.read :text, data, encoding: 'UTF-32'
126
+ ```
127
+
114
128
  ## Installation and Dependencies
115
129
 
116
130
  ### Java Runtime
117
131
 
118
132
  Henkei packages the Apache Tika application jar and requires a working JRE for it to work.
119
- Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
133
+ Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path.
120
134
 
121
135
  ### Gem
122
136
 
@@ -131,7 +145,7 @@ And then execute:
131
145
  Or install it yourself as:
132
146
 
133
147
  $ gem install henkei
134
-
148
+
135
149
  ### Heroku
136
150
 
137
151
  Add the JVM Buildpack to your Heroku project:
data/henkei.gemspec CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec|
15
15
  '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
16
16
  spec.homepage = 'https://github.com/abrom/henkei'
17
17
  spec.license = 'MIT'
18
- spec.required_ruby_version = ['>= 3.0.0', '< 3.4.0']
18
+ spec.required_ruby_version = ['>= 3.0.0', '< 3.5.0']
19
19
 
20
20
  # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
21
21
  # delete this section to allow pushing this gem to any host.
@@ -28,6 +28,6 @@ Gem::Specification.new do |spec|
28
28
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
29
29
  spec.require_paths = ['lib']
30
30
 
31
- spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
32
- spec.add_runtime_dependency 'mini_mime', '>= 0.1.1', '< 2'
31
+ spec.add_dependency 'json', '>= 1.8', '< 3'
32
+ spec.add_dependency 'mini_mime', '>= 0.1.1', '< 2'
33
33
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Henkei
4
- VERSION = '2.9.2.1'
4
+ VERSION = '2.9.2.3'
5
5
  end
data/lib/henkei.rb CHANGED
@@ -47,8 +47,8 @@ class Henkei # rubocop:disable Metrics/ClassLength
47
47
  # text = Henkei.read :text, data
48
48
  # metadata = Henkei.read :metadata, data
49
49
  #
50
- def self.read(type, data, include_ocr: false)
51
- result = client_read(type, data, include_ocr: include_ocr)
50
+ def self.read(type, data, include_ocr: false, encoding: nil)
51
+ result = client_read(type, data, include_ocr: include_ocr, encoding: encoding)
52
52
 
53
53
  case type
54
54
  when :text, :html then result
@@ -96,10 +96,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
96
96
  #
97
97
  # henkei.text(include_ocr: true)
98
98
  #
99
- def text(include_ocr: false)
99
+ # Set the output character encoding (e.g. 'UTF-8')
100
+ #
101
+ # henkei.text(encoding: 'UTF-8')
102
+ #
103
+ def text(include_ocr: false, encoding: nil)
100
104
  return @text if defined? @text
101
105
 
102
- @text = Henkei.read :text, data, include_ocr: include_ocr
106
+ @text = Henkei.read :text, data, include_ocr: include_ocr, encoding: encoding
103
107
  end
104
108
 
105
109
  # Returns the text content of the Henkei document in HTML.
@@ -111,10 +115,14 @@ class Henkei # rubocop:disable Metrics/ClassLength
111
115
  #
112
116
  # henkei.html(include_ocr: true)
113
117
  #
114
- def html(include_ocr: false)
118
+ # Set the output character encoding (e.g. 'UTF-8')
119
+ #
120
+ # henkei.text(encoding: 'UTF-8')
121
+ #
122
+ def html(include_ocr: false, encoding: nil)
115
123
  return @html if defined? @html
116
124
 
117
- @html = Henkei.read :html, data, include_ocr: include_ocr
125
+ @html = Henkei.read :html, data, include_ocr: include_ocr, encoding: encoding
118
126
  end
119
127
 
120
128
  # Returns the metadata hash of the Henkei document.
@@ -211,20 +219,37 @@ class Henkei # rubocop:disable Metrics/ClassLength
211
219
 
212
220
  # Internal helper for calling to Tika library directly
213
221
  #
214
- def self.client_read(type, data, include_ocr: false)
215
- Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first
222
+ def self.client_read(type, data, include_ocr: false, encoding: nil)
223
+ unless encoding.nil? || Encoding.name_list.include?(encoding)
224
+ raise ArgumentError, "unsupported encoding - #{encoding}"
225
+ end
226
+
227
+ Open3.popen2(*tika_command(type, include_ocr: include_ocr, encoding: encoding)) do |stdin, stdout|
228
+ stdin.binmode
229
+ stdout.binmode
230
+ stdout.set_encoding encoding unless encoding.nil?
231
+
232
+ out_reader = Thread.new { stdout.read }
233
+
234
+ write_data_to_stdin(data, stdin)
235
+
236
+ stdin.close
237
+
238
+ out_reader.value
239
+ end
216
240
  end
217
241
  private_class_method :client_read
218
242
 
219
243
  # Internal helper for building the Java command to call Tika
220
244
  #
221
- def self.tika_command(type, include_ocr: false)
245
+ def self.tika_command(type, include_ocr: false, encoding: nil)
222
246
  [
223
247
  java_path,
224
248
  '-Djava.awt.headless=true',
225
249
  '-jar',
226
250
  Henkei::JAR_PATH,
227
- "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}"
251
+ "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}",
252
+ *("--encoding=#{encoding}" unless encoding.nil?)
228
253
  ] + switch_for_type(type)
229
254
  end
230
255
  private_class_method :tika_command
@@ -240,4 +265,21 @@ class Henkei # rubocop:disable Metrics/ClassLength
240
265
  }[type]
241
266
  end
242
267
  private_class_method :switch_for_type
268
+
269
+ # Internal helper for writing the input data to stdin when calling Tika
270
+ #
271
+ def self.write_data_to_stdin(data, stdin)
272
+ return unless data
273
+
274
+ begin
275
+ if data.respond_to? :readpartial
276
+ IO.copy_stream(data, stdin)
277
+ else
278
+ stdin.write data
279
+ end
280
+ rescue Errno::EPIPE
281
+ # Catch broken pipe.
282
+ end
283
+ end
284
+ private_class_method :write_data_to_stdin
243
285
  end
data/spec/henkei_spec.rb CHANGED
@@ -72,6 +72,26 @@ describe Henkei do
72
72
  end
73
73
  end
74
74
  end
75
+
76
+ context 'when a valid `encoding` value is provided' do
77
+ let(:encoding) { 'UTF-32' }
78
+
79
+ it 'returns the parsed text in the specified encoding' do
80
+ text = described_class.read :text, data, encoding: encoding
81
+
82
+ expect(text.encoding.name).to eq encoding
83
+ end
84
+ end
85
+
86
+ context 'when an invalid `encoding` value is provided' do
87
+ let(:encoding) { 'Beef' }
88
+
89
+ it 'raises an error' do
90
+ expect do
91
+ described_class.read :text, data, encoding: encoding
92
+ end.to raise_error(ArgumentError, "unsupported encoding - #{encoding}")
93
+ end
94
+ end
75
95
  end
76
96
 
77
97
  describe '.new' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.9.2.1
4
+ version: 2.9.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2024-05-01 00:00:00.000000000 Z
12
+ date: 2025-01-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: json
@@ -102,7 +102,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
102
102
  version: 3.0.0
103
103
  - - "<"
104
104
  - !ruby/object:Gem::Version
105
- version: 3.4.0
105
+ version: 3.5.0
106
106
  required_rubygems_version: !ruby/object:Gem::Requirement
107
107
  requirements:
108
108
  - - ">="