henkei 1.17.3 → 1.17.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5d629bcc0d435522497e749752307e6c33844d29
4
- data.tar.gz: 490b5e0c89b43f4ec83434e3861c85d69de7a8bd
3
+ metadata.gz: 4215a4071c7a618e66330cb7eca8873eae851b7b
4
+ data.tar.gz: 488366df7fdf1863272dc121b7fa74f077036352
5
5
  SHA512:
6
- metadata.gz: 6f4e76efb3cf67bca58db9e6dc074511af3e8661b5c03800bbf28cce9939b01774c33aaa65ae24df5e46a5346cda97347e17add23e934cbaf0cbaf7c48e73246
7
- data.tar.gz: 21cf99d84f4428f3db892aabc5827f02c848e70bb693080ac33789f4ea66d64de03f88856af7e89c14a4b783a038e33c5fba7ad6229e316f99be2cc2c0ab5fa1
6
+ metadata.gz: 21ba327a8c1139b586944ca1d28e35158c66386a721da9fc6cb71d400f52e9f339931d17f39a3e1b2344e65f65a7ccddeeff84e515dc6c5d491e4005749071f4
7
+ data.tar.gz: 880bd8f6721d5c6659da8d10aac9949a582d670bdd18b64de16089c738c57c64fe7499d5405e0af736571317f79dedcf783b8ba0389b9794b90cbc939548916c
@@ -0,0 +1,15 @@
1
+ Metrics/BlockLength:
2
+ Exclude:
3
+ - 'spec/**/*'
4
+
5
+ Metrics/LineLength:
6
+ Max: 120
7
+
8
+ Metrics/MethodLength:
9
+ Max: 15
10
+
11
+ Style/ClassVars:
12
+ Enabled: false
13
+
14
+ Style/DoubleNegation:
15
+ Enabled: false
@@ -4,8 +4,6 @@ env:
4
4
 
5
5
  language: ruby
6
6
  rvm:
7
- - 1.9.3
8
- - 2.0.0
9
7
  - 2.1.10
10
8
  - 2.2.7
11
9
  - 2.3.6
@@ -15,12 +13,17 @@ rvm:
15
13
  before_install:
16
14
  - gem update bundler
17
15
 
16
+ install:
17
+ - bundle install --jobs=3 --retry=3
18
+ - gem install rubocop
19
+
18
20
  before_script:
19
21
  - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
20
22
  - chmod +x ./cc-test-reporter
21
23
  - ./cc-test-reporter before-build
22
24
 
23
25
  script:
26
+ - rubocop
24
27
  - bundle exec rspec
25
28
 
26
29
  after_script:
data/Rakefile CHANGED
@@ -5,4 +5,4 @@ require 'rspec/core/rake_task'
5
5
 
6
6
  RSpec::Core::RakeTask.new 'spec'
7
7
 
8
- task :default => :spec
8
+ task default: :spec
@@ -1,6 +1,6 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+
4
4
  require 'henkei/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
@@ -9,20 +9,22 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
10
10
  spec.email = %w[erol.fornoles@gmail.com a.bromwich@gmail.com]
11
11
  spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
12
- spec.summary = 'Read text and metadata from files and documents (.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
12
+ spec.summary = 'Read text and metadata from files and documents ' \
13
+ '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
13
14
  spec.homepage = 'http://github.com/abrom/henkei'
14
15
  spec.license = 'MIT'
15
16
 
16
- spec.files = `git ls-files`.split($/)
17
+ spec.files = `git ls-files`.split("\n")
17
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
20
  spec.require_paths = ['lib']
20
21
 
21
- spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
22
22
  spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
23
+ spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
23
24
 
24
25
  spec.add_development_dependency 'bundler', '~> 1.3'
25
26
  spec.add_development_dependency 'rake', '~> 12.3'
26
27
  spec.add_development_dependency 'rspec', '~> 3.7'
28
+ spec.add_development_dependency 'rubocop', '~> 0.53'
27
29
  spec.add_development_dependency 'simplecov', '~> 0.15'
28
30
  end
@@ -9,7 +9,8 @@ require 'json'
9
9
  require 'socket'
10
10
  require 'stringio'
11
11
 
12
- class Henkei
12
+ # Read text and metadata from files and documents using Apache Tika toolkit
13
+ class Henkei # rubocop:disable Metrics/ClassLength
13
14
  GEM_PATH = File.dirname(File.dirname(__FILE__))
14
15
  JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.17.jar')
15
16
  CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
@@ -25,17 +26,13 @@ class Henkei
25
26
  # metadata = Henkei.read :metadata, data
26
27
  #
27
28
  def self.read(type, data)
28
- result = @@server_pid ? server_read(type, data) : client_read(type, data)
29
+ result = @@server_pid ? server_read(data) : client_read(type, data)
29
30
 
30
31
  case type
31
- when :text
32
- result
33
- when :html
34
- result
35
- when :metadata
36
- JSON.parse(result)
37
- when :mimetype
38
- MIME::Types[JSON.parse(result)['Content-Type']].first
32
+ when :text then result
33
+ when :html then result
34
+ when :metadata then JSON.parse(result)
35
+ when :mimetype then MIME::Types[JSON.parse(result)['Content-Type']].first
39
36
  end
40
37
  end
41
38
 
@@ -55,17 +52,17 @@ class Henkei
55
52
  #
56
53
  def initialize(input)
57
54
  if input.is_a? String
58
- if File.exists? input
55
+ if File.exist? input
59
56
  @path = input
60
- elsif input =~ URI::regexp
57
+ elsif input =~ URI::DEFAULT_PARSER.make_regexp
61
58
  @uri = URI.parse input
62
59
  else
63
- raise Errno::ENOENT.new "missing file or invalid URI - #{input}"
60
+ raise Errno::ENOENT, "missing file or invalid URI - #{input}"
64
61
  end
65
62
  elsif input.respond_to? :read
66
63
  @stream = input
67
64
  else
68
- raise TypeError.new "can't read from #{input.class.name}"
65
+ raise TypeError, "can't read from #{input.class.name}"
69
66
  end
70
67
  end
71
68
 
@@ -112,7 +109,7 @@ class Henkei
112
109
  return @mimetype if defined? @mimetype
113
110
 
114
111
  type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
115
-
112
+
116
113
  @mimetype = MIME::Types[type].first
117
114
  end
118
115
 
@@ -123,12 +120,8 @@ class Henkei
123
120
  #
124
121
  def creation_date
125
122
  return @creation_date if defined? @creation_date
126
-
127
- if metadata['Creation-Date']
128
- @creation_date = Time.parse(metadata['Creation-Date'])
129
- else
130
- nil
131
- end
123
+ return unless metadata['Creation-Date']
124
+ @creation_date = Time.parse(metadata['Creation-Date'])
132
125
  end
133
126
 
134
127
  # Returns +true+ if the Henkei document was specified using a file path.
@@ -182,19 +175,19 @@ class Henkei
182
175
  #
183
176
  # type :html, :text or :metadata
184
177
  # custom_port e.g. 9293
185
- #
178
+ #
186
179
  # Henkei.server(:text, 9294)
187
180
  #
188
- def self.server(type, custom_port=nil)
181
+ def self.server(type, custom_port = nil)
189
182
  @@server_port = custom_port || DEFAULT_SERVER_PORT
190
-
183
+
191
184
  @@server_pid = Process.spawn tika_command(type, true)
192
185
  sleep(2) # Give the server 2 seconds to spin up.
193
186
  @@server_pid
194
187
  end
195
188
 
196
189
  # Kills server started by Henkei.server
197
- #
190
+ #
198
191
  # Always run this when you're done, or else Tika might run until you kill it manually
199
192
  # You might try putting your extraction in a begin..rescue...ensure...end block and
200
193
  # putting this method in the ensure block.
@@ -209,11 +202,11 @@ class Henkei
209
202
  # end
210
203
  #
211
204
  def self.kill_server!
212
- if @@server_pid
213
- Process.kill('INT', @@server_pid)
214
- @@server_pid = nil
215
- @@server_port = nil
216
- end
205
+ return unless @@server_pid
206
+
207
+ Process.kill('INT', @@server_pid)
208
+ @@server_pid = nil
209
+ @@server_port = nil
217
210
  end
218
211
 
219
212
  ### Private class methods
@@ -238,12 +231,12 @@ class Henkei
238
231
 
239
232
  # Internal helper for calling to running Tika server
240
233
  #
241
- def self.server_read(_, data)
234
+ def self.server_read(data)
242
235
  s = TCPSocket.new('localhost', @@server_port)
243
236
  file = StringIO.new(data, 'r')
244
237
 
245
- while 1
246
- chunk = file.read(65536)
238
+ loop do
239
+ chunk = file.read(65_536)
247
240
  break unless chunk
248
241
  s.write(chunk)
249
242
  end
@@ -252,8 +245,8 @@ class Henkei
252
245
  s.shutdown(Socket::SHUT_WR)
253
246
 
254
247
  resp = ''
255
- while 1
256
- chunk = s.recv(65536)
248
+ loop do
249
+ chunk = s.recv(65_536)
257
250
  break if chunk.empty? || !chunk
258
251
  resp << chunk
259
252
  end
@@ -1,3 +1,3 @@
1
1
  class Henkei
2
- VERSION = '1.17.3'
2
+ VERSION = '1.17.4'.freeze
3
3
  end
@@ -12,27 +12,31 @@ describe Henkei do
12
12
  it 'reads text' do
13
13
  text = Henkei.read :text, data
14
14
 
15
- expect( text ).to include 'The quick brown fox jumped over the lazy cat.'
15
+ expect(text).to include 'The quick brown fox jumped over the lazy cat.'
16
16
  end
17
17
 
18
18
  it 'reads metadata' do
19
19
  metadata = Henkei.read :metadata, data
20
20
 
21
- expect( metadata['Content-Type'] ).to eql 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
21
+ expect(metadata['Content-Type']).to(
22
+ eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
23
+ )
22
24
  end
23
25
 
24
26
  it 'reads metadata values with colons as strings' do
25
27
  data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
26
28
  metadata = Henkei.read :metadata, data
27
29
 
28
- expect( metadata['dc:title'] ).to eql 'problem: test'
30
+ expect(metadata['dc:title']).to eq 'problem: test'
29
31
  end
30
32
 
31
33
  it 'reads mimetype' do
32
34
  mimetype = Henkei.read :mimetype, data
33
35
 
34
- expect( mimetype.content_type ).to eql 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
35
- expect( mimetype.extensions ).to include 'docx'
36
+ expect(mimetype.content_type).to(
37
+ eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
38
+ )
39
+ expect(mimetype.extensions).to include 'docx'
36
40
  end
37
41
  end
38
42
 
@@ -44,47 +48,47 @@ describe Henkei do
44
48
  it 'accepts a root path' do
45
49
  henkei = Henkei.new 'spec/samples/sample.pages'
46
50
 
47
- expect( henkei ).to be_path
48
- expect( henkei ).not_to be_uri
49
- expect( henkei ).not_to be_stream
51
+ expect(henkei).to be_path
52
+ expect(henkei).not_to be_uri
53
+ expect(henkei).not_to be_stream
50
54
  end
51
55
 
52
56
  it 'accepts a relative path' do
53
57
  henkei = Henkei.new 'spec/samples/sample.pages'
54
58
 
55
- expect( henkei ).to be_path
56
- expect( henkei ).not_to be_uri
57
- expect( henkei ).not_to be_stream
59
+ expect(henkei).to be_path
60
+ expect(henkei).not_to be_uri
61
+ expect(henkei).not_to be_stream
58
62
  end
59
63
 
60
64
  it 'accepts a path with spaces' do
61
65
  henkei = Henkei.new 'spec/samples/sample filename with spaces.pages'
62
66
 
63
- expect( henkei ).to be_path
64
- expect( henkei ).not_to be_uri
65
- expect( henkei ).not_to be_stream
67
+ expect(henkei).to be_path
68
+ expect(henkei).not_to be_uri
69
+ expect(henkei).not_to be_stream
66
70
  end
67
71
 
68
72
  it 'accepts a URI' do
69
73
  henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
70
74
 
71
- expect( henkei ).to be_uri
72
- expect( henkei ).not_to be_path
73
- expect( henkei ).not_to be_stream
75
+ expect(henkei).to be_uri
76
+ expect(henkei).not_to be_path
77
+ expect(henkei).not_to be_stream
74
78
  end
75
79
 
76
80
  it 'accepts a stream or object that can be read' do
77
81
  File.open 'spec/samples/sample.pages', 'r' do |file|
78
82
  henkei = Henkei.new file
79
83
 
80
- expect( henkei ).to be_stream
81
- expect( henkei ).not_to be_path
82
- expect( henkei ).not_to be_uri
84
+ expect(henkei).to be_stream
85
+ expect(henkei).not_to be_path
86
+ expect(henkei).not_to be_uri
83
87
  end
84
88
  end
85
89
 
86
90
  it 'refuses a path to a missing file' do
87
- expect { Henkei.new 'test/sample/missing.pages'}.to raise_error Errno::ENOENT
91
+ expect { Henkei.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
88
92
  end
89
93
 
90
94
  it 'refuses other objects' do
@@ -94,23 +98,22 @@ describe Henkei do
94
98
  end
95
99
  end
96
100
 
97
-
98
101
  describe '.creation_date' do
99
102
  let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
100
103
  it 'should return Time' do
101
- expect( henkei.creation_date ).to be_a Time
104
+ expect(henkei.creation_date).to be_a Time
102
105
  end
103
106
  end
104
107
 
105
108
  describe '.java' do
106
109
  specify 'with no specified JAVA_HOME' do
107
- expect( Henkei.send(:java_path) ).to eql 'java'
110
+ expect(Henkei.send(:java_path)).to eq 'java'
108
111
  end
109
112
 
110
113
  specify 'with a specified JAVA_HOME' do
111
114
  ENV['JAVA_HOME'] = '/path/to/java/home'
112
115
 
113
- expect( Henkei.send(:java_path) ).to eql '/path/to/java/home/bin/java'
116
+ expect(Henkei.send(:java_path)).to eq '/path/to/java/home/bin/java'
114
117
  end
115
118
  end
116
119
 
@@ -118,11 +121,11 @@ describe Henkei do
118
121
  let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
119
122
 
120
123
  specify '#text reads text' do
121
- expect( henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
124
+ expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
122
125
  end
123
126
 
124
127
  specify '#metadata reads metadata' do
125
- expect( henkei.metadata['Content-Type'] ).to eql ['application/vnd.apple.pages', 'application/vnd.apple.pages']
128
+ expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
126
129
  end
127
130
  end
128
131
 
@@ -130,11 +133,13 @@ describe Henkei do
130
133
  let(:henkei) { Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
131
134
 
132
135
  specify '#text reads text' do
133
- expect( henkei.text ).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
136
+ expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
134
137
  end
135
138
 
136
139
  specify '#metadata reads metadata' do
137
- expect( henkei.metadata['Content-Type'] ).to eql 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
140
+ expect(henkei.metadata['Content-Type']).to(
141
+ eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
142
+ )
138
143
  end
139
144
  end
140
145
 
@@ -142,11 +147,11 @@ describe Henkei do
142
147
  let(:henkei) { Henkei.new File.open('spec/samples/sample.pages', 'rb') }
143
148
 
144
149
  specify '#text reads text' do
145
- expect( henkei.text ).to include 'The quick brown fox jumped over the lazy cat.'
150
+ expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
146
151
  end
147
152
 
148
153
  specify '#metadata reads metadata' do
149
- expect( henkei.metadata['Content-Type'] ).to eql ['application/vnd.apple.pages', 'application/vnd.apple.pages']
154
+ expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
150
155
  end
151
156
  end
152
157
 
@@ -171,9 +176,15 @@ describe Henkei do
171
176
  specify '#runs samples through server mode' do
172
177
  begin
173
178
  Henkei.server(:text)
174
- expect(Henkei.new('spec/samples/sample.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
175
- expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
176
- expect(Henkei.new('spec/samples/sample.docx').text).to include 'The quick brown fox jumped over the lazy cat.'
179
+ expect(Henkei.new('spec/samples/sample.pages').text).to(
180
+ include 'The quick brown fox jumped over the lazy cat.'
181
+ )
182
+ expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
183
+ include 'The quick brown fox jumped over the lazy cat.'
184
+ )
185
+ expect(Henkei.new('spec/samples/sample.docx').text).to(
186
+ include 'The quick brown fox jumped over the lazy cat.'
187
+ )
177
188
  ensure
178
189
  Henkei.kill_server!
179
190
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.17.3
4
+ version: 1.17.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
@@ -12,45 +12,45 @@ cert_chain: []
12
12
  date: 2018-03-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: mime-types
15
+ name: json
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  requirements:
18
18
  - - ">="
19
19
  - !ruby/object:Gem::Version
20
- version: '1.23'
20
+ version: '1.8'
21
21
  - - "<"
22
22
  - !ruby/object:Gem::Version
23
- version: '4'
23
+ version: '3'
24
24
  type: :runtime
25
25
  prerelease: false
26
26
  version_requirements: !ruby/object:Gem::Requirement
27
27
  requirements:
28
28
  - - ">="
29
29
  - !ruby/object:Gem::Version
30
- version: '1.23'
30
+ version: '1.8'
31
31
  - - "<"
32
32
  - !ruby/object:Gem::Version
33
- version: '4'
33
+ version: '3'
34
34
  - !ruby/object:Gem::Dependency
35
- name: json
35
+ name: mime-types
36
36
  requirement: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '1.8'
40
+ version: '1.23'
41
41
  - - "<"
42
42
  - !ruby/object:Gem::Version
43
- version: '3'
43
+ version: '4'
44
44
  type: :runtime
45
45
  prerelease: false
46
46
  version_requirements: !ruby/object:Gem::Requirement
47
47
  requirements:
48
48
  - - ">="
49
49
  - !ruby/object:Gem::Version
50
- version: '1.8'
50
+ version: '1.23'
51
51
  - - "<"
52
52
  - !ruby/object:Gem::Version
53
- version: '3'
53
+ version: '4'
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: bundler
56
56
  requirement: !ruby/object:Gem::Requirement
@@ -93,6 +93,20 @@ dependencies:
93
93
  - - "~>"
94
94
  - !ruby/object:Gem::Version
95
95
  version: '3.7'
96
+ - !ruby/object:Gem::Dependency
97
+ name: rubocop
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '0.53'
103
+ type: :development
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '0.53'
96
110
  - !ruby/object:Gem::Dependency
97
111
  name: simplecov
98
112
  requirement: !ruby/object:Gem::Requirement
@@ -117,6 +131,7 @@ extra_rdoc_files: []
117
131
  files:
118
132
  - ".gitignore"
119
133
  - ".rspec"
134
+ - ".rubocop.yml"
120
135
  - ".travis.yml"
121
136
  - Gemfile
122
137
  - LICENSE