henkei 1.17.3 → 1.17.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5d629bcc0d435522497e749752307e6c33844d29
4
- data.tar.gz: 490b5e0c89b43f4ec83434e3861c85d69de7a8bd
3
+ metadata.gz: 4215a4071c7a618e66330cb7eca8873eae851b7b
4
+ data.tar.gz: 488366df7fdf1863272dc121b7fa74f077036352
5
5
  SHA512:
6
- metadata.gz: 6f4e76efb3cf67bca58db9e6dc074511af3e8661b5c03800bbf28cce9939b01774c33aaa65ae24df5e46a5346cda97347e17add23e934cbaf0cbaf7c48e73246
7
- data.tar.gz: 21cf99d84f4428f3db892aabc5827f02c848e70bb693080ac33789f4ea66d64de03f88856af7e89c14a4b783a038e33c5fba7ad6229e316f99be2cc2c0ab5fa1
6
+ metadata.gz: 21ba327a8c1139b586944ca1d28e35158c66386a721da9fc6cb71d400f52e9f339931d17f39a3e1b2344e65f65a7ccddeeff84e515dc6c5d491e4005749071f4
7
+ data.tar.gz: 880bd8f6721d5c6659da8d10aac9949a582d670bdd18b64de16089c738c57c64fe7499d5405e0af736571317f79dedcf783b8ba0389b9794b90cbc939548916c
@@ -0,0 +1,15 @@
1
+ Metrics/BlockLength:
2
+ Exclude:
3
+ - 'spec/**/*'
4
+
5
+ Metrics/LineLength:
6
+ Max: 120
7
+
8
+ Metrics/MethodLength:
9
+ Max: 15
10
+
11
+ Style/ClassVars:
12
+ Enabled: false
13
+
14
+ Style/DoubleNegation:
15
+ Enabled: false
@@ -4,8 +4,6 @@ env:
4
4
 
5
5
  language: ruby
6
6
  rvm:
7
- - 1.9.3
8
- - 2.0.0
9
7
  - 2.1.10
10
8
  - 2.2.7
11
9
  - 2.3.6
@@ -15,12 +13,17 @@ rvm:
15
13
  before_install:
16
14
  - gem update bundler
17
15
 
16
+ install:
17
+ - bundle install --jobs=3 --retry=3
18
+ - gem install rubocop
19
+
18
20
  before_script:
19
21
  - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
20
22
  - chmod +x ./cc-test-reporter
21
23
  - ./cc-test-reporter before-build
22
24
 
23
25
  script:
26
+ - rubocop
24
27
  - bundle exec rspec
25
28
 
26
29
  after_script:
data/Rakefile CHANGED
@@ -5,4 +5,4 @@ require 'rspec/core/rake_task'
5
5
 
6
6
  RSpec::Core::RakeTask.new 'spec'
7
7
 
8
- task :default => :spec
8
+ task default: :spec
@@ -1,6 +1,6 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
1
+ lib = File.expand_path('lib', __dir__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+
4
4
  require 'henkei/version'
5
5
 
6
6
  Gem::Specification.new do |spec|
@@ -9,20 +9,22 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ['Erol Fornoles', 'Andrew Bromwich']
10
10
  spec.email = %w[erol.fornoles@gmail.com a.bromwich@gmail.com]
11
11
  spec.description = 'Read text and metadata from files and documents using Apache Tika toolkit'
12
- spec.summary = 'Read text and metadata from files and documents (.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
12
+ spec.summary = 'Read text and metadata from files and documents ' \
13
+ '(.doc, .docx, .pages, .odt, .rtf, .pdf) using Apache Tika toolkit'
13
14
  spec.homepage = 'http://github.com/abrom/henkei'
14
15
  spec.license = 'MIT'
15
16
 
16
- spec.files = `git ls-files`.split($/)
17
+ spec.files = `git ls-files`.split("\n")
17
18
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
19
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
20
  spec.require_paths = ['lib']
20
21
 
21
- spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
22
22
  spec.add_runtime_dependency 'json', '>= 1.8', '< 3'
23
+ spec.add_runtime_dependency 'mime-types', '>= 1.23', '< 4'
23
24
 
24
25
  spec.add_development_dependency 'bundler', '~> 1.3'
25
26
  spec.add_development_dependency 'rake', '~> 12.3'
26
27
  spec.add_development_dependency 'rspec', '~> 3.7'
28
+ spec.add_development_dependency 'rubocop', '~> 0.53'
27
29
  spec.add_development_dependency 'simplecov', '~> 0.15'
28
30
  end
@@ -9,7 +9,8 @@ require 'json'
9
9
  require 'socket'
10
10
  require 'stringio'
11
11
 
12
- class Henkei
12
+ # Read text and metadata from files and documents using Apache Tika toolkit
13
+ class Henkei # rubocop:disable Metrics/ClassLength
13
14
  GEM_PATH = File.dirname(File.dirname(__FILE__))
14
15
  JAR_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-app-1.17.jar')
15
16
  CONFIG_PATH = File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
@@ -25,17 +26,13 @@ class Henkei
25
26
  # metadata = Henkei.read :metadata, data
26
27
  #
27
28
  def self.read(type, data)
28
- result = @@server_pid ? server_read(type, data) : client_read(type, data)
29
+ result = @@server_pid ? server_read(data) : client_read(type, data)
29
30
 
30
31
  case type
31
- when :text
32
- result
33
- when :html
34
- result
35
- when :metadata
36
- JSON.parse(result)
37
- when :mimetype
38
- MIME::Types[JSON.parse(result)['Content-Type']].first
32
+ when :text then result
33
+ when :html then result
34
+ when :metadata then JSON.parse(result)
35
+ when :mimetype then MIME::Types[JSON.parse(result)['Content-Type']].first
39
36
  end
40
37
  end
41
38
 
@@ -55,17 +52,17 @@ class Henkei
55
52
  #
56
53
  def initialize(input)
57
54
  if input.is_a? String
58
- if File.exists? input
55
+ if File.exist? input
59
56
  @path = input
60
- elsif input =~ URI::regexp
57
+ elsif input =~ URI::DEFAULT_PARSER.make_regexp
61
58
  @uri = URI.parse input
62
59
  else
63
- raise Errno::ENOENT.new "missing file or invalid URI - #{input}"
60
+ raise Errno::ENOENT, "missing file or invalid URI - #{input}"
64
61
  end
65
62
  elsif input.respond_to? :read
66
63
  @stream = input
67
64
  else
68
- raise TypeError.new "can't read from #{input.class.name}"
65
+ raise TypeError, "can't read from #{input.class.name}"
69
66
  end
70
67
  end
71
68
 
@@ -112,7 +109,7 @@ class Henkei
112
109
  return @mimetype if defined? @mimetype
113
110
 
114
111
  type = metadata['Content-Type'].is_a?(Array) ? metadata['Content-Type'].first : metadata['Content-Type']
115
-
112
+
116
113
  @mimetype = MIME::Types[type].first
117
114
  end
118
115
 
@@ -123,12 +120,8 @@ class Henkei
123
120
  #
124
121
  def creation_date
125
122
  return @creation_date if defined? @creation_date
126
-
127
- if metadata['Creation-Date']
128
- @creation_date = Time.parse(metadata['Creation-Date'])
129
- else
130
- nil
131
- end
123
+ return unless metadata['Creation-Date']
124
+ @creation_date = Time.parse(metadata['Creation-Date'])
132
125
  end
133
126
 
134
127
  # Returns +true+ if the Henkei document was specified using a file path.
@@ -182,19 +175,19 @@ class Henkei
182
175
  #
183
176
  # type :html, :text or :metadata
184
177
  # custom_port e.g. 9293
185
- #
178
+ #
186
179
  # Henkei.server(:text, 9294)
187
180
  #
188
- def self.server(type, custom_port=nil)
181
+ def self.server(type, custom_port = nil)
189
182
  @@server_port = custom_port || DEFAULT_SERVER_PORT
190
-
183
+
191
184
  @@server_pid = Process.spawn tika_command(type, true)
192
185
  sleep(2) # Give the server 2 seconds to spin up.
193
186
  @@server_pid
194
187
  end
195
188
 
196
189
  # Kills server started by Henkei.server
197
- #
190
+ #
198
191
  # Always run this when you're done, or else Tika might run until you kill it manually
199
192
  # You might try putting your extraction in a begin..rescue...ensure...end block and
200
193
  # putting this method in the ensure block.
@@ -209,11 +202,11 @@ class Henkei
209
202
  # end
210
203
  #
211
204
  def self.kill_server!
212
- if @@server_pid
213
- Process.kill('INT', @@server_pid)
214
- @@server_pid = nil
215
- @@server_port = nil
216
- end
205
+ return unless @@server_pid
206
+
207
+ Process.kill('INT', @@server_pid)
208
+ @@server_pid = nil
209
+ @@server_port = nil
217
210
  end
218
211
 
219
212
  ### Private class methods
@@ -238,12 +231,12 @@ class Henkei
238
231
 
239
232
  # Internal helper for calling to running Tika server
240
233
  #
241
- def self.server_read(_, data)
234
+ def self.server_read(data)
242
235
  s = TCPSocket.new('localhost', @@server_port)
243
236
  file = StringIO.new(data, 'r')
244
237
 
245
- while 1
246
- chunk = file.read(65536)
238
+ loop do
239
+ chunk = file.read(65_536)
247
240
  break unless chunk
248
241
  s.write(chunk)
249
242
  end
@@ -252,8 +245,8 @@ class Henkei
252
245
  s.shutdown(Socket::SHUT_WR)
253
246
 
254
247
  resp = ''
255
- while 1
256
- chunk = s.recv(65536)
248
+ loop do
249
+ chunk = s.recv(65_536)
257
250
  break if chunk.empty? || !chunk
258
251
  resp << chunk
259
252
  end
@@ -1,3 +1,3 @@
1
1
  class Henkei
2
- VERSION = '1.17.3'
2
+ VERSION = '1.17.4'.freeze
3
3
  end
@@ -12,27 +12,31 @@ describe Henkei do
12
12
  it 'reads text' do
13
13
  text = Henkei.read :text, data
14
14
 
15
- expect( text ).to include 'The quick brown fox jumped over the lazy cat.'
15
+ expect(text).to include 'The quick brown fox jumped over the lazy cat.'
16
16
  end
17
17
 
18
18
  it 'reads metadata' do
19
19
  metadata = Henkei.read :metadata, data
20
20
 
21
- expect( metadata['Content-Type'] ).to eql 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
21
+ expect(metadata['Content-Type']).to(
22
+ eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
23
+ )
22
24
  end
23
25
 
24
26
  it 'reads metadata values with colons as strings' do
25
27
  data = File.read 'spec/samples/sample-metadata-values-with-colons.doc'
26
28
  metadata = Henkei.read :metadata, data
27
29
 
28
- expect( metadata['dc:title'] ).to eql 'problem: test'
30
+ expect(metadata['dc:title']).to eq 'problem: test'
29
31
  end
30
32
 
31
33
  it 'reads mimetype' do
32
34
  mimetype = Henkei.read :mimetype, data
33
35
 
34
- expect( mimetype.content_type ).to eql 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
35
- expect( mimetype.extensions ).to include 'docx'
36
+ expect(mimetype.content_type).to(
37
+ eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
38
+ )
39
+ expect(mimetype.extensions).to include 'docx'
36
40
  end
37
41
  end
38
42
 
@@ -44,47 +48,47 @@ describe Henkei do
44
48
  it 'accepts a root path' do
45
49
  henkei = Henkei.new 'spec/samples/sample.pages'
46
50
 
47
- expect( henkei ).to be_path
48
- expect( henkei ).not_to be_uri
49
- expect( henkei ).not_to be_stream
51
+ expect(henkei).to be_path
52
+ expect(henkei).not_to be_uri
53
+ expect(henkei).not_to be_stream
50
54
  end
51
55
 
52
56
  it 'accepts a relative path' do
53
57
  henkei = Henkei.new 'spec/samples/sample.pages'
54
58
 
55
- expect( henkei ).to be_path
56
- expect( henkei ).not_to be_uri
57
- expect( henkei ).not_to be_stream
59
+ expect(henkei).to be_path
60
+ expect(henkei).not_to be_uri
61
+ expect(henkei).not_to be_stream
58
62
  end
59
63
 
60
64
  it 'accepts a path with spaces' do
61
65
  henkei = Henkei.new 'spec/samples/sample filename with spaces.pages'
62
66
 
63
- expect( henkei ).to be_path
64
- expect( henkei ).not_to be_uri
65
- expect( henkei ).not_to be_stream
67
+ expect(henkei).to be_path
68
+ expect(henkei).not_to be_uri
69
+ expect(henkei).not_to be_stream
66
70
  end
67
71
 
68
72
  it 'accepts a URI' do
69
73
  henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
70
74
 
71
- expect( henkei ).to be_uri
72
- expect( henkei ).not_to be_path
73
- expect( henkei ).not_to be_stream
75
+ expect(henkei).to be_uri
76
+ expect(henkei).not_to be_path
77
+ expect(henkei).not_to be_stream
74
78
  end
75
79
 
76
80
  it 'accepts a stream or object that can be read' do
77
81
  File.open 'spec/samples/sample.pages', 'r' do |file|
78
82
  henkei = Henkei.new file
79
83
 
80
- expect( henkei ).to be_stream
81
- expect( henkei ).not_to be_path
82
- expect( henkei ).not_to be_uri
84
+ expect(henkei).to be_stream
85
+ expect(henkei).not_to be_path
86
+ expect(henkei).not_to be_uri
83
87
  end
84
88
  end
85
89
 
86
90
  it 'refuses a path to a missing file' do
87
- expect { Henkei.new 'test/sample/missing.pages'}.to raise_error Errno::ENOENT
91
+ expect { Henkei.new 'test/sample/missing.pages' }.to raise_error Errno::ENOENT
88
92
  end
89
93
 
90
94
  it 'refuses other objects' do
@@ -94,23 +98,22 @@ describe Henkei do
94
98
  end
95
99
  end
96
100
 
97
-
98
101
  describe '.creation_date' do
99
102
  let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
100
103
  it 'should return Time' do
101
- expect( henkei.creation_date ).to be_a Time
104
+ expect(henkei.creation_date).to be_a Time
102
105
  end
103
106
  end
104
107
 
105
108
  describe '.java' do
106
109
  specify 'with no specified JAVA_HOME' do
107
- expect( Henkei.send(:java_path) ).to eql 'java'
110
+ expect(Henkei.send(:java_path)).to eq 'java'
108
111
  end
109
112
 
110
113
  specify 'with a specified JAVA_HOME' do
111
114
  ENV['JAVA_HOME'] = '/path/to/java/home'
112
115
 
113
- expect( Henkei.send(:java_path) ).to eql '/path/to/java/home/bin/java'
116
+ expect(Henkei.send(:java_path)).to eq '/path/to/java/home/bin/java'
114
117
  end
115
118
  end
116
119
 
@@ -118,11 +121,11 @@ describe Henkei do
118
121
  let(:henkei) { Henkei.new 'spec/samples/sample.pages' }
119
122
 
120
123
  specify '#text reads text' do
121
- expect( henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
124
+ expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
122
125
  end
123
126
 
124
127
  specify '#metadata reads metadata' do
125
- expect( henkei.metadata['Content-Type'] ).to eql ['application/vnd.apple.pages', 'application/vnd.apple.pages']
128
+ expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
126
129
  end
127
130
  end
128
131
 
@@ -130,11 +133,13 @@ describe Henkei do
130
133
  let(:henkei) { Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx' }
131
134
 
132
135
  specify '#text reads text' do
133
- expect( henkei.text ).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
136
+ expect(henkei.text).to include 'Lorem ipsum dolor sit amet, consectetuer adipiscing elit.'
134
137
  end
135
138
 
136
139
  specify '#metadata reads metadata' do
137
- expect( henkei.metadata['Content-Type'] ).to eql 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
140
+ expect(henkei.metadata['Content-Type']).to(
141
+ eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
142
+ )
138
143
  end
139
144
  end
140
145
 
@@ -142,11 +147,11 @@ describe Henkei do
142
147
  let(:henkei) { Henkei.new File.open('spec/samples/sample.pages', 'rb') }
143
148
 
144
149
  specify '#text reads text' do
145
- expect( henkei.text ).to include 'The quick brown fox jumped over the lazy cat.'
150
+ expect(henkei.text).to include 'The quick brown fox jumped over the lazy cat.'
146
151
  end
147
152
 
148
153
  specify '#metadata reads metadata' do
149
- expect( henkei.metadata['Content-Type'] ).to eql ['application/vnd.apple.pages', 'application/vnd.apple.pages']
154
+ expect(henkei.metadata['Content-Type']).to eq %w[application/vnd.apple.pages application/vnd.apple.pages]
150
155
  end
151
156
  end
152
157
 
@@ -171,9 +176,15 @@ describe Henkei do
171
176
  specify '#runs samples through server mode' do
172
177
  begin
173
178
  Henkei.server(:text)
174
- expect(Henkei.new('spec/samples/sample.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
175
- expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to include 'The quick brown fox jumped over the lazy cat.'
176
- expect(Henkei.new('spec/samples/sample.docx').text).to include 'The quick brown fox jumped over the lazy cat.'
179
+ expect(Henkei.new('spec/samples/sample.pages').text).to(
180
+ include 'The quick brown fox jumped over the lazy cat.'
181
+ )
182
+ expect(Henkei.new('spec/samples/sample filename with spaces.pages').text).to(
183
+ include 'The quick brown fox jumped over the lazy cat.'
184
+ )
185
+ expect(Henkei.new('spec/samples/sample.docx').text).to(
186
+ include 'The quick brown fox jumped over the lazy cat.'
187
+ )
177
188
  ensure
178
189
  Henkei.kill_server!
179
190
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: henkei
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.17.3
4
+ version: 1.17.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erol Fornoles
@@ -12,45 +12,45 @@ cert_chain: []
12
12
  date: 2018-03-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: mime-types
15
+ name: json
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  requirements:
18
18
  - - ">="
19
19
  - !ruby/object:Gem::Version
20
- version: '1.23'
20
+ version: '1.8'
21
21
  - - "<"
22
22
  - !ruby/object:Gem::Version
23
- version: '4'
23
+ version: '3'
24
24
  type: :runtime
25
25
  prerelease: false
26
26
  version_requirements: !ruby/object:Gem::Requirement
27
27
  requirements:
28
28
  - - ">="
29
29
  - !ruby/object:Gem::Version
30
- version: '1.23'
30
+ version: '1.8'
31
31
  - - "<"
32
32
  - !ruby/object:Gem::Version
33
- version: '4'
33
+ version: '3'
34
34
  - !ruby/object:Gem::Dependency
35
- name: json
35
+ name: mime-types
36
36
  requirement: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '1.8'
40
+ version: '1.23'
41
41
  - - "<"
42
42
  - !ruby/object:Gem::Version
43
- version: '3'
43
+ version: '4'
44
44
  type: :runtime
45
45
  prerelease: false
46
46
  version_requirements: !ruby/object:Gem::Requirement
47
47
  requirements:
48
48
  - - ">="
49
49
  - !ruby/object:Gem::Version
50
- version: '1.8'
50
+ version: '1.23'
51
51
  - - "<"
52
52
  - !ruby/object:Gem::Version
53
- version: '3'
53
+ version: '4'
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: bundler
56
56
  requirement: !ruby/object:Gem::Requirement
@@ -93,6 +93,20 @@ dependencies:
93
93
  - - "~>"
94
94
  - !ruby/object:Gem::Version
95
95
  version: '3.7'
96
+ - !ruby/object:Gem::Dependency
97
+ name: rubocop
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '0.53'
103
+ type: :development
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '0.53'
96
110
  - !ruby/object:Gem::Dependency
97
111
  name: simplecov
98
112
  requirement: !ruby/object:Gem::Requirement
@@ -117,6 +131,7 @@ extra_rdoc_files: []
117
131
  files:
118
132
  - ".gitignore"
119
133
  - ".rspec"
134
+ - ".rubocop.yml"
120
135
  - ".travis.yml"
121
136
  - Gemfile
122
137
  - LICENSE