ruby_tika_app 1.5.0 → 1.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 23bb9e8faef2930749c2a92c52bca6991c6f4cfd
4
- data.tar.gz: aff9a6f879ed8f100e3e72cc13077c3e81093eb0
2
+ SHA256:
3
+ metadata.gz: bbd4069c575a64c475bf64daa5529fb6d2d552ddd5282791708de6344bbe2755
4
+ data.tar.gz: b4254a9b725fa8a320fbe6a90d1c41077d972a181fb1f3a90447061fa43be9f6
5
5
  SHA512:
6
- metadata.gz: afc3e589189d68e809f5a7f56742676ad3b9daa520c2fa412f7af84671c416b31e18b90cf89aa713dcdbb8752e605cf511d854c939ea29b4bfbf509d59e742e2
7
- data.tar.gz: fc3a95fd56ff9cc646ae01f8ddbfb8942038af77fc8ae7dc6c7de7297eec182b6fad334b09f96ebf065a95c4967da109e6b207b3e4daebd660255753c50ce909
6
+ metadata.gz: f87262cb29711262b8ceef0ec55a4dd6db9a964999190734242efa00f92e4847a7ab72e1fae3e53318f93fb08b82f40dbd345f2a79021e4839a861f59878a970
7
+ data.tar.gz: cc425ee26c3a8f7879ae58b2872704e15bf589961b22d40c73bd43caea0b1720a589670c5897dee03acb707dbe067a7b97587bb0ce48818c541282f604b3b8da
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2011 Chris Parker
1
+ Copyright (c) 2011-2018 Chris Parker
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
@@ -0,0 +1,13 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <properties>
3
+ <service-loader initializableProblemHandler="ignore"/>
4
+ <parsers>
5
+ <!-- Default Parser for most things, except for 2 mime types, and never
6
+ use the Executable Parser -->
7
+ <parser class="org.apache.tika.parser.DefaultParser">
8
+ <mime-exclude>image/jpeg</mime-exclude>
9
+ <mime-exclude>application/x-sqlite3</mime-exclude>
10
+ <parser-exclude class="org.apache.tika.parser.jdbc.SQLite3Parser"/>
11
+ </parser>
12
+ </parsers>
13
+ </properties>
data/lib/ruby_tika_app.rb CHANGED
@@ -5,7 +5,6 @@ require 'stringio'
5
5
  require 'open4'
6
6
 
7
7
  class RubyTikaApp
8
-
9
8
  class Error < RuntimeError; end
10
9
 
11
10
  class CommandFailedError < Error
@@ -16,17 +15,19 @@ class RubyTikaApp
16
15
  end
17
16
 
18
17
  def initialize(document)
19
- if (document =~ /https?:\/\/[\S]+/) == 0
20
- @document = document
21
- else
22
- @document = "file://#{document}"
23
- end
18
+ @document = if (document =~ %r{https?:\/\/[\S]+}) == 0
19
+ document
20
+ else
21
+ "file://#{document}"
22
+ end
24
23
 
25
24
  java_cmd = 'java'
26
25
  java_args = '-server -Djava.awt.headless=true'
27
- tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-1.9.jar"
26
+ ext_dir = File.join(File.dirname(__FILE__))
27
+ tika_path = "#{ext_dir}/../ext/tika-app-1.19.1.jar"
28
+ tika_config_path = "#{ext_dir}/../ext/tika-config.xml"
28
29
 
29
- @tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}'"
30
+ @tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
30
31
  end
31
32
 
32
33
  def to_xml
@@ -58,7 +59,7 @@ class RubyTikaApp
58
59
  def run_tika(option)
59
60
  final_cmd = "#{@tika_cmd} #{option} '#{@document}'"
60
61
 
61
- pid, stdin, stdout, stderr = Open4::popen4(final_cmd)
62
+ _, stdin, stdout, stderr = Open4.popen4(final_cmd)
62
63
 
63
64
  stdout_result = stdout.read.strip
64
65
  stderr_result = stderr.read.strip
@@ -75,8 +76,11 @@ class RubyTikaApp
75
76
  stderr.close
76
77
  end
77
78
 
78
- def strip_stderr(s)
79
- s.gsub(/^(info|warn) - .*$/i, '').strip
79
+ def strip_stderr(err)
80
+ err
81
+ .gsub(/^(info|warn) - .*$/i, '')
82
+ .strip
83
+ .gsub(/Picked up JAVA_TOOL_OPTIONS: .+ -Dfile.encoding=UTF-8/i, '')
84
+ .strip
80
85
  end
81
-
82
86
  end
@@ -1,32 +1,31 @@
1
- # -*- encoding: utf-8 -*-
2
- $:.push File.expand_path('../lib', __FILE__)
1
+ $LOAD_PATH.push File.expand_path('lib', __dir__)
3
2
 
4
3
  Gem::Specification.new do |s|
5
4
  s.name = 'ruby_tika_app'
6
- s.version = '1.5.0'
5
+ s.version = '1.8.0'
7
6
  s.platform = Gem::Platform::RUBY
8
7
  s.authors = ['Chris Parker']
9
- s.email = %w(mrcsparker@gmail.com)
8
+ s.email = %w[mrcsparker@gmail.com]
10
9
  s.homepage = 'https://github.com/mrcsparker/ruby_tika_app'
11
- s.summary = %q{Wrapper around the tika-app jar}
12
- s.description = %q{Wrapper around the tika-app jar}
10
+ s.summary = 'Wrapper around the tika-app jar'
11
+ s.description = 'Wrapper around the tika-app jar'
13
12
 
14
13
  s.rubyforge_project = 'ruby_tika_app'
15
14
 
16
- s.files = `git ls-files`.split("\n") +
17
- %w(LICENSE README.md HISTORY)
18
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
- s.require_paths = %w(lib)
15
+ s.files = `git ls-files`.split("\n") +
16
+ %w[LICENSE README.md HISTORY]
17
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
18
+ s.require_paths = %w[lib]
21
19
  s.test_files = Dir.glob('spec/**/*')
22
20
 
23
21
  s.add_runtime_dependency('open4')
24
22
 
25
- s.add_development_dependency('rake')
26
- s.add_development_dependency('rspec', '~> 3.3.0')
27
23
  s.add_development_dependency('bundler', '>= 1.0.15')
28
- s.add_development_dependency('simplecov')
29
24
  s.add_development_dependency('json')
25
+ s.add_development_dependency('pry')
30
26
  s.add_development_dependency('rack')
27
+ s.add_development_dependency('rake')
28
+ s.add_development_dependency('rspec', '~> 3.8.0')
29
+ s.add_development_dependency('simplecov')
31
30
  s.add_development_dependency('thin')
32
31
  end
@@ -1,10 +1,9 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe RubyTikaApp do
4
-
5
4
  before(:each) do
6
5
  doc_path = "#{File.join(File.dirname(__FILE__))}/docs"
7
-
6
+
8
7
  @test_file = "#{doc_path}/graph sampling simplex - 11.pdf"
9
8
 
10
9
  @cnn_com_file = "#{doc_path}/cnn.com"
@@ -13,50 +12,50 @@ describe RubyTikaApp do
13
12
 
14
13
  describe 'Error' do
15
14
  it 'has an error' do
16
- expect {
15
+ expect do
17
16
  rta = RubyTikaApp.new('No file')
18
17
  rta.to_xml
19
- }.to raise_error(RuntimeError)
18
+ end.to raise_error(RuntimeError)
20
19
  end
21
20
  end
22
21
 
23
22
  describe '#to_xml' do
24
23
  it 'header' do
25
24
  rta = RubyTikaApp.new(@test_file)
26
- expect(rta.to_xml[0..37]).to eq("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
25
+ expect(rta.to_xml[0..37]).to eq('<?xml version="1.0" encoding="UTF-8"?>')
27
26
  end
28
-
27
+
29
28
  it 'middle' do
30
29
  rta = RubyTikaApp.new(@test_file)
31
30
  xml = rta.to_xml
32
31
 
33
32
  xml_size = xml.size / 2
34
33
 
35
- expect(xml[xml_size..(xml_size + 100)]).to eq("plicated nodes make the node distribution converge\nto uniform distribution. We do not need to conside")
34
+ expect(xml[xml_size..(xml_size + 100)]).to eq("ph\nG. This methodology is also used in Frontier Sampling (FS).\nSince this is the only difference betw")
36
35
  end
37
36
  end
38
37
 
39
38
  describe '#to_html' do
40
39
  it 'header' do
41
40
  rta = RubyTikaApp.new(@test_file)
42
- expect(rta.to_html[0..42]).to eq("<html xmlns=\"http://www.w3.org/1999/xhtml\">")
41
+ expect(rta.to_html[0..42]).to eq('<html xmlns="http://www.w3.org/1999/xhtml">')
43
42
  end
44
43
 
45
44
  it 'middle' do
46
45
  rta = RubyTikaApp.new(@test_file)
47
- expect(rta.to_html[1000 ... 1100]).to eq("on/pdf\"/>\n<meta name=\"X-Parsed-By\" content=\"org.apache.tika.parser.DefaultParser\"/>\n<meta name=\"X-Pa")
46
+ expect(rta.to_html[1000...1100]).to eq("Z\"/>\n<meta name=\"meta:save-date\" content=\"2011-03-29T13:00:16Z\"/>\n<meta name=\"pdf:encrypted\" content")
48
47
  end
49
48
  end
50
49
 
51
50
  describe '#to_json' do
52
51
  it 'header' do
53
52
  rta = RubyTikaApp.new(@test_file)
54
- expect(rta.to_json[0..42]).to eq("{\"Application\":\"\\u0027Certified by IEEE PDF")
53
+ expect(rta.to_json[0..42]).to eq('{"Application":"\\u0027Certified by IEEE PDF')
55
54
  end
56
55
 
57
56
  it 'middle' do
58
57
  rta = RubyTikaApp.new(@test_file)
59
- expect(rta.to_json[100 ... 150]).to eq("\"171510\",\"Content-Type\":\"application/pdf\",\"Creatio")
58
+ expect(rta.to_json[100...150]).to eq('"171510","Content-Type":"application/pdf","Creatio')
60
59
  end
61
60
  end
62
61
 
@@ -68,7 +67,7 @@ describe RubyTikaApp do
68
67
 
69
68
  it 'middle' do
70
69
  rta = RubyTikaApp.new(@test_file)
71
- expect(rta.to_text[100 ... 150]).to eq("n Zhang3, Tianyin Xu2\n\nLong Jin1, Pan Hui4, Beixin")
70
+ expect(rta.to_text[100...150]).to eq("in Zhang3, Tianyin Xu2\n\nLong Jin1, Pan Hui4, Beixi")
72
71
  end
73
72
  end
74
73
 
@@ -80,7 +79,7 @@ describe RubyTikaApp do
80
79
 
81
80
  it 'middle' do
82
81
  rta = RubyTikaApp.new(@test_file)
83
- expect(rta.to_text_main[100 ... 150]).to eq("n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing")
82
+ expect(rta.to_text_main[100...150]).to eq("n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing")
84
83
  end
85
84
  end
86
85
 
@@ -92,7 +91,7 @@ describe RubyTikaApp do
92
91
 
93
92
  it 'middle' do
94
93
  rta = RubyTikaApp.new(@test_file)
95
- expect(rta.to_metadata[100 ... 150]).to eq("Type: application/pdf\nCreation-Date: 2011-03-29T12")
94
+ expect(rta.to_metadata[100...150]).to eq("Type: application/pdf\nCreation-Date: 2011-03-29T12")
96
95
  end
97
96
  end
98
97
 
@@ -109,5 +108,4 @@ describe RubyTikaApp do
109
108
  expect(rta.to_text).to eq(RubyTikaApp.new(@news_ycombinator_com_file).to_text)
110
109
  end
111
110
  end
112
-
113
111
  end
data/spec/spec_helper.rb CHANGED
@@ -8,12 +8,11 @@ require 'ruby_tika_app'
8
8
  require 'rspec'
9
9
 
10
10
  # Include all files under spec/support
11
- Dir['./spec/support/**/*.rb'].each {|f| require f}
11
+ Dir['./spec/support/**/*.rb'].each { |f| require f }
12
12
 
13
13
  # Start a local rack server to serve up test pages.
14
14
  @server_thread = Thread.new do
15
- Rack::Handler::Thin.run MyApp::Test::Server.new, :Port => 9299
15
+ Rack::Handler::Thin.run MyApp::Test::Server.new, Port: 9299
16
16
  end
17
17
 
18
18
  sleep(1) # wait a sec for the server to be booted
19
-
@@ -5,15 +5,15 @@ module MyApp
5
5
  module Test
6
6
  class Server
7
7
  def call(env)
8
- @root = "#{File.expand_path(File.dirname(__FILE__))}/../docs/"
8
+ @root = "#{__dir__}/../docs/"
9
9
  path = Rack::Utils.unescape(env['PATH_INFO'])
10
10
  path += 'index.html' if path == '/'
11
- file = @root + "#{path}"
11
+ file = @root + path.to_s
12
12
 
13
- if File.exists?(file)
14
- [ 200, {'Content-Type' => 'text/html'}, File.read(file) ]
13
+ if File.exist?(file)
14
+ [200, { 'Content-Type' => 'text/html' }, File.read(file)]
15
15
  else
16
- [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
16
+ [404, { 'Content-Type' => 'text/plain' }, 'file not found']
17
17
  end
18
18
  end
19
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_tika_app
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.0
4
+ version: 1.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Parker
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-30 00:00:00.000000000 Z
11
+ date: 2018-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: open4
@@ -25,49 +25,49 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: rake
28
+ name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: 1.0.15
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: 1.0.15
41
41
  - !ruby/object:Gem::Dependency
42
- name: rspec
42
+ name: json
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: 3.3.0
47
+ version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: 3.3.0
54
+ version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: bundler
56
+ name: pry
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: 1.0.15
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: 1.0.15
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: simplecov
70
+ name: rack
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - ">="
@@ -81,7 +81,7 @@ dependencies:
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  - !ruby/object:Gem::Dependency
84
- name: json
84
+ name: rake
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - ">="
@@ -95,7 +95,21 @@ dependencies:
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
97
  - !ruby/object:Gem::Dependency
98
- name: rack
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 3.8.0
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 3.8.0
111
+ - !ruby/object:Gem::Dependency
112
+ name: simplecov
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - ">="
@@ -136,7 +150,8 @@ files:
136
150
  - LICENSE
137
151
  - README.md
138
152
  - Rakefile
139
- - ext/tika-app-1.9.jar
153
+ - ext/tika-app-1.19.1.jar
154
+ - ext/tika-config.xml
140
155
  - lib/ruby_tika_app.rb
141
156
  - ruby_tika_app.gemspec
142
157
  - spec/docs/cnn.com
@@ -164,14 +179,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
164
179
  version: '0'
165
180
  requirements: []
166
181
  rubyforge_project: ruby_tika_app
167
- rubygems_version: 2.4.5
182
+ rubygems_version: 2.7.7
168
183
  signing_key:
169
184
  specification_version: 4
170
185
  summary: Wrapper around the tika-app jar
171
186
  test_files:
172
- - spec/docs/cnn.com
187
+ - spec/ruby_tika_app_spec.rb
173
188
  - spec/docs/graph sampling simplex - 11.pdf
189
+ - spec/docs/cnn.com
174
190
  - spec/docs/news.ycombinator.com
175
- - spec/ruby_tika_app_spec.rb
176
- - spec/spec_helper.rb
177
191
  - spec/support/test_server.rb
192
+ - spec/spec_helper.rb