ruby_tika_app 1.5.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 23bb9e8faef2930749c2a92c52bca6991c6f4cfd
4
- data.tar.gz: aff9a6f879ed8f100e3e72cc13077c3e81093eb0
2
+ SHA256:
3
+ metadata.gz: bbd4069c575a64c475bf64daa5529fb6d2d552ddd5282791708de6344bbe2755
4
+ data.tar.gz: b4254a9b725fa8a320fbe6a90d1c41077d972a181fb1f3a90447061fa43be9f6
5
5
  SHA512:
6
- metadata.gz: afc3e589189d68e809f5a7f56742676ad3b9daa520c2fa412f7af84671c416b31e18b90cf89aa713dcdbb8752e605cf511d854c939ea29b4bfbf509d59e742e2
7
- data.tar.gz: fc3a95fd56ff9cc646ae01f8ddbfb8942038af77fc8ae7dc6c7de7297eec182b6fad334b09f96ebf065a95c4967da109e6b207b3e4daebd660255753c50ce909
6
+ metadata.gz: f87262cb29711262b8ceef0ec55a4dd6db9a964999190734242efa00f92e4847a7ab72e1fae3e53318f93fb08b82f40dbd345f2a79021e4839a861f59878a970
7
+ data.tar.gz: cc425ee26c3a8f7879ae58b2872704e15bf589961b22d40c73bd43caea0b1720a589670c5897dee03acb707dbe067a7b97587bb0ce48818c541282f604b3b8da
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2011 Chris Parker
1
+ Copyright (c) 2011-2018 Chris Parker
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
@@ -0,0 +1,13 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <properties>
3
+ <service-loader initializableProblemHandler="ignore"/>
4
+ <parsers>
5
+ <!-- Default Parser for most things, except for 2 mime types, and never
6
+ use the Executable Parser -->
7
+ <parser class="org.apache.tika.parser.DefaultParser">
8
+ <mime-exclude>image/jpeg</mime-exclude>
9
+ <mime-exclude>application/x-sqlite3</mime-exclude>
10
+ <parser-exclude class="org.apache.tika.parser.jdbc.SQLite3Parser"/>
11
+ </parser>
12
+ </parsers>
13
+ </properties>
data/lib/ruby_tika_app.rb CHANGED
@@ -5,7 +5,6 @@ require 'stringio'
5
5
  require 'open4'
6
6
 
7
7
  class RubyTikaApp
8
-
9
8
  class Error < RuntimeError; end
10
9
 
11
10
  class CommandFailedError < Error
@@ -16,17 +15,19 @@ class RubyTikaApp
16
15
  end
17
16
 
18
17
  def initialize(document)
19
- if (document =~ /https?:\/\/[\S]+/) == 0
20
- @document = document
21
- else
22
- @document = "file://#{document}"
23
- end
18
+ @document = if (document =~ %r{https?:\/\/[\S]+}) == 0
19
+ document
20
+ else
21
+ "file://#{document}"
22
+ end
24
23
 
25
24
  java_cmd = 'java'
26
25
  java_args = '-server -Djava.awt.headless=true'
27
- tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-1.9.jar"
26
+ ext_dir = File.join(File.dirname(__FILE__))
27
+ tika_path = "#{ext_dir}/../ext/tika-app-1.19.1.jar"
28
+ tika_config_path = "#{ext_dir}/../ext/tika-config.xml"
28
29
 
29
- @tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}'"
30
+ @tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
30
31
  end
31
32
 
32
33
  def to_xml
@@ -58,7 +59,7 @@ class RubyTikaApp
58
59
  def run_tika(option)
59
60
  final_cmd = "#{@tika_cmd} #{option} '#{@document}'"
60
61
 
61
- pid, stdin, stdout, stderr = Open4::popen4(final_cmd)
62
+ _, stdin, stdout, stderr = Open4.popen4(final_cmd)
62
63
 
63
64
  stdout_result = stdout.read.strip
64
65
  stderr_result = stderr.read.strip
@@ -75,8 +76,11 @@ class RubyTikaApp
75
76
  stderr.close
76
77
  end
77
78
 
78
- def strip_stderr(s)
79
- s.gsub(/^(info|warn) - .*$/i, '').strip
79
+ def strip_stderr(err)
80
+ err
81
+ .gsub(/^(info|warn) - .*$/i, '')
82
+ .strip
83
+ .gsub(/Picked up JAVA_TOOL_OPTIONS: .+ -Dfile.encoding=UTF-8/i, '')
84
+ .strip
80
85
  end
81
-
82
86
  end
@@ -1,32 +1,31 @@
1
- # -*- encoding: utf-8 -*-
2
- $:.push File.expand_path('../lib', __FILE__)
1
+ $LOAD_PATH.push File.expand_path('lib', __dir__)
3
2
 
4
3
  Gem::Specification.new do |s|
5
4
  s.name = 'ruby_tika_app'
6
- s.version = '1.5.0'
5
+ s.version = '1.8.0'
7
6
  s.platform = Gem::Platform::RUBY
8
7
  s.authors = ['Chris Parker']
9
- s.email = %w(mrcsparker@gmail.com)
8
+ s.email = %w[mrcsparker@gmail.com]
10
9
  s.homepage = 'https://github.com/mrcsparker/ruby_tika_app'
11
- s.summary = %q{Wrapper around the tika-app jar}
12
- s.description = %q{Wrapper around the tika-app jar}
10
+ s.summary = 'Wrapper around the tika-app jar'
11
+ s.description = 'Wrapper around the tika-app jar'
13
12
 
14
13
  s.rubyforge_project = 'ruby_tika_app'
15
14
 
16
- s.files = `git ls-files`.split("\n") +
17
- %w(LICENSE README.md HISTORY)
18
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
- s.require_paths = %w(lib)
15
+ s.files = `git ls-files`.split("\n") +
16
+ %w[LICENSE README.md HISTORY]
17
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
18
+ s.require_paths = %w[lib]
21
19
  s.test_files = Dir.glob('spec/**/*')
22
20
 
23
21
  s.add_runtime_dependency('open4')
24
22
 
25
- s.add_development_dependency('rake')
26
- s.add_development_dependency('rspec', '~> 3.3.0')
27
23
  s.add_development_dependency('bundler', '>= 1.0.15')
28
- s.add_development_dependency('simplecov')
29
24
  s.add_development_dependency('json')
25
+ s.add_development_dependency('pry')
30
26
  s.add_development_dependency('rack')
27
+ s.add_development_dependency('rake')
28
+ s.add_development_dependency('rspec', '~> 3.8.0')
29
+ s.add_development_dependency('simplecov')
31
30
  s.add_development_dependency('thin')
32
31
  end
@@ -1,10 +1,9 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe RubyTikaApp do
4
-
5
4
  before(:each) do
6
5
  doc_path = "#{File.join(File.dirname(__FILE__))}/docs"
7
-
6
+
8
7
  @test_file = "#{doc_path}/graph sampling simplex - 11.pdf"
9
8
 
10
9
  @cnn_com_file = "#{doc_path}/cnn.com"
@@ -13,50 +12,50 @@ describe RubyTikaApp do
13
12
 
14
13
  describe 'Error' do
15
14
  it 'has an error' do
16
- expect {
15
+ expect do
17
16
  rta = RubyTikaApp.new('No file')
18
17
  rta.to_xml
19
- }.to raise_error(RuntimeError)
18
+ end.to raise_error(RuntimeError)
20
19
  end
21
20
  end
22
21
 
23
22
  describe '#to_xml' do
24
23
  it 'header' do
25
24
  rta = RubyTikaApp.new(@test_file)
26
- expect(rta.to_xml[0..37]).to eq("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
25
+ expect(rta.to_xml[0..37]).to eq('<?xml version="1.0" encoding="UTF-8"?>')
27
26
  end
28
-
27
+
29
28
  it 'middle' do
30
29
  rta = RubyTikaApp.new(@test_file)
31
30
  xml = rta.to_xml
32
31
 
33
32
  xml_size = xml.size / 2
34
33
 
35
- expect(xml[xml_size..(xml_size + 100)]).to eq("plicated nodes make the node distribution converge\nto uniform distribution. We do not need to conside")
34
+ expect(xml[xml_size..(xml_size + 100)]).to eq("ph\nG. This methodology is also used in Frontier Sampling (FS).\nSince this is the only difference betw")
36
35
  end
37
36
  end
38
37
 
39
38
  describe '#to_html' do
40
39
  it 'header' do
41
40
  rta = RubyTikaApp.new(@test_file)
42
- expect(rta.to_html[0..42]).to eq("<html xmlns=\"http://www.w3.org/1999/xhtml\">")
41
+ expect(rta.to_html[0..42]).to eq('<html xmlns="http://www.w3.org/1999/xhtml">')
43
42
  end
44
43
 
45
44
  it 'middle' do
46
45
  rta = RubyTikaApp.new(@test_file)
47
- expect(rta.to_html[1000 ... 1100]).to eq("on/pdf\"/>\n<meta name=\"X-Parsed-By\" content=\"org.apache.tika.parser.DefaultParser\"/>\n<meta name=\"X-Pa")
46
+ expect(rta.to_html[1000...1100]).to eq("Z\"/>\n<meta name=\"meta:save-date\" content=\"2011-03-29T13:00:16Z\"/>\n<meta name=\"pdf:encrypted\" content")
48
47
  end
49
48
  end
50
49
 
51
50
  describe '#to_json' do
52
51
  it 'header' do
53
52
  rta = RubyTikaApp.new(@test_file)
54
- expect(rta.to_json[0..42]).to eq("{\"Application\":\"\\u0027Certified by IEEE PDF")
53
+ expect(rta.to_json[0..42]).to eq('{"Application":"\\u0027Certified by IEEE PDF')
55
54
  end
56
55
 
57
56
  it 'middle' do
58
57
  rta = RubyTikaApp.new(@test_file)
59
- expect(rta.to_json[100 ... 150]).to eq("\"171510\",\"Content-Type\":\"application/pdf\",\"Creatio")
58
+ expect(rta.to_json[100...150]).to eq('"171510","Content-Type":"application/pdf","Creatio')
60
59
  end
61
60
  end
62
61
 
@@ -68,7 +67,7 @@ describe RubyTikaApp do
68
67
 
69
68
  it 'middle' do
70
69
  rta = RubyTikaApp.new(@test_file)
71
- expect(rta.to_text[100 ... 150]).to eq("n Zhang3, Tianyin Xu2\n\nLong Jin1, Pan Hui4, Beixin")
70
+ expect(rta.to_text[100...150]).to eq("in Zhang3, Tianyin Xu2\n\nLong Jin1, Pan Hui4, Beixi")
72
71
  end
73
72
  end
74
73
 
@@ -80,7 +79,7 @@ describe RubyTikaApp do
80
79
 
81
80
  it 'middle' do
82
81
  rta = RubyTikaApp.new(@test_file)
83
- expect(rta.to_text_main[100 ... 150]).to eq("n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing")
82
+ expect(rta.to_text_main[100...150]).to eq("n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing")
84
83
  end
85
84
  end
86
85
 
@@ -92,7 +91,7 @@ describe RubyTikaApp do
92
91
 
93
92
  it 'middle' do
94
93
  rta = RubyTikaApp.new(@test_file)
95
- expect(rta.to_metadata[100 ... 150]).to eq("Type: application/pdf\nCreation-Date: 2011-03-29T12")
94
+ expect(rta.to_metadata[100...150]).to eq("Type: application/pdf\nCreation-Date: 2011-03-29T12")
96
95
  end
97
96
  end
98
97
 
@@ -109,5 +108,4 @@ describe RubyTikaApp do
109
108
  expect(rta.to_text).to eq(RubyTikaApp.new(@news_ycombinator_com_file).to_text)
110
109
  end
111
110
  end
112
-
113
111
  end
data/spec/spec_helper.rb CHANGED
@@ -8,12 +8,11 @@ require 'ruby_tika_app'
8
8
  require 'rspec'
9
9
 
10
10
  # Include all files under spec/support
11
- Dir['./spec/support/**/*.rb'].each {|f| require f}
11
+ Dir['./spec/support/**/*.rb'].each { |f| require f }
12
12
 
13
13
  # Start a local rack server to serve up test pages.
14
14
  @server_thread = Thread.new do
15
- Rack::Handler::Thin.run MyApp::Test::Server.new, :Port => 9299
15
+ Rack::Handler::Thin.run MyApp::Test::Server.new, Port: 9299
16
16
  end
17
17
 
18
18
  sleep(1) # wait a sec for the server to be booted
19
-
@@ -5,15 +5,15 @@ module MyApp
5
5
  module Test
6
6
  class Server
7
7
  def call(env)
8
- @root = "#{File.expand_path(File.dirname(__FILE__))}/../docs/"
8
+ @root = "#{__dir__}/../docs/"
9
9
  path = Rack::Utils.unescape(env['PATH_INFO'])
10
10
  path += 'index.html' if path == '/'
11
- file = @root + "#{path}"
11
+ file = @root + path.to_s
12
12
 
13
- if File.exists?(file)
14
- [ 200, {'Content-Type' => 'text/html'}, File.read(file) ]
13
+ if File.exist?(file)
14
+ [200, { 'Content-Type' => 'text/html' }, File.read(file)]
15
15
  else
16
- [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
16
+ [404, { 'Content-Type' => 'text/plain' }, 'file not found']
17
17
  end
18
18
  end
19
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby_tika_app
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.0
4
+ version: 1.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Parker
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-30 00:00:00.000000000 Z
11
+ date: 2018-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: open4
@@ -25,49 +25,49 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: rake
28
+ name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: 1.0.15
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: 1.0.15
41
41
  - !ruby/object:Gem::Dependency
42
- name: rspec
42
+ name: json
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: 3.3.0
47
+ version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: 3.3.0
54
+ version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: bundler
56
+ name: pry
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: 1.0.15
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: 1.0.15
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: simplecov
70
+ name: rack
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - ">="
@@ -81,7 +81,7 @@ dependencies:
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  - !ruby/object:Gem::Dependency
84
- name: json
84
+ name: rake
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - ">="
@@ -95,7 +95,21 @@ dependencies:
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
97
  - !ruby/object:Gem::Dependency
98
- name: rack
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 3.8.0
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 3.8.0
111
+ - !ruby/object:Gem::Dependency
112
+ name: simplecov
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
115
  - - ">="
@@ -136,7 +150,8 @@ files:
136
150
  - LICENSE
137
151
  - README.md
138
152
  - Rakefile
139
- - ext/tika-app-1.9.jar
153
+ - ext/tika-app-1.19.1.jar
154
+ - ext/tika-config.xml
140
155
  - lib/ruby_tika_app.rb
141
156
  - ruby_tika_app.gemspec
142
157
  - spec/docs/cnn.com
@@ -164,14 +179,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
164
179
  version: '0'
165
180
  requirements: []
166
181
  rubyforge_project: ruby_tika_app
167
- rubygems_version: 2.4.5
182
+ rubygems_version: 2.7.7
168
183
  signing_key:
169
184
  specification_version: 4
170
185
  summary: Wrapper around the tika-app jar
171
186
  test_files:
172
- - spec/docs/cnn.com
187
+ - spec/ruby_tika_app_spec.rb
173
188
  - spec/docs/graph sampling simplex - 11.pdf
189
+ - spec/docs/cnn.com
174
190
  - spec/docs/news.ycombinator.com
175
- - spec/ruby_tika_app_spec.rb
176
- - spec/spec_helper.rb
177
191
  - spec/support/test_server.rb
192
+ - spec/spec_helper.rb