ruby_tika_app 1.5.0 → 1.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/LICENSE +1 -1
- data/ext/{tika-app-1.9.jar → tika-app-1.19.1.jar} +0 -0
- data/ext/tika-config.xml +13 -0
- data/lib/ruby_tika_app.rb +16 -12
- data/ruby_tika_app.gemspec +13 -14
- data/spec/ruby_tika_app_spec.rb +13 -15
- data/spec/spec_helper.rb +2 -3
- data/spec/support/test_server.rb +5 -5
- metadata +36 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bbd4069c575a64c475bf64daa5529fb6d2d552ddd5282791708de6344bbe2755
|
4
|
+
data.tar.gz: b4254a9b725fa8a320fbe6a90d1c41077d972a181fb1f3a90447061fa43be9f6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f87262cb29711262b8ceef0ec55a4dd6db9a964999190734242efa00f92e4847a7ab72e1fae3e53318f93fb08b82f40dbd345f2a79021e4839a861f59878a970
|
7
|
+
data.tar.gz: cc425ee26c3a8f7879ae58b2872704e15bf589961b22d40c73bd43caea0b1720a589670c5897dee03acb707dbe067a7b97587bb0ce48818c541282f604b3b8da
|
data/LICENSE
CHANGED
Binary file
|
data/ext/tika-config.xml
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<properties>
|
3
|
+
<service-loader initializableProblemHandler="ignore"/>
|
4
|
+
<parsers>
|
5
|
+
<!-- Default Parser for most things, except for 2 mime types, and never
|
6
|
+
use the Executable Parser -->
|
7
|
+
<parser class="org.apache.tika.parser.DefaultParser">
|
8
|
+
<mime-exclude>image/jpeg</mime-exclude>
|
9
|
+
<mime-exclude>application/x-sqlite3</mime-exclude>
|
10
|
+
<parser-exclude class="org.apache.tika.parser.jdbc.SQLite3Parser"/>
|
11
|
+
</parser>
|
12
|
+
</parsers>
|
13
|
+
</properties>
|
data/lib/ruby_tika_app.rb
CHANGED
@@ -5,7 +5,6 @@ require 'stringio'
|
|
5
5
|
require 'open4'
|
6
6
|
|
7
7
|
class RubyTikaApp
|
8
|
-
|
9
8
|
class Error < RuntimeError; end
|
10
9
|
|
11
10
|
class CommandFailedError < Error
|
@@ -16,17 +15,19 @@ class RubyTikaApp
|
|
16
15
|
end
|
17
16
|
|
18
17
|
def initialize(document)
|
19
|
-
if (document =~
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
18
|
+
@document = if (document =~ %r{https?:\/\/[\S]+}) == 0
|
19
|
+
document
|
20
|
+
else
|
21
|
+
"file://#{document}"
|
22
|
+
end
|
24
23
|
|
25
24
|
java_cmd = 'java'
|
26
25
|
java_args = '-server -Djava.awt.headless=true'
|
27
|
-
|
26
|
+
ext_dir = File.join(File.dirname(__FILE__))
|
27
|
+
tika_path = "#{ext_dir}/../ext/tika-app-1.19.1.jar"
|
28
|
+
tika_config_path = "#{ext_dir}/../ext/tika-config.xml"
|
28
29
|
|
29
|
-
@tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}'"
|
30
|
+
@tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
|
30
31
|
end
|
31
32
|
|
32
33
|
def to_xml
|
@@ -58,7 +59,7 @@ class RubyTikaApp
|
|
58
59
|
def run_tika(option)
|
59
60
|
final_cmd = "#{@tika_cmd} #{option} '#{@document}'"
|
60
61
|
|
61
|
-
|
62
|
+
_, stdin, stdout, stderr = Open4.popen4(final_cmd)
|
62
63
|
|
63
64
|
stdout_result = stdout.read.strip
|
64
65
|
stderr_result = stderr.read.strip
|
@@ -75,8 +76,11 @@ class RubyTikaApp
|
|
75
76
|
stderr.close
|
76
77
|
end
|
77
78
|
|
78
|
-
def strip_stderr(
|
79
|
-
|
79
|
+
def strip_stderr(err)
|
80
|
+
err
|
81
|
+
.gsub(/^(info|warn) - .*$/i, '')
|
82
|
+
.strip
|
83
|
+
.gsub(/Picked up JAVA_TOOL_OPTIONS: .+ -Dfile.encoding=UTF-8/i, '')
|
84
|
+
.strip
|
80
85
|
end
|
81
|
-
|
82
86
|
end
|
data/ruby_tika_app.gemspec
CHANGED
@@ -1,32 +1,31 @@
|
|
1
|
-
|
2
|
-
$:.push File.expand_path('../lib', __FILE__)
|
1
|
+
$LOAD_PATH.push File.expand_path('lib', __dir__)
|
3
2
|
|
4
3
|
Gem::Specification.new do |s|
|
5
4
|
s.name = 'ruby_tika_app'
|
6
|
-
s.version = '1.
|
5
|
+
s.version = '1.8.0'
|
7
6
|
s.platform = Gem::Platform::RUBY
|
8
7
|
s.authors = ['Chris Parker']
|
9
|
-
s.email = %w
|
8
|
+
s.email = %w[mrcsparker@gmail.com]
|
10
9
|
s.homepage = 'https://github.com/mrcsparker/ruby_tika_app'
|
11
|
-
s.summary =
|
12
|
-
s.description =
|
10
|
+
s.summary = 'Wrapper around the tika-app jar'
|
11
|
+
s.description = 'Wrapper around the tika-app jar'
|
13
12
|
|
14
13
|
s.rubyforge_project = 'ruby_tika_app'
|
15
14
|
|
16
|
-
s.files = `git ls-files`.split("\n") +
|
17
|
-
|
18
|
-
s.
|
19
|
-
s.
|
20
|
-
s.require_paths = %w(lib)
|
15
|
+
s.files = `git ls-files`.split("\n") +
|
16
|
+
%w[LICENSE README.md HISTORY]
|
17
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
18
|
+
s.require_paths = %w[lib]
|
21
19
|
s.test_files = Dir.glob('spec/**/*')
|
22
20
|
|
23
21
|
s.add_runtime_dependency('open4')
|
24
22
|
|
25
|
-
s.add_development_dependency('rake')
|
26
|
-
s.add_development_dependency('rspec', '~> 3.3.0')
|
27
23
|
s.add_development_dependency('bundler', '>= 1.0.15')
|
28
|
-
s.add_development_dependency('simplecov')
|
29
24
|
s.add_development_dependency('json')
|
25
|
+
s.add_development_dependency('pry')
|
30
26
|
s.add_development_dependency('rack')
|
27
|
+
s.add_development_dependency('rake')
|
28
|
+
s.add_development_dependency('rspec', '~> 3.8.0')
|
29
|
+
s.add_development_dependency('simplecov')
|
31
30
|
s.add_development_dependency('thin')
|
32
31
|
end
|
data/spec/ruby_tika_app_spec.rb
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe RubyTikaApp do
|
4
|
-
|
5
4
|
before(:each) do
|
6
5
|
doc_path = "#{File.join(File.dirname(__FILE__))}/docs"
|
7
|
-
|
6
|
+
|
8
7
|
@test_file = "#{doc_path}/graph sampling simplex - 11.pdf"
|
9
8
|
|
10
9
|
@cnn_com_file = "#{doc_path}/cnn.com"
|
@@ -13,50 +12,50 @@ describe RubyTikaApp do
|
|
13
12
|
|
14
13
|
describe 'Error' do
|
15
14
|
it 'has an error' do
|
16
|
-
expect
|
15
|
+
expect do
|
17
16
|
rta = RubyTikaApp.new('No file')
|
18
17
|
rta.to_xml
|
19
|
-
|
18
|
+
end.to raise_error(RuntimeError)
|
20
19
|
end
|
21
20
|
end
|
22
21
|
|
23
22
|
describe '#to_xml' do
|
24
23
|
it 'header' do
|
25
24
|
rta = RubyTikaApp.new(@test_file)
|
26
|
-
expect(rta.to_xml[0..37]).to eq(
|
25
|
+
expect(rta.to_xml[0..37]).to eq('<?xml version="1.0" encoding="UTF-8"?>')
|
27
26
|
end
|
28
|
-
|
27
|
+
|
29
28
|
it 'middle' do
|
30
29
|
rta = RubyTikaApp.new(@test_file)
|
31
30
|
xml = rta.to_xml
|
32
31
|
|
33
32
|
xml_size = xml.size / 2
|
34
33
|
|
35
|
-
expect(xml[xml_size..(xml_size + 100)]).to eq("
|
34
|
+
expect(xml[xml_size..(xml_size + 100)]).to eq("ph\nG. This methodology is also used in Frontier Sampling (FS).\nSince this is the only difference betw")
|
36
35
|
end
|
37
36
|
end
|
38
37
|
|
39
38
|
describe '#to_html' do
|
40
39
|
it 'header' do
|
41
40
|
rta = RubyTikaApp.new(@test_file)
|
42
|
-
expect(rta.to_html[0..42]).to eq(
|
41
|
+
expect(rta.to_html[0..42]).to eq('<html xmlns="http://www.w3.org/1999/xhtml">')
|
43
42
|
end
|
44
43
|
|
45
44
|
it 'middle' do
|
46
45
|
rta = RubyTikaApp.new(@test_file)
|
47
|
-
expect(rta.to_html[1000
|
46
|
+
expect(rta.to_html[1000...1100]).to eq("Z\"/>\n<meta name=\"meta:save-date\" content=\"2011-03-29T13:00:16Z\"/>\n<meta name=\"pdf:encrypted\" content")
|
48
47
|
end
|
49
48
|
end
|
50
49
|
|
51
50
|
describe '#to_json' do
|
52
51
|
it 'header' do
|
53
52
|
rta = RubyTikaApp.new(@test_file)
|
54
|
-
expect(rta.to_json[0..42]).to eq(
|
53
|
+
expect(rta.to_json[0..42]).to eq('{"Application":"\\u0027Certified by IEEE PDF')
|
55
54
|
end
|
56
55
|
|
57
56
|
it 'middle' do
|
58
57
|
rta = RubyTikaApp.new(@test_file)
|
59
|
-
expect(rta.to_json[100
|
58
|
+
expect(rta.to_json[100...150]).to eq('"171510","Content-Type":"application/pdf","Creatio')
|
60
59
|
end
|
61
60
|
end
|
62
61
|
|
@@ -68,7 +67,7 @@ describe RubyTikaApp do
|
|
68
67
|
|
69
68
|
it 'middle' do
|
70
69
|
rta = RubyTikaApp.new(@test_file)
|
71
|
-
expect(rta.to_text[100
|
70
|
+
expect(rta.to_text[100...150]).to eq("in Zhang3, Tianyin Xu2\n\nLong Jin1, Pan Hui4, Beixi")
|
72
71
|
end
|
73
72
|
end
|
74
73
|
|
@@ -80,7 +79,7 @@ describe RubyTikaApp do
|
|
80
79
|
|
81
80
|
it 'middle' do
|
82
81
|
rta = RubyTikaApp.new(@test_file)
|
83
|
-
expect(rta.to_text_main[100
|
82
|
+
expect(rta.to_text_main[100...150]).to eq("n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing")
|
84
83
|
end
|
85
84
|
end
|
86
85
|
|
@@ -92,7 +91,7 @@ describe RubyTikaApp do
|
|
92
91
|
|
93
92
|
it 'middle' do
|
94
93
|
rta = RubyTikaApp.new(@test_file)
|
95
|
-
expect(rta.to_metadata[100
|
94
|
+
expect(rta.to_metadata[100...150]).to eq("Type: application/pdf\nCreation-Date: 2011-03-29T12")
|
96
95
|
end
|
97
96
|
end
|
98
97
|
|
@@ -109,5 +108,4 @@ describe RubyTikaApp do
|
|
109
108
|
expect(rta.to_text).to eq(RubyTikaApp.new(@news_ycombinator_com_file).to_text)
|
110
109
|
end
|
111
110
|
end
|
112
|
-
|
113
111
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -8,12 +8,11 @@ require 'ruby_tika_app'
|
|
8
8
|
require 'rspec'
|
9
9
|
|
10
10
|
# Include all files under spec/support
|
11
|
-
Dir['./spec/support/**/*.rb'].each {|f| require f}
|
11
|
+
Dir['./spec/support/**/*.rb'].each { |f| require f }
|
12
12
|
|
13
13
|
# Start a local rack server to serve up test pages.
|
14
14
|
@server_thread = Thread.new do
|
15
|
-
Rack::Handler::Thin.run MyApp::Test::Server.new, :
|
15
|
+
Rack::Handler::Thin.run MyApp::Test::Server.new, Port: 9299
|
16
16
|
end
|
17
17
|
|
18
18
|
sleep(1) # wait a sec for the server to be booted
|
19
|
-
|
data/spec/support/test_server.rb
CHANGED
@@ -5,15 +5,15 @@ module MyApp
|
|
5
5
|
module Test
|
6
6
|
class Server
|
7
7
|
def call(env)
|
8
|
-
@root = "#{
|
8
|
+
@root = "#{__dir__}/../docs/"
|
9
9
|
path = Rack::Utils.unescape(env['PATH_INFO'])
|
10
10
|
path += 'index.html' if path == '/'
|
11
|
-
file = @root +
|
11
|
+
file = @root + path.to_s
|
12
12
|
|
13
|
-
if File.
|
14
|
-
[
|
13
|
+
if File.exist?(file)
|
14
|
+
[200, { 'Content-Type' => 'text/html' }, File.read(file)]
|
15
15
|
else
|
16
|
-
[
|
16
|
+
[404, { 'Content-Type' => 'text/plain' }, 'file not found']
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby_tika_app
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Parker
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: open4
|
@@ -25,49 +25,49 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 1.0.15
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 1.0.15
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: json
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: pry
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rack
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ">="
|
@@ -81,7 +81,7 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: rake
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - ">="
|
@@ -95,7 +95,21 @@ dependencies:
|
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 3.8.0
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 3.8.0
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: simplecov
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - ">="
|
@@ -136,7 +150,8 @@ files:
|
|
136
150
|
- LICENSE
|
137
151
|
- README.md
|
138
152
|
- Rakefile
|
139
|
-
- ext/tika-app-1.
|
153
|
+
- ext/tika-app-1.19.1.jar
|
154
|
+
- ext/tika-config.xml
|
140
155
|
- lib/ruby_tika_app.rb
|
141
156
|
- ruby_tika_app.gemspec
|
142
157
|
- spec/docs/cnn.com
|
@@ -164,14 +179,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
164
179
|
version: '0'
|
165
180
|
requirements: []
|
166
181
|
rubyforge_project: ruby_tika_app
|
167
|
-
rubygems_version: 2.
|
182
|
+
rubygems_version: 2.7.7
|
168
183
|
signing_key:
|
169
184
|
specification_version: 4
|
170
185
|
summary: Wrapper around the tika-app jar
|
171
186
|
test_files:
|
172
|
-
- spec/
|
187
|
+
- spec/ruby_tika_app_spec.rb
|
173
188
|
- spec/docs/graph sampling simplex - 11.pdf
|
189
|
+
- spec/docs/cnn.com
|
174
190
|
- spec/docs/news.ycombinator.com
|
175
|
-
- spec/ruby_tika_app_spec.rb
|
176
|
-
- spec/spec_helper.rb
|
177
191
|
- spec/support/test_server.rb
|
192
|
+
- spec/spec_helper.rb
|