ruby_tika_app 1.5.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/LICENSE +1 -1
- data/ext/{tika-app-1.9.jar → tika-app-1.19.1.jar} +0 -0
- data/ext/tika-config.xml +13 -0
- data/lib/ruby_tika_app.rb +16 -12
- data/ruby_tika_app.gemspec +13 -14
- data/spec/ruby_tika_app_spec.rb +13 -15
- data/spec/spec_helper.rb +2 -3
- data/spec/support/test_server.rb +5 -5
- metadata +36 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bbd4069c575a64c475bf64daa5529fb6d2d552ddd5282791708de6344bbe2755
|
4
|
+
data.tar.gz: b4254a9b725fa8a320fbe6a90d1c41077d972a181fb1f3a90447061fa43be9f6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f87262cb29711262b8ceef0ec55a4dd6db9a964999190734242efa00f92e4847a7ab72e1fae3e53318f93fb08b82f40dbd345f2a79021e4839a861f59878a970
|
7
|
+
data.tar.gz: cc425ee26c3a8f7879ae58b2872704e15bf589961b22d40c73bd43caea0b1720a589670c5897dee03acb707dbe067a7b97587bb0ce48818c541282f604b3b8da
|
data/LICENSE
CHANGED
Binary file
|
data/ext/tika-config.xml
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<properties>
|
3
|
+
<service-loader initializableProblemHandler="ignore"/>
|
4
|
+
<parsers>
|
5
|
+
<!-- Default Parser for most things, except for 2 mime types, and never
|
6
|
+
use the Executable Parser -->
|
7
|
+
<parser class="org.apache.tika.parser.DefaultParser">
|
8
|
+
<mime-exclude>image/jpeg</mime-exclude>
|
9
|
+
<mime-exclude>application/x-sqlite3</mime-exclude>
|
10
|
+
<parser-exclude class="org.apache.tika.parser.jdbc.SQLite3Parser"/>
|
11
|
+
</parser>
|
12
|
+
</parsers>
|
13
|
+
</properties>
|
data/lib/ruby_tika_app.rb
CHANGED
@@ -5,7 +5,6 @@ require 'stringio'
|
|
5
5
|
require 'open4'
|
6
6
|
|
7
7
|
class RubyTikaApp
|
8
|
-
|
9
8
|
class Error < RuntimeError; end
|
10
9
|
|
11
10
|
class CommandFailedError < Error
|
@@ -16,17 +15,19 @@ class RubyTikaApp
|
|
16
15
|
end
|
17
16
|
|
18
17
|
def initialize(document)
|
19
|
-
if (document =~
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
18
|
+
@document = if (document =~ %r{https?:\/\/[\S]+}) == 0
|
19
|
+
document
|
20
|
+
else
|
21
|
+
"file://#{document}"
|
22
|
+
end
|
24
23
|
|
25
24
|
java_cmd = 'java'
|
26
25
|
java_args = '-server -Djava.awt.headless=true'
|
27
|
-
|
26
|
+
ext_dir = File.join(File.dirname(__FILE__))
|
27
|
+
tika_path = "#{ext_dir}/../ext/tika-app-1.19.1.jar"
|
28
|
+
tika_config_path = "#{ext_dir}/../ext/tika-config.xml"
|
28
29
|
|
29
|
-
@tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}'"
|
30
|
+
@tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
|
30
31
|
end
|
31
32
|
|
32
33
|
def to_xml
|
@@ -58,7 +59,7 @@ class RubyTikaApp
|
|
58
59
|
def run_tika(option)
|
59
60
|
final_cmd = "#{@tika_cmd} #{option} '#{@document}'"
|
60
61
|
|
61
|
-
|
62
|
+
_, stdin, stdout, stderr = Open4.popen4(final_cmd)
|
62
63
|
|
63
64
|
stdout_result = stdout.read.strip
|
64
65
|
stderr_result = stderr.read.strip
|
@@ -75,8 +76,11 @@ class RubyTikaApp
|
|
75
76
|
stderr.close
|
76
77
|
end
|
77
78
|
|
78
|
-
def strip_stderr(
|
79
|
-
|
79
|
+
def strip_stderr(err)
|
80
|
+
err
|
81
|
+
.gsub(/^(info|warn) - .*$/i, '')
|
82
|
+
.strip
|
83
|
+
.gsub(/Picked up JAVA_TOOL_OPTIONS: .+ -Dfile.encoding=UTF-8/i, '')
|
84
|
+
.strip
|
80
85
|
end
|
81
|
-
|
82
86
|
end
|
data/ruby_tika_app.gemspec
CHANGED
@@ -1,32 +1,31 @@
|
|
1
|
-
|
2
|
-
$:.push File.expand_path('../lib', __FILE__)
|
1
|
+
$LOAD_PATH.push File.expand_path('lib', __dir__)
|
3
2
|
|
4
3
|
Gem::Specification.new do |s|
|
5
4
|
s.name = 'ruby_tika_app'
|
6
|
-
s.version = '1.
|
5
|
+
s.version = '1.8.0'
|
7
6
|
s.platform = Gem::Platform::RUBY
|
8
7
|
s.authors = ['Chris Parker']
|
9
|
-
s.email = %w
|
8
|
+
s.email = %w[mrcsparker@gmail.com]
|
10
9
|
s.homepage = 'https://github.com/mrcsparker/ruby_tika_app'
|
11
|
-
s.summary =
|
12
|
-
s.description =
|
10
|
+
s.summary = 'Wrapper around the tika-app jar'
|
11
|
+
s.description = 'Wrapper around the tika-app jar'
|
13
12
|
|
14
13
|
s.rubyforge_project = 'ruby_tika_app'
|
15
14
|
|
16
|
-
s.files = `git ls-files`.split("\n") +
|
17
|
-
|
18
|
-
s.
|
19
|
-
s.
|
20
|
-
s.require_paths = %w(lib)
|
15
|
+
s.files = `git ls-files`.split("\n") +
|
16
|
+
%w[LICENSE README.md HISTORY]
|
17
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
18
|
+
s.require_paths = %w[lib]
|
21
19
|
s.test_files = Dir.glob('spec/**/*')
|
22
20
|
|
23
21
|
s.add_runtime_dependency('open4')
|
24
22
|
|
25
|
-
s.add_development_dependency('rake')
|
26
|
-
s.add_development_dependency('rspec', '~> 3.3.0')
|
27
23
|
s.add_development_dependency('bundler', '>= 1.0.15')
|
28
|
-
s.add_development_dependency('simplecov')
|
29
24
|
s.add_development_dependency('json')
|
25
|
+
s.add_development_dependency('pry')
|
30
26
|
s.add_development_dependency('rack')
|
27
|
+
s.add_development_dependency('rake')
|
28
|
+
s.add_development_dependency('rspec', '~> 3.8.0')
|
29
|
+
s.add_development_dependency('simplecov')
|
31
30
|
s.add_development_dependency('thin')
|
32
31
|
end
|
data/spec/ruby_tika_app_spec.rb
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe RubyTikaApp do
|
4
|
-
|
5
4
|
before(:each) do
|
6
5
|
doc_path = "#{File.join(File.dirname(__FILE__))}/docs"
|
7
|
-
|
6
|
+
|
8
7
|
@test_file = "#{doc_path}/graph sampling simplex - 11.pdf"
|
9
8
|
|
10
9
|
@cnn_com_file = "#{doc_path}/cnn.com"
|
@@ -13,50 +12,50 @@ describe RubyTikaApp do
|
|
13
12
|
|
14
13
|
describe 'Error' do
|
15
14
|
it 'has an error' do
|
16
|
-
expect
|
15
|
+
expect do
|
17
16
|
rta = RubyTikaApp.new('No file')
|
18
17
|
rta.to_xml
|
19
|
-
|
18
|
+
end.to raise_error(RuntimeError)
|
20
19
|
end
|
21
20
|
end
|
22
21
|
|
23
22
|
describe '#to_xml' do
|
24
23
|
it 'header' do
|
25
24
|
rta = RubyTikaApp.new(@test_file)
|
26
|
-
expect(rta.to_xml[0..37]).to eq(
|
25
|
+
expect(rta.to_xml[0..37]).to eq('<?xml version="1.0" encoding="UTF-8"?>')
|
27
26
|
end
|
28
|
-
|
27
|
+
|
29
28
|
it 'middle' do
|
30
29
|
rta = RubyTikaApp.new(@test_file)
|
31
30
|
xml = rta.to_xml
|
32
31
|
|
33
32
|
xml_size = xml.size / 2
|
34
33
|
|
35
|
-
expect(xml[xml_size..(xml_size + 100)]).to eq("
|
34
|
+
expect(xml[xml_size..(xml_size + 100)]).to eq("ph\nG. This methodology is also used in Frontier Sampling (FS).\nSince this is the only difference betw")
|
36
35
|
end
|
37
36
|
end
|
38
37
|
|
39
38
|
describe '#to_html' do
|
40
39
|
it 'header' do
|
41
40
|
rta = RubyTikaApp.new(@test_file)
|
42
|
-
expect(rta.to_html[0..42]).to eq(
|
41
|
+
expect(rta.to_html[0..42]).to eq('<html xmlns="http://www.w3.org/1999/xhtml">')
|
43
42
|
end
|
44
43
|
|
45
44
|
it 'middle' do
|
46
45
|
rta = RubyTikaApp.new(@test_file)
|
47
|
-
expect(rta.to_html[1000
|
46
|
+
expect(rta.to_html[1000...1100]).to eq("Z\"/>\n<meta name=\"meta:save-date\" content=\"2011-03-29T13:00:16Z\"/>\n<meta name=\"pdf:encrypted\" content")
|
48
47
|
end
|
49
48
|
end
|
50
49
|
|
51
50
|
describe '#to_json' do
|
52
51
|
it 'header' do
|
53
52
|
rta = RubyTikaApp.new(@test_file)
|
54
|
-
expect(rta.to_json[0..42]).to eq(
|
53
|
+
expect(rta.to_json[0..42]).to eq('{"Application":"\\u0027Certified by IEEE PDF')
|
55
54
|
end
|
56
55
|
|
57
56
|
it 'middle' do
|
58
57
|
rta = RubyTikaApp.new(@test_file)
|
59
|
-
expect(rta.to_json[100
|
58
|
+
expect(rta.to_json[100...150]).to eq('"171510","Content-Type":"application/pdf","Creatio')
|
60
59
|
end
|
61
60
|
end
|
62
61
|
|
@@ -68,7 +67,7 @@ describe RubyTikaApp do
|
|
68
67
|
|
69
68
|
it 'middle' do
|
70
69
|
rta = RubyTikaApp.new(@test_file)
|
71
|
-
expect(rta.to_text[100
|
70
|
+
expect(rta.to_text[100...150]).to eq("in Zhang3, Tianyin Xu2\n\nLong Jin1, Pan Hui4, Beixi")
|
72
71
|
end
|
73
72
|
end
|
74
73
|
|
@@ -80,7 +79,7 @@ describe RubyTikaApp do
|
|
80
79
|
|
81
80
|
it 'middle' do
|
82
81
|
rta = RubyTikaApp.new(@test_file)
|
83
|
-
expect(rta.to_text_main[100
|
82
|
+
expect(rta.to_text_main[100...150]).to eq("n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing")
|
84
83
|
end
|
85
84
|
end
|
86
85
|
|
@@ -92,7 +91,7 @@ describe RubyTikaApp do
|
|
92
91
|
|
93
92
|
it 'middle' do
|
94
93
|
rta = RubyTikaApp.new(@test_file)
|
95
|
-
expect(rta.to_metadata[100
|
94
|
+
expect(rta.to_metadata[100...150]).to eq("Type: application/pdf\nCreation-Date: 2011-03-29T12")
|
96
95
|
end
|
97
96
|
end
|
98
97
|
|
@@ -109,5 +108,4 @@ describe RubyTikaApp do
|
|
109
108
|
expect(rta.to_text).to eq(RubyTikaApp.new(@news_ycombinator_com_file).to_text)
|
110
109
|
end
|
111
110
|
end
|
112
|
-
|
113
111
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -8,12 +8,11 @@ require 'ruby_tika_app'
|
|
8
8
|
require 'rspec'
|
9
9
|
|
10
10
|
# Include all files under spec/support
|
11
|
-
Dir['./spec/support/**/*.rb'].each {|f| require f}
|
11
|
+
Dir['./spec/support/**/*.rb'].each { |f| require f }
|
12
12
|
|
13
13
|
# Start a local rack server to serve up test pages.
|
14
14
|
@server_thread = Thread.new do
|
15
|
-
Rack::Handler::Thin.run MyApp::Test::Server.new, :
|
15
|
+
Rack::Handler::Thin.run MyApp::Test::Server.new, Port: 9299
|
16
16
|
end
|
17
17
|
|
18
18
|
sleep(1) # wait a sec for the server to be booted
|
19
|
-
|
data/spec/support/test_server.rb
CHANGED
@@ -5,15 +5,15 @@ module MyApp
|
|
5
5
|
module Test
|
6
6
|
class Server
|
7
7
|
def call(env)
|
8
|
-
@root = "#{
|
8
|
+
@root = "#{__dir__}/../docs/"
|
9
9
|
path = Rack::Utils.unescape(env['PATH_INFO'])
|
10
10
|
path += 'index.html' if path == '/'
|
11
|
-
file = @root +
|
11
|
+
file = @root + path.to_s
|
12
12
|
|
13
|
-
if File.
|
14
|
-
[
|
13
|
+
if File.exist?(file)
|
14
|
+
[200, { 'Content-Type' => 'text/html' }, File.read(file)]
|
15
15
|
else
|
16
|
-
[
|
16
|
+
[404, { 'Content-Type' => 'text/plain' }, 'file not found']
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby_tika_app
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Parker
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: open4
|
@@ -25,49 +25,49 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 1.0.15
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 1.0.15
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: json
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: pry
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: rack
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ">="
|
@@ -81,7 +81,7 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: rake
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - ">="
|
@@ -95,7 +95,21 @@ dependencies:
|
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 3.8.0
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 3.8.0
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: simplecov
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
115
|
- - ">="
|
@@ -136,7 +150,8 @@ files:
|
|
136
150
|
- LICENSE
|
137
151
|
- README.md
|
138
152
|
- Rakefile
|
139
|
-
- ext/tika-app-1.
|
153
|
+
- ext/tika-app-1.19.1.jar
|
154
|
+
- ext/tika-config.xml
|
140
155
|
- lib/ruby_tika_app.rb
|
141
156
|
- ruby_tika_app.gemspec
|
142
157
|
- spec/docs/cnn.com
|
@@ -164,14 +179,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
164
179
|
version: '0'
|
165
180
|
requirements: []
|
166
181
|
rubyforge_project: ruby_tika_app
|
167
|
-
rubygems_version: 2.
|
182
|
+
rubygems_version: 2.7.7
|
168
183
|
signing_key:
|
169
184
|
specification_version: 4
|
170
185
|
summary: Wrapper around the tika-app jar
|
171
186
|
test_files:
|
172
|
-
- spec/
|
187
|
+
- spec/ruby_tika_app_spec.rb
|
173
188
|
- spec/docs/graph sampling simplex - 11.pdf
|
189
|
+
- spec/docs/cnn.com
|
174
190
|
- spec/docs/news.ycombinator.com
|
175
|
-
- spec/ruby_tika_app_spec.rb
|
176
|
-
- spec/spec_helper.rb
|
177
191
|
- spec/support/test_server.rb
|
192
|
+
- spec/spec_helper.rb
|