ruby_tika_app 0.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/.rspec +2 -1
- data/Gemfile +1 -1
- data/Rakefile +1 -1
- data/lib/ruby_tika_app.rb +9 -9
- data/ruby_tika_app.gemspec +13 -11
- data/spec/docs/graph_sampling_simplex11.pdf b/data/spec/docs/graph sampling simplex - → 11.pdf +0 -0
- data/spec/ruby_tika_app_spec.rb +31 -23
- data/spec/spec_helper.rb +3 -0
- metadata +39 -7
data/.gitignore
CHANGED
data/.rspec
CHANGED
data/Gemfile
CHANGED
data/Rakefile
CHANGED
@@ -1 +1 @@
|
|
1
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
data/lib/ruby_tika_app.rb
CHANGED
@@ -10,20 +10,19 @@ class RubyTikaApp
|
|
10
10
|
|
11
11
|
class CommandFailedError < Error
|
12
12
|
attr_reader :status
|
13
|
-
def initialize
|
13
|
+
def initialize(status)
|
14
14
|
@status = status
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
18
|
def initialize(document)
|
19
|
-
|
20
|
-
@document = document
|
19
|
+
@document = "file://#{document}"
|
21
20
|
|
22
21
|
java_cmd = 'java'
|
23
22
|
java_args = '-server -Djava.awt.headless=true'
|
24
23
|
tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-1.2.jar"
|
25
24
|
|
26
|
-
@tika_cmd = "#{java_cmd} #{java_args} -jar #{tika_path}"
|
25
|
+
@tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}'"
|
27
26
|
end
|
28
27
|
|
29
28
|
def to_xml
|
@@ -53,17 +52,14 @@ class RubyTikaApp
|
|
53
52
|
private
|
54
53
|
|
55
54
|
def run_tika(option)
|
56
|
-
|
57
|
-
final_cmd = "#{@tika_cmd} #{option} #{@document}"
|
58
|
-
result = []
|
59
|
-
|
55
|
+
final_cmd = "#{@tika_cmd} #{option} '#{@document}'"
|
60
56
|
|
61
57
|
pid, stdin, stdout, stderr = Open4::popen4(final_cmd)
|
62
58
|
|
63
59
|
stdout_result = stdout.read.strip
|
64
60
|
stderr_result = stderr.read.strip
|
65
61
|
|
66
|
-
unless stderr_result.
|
62
|
+
unless strip_stderr(stderr_result).empty?
|
67
63
|
raise(CommandFailedError.new(stderr_result),
|
68
64
|
"execution failed with status #{stderr_result}: #{final_cmd}")
|
69
65
|
end
|
@@ -75,4 +71,8 @@ class RubyTikaApp
|
|
75
71
|
stderr.close
|
76
72
|
end
|
77
73
|
|
74
|
+
def strip_stderr(s)
|
75
|
+
s.gsub(/^(info|warn) - .*$/i, '').strip
|
76
|
+
end
|
77
|
+
|
78
78
|
end
|
data/ruby_tika_app.gemspec
CHANGED
@@ -1,27 +1,29 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
$:.push File.expand_path(
|
2
|
+
$:.push File.expand_path('../lib', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
|
-
s.name =
|
6
|
-
s.version =
|
5
|
+
s.name = 'ruby_tika_app'
|
6
|
+
s.version = '1.0.0'
|
7
7
|
s.platform = Gem::Platform::RUBY
|
8
|
-
s.authors = [
|
9
|
-
s.email =
|
10
|
-
s.homepage =
|
8
|
+
s.authors = ['Chris Parker']
|
9
|
+
s.email = %w(mrcsparker@gmail.com)
|
10
|
+
s.homepage = 'https://github.com/mrcsparker/ruby_tika_app'
|
11
11
|
s.summary = %q{Wrapper around the tika-app jar}
|
12
12
|
s.description = %q{Wrapper around the tika-app jar}
|
13
13
|
|
14
|
-
s.rubyforge_project =
|
14
|
+
s.rubyforge_project = 'ruby_tika_app'
|
15
15
|
|
16
16
|
s.files = `git ls-files`.split("\n") +
|
17
17
|
%w(LICENSE README.textile HISTORY)
|
18
18
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
-
s.require_paths =
|
20
|
+
s.require_paths = %w(lib)
|
21
21
|
s.test_files = Dir.glob('spec/**/*')
|
22
22
|
|
23
|
-
s.add_runtime_dependency(
|
23
|
+
s.add_runtime_dependency('open4')
|
24
24
|
|
25
|
-
s.add_development_dependency(
|
26
|
-
s.add_development_dependency(
|
25
|
+
s.add_development_dependency('rake')
|
26
|
+
s.add_development_dependency('rspec', '~> 2.13.0')
|
27
|
+
s.add_development_dependency('bundler', '>= 1.0.15')
|
28
|
+
s.add_development_dependency('simplecov')
|
27
29
|
end
|
data/spec/docs/graph_sampling_simplex11.pdf b/data/spec/docs/graph sampling simplex - → 11.pdf
RENAMED
File without changes
|
data/spec/ruby_tika_app_spec.rb
CHANGED
@@ -3,84 +3,92 @@ require 'spec_helper'
|
|
3
3
|
describe RubyTikaApp do
|
4
4
|
|
5
5
|
before(:each) do
|
6
|
-
@test_file = "#{File.join(File.dirname(__FILE__))}/docs/
|
6
|
+
@test_file = "#{File.join(File.dirname(__FILE__))}/docs/graph sampling simplex - 11.pdf"
|
7
7
|
end
|
8
8
|
|
9
|
-
describe
|
10
|
-
it
|
9
|
+
describe 'Error' do
|
10
|
+
it 'has an error' do
|
11
|
+
expect {
|
12
|
+
rta = RubyTikaApp.new('No file')
|
13
|
+
rta.to_xml
|
14
|
+
}.to raise_error
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#to_xml' do
|
19
|
+
it 'header' do
|
11
20
|
rta = RubyTikaApp.new(@test_file)
|
12
21
|
rta.to_xml[0..37].should == "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
|
13
22
|
end
|
14
23
|
|
15
|
-
it
|
24
|
+
it 'middle' do
|
16
25
|
rta = RubyTikaApp.new(@test_file)
|
17
26
|
xml = rta.to_xml
|
18
27
|
|
19
28
|
xml_size = xml.size / 2
|
20
29
|
|
21
|
-
xml[xml_size..(xml_size + 100)].should == "(Section IV). Besides,\nMHRW performs better in well connected graphs than in\nloosely connected
|
30
|
+
xml[xml_size..(xml_size + 100)].should == "S (Section IV). Besides,\nMHRW performs better in well connected graphs than in\nloosely connected grap"
|
22
31
|
end
|
23
32
|
end
|
24
33
|
|
25
|
-
describe
|
26
|
-
it
|
34
|
+
describe '#to_html' do
|
35
|
+
it 'header' do
|
27
36
|
rta = RubyTikaApp.new(@test_file)
|
28
37
|
rta.to_html[0..42].should == "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
|
29
38
|
end
|
30
39
|
|
31
|
-
it
|
40
|
+
it 'middle' do
|
32
41
|
rta = RubyTikaApp.new(@test_file)
|
33
|
-
rta.to_html[1000 ... 1100].should == "rceName\" content=\"
|
42
|
+
rta.to_html[1000 ... 1100].should == "rceName\" content=\"graph sampling simplex - 11.pdf\"/>\n<meta name=\"Last-Save-Date\" content=\"2011-03-29"
|
34
43
|
end
|
35
44
|
end
|
36
45
|
|
37
|
-
describe
|
38
|
-
it
|
46
|
+
describe '#to_json' do
|
47
|
+
it 'header' do
|
39
48
|
rta = RubyTikaApp.new(@test_file)
|
40
49
|
rta.to_json[0..42].should == "{ \"Application\":\"\\u0027Certified by IEEE PD"
|
41
50
|
end
|
42
51
|
|
43
|
-
it
|
52
|
+
it 'middle' do
|
44
53
|
rta = RubyTikaApp.new(@test_file)
|
45
54
|
rta.to_json[100 ... 150].should == "h\":171510, \n\"Content-Type\":\"application/pdf\", \n\"Cr"
|
46
55
|
end
|
47
56
|
end
|
48
57
|
|
49
|
-
describe
|
50
|
-
it
|
58
|
+
describe '#to_text' do
|
59
|
+
it 'header' do
|
51
60
|
rta = RubyTikaApp.new(@test_file)
|
52
61
|
rta.to_text[0..42].should == "Understanding Graph Sampling Algorithms\nfor"
|
53
62
|
end
|
54
63
|
|
55
|
-
it
|
64
|
+
it 'middle' do
|
56
65
|
rta = RubyTikaApp.new(@test_file)
|
57
66
|
rta.to_text[100 ... 150].should == "n Zhang3, Tianyin Xu2\n\nLong Jin1, Pan Hui4, Beixin"
|
58
67
|
end
|
59
68
|
end
|
60
69
|
|
61
|
-
describe
|
62
|
-
it
|
70
|
+
describe '#to_text_main' do
|
71
|
+
it 'header' do
|
63
72
|
rta = RubyTikaApp.new(@test_file)
|
64
|
-
rta.to_text_main[0..42].should ==
|
73
|
+
rta.to_text_main[0..42].should == 'Understanding Graph Sampling Algorithms for'
|
65
74
|
end
|
66
75
|
|
67
|
-
it
|
76
|
+
it 'middle' do
|
68
77
|
rta = RubyTikaApp.new(@test_file)
|
69
78
|
rta.to_text_main[100 ... 150].should == "n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing"
|
70
79
|
end
|
71
80
|
end
|
72
81
|
|
73
|
-
describe
|
74
|
-
it
|
82
|
+
describe '#to_metadata' do
|
83
|
+
it 'header' do
|
75
84
|
rta = RubyTikaApp.new(@test_file)
|
76
85
|
rta.to_metadata[0..42].should == "Application: 'Certified by IEEE PDFeXpress "
|
77
86
|
end
|
78
87
|
|
79
|
-
it
|
88
|
+
it 'middle' do
|
80
89
|
rta = RubyTikaApp.new(@test_file)
|
81
90
|
rta.to_metadata[100 ... 150].should == "Type: application/pdf\nCreation-Date: 2011-03-29T12"
|
82
91
|
end
|
83
|
-
|
84
92
|
end
|
85
93
|
|
86
94
|
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby_tika_app
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-03-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: open4
|
@@ -27,6 +27,22 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
30
46
|
- !ruby/object:Gem::Dependency
|
31
47
|
name: rspec
|
32
48
|
requirement: !ruby/object:Gem::Requirement
|
@@ -34,7 +50,7 @@ dependencies:
|
|
34
50
|
requirements:
|
35
51
|
- - ~>
|
36
52
|
- !ruby/object:Gem::Version
|
37
|
-
version: 2.
|
53
|
+
version: 2.13.0
|
38
54
|
type: :development
|
39
55
|
prerelease: false
|
40
56
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -42,7 +58,7 @@ dependencies:
|
|
42
58
|
requirements:
|
43
59
|
- - ~>
|
44
60
|
- !ruby/object:Gem::Version
|
45
|
-
version: 2.
|
61
|
+
version: 2.13.0
|
46
62
|
- !ruby/object:Gem::Dependency
|
47
63
|
name: bundler
|
48
64
|
requirement: !ruby/object:Gem::Requirement
|
@@ -59,6 +75,22 @@ dependencies:
|
|
59
75
|
- - ! '>='
|
60
76
|
- !ruby/object:Gem::Version
|
61
77
|
version: 1.0.15
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: simplecov
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
62
94
|
description: Wrapper around the tika-app jar
|
63
95
|
email:
|
64
96
|
- mrcsparker@gmail.com
|
@@ -76,7 +108,7 @@ files:
|
|
76
108
|
- ext/tika-app-1.2.jar
|
77
109
|
- lib/ruby_tika_app.rb
|
78
110
|
- ruby_tika_app.gemspec
|
79
|
-
- spec/docs/
|
111
|
+
- spec/docs/graph sampling simplex - 11.pdf
|
80
112
|
- spec/ruby_tika_app_spec.rb
|
81
113
|
- spec/spec_helper.rb
|
82
114
|
homepage: https://github.com/mrcsparker/ruby_tika_app
|
@@ -99,11 +131,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
131
|
version: '0'
|
100
132
|
requirements: []
|
101
133
|
rubyforge_project: ruby_tika_app
|
102
|
-
rubygems_version: 1.8.
|
134
|
+
rubygems_version: 1.8.25
|
103
135
|
signing_key:
|
104
136
|
specification_version: 3
|
105
137
|
summary: Wrapper around the tika-app jar
|
106
138
|
test_files:
|
107
|
-
- spec/docs/
|
139
|
+
- spec/docs/graph sampling simplex - 11.pdf
|
108
140
|
- spec/ruby_tika_app_spec.rb
|
109
141
|
- spec/spec_helper.rb
|