ruby_tika_app 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/HISTORY +7 -0
- data/LICENSE +20 -0
- data/README.textile +47 -0
- data/Rakefile +1 -0
- data/ext/tika-app-0.10.jar +0 -0
- data/lib/ruby_tika_app.rb +78 -0
- data/ruby_tika_app.gemspec +27 -0
- data/spec/docs/graph_sampling_simplex11.pdf +0 -0
- data/spec/ruby_tika_app_spec.rb +86 -0
- data/spec/spec_helper.rb +9 -0
- metadata +124 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/HISTORY
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Chris Parker
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.textile
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
h1. Ruby Tika Parser
|
2
|
+
|
3
|
+
h2. Introduction
|
4
|
+
|
5
|
+
This is a simple frontend to the Java Tika parser command line jar / app.
|
6
|
+
|
7
|
+
It is the same as running:
|
8
|
+
|
9
|
+
<pre>
|
10
|
+
java -server -Djava.awt.headless=true -jar tika-app-0.10.jar FileToParse.pdf
|
11
|
+
</pre>
|
12
|
+
|
13
|
+
with options like --xml, --text, etc.
|
14
|
+
|
15
|
+
h2. Installation
|
16
|
+
|
17
|
+
To install, add ruby_tika_app to your @Gemfile@ and run `bundle install`:
|
18
|
+
|
19
|
+
<pre>
|
20
|
+
gem 'ruby_tika_app'
|
21
|
+
</pre>
|
22
|
+
|
23
|
+
h3. Note about installation
|
24
|
+
|
25
|
+
RubyTikaApp is a pretty big gem since it includes the ruby-tika-app jarfile.
|
26
|
+
It might take a while to install.
|
27
|
+
|
28
|
+
h2. Usage
|
29
|
+
|
30
|
+
First, you need Java installed. And it needs to be in your $PATH.
|
31
|
+
|
32
|
+
Then:
|
33
|
+
|
34
|
+
<pre>
|
35
|
+
require 'ruby_tika_app'
|
36
|
+
|
37
|
+
rta = RubyTikaApp.new("sample_file.pdf")
|
38
|
+
|
39
|
+
puts rta.to_xml # <xml output>
|
40
|
+
|
41
|
+
# You also get to_json, to_text, to_text_main, and to_metadata
|
42
|
+
|
43
|
+
</pre>
|
44
|
+
|
45
|
+
h2. Contributing
|
46
|
+
|
47
|
+
Fork on GitHub and after you've committed tested patches, send a pull request.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
Binary file
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# Based on the rake remote task code
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'stringio'
|
5
|
+
require 'open4'
|
6
|
+
|
7
|
+
class RubyTikaApp
|
8
|
+
|
9
|
+
class Error < RuntimeError; end
|
10
|
+
|
11
|
+
class CommandFailedError < Error
|
12
|
+
attr_reader :status
|
13
|
+
def initialize status
|
14
|
+
@status = status
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(document)
|
19
|
+
|
20
|
+
@document = document
|
21
|
+
|
22
|
+
java_cmd = 'java'
|
23
|
+
java_args = '-server -Djava.awt.headless=true'
|
24
|
+
tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-0.10.jar"
|
25
|
+
|
26
|
+
@tika_cmd = "#{java_cmd} #{java_args} -jar #{tika_path}"
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_xml
|
30
|
+
run_tika('--xml')
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_html
|
34
|
+
run_tika('--html')
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_json
|
38
|
+
run_tika('--json')
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_text
|
42
|
+
run_tika('--text')
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_text_main
|
46
|
+
run_tika('--text-main')
|
47
|
+
end
|
48
|
+
|
49
|
+
def to_metadata
|
50
|
+
run_tika('--metadata')
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def run_tika(option)
|
56
|
+
|
57
|
+
final_cmd = "#{@tika_cmd} #{option} #{@document}"
|
58
|
+
result = []
|
59
|
+
|
60
|
+
|
61
|
+
pid, stdin, stdout, stderr = Open4::popen4(final_cmd)
|
62
|
+
|
63
|
+
stdout_result = stdout.read.strip
|
64
|
+
stderr_result = stderr.read.strip
|
65
|
+
|
66
|
+
unless stderr_result.strip == "" then
|
67
|
+
raise(CommandFailedError.new(stderr_result),
|
68
|
+
"execution failed with status #{stderr_result}: #{final_cmd}")
|
69
|
+
end
|
70
|
+
|
71
|
+
stdout_result
|
72
|
+
ensure
|
73
|
+
stdin.close
|
74
|
+
stdout.close
|
75
|
+
stderr.close
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "ruby_tika_app"
|
6
|
+
s.version = "0.2"
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Chris Parker"]
|
9
|
+
s.email = ["mrcsparker@gmail.com"]
|
10
|
+
s.homepage = "https://github.com/mrcsparker/ruby_tika_app"
|
11
|
+
s.summary = %q{Wrapper around the tika-app jar}
|
12
|
+
s.description = %q{Wrapper around the tika-app jar}
|
13
|
+
|
14
|
+
s.rubyforge_project = "ruby_tika_app"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n") +
|
17
|
+
%w(LICENSE README.textile HISTORY)
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
s.test_files = Dir.glob('spec/**/*')
|
22
|
+
|
23
|
+
s.add_runtime_dependency("open4")
|
24
|
+
|
25
|
+
s.add_development_dependency("rspec", "~> 2.7.0")
|
26
|
+
s.add_development_dependency("bundler", ">= 1.0.15")
|
27
|
+
end
|
Binary file
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RubyTikaApp do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@test_file = "#{File.join(File.dirname(__FILE__))}/docs/graph_sampling_simplex11.pdf"
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#to_xml" do
|
10
|
+
it "header" do
|
11
|
+
rta = RubyTikaApp.new(@test_file)
|
12
|
+
rta.to_xml[0..37].should == "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "middle" do
|
16
|
+
rta = RubyTikaApp.new(@test_file)
|
17
|
+
xml = rta.to_xml
|
18
|
+
|
19
|
+
xml_size = xml.size / 2
|
20
|
+
|
21
|
+
xml[xml_size..(xml_size + 100)].should == "HRW considers all the duplicated nodes as valid nodes.\nThese duplicated nodes make the node distribut"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#to_html" do
|
26
|
+
it "header" do
|
27
|
+
rta = RubyTikaApp.new(@test_file)
|
28
|
+
rta.to_html[0..42].should == "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
|
29
|
+
end
|
30
|
+
|
31
|
+
it "middle" do
|
32
|
+
rta = RubyTikaApp.new(@test_file)
|
33
|
+
rta.to_html[1000 ... 1100].should == "ersity of Goettingen, Germany\n3 Department of Computer Science, U.C. Santa Barbara, USA\n4 Deutsche T"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#to_json" do
|
38
|
+
it "header" do
|
39
|
+
rta = RubyTikaApp.new(@test_file)
|
40
|
+
rta.to_json[0..42].should == "{ \"Application\":\"\\u0027Certified by IEEE PD"
|
41
|
+
end
|
42
|
+
|
43
|
+
it "middle" do
|
44
|
+
rta = RubyTikaApp.new(@test_file)
|
45
|
+
rta.to_json[100 ... 150].should == "h\":171510, \n\"Content-Type\":\"application/pdf\", \n\"Cr"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe "#to_text" do
|
50
|
+
it "header" do
|
51
|
+
rta = RubyTikaApp.new(@test_file)
|
52
|
+
rta.to_text[0..42].should == "Understanding Graph Sampling Algorithms\nfor"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "middle" do
|
56
|
+
rta = RubyTikaApp.new(@test_file)
|
57
|
+
rta.to_text[100 ... 150].should == "n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe "#to_text_main" do
|
62
|
+
it "header" do
|
63
|
+
rta = RubyTikaApp.new(@test_file)
|
64
|
+
rta.to_text_main[0..42].should == "Understanding Graph Sampling Algorithms for"
|
65
|
+
end
|
66
|
+
|
67
|
+
it "middle" do
|
68
|
+
rta = RubyTikaApp.new(@test_file)
|
69
|
+
rta.to_text_main[100 ... 150].should == "n Zhang3, Tianyin Xu2 Long Jin1, Pan Hui4, Beixing"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe "#to_metadata" do
|
74
|
+
it "header" do
|
75
|
+
rta = RubyTikaApp.new(@test_file)
|
76
|
+
rta.to_metadata[0..42].should == "Application: 'Certified by IEEE PDFeXpress "
|
77
|
+
end
|
78
|
+
|
79
|
+
it "middle" do
|
80
|
+
rta = RubyTikaApp.new(@test_file)
|
81
|
+
rta.to_metadata[100 ... 150].should == "Type: application/pdf\nCreation-Date: 2011-03-29T12"
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby_tika_app
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 15
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
version: "0.2"
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Chris Parker
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-12-20 00:00:00 Z
|
18
|
+
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: open4
|
21
|
+
prerelease: false
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
hash: 3
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: rspec
|
35
|
+
prerelease: false
|
36
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ~>
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
hash: 19
|
42
|
+
segments:
|
43
|
+
- 2
|
44
|
+
- 7
|
45
|
+
- 0
|
46
|
+
version: 2.7.0
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id002
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: bundler
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
hash: 9
|
58
|
+
segments:
|
59
|
+
- 1
|
60
|
+
- 0
|
61
|
+
- 15
|
62
|
+
version: 1.0.15
|
63
|
+
type: :development
|
64
|
+
version_requirements: *id003
|
65
|
+
description: Wrapper around the tika-app jar
|
66
|
+
email:
|
67
|
+
- mrcsparker@gmail.com
|
68
|
+
executables: []
|
69
|
+
|
70
|
+
extensions: []
|
71
|
+
|
72
|
+
extra_rdoc_files: []
|
73
|
+
|
74
|
+
files:
|
75
|
+
- .gitignore
|
76
|
+
- .rspec
|
77
|
+
- Gemfile
|
78
|
+
- HISTORY
|
79
|
+
- LICENSE
|
80
|
+
- README.textile
|
81
|
+
- Rakefile
|
82
|
+
- ext/tika-app-0.10.jar
|
83
|
+
- lib/ruby_tika_app.rb
|
84
|
+
- ruby_tika_app.gemspec
|
85
|
+
- spec/docs/graph_sampling_simplex11.pdf
|
86
|
+
- spec/ruby_tika_app_spec.rb
|
87
|
+
- spec/spec_helper.rb
|
88
|
+
homepage: https://github.com/mrcsparker/ruby_tika_app
|
89
|
+
licenses: []
|
90
|
+
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
hash: 3
|
102
|
+
segments:
|
103
|
+
- 0
|
104
|
+
version: "0"
|
105
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
hash: 3
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
version: "0"
|
114
|
+
requirements: []
|
115
|
+
|
116
|
+
rubyforge_project: ruby_tika_app
|
117
|
+
rubygems_version: 1.8.12
|
118
|
+
signing_key:
|
119
|
+
specification_version: 3
|
120
|
+
summary: Wrapper around the tika-app jar
|
121
|
+
test_files:
|
122
|
+
- spec/docs/graph_sampling_simplex11.pdf
|
123
|
+
- spec/ruby_tika_app_spec.rb
|
124
|
+
- spec/spec_helper.rb
|