ruby_tika_app 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/HISTORY +7 -0
- data/LICENSE +20 -0
- data/README.textile +47 -0
- data/Rakefile +1 -0
- data/ext/tika-app-0.10.jar +0 -0
- data/lib/ruby_tika_app.rb +78 -0
- data/ruby_tika_app.gemspec +27 -0
- data/spec/docs/graph_sampling_simplex11.pdf +0 -0
- data/spec/ruby_tika_app_spec.rb +86 -0
- data/spec/spec_helper.rb +9 -0
- metadata +124 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/HISTORY
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Chris Parker
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.textile
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
h1. Ruby Tika Parser
|
2
|
+
|
3
|
+
h2. Introduction
|
4
|
+
|
5
|
+
This is a simple frontend to the Java Tika parser command line jar / app.
|
6
|
+
|
7
|
+
It is the same as running:
|
8
|
+
|
9
|
+
<pre>
|
10
|
+
java -server -Djava.awt.headless=true -jar tika-app-0.10.jar FileToParse.pdf
|
11
|
+
</pre>
|
12
|
+
|
13
|
+
with options like --xml, --text, etc.
|
14
|
+
|
15
|
+
h2. Installation
|
16
|
+
|
17
|
+
To install, add ruby_tika_app to your @Gemfile@ and run `bundle install`:
|
18
|
+
|
19
|
+
<pre>
|
20
|
+
gem 'ruby_tika_app'
|
21
|
+
</pre>
|
22
|
+
|
23
|
+
h3. Note about installation
|
24
|
+
|
25
|
+
RubyTikaApp is a pretty big gem since it includes the ruby-tika-app jarfile.
|
26
|
+
It might take a while to install.
|
27
|
+
|
28
|
+
h2. Usage
|
29
|
+
|
30
|
+
First, you need Java installed. And it needs to be in your $PATH.
|
31
|
+
|
32
|
+
Then:
|
33
|
+
|
34
|
+
<pre>
|
35
|
+
require 'ruby_tika_app'
|
36
|
+
|
37
|
+
rta = RubyTikaApp.new("sample_file.pdf")
|
38
|
+
|
39
|
+
puts rta.to_xml # <xml output>
|
40
|
+
|
41
|
+
# You also get to_json, to_text, to_text_main, and to_metadata
|
42
|
+
|
43
|
+
</pre>
|
44
|
+
|
45
|
+
h2. Contributing
|
46
|
+
|
47
|
+
Fork on GitHub and after you've committed tested patches, send a pull request.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
Binary file
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# Based on the rake remote task code
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'stringio'
|
5
|
+
require 'open4'
|
6
|
+
|
7
|
+
class RubyTikaApp
|
8
|
+
|
9
|
+
class Error < RuntimeError; end
|
10
|
+
|
11
|
+
class CommandFailedError < Error
|
12
|
+
attr_reader :status
|
13
|
+
def initialize status
|
14
|
+
@status = status
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(document)
|
19
|
+
|
20
|
+
@document = document
|
21
|
+
|
22
|
+
java_cmd = 'java'
|
23
|
+
java_args = '-server -Djava.awt.headless=true'
|
24
|
+
tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-0.10.jar"
|
25
|
+
|
26
|
+
@tika_cmd = "#{java_cmd} #{java_args} -jar #{tika_path}"
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_xml
|
30
|
+
run_tika('--xml')
|
31
|
+
end
|
32
|
+
|
33
|
+
def to_html
|
34
|
+
run_tika('--html')
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_json
|
38
|
+
run_tika('--json')
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_text
|
42
|
+
run_tika('--text')
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_text_main
|
46
|
+
run_tika('--text-main')
|
47
|
+
end
|
48
|
+
|
49
|
+
def to_metadata
|
50
|
+
run_tika('--metadata')
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def run_tika(option)
|
56
|
+
|
57
|
+
final_cmd = "#{@tika_cmd} #{option} #{@document}"
|
58
|
+
result = []
|
59
|
+
|
60
|
+
|
61
|
+
pid, stdin, stdout, stderr = Open4::popen4(final_cmd)
|
62
|
+
|
63
|
+
stdout_result = stdout.read.strip
|
64
|
+
stderr_result = stderr.read.strip
|
65
|
+
|
66
|
+
unless stderr_result.strip == "" then
|
67
|
+
raise(CommandFailedError.new(stderr_result),
|
68
|
+
"execution failed with status #{stderr_result}: #{final_cmd}")
|
69
|
+
end
|
70
|
+
|
71
|
+
stdout_result
|
72
|
+
ensure
|
73
|
+
stdin.close
|
74
|
+
stdout.close
|
75
|
+
stderr.close
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "ruby_tika_app"
|
6
|
+
s.version = "0.2"
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Chris Parker"]
|
9
|
+
s.email = ["mrcsparker@gmail.com"]
|
10
|
+
s.homepage = "https://github.com/mrcsparker/ruby_tika_app"
|
11
|
+
s.summary = %q{Wrapper around the tika-app jar}
|
12
|
+
s.description = %q{Wrapper around the tika-app jar}
|
13
|
+
|
14
|
+
s.rubyforge_project = "ruby_tika_app"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n") +
|
17
|
+
%w(LICENSE README.textile HISTORY)
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
s.test_files = Dir.glob('spec/**/*')
|
22
|
+
|
23
|
+
s.add_runtime_dependency("open4")
|
24
|
+
|
25
|
+
s.add_development_dependency("rspec", "~> 2.7.0")
|
26
|
+
s.add_development_dependency("bundler", ">= 1.0.15")
|
27
|
+
end
|
Binary file
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RubyTikaApp do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
@test_file = "#{File.join(File.dirname(__FILE__))}/docs/graph_sampling_simplex11.pdf"
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "#to_xml" do
|
10
|
+
it "header" do
|
11
|
+
rta = RubyTikaApp.new(@test_file)
|
12
|
+
rta.to_xml[0..37].should == "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
|
13
|
+
end
|
14
|
+
|
15
|
+
it "middle" do
|
16
|
+
rta = RubyTikaApp.new(@test_file)
|
17
|
+
xml = rta.to_xml
|
18
|
+
|
19
|
+
xml_size = xml.size / 2
|
20
|
+
|
21
|
+
xml[xml_size..(xml_size + 100)].should == "HRW considers all the duplicated nodes as valid nodes.\nThese duplicated nodes make the node distribut"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#to_html" do
|
26
|
+
it "header" do
|
27
|
+
rta = RubyTikaApp.new(@test_file)
|
28
|
+
rta.to_html[0..42].should == "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
|
29
|
+
end
|
30
|
+
|
31
|
+
it "middle" do
|
32
|
+
rta = RubyTikaApp.new(@test_file)
|
33
|
+
rta.to_html[1000 ... 1100].should == "ersity of Goettingen, Germany\n3 Department of Computer Science, U.C. Santa Barbara, USA\n4 Deutsche T"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#to_json" do
|
38
|
+
it "header" do
|
39
|
+
rta = RubyTikaApp.new(@test_file)
|
40
|
+
rta.to_json[0..42].should == "{ \"Application\":\"\\u0027Certified by IEEE PD"
|
41
|
+
end
|
42
|
+
|
43
|
+
it "middle" do
|
44
|
+
rta = RubyTikaApp.new(@test_file)
|
45
|
+
rta.to_json[100 ... 150].should == "h\":171510, \n\"Content-Type\":\"application/pdf\", \n\"Cr"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe "#to_text" do
|
50
|
+
it "header" do
|
51
|
+
rta = RubyTikaApp.new(@test_file)
|
52
|
+
rta.to_text[0..42].should == "Understanding Graph Sampling Algorithms\nfor"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "middle" do
|
56
|
+
rta = RubyTikaApp.new(@test_file)
|
57
|
+
rta.to_text[100 ... 150].should == "n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe "#to_text_main" do
|
62
|
+
it "header" do
|
63
|
+
rta = RubyTikaApp.new(@test_file)
|
64
|
+
rta.to_text_main[0..42].should == "Understanding Graph Sampling Algorithms for"
|
65
|
+
end
|
66
|
+
|
67
|
+
it "middle" do
|
68
|
+
rta = RubyTikaApp.new(@test_file)
|
69
|
+
rta.to_text_main[100 ... 150].should == "n Zhang3, Tianyin Xu2 Long Jin1, Pan Hui4, Beixing"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe "#to_metadata" do
|
74
|
+
it "header" do
|
75
|
+
rta = RubyTikaApp.new(@test_file)
|
76
|
+
rta.to_metadata[0..42].should == "Application: 'Certified by IEEE PDFeXpress "
|
77
|
+
end
|
78
|
+
|
79
|
+
it "middle" do
|
80
|
+
rta = RubyTikaApp.new(@test_file)
|
81
|
+
rta.to_metadata[100 ... 150].should == "Type: application/pdf\nCreation-Date: 2011-03-29T12"
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby_tika_app
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 15
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
version: "0.2"
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Chris Parker
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-12-20 00:00:00 Z
|
18
|
+
dependencies:
|
19
|
+
- !ruby/object:Gem::Dependency
|
20
|
+
name: open4
|
21
|
+
prerelease: false
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
hash: 3
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
version: "0"
|
31
|
+
type: :runtime
|
32
|
+
version_requirements: *id001
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: rspec
|
35
|
+
prerelease: false
|
36
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ~>
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
hash: 19
|
42
|
+
segments:
|
43
|
+
- 2
|
44
|
+
- 7
|
45
|
+
- 0
|
46
|
+
version: 2.7.0
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id002
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: bundler
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
hash: 9
|
58
|
+
segments:
|
59
|
+
- 1
|
60
|
+
- 0
|
61
|
+
- 15
|
62
|
+
version: 1.0.15
|
63
|
+
type: :development
|
64
|
+
version_requirements: *id003
|
65
|
+
description: Wrapper around the tika-app jar
|
66
|
+
email:
|
67
|
+
- mrcsparker@gmail.com
|
68
|
+
executables: []
|
69
|
+
|
70
|
+
extensions: []
|
71
|
+
|
72
|
+
extra_rdoc_files: []
|
73
|
+
|
74
|
+
files:
|
75
|
+
- .gitignore
|
76
|
+
- .rspec
|
77
|
+
- Gemfile
|
78
|
+
- HISTORY
|
79
|
+
- LICENSE
|
80
|
+
- README.textile
|
81
|
+
- Rakefile
|
82
|
+
- ext/tika-app-0.10.jar
|
83
|
+
- lib/ruby_tika_app.rb
|
84
|
+
- ruby_tika_app.gemspec
|
85
|
+
- spec/docs/graph_sampling_simplex11.pdf
|
86
|
+
- spec/ruby_tika_app_spec.rb
|
87
|
+
- spec/spec_helper.rb
|
88
|
+
homepage: https://github.com/mrcsparker/ruby_tika_app
|
89
|
+
licenses: []
|
90
|
+
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
hash: 3
|
102
|
+
segments:
|
103
|
+
- 0
|
104
|
+
version: "0"
|
105
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
hash: 3
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
version: "0"
|
114
|
+
requirements: []
|
115
|
+
|
116
|
+
rubyforge_project: ruby_tika_app
|
117
|
+
rubygems_version: 1.8.12
|
118
|
+
signing_key:
|
119
|
+
specification_version: 3
|
120
|
+
summary: Wrapper around the tika-app jar
|
121
|
+
test_files:
|
122
|
+
- spec/docs/graph_sampling_simplex11.pdf
|
123
|
+
- spec/ruby_tika_app_spec.rb
|
124
|
+
- spec/spec_helper.rb
|