ruby_tika_app 0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ruby_tika_app.gemspec
4
+ gemspec
data/HISTORY ADDED
@@ -0,0 +1,7 @@
1
+ 0.2 - November 30, 2011
2
+ * Fixed open4 bundler issue - file was getting required that needed open4 before add_dependency
3
+ * Added README info, HISTORY
4
+ * Added more tests
5
+
6
+ 0.1 - November 29, 2011
7
+ * Initial release
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Chris Parker
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,47 @@
1
+ h1. Ruby Tika Parser
2
+
3
+ h2. Introduction
4
+
5
+ This is a simple frontend to the Java Tika parser command line jar / app.
6
+
7
+ It is the same as running:
8
+
9
+ <pre>
10
+ java -server -Djava.awt.headless=true -jar tika-app-0.10.jar FileToParse.pdf
11
+ </pre>
12
+
13
+ with options like --xml, --text, etc.
14
+
15
+ h2. Installation
16
+
17
+ To install, add ruby_tika_app to your @Gemfile@ and run `bundle install`:
18
+
19
+ <pre>
20
+ gem 'ruby_tika_app'
21
+ </pre>
22
+
23
+ h3. Note about installation
24
+
25
+ RubyTikaApp is a pretty big gem since it includes the ruby-tika-app jarfile.
26
+ It might take a while to install.
27
+
28
+ h2. Usage
29
+
30
+ First, you need Java installed. And it needs to be in your $PATH.
31
+
32
+ Then:
33
+
34
+ <pre>
35
+ require 'ruby_tika_app'
36
+
37
+ rta = RubyTikaApp.new("sample_file.pdf")
38
+
39
+ puts rta.to_xml # <xml output>
40
+
41
+ # You also get to_json, to_text, to_text_main, and to_metadata
42
+
43
+ </pre>
44
+
45
+ h2. Contributing
46
+
47
+ Fork on GitHub and after you've committed tested patches, send a pull request.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
Binary file
@@ -0,0 +1,78 @@
1
+ # Based on the rake remote task code
2
+
3
+ require 'rubygems'
4
+ require 'stringio'
5
+ require 'open4'
6
+
7
+ class RubyTikaApp
8
+
9
+ class Error < RuntimeError; end
10
+
11
+ class CommandFailedError < Error
12
+ attr_reader :status
13
+ def initialize status
14
+ @status = status
15
+ end
16
+ end
17
+
18
+ def initialize(document)
19
+
20
+ @document = document
21
+
22
+ java_cmd = 'java'
23
+ java_args = '-server -Djava.awt.headless=true'
24
+ tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-0.10.jar"
25
+
26
+ @tika_cmd = "#{java_cmd} #{java_args} -jar #{tika_path}"
27
+ end
28
+
29
+ def to_xml
30
+ run_tika('--xml')
31
+ end
32
+
33
+ def to_html
34
+ run_tika('--html')
35
+ end
36
+
37
+ def to_json
38
+ run_tika('--json')
39
+ end
40
+
41
+ def to_text
42
+ run_tika('--text')
43
+ end
44
+
45
+ def to_text_main
46
+ run_tika('--text-main')
47
+ end
48
+
49
+ def to_metadata
50
+ run_tika('--metadata')
51
+ end
52
+
53
+ private
54
+
55
+ def run_tika(option)
56
+
57
+ final_cmd = "#{@tika_cmd} #{option} #{@document}"
58
+ result = []
59
+
60
+
61
+ pid, stdin, stdout, stderr = Open4::popen4(final_cmd)
62
+
63
+ stdout_result = stdout.read.strip
64
+ stderr_result = stderr.read.strip
65
+
66
+ unless stderr_result.strip == "" then
67
+ raise(CommandFailedError.new(stderr_result),
68
+ "execution failed with status #{stderr_result}: #{final_cmd}")
69
+ end
70
+
71
+ stdout_result
72
+ ensure
73
+ stdin.close
74
+ stdout.close
75
+ stderr.close
76
+ end
77
+
78
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "ruby_tika_app"
6
+ s.version = "0.2"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Chris Parker"]
9
+ s.email = ["mrcsparker@gmail.com"]
10
+ s.homepage = "https://github.com/mrcsparker/ruby_tika_app"
11
+ s.summary = %q{Wrapper around the tika-app jar}
12
+ s.description = %q{Wrapper around the tika-app jar}
13
+
14
+ s.rubyforge_project = "ruby_tika_app"
15
+
16
+ s.files = `git ls-files`.split("\n") +
17
+ %w(LICENSE README.textile HISTORY)
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+ s.test_files = Dir.glob('spec/**/*')
22
+
23
+ s.add_runtime_dependency("open4")
24
+
25
+ s.add_development_dependency("rspec", "~> 2.7.0")
26
+ s.add_development_dependency("bundler", ">= 1.0.15")
27
+ end
@@ -0,0 +1,86 @@
1
+ require 'spec_helper'
2
+
3
+ describe RubyTikaApp do
4
+
5
+ before(:each) do
6
+ @test_file = "#{File.join(File.dirname(__FILE__))}/docs/graph_sampling_simplex11.pdf"
7
+ end
8
+
9
+ describe "#to_xml" do
10
+ it "header" do
11
+ rta = RubyTikaApp.new(@test_file)
12
+ rta.to_xml[0..37].should == "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
13
+ end
14
+
15
+ it "middle" do
16
+ rta = RubyTikaApp.new(@test_file)
17
+ xml = rta.to_xml
18
+
19
+ xml_size = xml.size / 2
20
+
21
+ xml[xml_size..(xml_size + 100)].should == "HRW considers all the duplicated nodes as valid nodes.\nThese duplicated nodes make the node distribut"
22
+ end
23
+ end
24
+
25
+ describe "#to_html" do
26
+ it "header" do
27
+ rta = RubyTikaApp.new(@test_file)
28
+ rta.to_html[0..42].should == "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
29
+ end
30
+
31
+ it "middle" do
32
+ rta = RubyTikaApp.new(@test_file)
33
+ rta.to_html[1000 ... 1100].should == "ersity of Goettingen, Germany\n3 Department of Computer Science, U.C. Santa Barbara, USA\n4 Deutsche T"
34
+ end
35
+ end
36
+
37
+ describe "#to_json" do
38
+ it "header" do
39
+ rta = RubyTikaApp.new(@test_file)
40
+ rta.to_json[0..42].should == "{ \"Application\":\"\\u0027Certified by IEEE PD"
41
+ end
42
+
43
+ it "middle" do
44
+ rta = RubyTikaApp.new(@test_file)
45
+ rta.to_json[100 ... 150].should == "h\":171510, \n\"Content-Type\":\"application/pdf\", \n\"Cr"
46
+ end
47
+ end
48
+
49
+ describe "#to_text" do
50
+ it "header" do
51
+ rta = RubyTikaApp.new(@test_file)
52
+ rta.to_text[0..42].should == "Understanding Graph Sampling Algorithms\nfor"
53
+ end
54
+
55
+ it "middle" do
56
+ rta = RubyTikaApp.new(@test_file)
57
+ rta.to_text[100 ... 150].should == "n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing"
58
+ end
59
+ end
60
+
61
+ describe "#to_text_main" do
62
+ it "header" do
63
+ rta = RubyTikaApp.new(@test_file)
64
+ rta.to_text_main[0..42].should == "Understanding Graph Sampling Algorithms for"
65
+ end
66
+
67
+ it "middle" do
68
+ rta = RubyTikaApp.new(@test_file)
69
+ rta.to_text_main[100 ... 150].should == "n Zhang3, Tianyin Xu2 Long Jin1, Pan Hui4, Beixing"
70
+ end
71
+ end
72
+
73
+ describe "#to_metadata" do
74
+ it "header" do
75
+ rta = RubyTikaApp.new(@test_file)
76
+ rta.to_metadata[0..42].should == "Application: 'Certified by IEEE PDFeXpress "
77
+ end
78
+
79
+ it "middle" do
80
+ rta = RubyTikaApp.new(@test_file)
81
+ rta.to_metadata[100 ... 150].should == "Type: application/pdf\nCreation-Date: 2011-03-29T12"
82
+ end
83
+
84
+ end
85
+
86
+ end
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'ruby_tika_app'
5
+ require 'rspec'
6
+
7
+ RSpec.configure do |config|
8
+
9
+ end
metadata ADDED
@@ -0,0 +1,124 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby_tika_app
3
+ version: !ruby/object:Gem::Version
4
+ hash: 15
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 2
9
+ version: "0.2"
10
+ platform: ruby
11
+ authors:
12
+ - Chris Parker
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-12-20 00:00:00 Z
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: open4
21
+ prerelease: false
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ hash: 3
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: rspec
35
+ prerelease: false
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ~>
40
+ - !ruby/object:Gem::Version
41
+ hash: 19
42
+ segments:
43
+ - 2
44
+ - 7
45
+ - 0
46
+ version: 2.7.0
47
+ type: :development
48
+ version_requirements: *id002
49
+ - !ruby/object:Gem::Dependency
50
+ name: bundler
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 9
58
+ segments:
59
+ - 1
60
+ - 0
61
+ - 15
62
+ version: 1.0.15
63
+ type: :development
64
+ version_requirements: *id003
65
+ description: Wrapper around the tika-app jar
66
+ email:
67
+ - mrcsparker@gmail.com
68
+ executables: []
69
+
70
+ extensions: []
71
+
72
+ extra_rdoc_files: []
73
+
74
+ files:
75
+ - .gitignore
76
+ - .rspec
77
+ - Gemfile
78
+ - HISTORY
79
+ - LICENSE
80
+ - README.textile
81
+ - Rakefile
82
+ - ext/tika-app-0.10.jar
83
+ - lib/ruby_tika_app.rb
84
+ - ruby_tika_app.gemspec
85
+ - spec/docs/graph_sampling_simplex11.pdf
86
+ - spec/ruby_tika_app_spec.rb
87
+ - spec/spec_helper.rb
88
+ homepage: https://github.com/mrcsparker/ruby_tika_app
89
+ licenses: []
90
+
91
+ post_install_message:
92
+ rdoc_options: []
93
+
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ hash: 3
102
+ segments:
103
+ - 0
104
+ version: "0"
105
+ required_rubygems_version: !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ hash: 3
111
+ segments:
112
+ - 0
113
+ version: "0"
114
+ requirements: []
115
+
116
+ rubyforge_project: ruby_tika_app
117
+ rubygems_version: 1.8.12
118
+ signing_key:
119
+ specification_version: 3
120
+ summary: Wrapper around the tika-app jar
121
+ test_files:
122
+ - spec/docs/graph_sampling_simplex11.pdf
123
+ - spec/ruby_tika_app_spec.rb
124
+ - spec/spec_helper.rb