ruby_tika_app 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ruby_tika_app.gemspec
4
+ gemspec
data/HISTORY ADDED
@@ -0,0 +1,7 @@
1
+ 0.2 - November 30, 2011
2
+ * Fixed open4 bundler issue - file was getting required that needed open4 before add_dependency
3
+ * Added README info, HISTORY
4
+ * Added more tests
5
+
6
+ 0.1 - November 29, 2011
7
+ * Initial release
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Chris Parker
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,47 @@
1
+ h1. Ruby Tika Parser
2
+
3
+ h2. Introduction
4
+
5
+ This is a simple frontend to the Java Tika parser command line jar / app.
6
+
7
+ It is the same as running:
8
+
9
+ <pre>
10
+ java -server -Djava.awt.headless=true -jar tika-app-0.10.jar FileToParse.pdf
11
+ </pre>
12
+
13
+ with options like --xml, --text, etc.
14
+
15
+ h2. Installation
16
+
17
+ To install, add ruby_tika_app to your @Gemfile@ and run `bundle install`:
18
+
19
+ <pre>
20
+ gem 'ruby_tika_app'
21
+ </pre>
22
+
23
+ h3. Note about installation
24
+
25
+ RubyTikaApp is a pretty big gem since it includes the ruby-tika-app jarfile.
26
+ It might take a while to install.
27
+
28
+ h2. Usage
29
+
30
+ First, you need Java installed. And it needs to be in your $PATH.
31
+
32
+ Then:
33
+
34
+ <pre>
35
+ require 'ruby_tika_app'
36
+
37
+ rta = RubyTikaApp.new("sample_file.pdf")
38
+
39
+ puts rta.to_xml # <xml output>
40
+
41
+ # You also get to_json, to_text, to_text_main, and to_metadata
42
+
43
+ </pre>
44
+
45
+ h2. Contributing
46
+
47
+ Fork on GitHub and after you've committed tested patches, send a pull request.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
Binary file
@@ -0,0 +1,78 @@
1
+ # Based on the rake remote task code
2
+
3
+ require 'rubygems'
4
+ require 'stringio'
5
+ require 'open4'
6
+
7
+ class RubyTikaApp
8
+
9
+ class Error < RuntimeError; end
10
+
11
+ class CommandFailedError < Error
12
+ attr_reader :status
13
+ def initialize status
14
+ @status = status
15
+ end
16
+ end
17
+
18
+ def initialize(document)
19
+
20
+ @document = document
21
+
22
+ java_cmd = 'java'
23
+ java_args = '-server -Djava.awt.headless=true'
24
+ tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-0.10.jar"
25
+
26
+ @tika_cmd = "#{java_cmd} #{java_args} -jar #{tika_path}"
27
+ end
28
+
29
+ def to_xml
30
+ run_tika('--xml')
31
+ end
32
+
33
+ def to_html
34
+ run_tika('--html')
35
+ end
36
+
37
+ def to_json
38
+ run_tika('--json')
39
+ end
40
+
41
+ def to_text
42
+ run_tika('--text')
43
+ end
44
+
45
+ def to_text_main
46
+ run_tika('--text-main')
47
+ end
48
+
49
+ def to_metadata
50
+ run_tika('--metadata')
51
+ end
52
+
53
+ private
54
+
55
+ def run_tika(option)
56
+
57
+ final_cmd = "#{@tika_cmd} #{option} #{@document}"
58
+ result = []
59
+
60
+
61
+ pid, stdin, stdout, stderr = Open4::popen4(final_cmd)
62
+
63
+ stdout_result = stdout.read.strip
64
+ stderr_result = stderr.read.strip
65
+
66
+ unless stderr_result.strip == "" then
67
+ raise(CommandFailedError.new(stderr_result),
68
+ "execution failed with status #{stderr_result}: #{final_cmd}")
69
+ end
70
+
71
+ stdout_result
72
+ ensure
73
+ stdin.close
74
+ stdout.close
75
+ stderr.close
76
+ end
77
+
78
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "ruby_tika_app"
6
+ s.version = "0.2"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Chris Parker"]
9
+ s.email = ["mrcsparker@gmail.com"]
10
+ s.homepage = "https://github.com/mrcsparker/ruby_tika_app"
11
+ s.summary = %q{Wrapper around the tika-app jar}
12
+ s.description = %q{Wrapper around the tika-app jar}
13
+
14
+ s.rubyforge_project = "ruby_tika_app"
15
+
16
+ s.files = `git ls-files`.split("\n") +
17
+ %w(LICENSE README.textile HISTORY)
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+ s.test_files = Dir.glob('spec/**/*')
22
+
23
+ s.add_runtime_dependency("open4")
24
+
25
+ s.add_development_dependency("rspec", "~> 2.7.0")
26
+ s.add_development_dependency("bundler", ">= 1.0.15")
27
+ end
@@ -0,0 +1,86 @@
1
+ require 'spec_helper'
2
+
3
+ describe RubyTikaApp do
4
+
5
+ before(:each) do
6
+ @test_file = "#{File.join(File.dirname(__FILE__))}/docs/graph_sampling_simplex11.pdf"
7
+ end
8
+
9
+ describe "#to_xml" do
10
+ it "header" do
11
+ rta = RubyTikaApp.new(@test_file)
12
+ rta.to_xml[0..37].should == "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
13
+ end
14
+
15
+ it "middle" do
16
+ rta = RubyTikaApp.new(@test_file)
17
+ xml = rta.to_xml
18
+
19
+ xml_size = xml.size / 2
20
+
21
+ xml[xml_size..(xml_size + 100)].should == "HRW considers all the duplicated nodes as valid nodes.\nThese duplicated nodes make the node distribut"
22
+ end
23
+ end
24
+
25
+ describe "#to_html" do
26
+ it "header" do
27
+ rta = RubyTikaApp.new(@test_file)
28
+ rta.to_html[0..42].should == "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
29
+ end
30
+
31
+ it "middle" do
32
+ rta = RubyTikaApp.new(@test_file)
33
+ rta.to_html[1000 ... 1100].should == "ersity of Goettingen, Germany\n3 Department of Computer Science, U.C. Santa Barbara, USA\n4 Deutsche T"
34
+ end
35
+ end
36
+
37
+ describe "#to_json" do
38
+ it "header" do
39
+ rta = RubyTikaApp.new(@test_file)
40
+ rta.to_json[0..42].should == "{ \"Application\":\"\\u0027Certified by IEEE PD"
41
+ end
42
+
43
+ it "middle" do
44
+ rta = RubyTikaApp.new(@test_file)
45
+ rta.to_json[100 ... 150].should == "h\":171510, \n\"Content-Type\":\"application/pdf\", \n\"Cr"
46
+ end
47
+ end
48
+
49
+ describe "#to_text" do
50
+ it "header" do
51
+ rta = RubyTikaApp.new(@test_file)
52
+ rta.to_text[0..42].should == "Understanding Graph Sampling Algorithms\nfor"
53
+ end
54
+
55
+ it "middle" do
56
+ rta = RubyTikaApp.new(@test_file)
57
+ rta.to_text[100 ... 150].should == "n Zhang3, Tianyin Xu2\nLong Jin1, Pan Hui4, Beixing"
58
+ end
59
+ end
60
+
61
+ describe "#to_text_main" do
62
+ it "header" do
63
+ rta = RubyTikaApp.new(@test_file)
64
+ rta.to_text_main[0..42].should == "Understanding Graph Sampling Algorithms for"
65
+ end
66
+
67
+ it "middle" do
68
+ rta = RubyTikaApp.new(@test_file)
69
+ rta.to_text_main[100 ... 150].should == "n Zhang3, Tianyin Xu2 Long Jin1, Pan Hui4, Beixing"
70
+ end
71
+ end
72
+
73
+ describe "#to_metadata" do
74
+ it "header" do
75
+ rta = RubyTikaApp.new(@test_file)
76
+ rta.to_metadata[0..42].should == "Application: 'Certified by IEEE PDFeXpress "
77
+ end
78
+
79
+ it "middle" do
80
+ rta = RubyTikaApp.new(@test_file)
81
+ rta.to_metadata[100 ... 150].should == "Type: application/pdf\nCreation-Date: 2011-03-29T12"
82
+ end
83
+
84
+ end
85
+
86
+ end
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'ruby_tika_app'
5
+ require 'rspec'
6
+
7
+ RSpec.configure do |config|
8
+
9
+ end
metadata ADDED
@@ -0,0 +1,124 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby_tika_app
3
+ version: !ruby/object:Gem::Version
4
+ hash: 15
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 2
9
+ version: "0.2"
10
+ platform: ruby
11
+ authors:
12
+ - Chris Parker
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-12-20 00:00:00 Z
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: open4
21
+ prerelease: false
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ hash: 3
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: rspec
35
+ prerelease: false
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ~>
40
+ - !ruby/object:Gem::Version
41
+ hash: 19
42
+ segments:
43
+ - 2
44
+ - 7
45
+ - 0
46
+ version: 2.7.0
47
+ type: :development
48
+ version_requirements: *id002
49
+ - !ruby/object:Gem::Dependency
50
+ name: bundler
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 9
58
+ segments:
59
+ - 1
60
+ - 0
61
+ - 15
62
+ version: 1.0.15
63
+ type: :development
64
+ version_requirements: *id003
65
+ description: Wrapper around the tika-app jar
66
+ email:
67
+ - mrcsparker@gmail.com
68
+ executables: []
69
+
70
+ extensions: []
71
+
72
+ extra_rdoc_files: []
73
+
74
+ files:
75
+ - .gitignore
76
+ - .rspec
77
+ - Gemfile
78
+ - HISTORY
79
+ - LICENSE
80
+ - README.textile
81
+ - Rakefile
82
+ - ext/tika-app-0.10.jar
83
+ - lib/ruby_tika_app.rb
84
+ - ruby_tika_app.gemspec
85
+ - spec/docs/graph_sampling_simplex11.pdf
86
+ - spec/ruby_tika_app_spec.rb
87
+ - spec/spec_helper.rb
88
+ homepage: https://github.com/mrcsparker/ruby_tika_app
89
+ licenses: []
90
+
91
+ post_install_message:
92
+ rdoc_options: []
93
+
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ hash: 3
102
+ segments:
103
+ - 0
104
+ version: "0"
105
+ required_rubygems_version: !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ hash: 3
111
+ segments:
112
+ - 0
113
+ version: "0"
114
+ requirements: []
115
+
116
+ rubyforge_project: ruby_tika_app
117
+ rubygems_version: 1.8.12
118
+ signing_key:
119
+ specification_version: 3
120
+ summary: Wrapper around the tika-app jar
121
+ test_files:
122
+ - spec/docs/graph_sampling_simplex11.pdf
123
+ - spec/ruby_tika_app_spec.rb
124
+ - spec/spec_helper.rb