rtika 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +33 -0
- data/Rakefile +55 -0
- data/VERSION +1 -0
- data/lib/rtika.rb +110 -0
- data/lib/tika-app-0.7.jar +0 -0
- data/lib/tika-bundle-0.7.jar +0 -0
- data/lib/tika-core-0.7.jar +0 -0
- data/lib/tika-parsers-0.7.jar +0 -0
- data/rtika.gemspec +55 -0
- data/test/helper.rb +10 -0
- data/test/test_rtika.rb +7 -0
- metadata +77 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2009 Pradeep Elankumaran
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
= rTika
|
|
2
|
+
|
|
3
|
+
A JRuby wrapper around the excellent Apache Tika content extraction library.
|
|
4
|
+
Feed rTika your files and get extracted text and metadata in return.
|
|
5
|
+
|
|
6
|
+
== Usage
|
|
7
|
+
Make sure you're on JRuby first.
|
|
8
|
+
|
|
9
|
+
require 'rubygems'
|
|
10
|
+
require 'rtika'
|
|
11
|
+
|
|
12
|
+
result = RTika::FileParser.parse("mywordfile.doc")
|
|
13
|
+
puts result.content
|
|
14
|
+
puts result.title
|
|
15
|
+
puts result.author
|
|
16
|
+
|
|
17
|
+
result = RTika::StringParser.parse("<html><body>this is my very ... long ... string</body></html>")
|
|
18
|
+
puts result.content
|
|
19
|
+
puts result.title
|
|
20
|
+
|
|
21
|
+
== Note on Patches/Pull Requests
|
|
22
|
+
|
|
23
|
+
* Fork the project.
|
|
24
|
+
* Make your feature addition or bug fix.
|
|
25
|
+
* Add tests for it. This is important so I don't break it in a
|
|
26
|
+
future version unintentionally.
|
|
27
|
+
* Commit, do not mess with rakefile, version, or history.
|
|
28
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
|
29
|
+
* Send me a pull request. Bonus points for topic branches.
|
|
30
|
+
|
|
31
|
+
== Copyright
|
|
32
|
+
|
|
33
|
+
Copyright (c) 2010 Pradeep Elankumaran. See LICENSE for details.
|
data/Rakefile
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'rake'
|
|
3
|
+
|
|
4
|
+
begin
|
|
5
|
+
require 'jeweler'
|
|
6
|
+
Jeweler::Tasks.new do |gem|
|
|
7
|
+
gem.name = "rtika"
|
|
8
|
+
gem.summary = %Q{A JRuby wrapper around the Apache Tika library}
|
|
9
|
+
gem.description = %Q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
|
|
10
|
+
gem.email = "pradeepe@gmail.com"
|
|
11
|
+
gem.homepage = "http://github.com/skyfallsin/rtika"
|
|
12
|
+
gem.authors = ["Pradeep Elankumaran"]
|
|
13
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
|
14
|
+
end
|
|
15
|
+
Jeweler::GemcutterTasks.new
|
|
16
|
+
rescue LoadError
|
|
17
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
require 'rake/testtask'
|
|
21
|
+
Rake::TestTask.new(:test) do |test|
|
|
22
|
+
test.libs << 'lib' << 'test'
|
|
23
|
+
test.pattern = 'test/**/test_*.rb'
|
|
24
|
+
test.verbose = true
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
begin
|
|
28
|
+
require 'rcov/rcovtask'
|
|
29
|
+
Rcov::RcovTask.new do |test|
|
|
30
|
+
test.libs << 'test'
|
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
|
32
|
+
test.verbose = true
|
|
33
|
+
end
|
|
34
|
+
rescue LoadError
|
|
35
|
+
task :rcov do
|
|
36
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
task :test => :check_dependencies
|
|
41
|
+
|
|
42
|
+
task :default => :test
|
|
43
|
+
|
|
44
|
+
require 'rake/rdoctask'
|
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
|
46
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
|
47
|
+
|
|
48
|
+
rdoc.rdoc_dir = 'rdoc'
|
|
49
|
+
rdoc.title = "rtika #{version}"
|
|
50
|
+
rdoc.rdoc_files.include('README*')
|
|
51
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
Jeweler::GemcutterTasks.new
|
|
55
|
+
|
data/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.1.0
|
data/lib/rtika.rb
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
raise "You need JRuby to use rRTika" unless RUBY_PLATFORM =~ /java/
|
|
2
|
+
require 'java'
|
|
3
|
+
|
|
4
|
+
Dir[File.join(File.dirname(__FILE__), "*.jar")].each do |jar|
|
|
5
|
+
#puts "require #{jar}"
|
|
6
|
+
require jar
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
module RTika
|
|
10
|
+
import org.apache.tika.sax.BodyContentHandler
|
|
11
|
+
import org.apache.tika.parser.AutoDetectParser
|
|
12
|
+
import org.apache.tika.metadata.Metadata
|
|
13
|
+
|
|
14
|
+
class ParsedResult
|
|
15
|
+
attr_accessor :content, :metadata
|
|
16
|
+
def initialize(content, metadata)
|
|
17
|
+
@content, @metadata = content, metadata
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def content
|
|
21
|
+
@content.to_string
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def title
|
|
25
|
+
@metadata.get(Metadata::TITLE)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def author
|
|
29
|
+
@metadata.get(Metadata::AUTHOR)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def content_type
|
|
33
|
+
@metadata.get(Metadata::CONTENT_TYPE)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def filename
|
|
37
|
+
@metadata.get("filename")
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
class GenericParser
|
|
42
|
+
def self.parse(*args)
|
|
43
|
+
new(*args).parse
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def parse
|
|
47
|
+
@parser = RTika::AutoDetectParser.new
|
|
48
|
+
content, metadata = process
|
|
49
|
+
RTika::ParsedResult.new(content, metadata)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def process
|
|
53
|
+
raise "override this in your parser, return content and metadata"
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
class StringParser < GenericParser
|
|
58
|
+
def initialize(string)
|
|
59
|
+
@input_string = string
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def process
|
|
63
|
+
input_stream = java.io.ByteArrayInputStream.new(@input_string.to_java.get_bytes)
|
|
64
|
+
content = RTika::BodyContentHandler.new
|
|
65
|
+
metadata = RTika::Metadata.new
|
|
66
|
+
|
|
67
|
+
@parser.parse(input_stream, content, metadata)
|
|
68
|
+
input_stream.close
|
|
69
|
+
|
|
70
|
+
return [content, metadata]
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
class FileParser < GenericParser
|
|
75
|
+
def initialize(filename)
|
|
76
|
+
@filename = filename
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def process
|
|
80
|
+
input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
|
|
81
|
+
content = RTika::BodyContentHandler.new
|
|
82
|
+
metadata = RTika::Metadata.new
|
|
83
|
+
metadata.set("filename", File.basename(@filename))
|
|
84
|
+
|
|
85
|
+
@parser.parse(input_stream, content, metadata)
|
|
86
|
+
input_stream.close
|
|
87
|
+
|
|
88
|
+
return [content, metadata]
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
class UrlParser < GenericParser
|
|
93
|
+
def initialize(url, content)
|
|
94
|
+
@url = url
|
|
95
|
+
@content = content
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def process
|
|
99
|
+
input_stream = java.io.ByteArrayInputStream.new(@content.to_java.get_bytes)
|
|
100
|
+
content = RTika::BodyContentHandler.new
|
|
101
|
+
metadata = RTika::Metadata.new
|
|
102
|
+
metadata.set("filename", File.basename(@url))
|
|
103
|
+
|
|
104
|
+
@parser.parse(input_stream, content, metadata)
|
|
105
|
+
input_stream.close
|
|
106
|
+
|
|
107
|
+
return [content, metadata]
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
data/rtika.gemspec
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Generated by jeweler
|
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
|
4
|
+
# -*- encoding: utf-8 -*-
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |s|
|
|
7
|
+
s.name = %q{rtika}
|
|
8
|
+
s.version = "0.1.0"
|
|
9
|
+
|
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
|
+
s.authors = ["Pradeep Elankumaran"]
|
|
12
|
+
s.date = %q{2010-10-28}
|
|
13
|
+
s.description = %q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
|
|
14
|
+
s.email = %q{pradeepe@gmail.com}
|
|
15
|
+
s.extra_rdoc_files = [
|
|
16
|
+
"LICENSE",
|
|
17
|
+
"README.rdoc"
|
|
18
|
+
]
|
|
19
|
+
s.files = [
|
|
20
|
+
".document",
|
|
21
|
+
".gitignore",
|
|
22
|
+
"LICENSE",
|
|
23
|
+
"README.rdoc",
|
|
24
|
+
"Rakefile",
|
|
25
|
+
"VERSION",
|
|
26
|
+
"lib/rtika.rb",
|
|
27
|
+
"lib/tika-app-0.7.jar",
|
|
28
|
+
"lib/tika-bundle-0.7.jar",
|
|
29
|
+
"lib/tika-core-0.7.jar",
|
|
30
|
+
"lib/tika-parsers-0.7.jar",
|
|
31
|
+
"rtika.gemspec",
|
|
32
|
+
"test/helper.rb",
|
|
33
|
+
"test/test_rtika.rb"
|
|
34
|
+
]
|
|
35
|
+
s.homepage = %q{http://github.com/skyfallsin/rtika}
|
|
36
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
|
37
|
+
s.require_paths = ["lib"]
|
|
38
|
+
s.rubygems_version = %q{1.3.6}
|
|
39
|
+
s.summary = %q{A JRuby wrapper around the Apache Tika library}
|
|
40
|
+
s.test_files = [
|
|
41
|
+
"test/helper.rb",
|
|
42
|
+
"test/test_rtika.rb"
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
if s.respond_to? :specification_version then
|
|
46
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
|
47
|
+
s.specification_version = 3
|
|
48
|
+
|
|
49
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
|
50
|
+
else
|
|
51
|
+
end
|
|
52
|
+
else
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
data/test/helper.rb
ADDED
data/test/test_rtika.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: rtika
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease: false
|
|
5
|
+
segments:
|
|
6
|
+
- 0
|
|
7
|
+
- 1
|
|
8
|
+
- 0
|
|
9
|
+
version: 0.1.0
|
|
10
|
+
platform: ruby
|
|
11
|
+
authors:
|
|
12
|
+
- Pradeep Elankumaran
|
|
13
|
+
autorequire:
|
|
14
|
+
bindir: bin
|
|
15
|
+
cert_chain: []
|
|
16
|
+
|
|
17
|
+
date: 2010-10-28 00:00:00 -07:00
|
|
18
|
+
default_executable:
|
|
19
|
+
dependencies: []
|
|
20
|
+
|
|
21
|
+
description: rTika is a JRuby wrapper around the Apache Tika content extraction library
|
|
22
|
+
email: pradeepe@gmail.com
|
|
23
|
+
executables: []
|
|
24
|
+
|
|
25
|
+
extensions: []
|
|
26
|
+
|
|
27
|
+
extra_rdoc_files:
|
|
28
|
+
- LICENSE
|
|
29
|
+
- README.rdoc
|
|
30
|
+
files:
|
|
31
|
+
- .document
|
|
32
|
+
- .gitignore
|
|
33
|
+
- LICENSE
|
|
34
|
+
- README.rdoc
|
|
35
|
+
- Rakefile
|
|
36
|
+
- VERSION
|
|
37
|
+
- lib/rtika.rb
|
|
38
|
+
- lib/tika-app-0.7.jar
|
|
39
|
+
- lib/tika-bundle-0.7.jar
|
|
40
|
+
- lib/tika-core-0.7.jar
|
|
41
|
+
- lib/tika-parsers-0.7.jar
|
|
42
|
+
- rtika.gemspec
|
|
43
|
+
- test/helper.rb
|
|
44
|
+
- test/test_rtika.rb
|
|
45
|
+
has_rdoc: true
|
|
46
|
+
homepage: http://github.com/skyfallsin/rtika
|
|
47
|
+
licenses: []
|
|
48
|
+
|
|
49
|
+
post_install_message:
|
|
50
|
+
rdoc_options:
|
|
51
|
+
- --charset=UTF-8
|
|
52
|
+
require_paths:
|
|
53
|
+
- lib
|
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
55
|
+
requirements:
|
|
56
|
+
- - ">="
|
|
57
|
+
- !ruby/object:Gem::Version
|
|
58
|
+
segments:
|
|
59
|
+
- 0
|
|
60
|
+
version: "0"
|
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
62
|
+
requirements:
|
|
63
|
+
- - ">="
|
|
64
|
+
- !ruby/object:Gem::Version
|
|
65
|
+
segments:
|
|
66
|
+
- 0
|
|
67
|
+
version: "0"
|
|
68
|
+
requirements: []
|
|
69
|
+
|
|
70
|
+
rubyforge_project:
|
|
71
|
+
rubygems_version: 1.3.6
|
|
72
|
+
signing_key:
|
|
73
|
+
specification_version: 3
|
|
74
|
+
summary: A JRuby wrapper around the Apache Tika library
|
|
75
|
+
test_files:
|
|
76
|
+
- test/helper.rb
|
|
77
|
+
- test/test_rtika.rb
|