rtika 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Pradeep Elankumaran
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,33 @@
1
+ = rTika
2
+
3
+ A JRuby wrapper around the excellent Apache Tika content extraction library.
4
+ Feed rTika your files and get extracted text and metadata in return.
5
+
6
+ == Usage
7
+ Make sure you're on JRuby first.
8
+
9
+ require 'rubygems'
10
+ require 'rtika'
11
+
12
+ result = RTika::FileParser.parse("mywordfile.doc")
13
+ puts result.content
14
+ puts result.title
15
+ puts result.author
16
+
17
+ result = RTika::StringParser.parse("<html><body>this is my very ... long ... string</body></html>")
18
+ puts result.content
19
+ puts result.title
20
+
21
+ == Note on Patches/Pull Requests
22
+
23
+ * Fork the project.
24
+ * Make your feature addition or bug fix.
25
+ * Add tests for it. This is important so I don't break it in a
26
+ future version unintentionally.
27
+ * Commit, do not mess with rakefile, version, or history.
28
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
29
+ * Send me a pull request. Bonus points for topic branches.
30
+
31
+ == Copyright
32
+
33
+ Copyright (c) 2010 Pradeep Elankumaran. See LICENSE for details.
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rtika"
8
+ gem.summary = %Q{A JRuby wrapper around the Apache Tika library}
9
+ gem.description = %Q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
10
+ gem.email = "pradeepe@gmail.com"
11
+ gem.homepage = "http://github.com/skyfallsin/rtika"
12
+ gem.authors = ["Pradeep Elankumaran"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/test_*.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+ task :test => :check_dependencies
41
+
42
+ task :default => :test
43
+
44
+ require 'rake/rdoctask'
45
+ Rake::RDocTask.new do |rdoc|
46
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
47
+
48
+ rdoc.rdoc_dir = 'rdoc'
49
+ rdoc.title = "rtika #{version}"
50
+ rdoc.rdoc_files.include('README*')
51
+ rdoc.rdoc_files.include('lib/**/*.rb')
52
+ end
53
+
54
+ Jeweler::GemcutterTasks.new
55
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,110 @@
1
+ raise "You need JRuby to use rRTika" unless RUBY_PLATFORM =~ /java/
2
+ require 'java'
3
+
4
+ Dir[File.join(File.dirname(__FILE__), "*.jar")].each do |jar|
5
+ #puts "require #{jar}"
6
+ require jar
7
+ end
8
+
9
+ module RTika
10
+ import org.apache.tika.sax.BodyContentHandler
11
+ import org.apache.tika.parser.AutoDetectParser
12
+ import org.apache.tika.metadata.Metadata
13
+
14
+ class ParsedResult
15
+ attr_accessor :content, :metadata
16
+ def initialize(content, metadata)
17
+ @content, @metadata = content, metadata
18
+ end
19
+
20
+ def content
21
+ @content.to_string
22
+ end
23
+
24
+ def title
25
+ @metadata.get(Metadata::TITLE)
26
+ end
27
+
28
+ def author
29
+ @metadata.get(Metadata::AUTHOR)
30
+ end
31
+
32
+ def content_type
33
+ @metadata.get(Metadata::CONTENT_TYPE)
34
+ end
35
+
36
+ def filename
37
+ @metadata.get("filename")
38
+ end
39
+ end
40
+
41
+ class GenericParser
42
+ def self.parse(*args)
43
+ new(*args).parse
44
+ end
45
+
46
+ def parse
47
+ @parser = RTika::AutoDetectParser.new
48
+ content, metadata = process
49
+ RTika::ParsedResult.new(content, metadata)
50
+ end
51
+
52
+ def process
53
+ raise "override this in your parser, return content and metadata"
54
+ end
55
+ end
56
+
57
+ class StringParser < GenericParser
58
+ def initialize(string)
59
+ @input_string = string
60
+ end
61
+
62
+ def process
63
+ input_stream = java.io.ByteArrayInputStream.new(@input_string.to_java.get_bytes)
64
+ content = RTika::BodyContentHandler.new
65
+ metadata = RTika::Metadata.new
66
+
67
+ @parser.parse(input_stream, content, metadata)
68
+ input_stream.close
69
+
70
+ return [content, metadata]
71
+ end
72
+ end
73
+
74
+ class FileParser < GenericParser
75
+ def initialize(filename)
76
+ @filename = filename
77
+ end
78
+
79
+ def process
80
+ input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
81
+ content = RTika::BodyContentHandler.new
82
+ metadata = RTika::Metadata.new
83
+ metadata.set("filename", File.basename(@filename))
84
+
85
+ @parser.parse(input_stream, content, metadata)
86
+ input_stream.close
87
+
88
+ return [content, metadata]
89
+ end
90
+ end
91
+
92
+ class UrlParser < GenericParser
93
+ def initialize(url, content)
94
+ @url = url
95
+ @content = content
96
+ end
97
+
98
+ def process
99
+ input_stream = java.io.ByteArrayInputStream.new(@content.to_java.get_bytes)
100
+ content = RTika::BodyContentHandler.new
101
+ metadata = RTika::Metadata.new
102
+ metadata.set("filename", File.basename(@url))
103
+
104
+ @parser.parse(input_stream, content, metadata)
105
+ input_stream.close
106
+
107
+ return [content, metadata]
108
+ end
109
+ end
110
+ end
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,55 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{rtika}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Pradeep Elankumaran"]
12
+ s.date = %q{2010-10-28}
13
+ s.description = %q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
14
+ s.email = %q{pradeepe@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/rtika.rb",
27
+ "lib/tika-app-0.7.jar",
28
+ "lib/tika-bundle-0.7.jar",
29
+ "lib/tika-core-0.7.jar",
30
+ "lib/tika-parsers-0.7.jar",
31
+ "rtika.gemspec",
32
+ "test/helper.rb",
33
+ "test/test_rtika.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/skyfallsin/rtika}
36
+ s.rdoc_options = ["--charset=UTF-8"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.6}
39
+ s.summary = %q{A JRuby wrapper around the Apache Tika library}
40
+ s.test_files = [
41
+ "test/helper.rb",
42
+ "test/test_rtika.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
50
+ else
51
+ end
52
+ else
53
+ end
54
+ end
55
+
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'rtika'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestRtika < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rtika
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Pradeep Elankumaran
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-10-28 00:00:00 -07:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: rTika is a JRuby wrapper around the Apache Tika content extraction library
22
+ email: pradeepe@gmail.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files:
28
+ - LICENSE
29
+ - README.rdoc
30
+ files:
31
+ - .document
32
+ - .gitignore
33
+ - LICENSE
34
+ - README.rdoc
35
+ - Rakefile
36
+ - VERSION
37
+ - lib/rtika.rb
38
+ - lib/tika-app-0.7.jar
39
+ - lib/tika-bundle-0.7.jar
40
+ - lib/tika-core-0.7.jar
41
+ - lib/tika-parsers-0.7.jar
42
+ - rtika.gemspec
43
+ - test/helper.rb
44
+ - test/test_rtika.rb
45
+ has_rdoc: true
46
+ homepage: http://github.com/skyfallsin/rtika
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options:
51
+ - --charset=UTF-8
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project:
71
+ rubygems_version: 1.3.6
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: A JRuby wrapper around the Apache Tika library
75
+ test_files:
76
+ - test/helper.rb
77
+ - test/test_rtika.rb