rtika 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Pradeep Elankumaran
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,33 @@
1
+ = rTika
2
+
3
+ A JRuby wrapper around the excellent Apache Tika content extraction library.
4
+ Feed rTika your files and get extracted text and metadata in return.
5
+
6
+ == Usage
7
+ Make sure you're on JRuby first.
8
+
9
+ require 'rubygems'
10
+ require 'rtika'
11
+
12
+ result = RTika::FileParser.parse("mywordfile.doc")
13
+ puts result.content
14
+ puts result.title
15
+ puts result.author
16
+
17
+ result = RTika::StringParser.parse("<html><body>this is my very ... long ... string</body></html>")
18
+ puts result.content
19
+ puts result.title
20
+
21
+ == Note on Patches/Pull Requests
22
+
23
+ * Fork the project.
24
+ * Make your feature addition or bug fix.
25
+ * Add tests for it. This is important so I don't break it in a
26
+ future version unintentionally.
27
+ * Commit, do not mess with rakefile, version, or history.
28
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
29
+ * Send me a pull request. Bonus points for topic branches.
30
+
31
+ == Copyright
32
+
33
+ Copyright (c) 2010 Pradeep Elankumaran. See LICENSE for details.
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rtika"
8
+ gem.summary = %Q{A JRuby wrapper around the Apache Tika library}
9
+ gem.description = %Q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
10
+ gem.email = "pradeepe@gmail.com"
11
+ gem.homepage = "http://github.com/skyfallsin/rtika"
12
+ gem.authors = ["Pradeep Elankumaran"]
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ Jeweler::GemcutterTasks.new
16
+ rescue LoadError
17
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
18
+ end
19
+
20
+ require 'rake/testtask'
21
+ Rake::TestTask.new(:test) do |test|
22
+ test.libs << 'lib' << 'test'
23
+ test.pattern = 'test/**/test_*.rb'
24
+ test.verbose = true
25
+ end
26
+
27
+ begin
28
+ require 'rcov/rcovtask'
29
+ Rcov::RcovTask.new do |test|
30
+ test.libs << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+ rescue LoadError
35
+ task :rcov do
36
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
37
+ end
38
+ end
39
+
40
+ task :test => :check_dependencies
41
+
42
+ task :default => :test
43
+
44
+ require 'rake/rdoctask'
45
+ Rake::RDocTask.new do |rdoc|
46
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
47
+
48
+ rdoc.rdoc_dir = 'rdoc'
49
+ rdoc.title = "rtika #{version}"
50
+ rdoc.rdoc_files.include('README*')
51
+ rdoc.rdoc_files.include('lib/**/*.rb')
52
+ end
53
+
54
+ Jeweler::GemcutterTasks.new
55
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,110 @@
1
+ raise "You need JRuby to use rRTika" unless RUBY_PLATFORM =~ /java/
2
+ require 'java'
3
+
4
+ Dir[File.join(File.dirname(__FILE__), "*.jar")].each do |jar|
5
+ #puts "require #{jar}"
6
+ require jar
7
+ end
8
+
9
+ module RTika
10
+ import org.apache.tika.sax.BodyContentHandler
11
+ import org.apache.tika.parser.AutoDetectParser
12
+ import org.apache.tika.metadata.Metadata
13
+
14
+ class ParsedResult
15
+ attr_accessor :content, :metadata
16
+ def initialize(content, metadata)
17
+ @content, @metadata = content, metadata
18
+ end
19
+
20
+ def content
21
+ @content.to_string
22
+ end
23
+
24
+ def title
25
+ @metadata.get(Metadata::TITLE)
26
+ end
27
+
28
+ def author
29
+ @metadata.get(Metadata::AUTHOR)
30
+ end
31
+
32
+ def content_type
33
+ @metadata.get(Metadata::CONTENT_TYPE)
34
+ end
35
+
36
+ def filename
37
+ @metadata.get("filename")
38
+ end
39
+ end
40
+
41
+ class GenericParser
42
+ def self.parse(*args)
43
+ new(*args).parse
44
+ end
45
+
46
+ def parse
47
+ @parser = RTika::AutoDetectParser.new
48
+ content, metadata = process
49
+ RTika::ParsedResult.new(content, metadata)
50
+ end
51
+
52
+ def process
53
+ raise "override this in your parser, return content and metadata"
54
+ end
55
+ end
56
+
57
+ class StringParser < GenericParser
58
+ def initialize(string)
59
+ @input_string = string
60
+ end
61
+
62
+ def process
63
+ input_stream = java.io.ByteArrayInputStream.new(@input_string.to_java.get_bytes)
64
+ content = RTika::BodyContentHandler.new
65
+ metadata = RTika::Metadata.new
66
+
67
+ @parser.parse(input_stream, content, metadata)
68
+ input_stream.close
69
+
70
+ return [content, metadata]
71
+ end
72
+ end
73
+
74
+ class FileParser < GenericParser
75
+ def initialize(filename)
76
+ @filename = filename
77
+ end
78
+
79
+ def process
80
+ input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
81
+ content = RTika::BodyContentHandler.new
82
+ metadata = RTika::Metadata.new
83
+ metadata.set("filename", File.basename(@filename))
84
+
85
+ @parser.parse(input_stream, content, metadata)
86
+ input_stream.close
87
+
88
+ return [content, metadata]
89
+ end
90
+ end
91
+
92
+ class UrlParser < GenericParser
93
+ def initialize(url, content)
94
+ @url = url
95
+ @content = content
96
+ end
97
+
98
+ def process
99
+ input_stream = java.io.ByteArrayInputStream.new(@content.to_java.get_bytes)
100
+ content = RTika::BodyContentHandler.new
101
+ metadata = RTika::Metadata.new
102
+ metadata.set("filename", File.basename(@url))
103
+
104
+ @parser.parse(input_stream, content, metadata)
105
+ input_stream.close
106
+
107
+ return [content, metadata]
108
+ end
109
+ end
110
+ end
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,55 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{rtika}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Pradeep Elankumaran"]
12
+ s.date = %q{2010-10-28}
13
+ s.description = %q{rTika is a JRuby wrapper around the Apache Tika content extraction library}
14
+ s.email = %q{pradeepe@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/rtika.rb",
27
+ "lib/tika-app-0.7.jar",
28
+ "lib/tika-bundle-0.7.jar",
29
+ "lib/tika-core-0.7.jar",
30
+ "lib/tika-parsers-0.7.jar",
31
+ "rtika.gemspec",
32
+ "test/helper.rb",
33
+ "test/test_rtika.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/skyfallsin/rtika}
36
+ s.rdoc_options = ["--charset=UTF-8"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.6}
39
+ s.summary = %q{A JRuby wrapper around the Apache Tika library}
40
+ s.test_files = [
41
+ "test/helper.rb",
42
+ "test/test_rtika.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
50
+ else
51
+ end
52
+ else
53
+ end
54
+ end
55
+
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'rtika'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestRtika < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rtika
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Pradeep Elankumaran
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-10-28 00:00:00 -07:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: rTika is a JRuby wrapper around the Apache Tika content extraction library
22
+ email: pradeepe@gmail.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files:
28
+ - LICENSE
29
+ - README.rdoc
30
+ files:
31
+ - .document
32
+ - .gitignore
33
+ - LICENSE
34
+ - README.rdoc
35
+ - Rakefile
36
+ - VERSION
37
+ - lib/rtika.rb
38
+ - lib/tika-app-0.7.jar
39
+ - lib/tika-bundle-0.7.jar
40
+ - lib/tika-core-0.7.jar
41
+ - lib/tika-parsers-0.7.jar
42
+ - rtika.gemspec
43
+ - test/helper.rb
44
+ - test/test_rtika.rb
45
+ has_rdoc: true
46
+ homepage: http://github.com/skyfallsin/rtika
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options:
51
+ - --charset=UTF-8
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project:
71
+ rubygems_version: 1.3.6
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: A JRuby wrapper around the Apache Tika library
75
+ test_files:
76
+ - test/helper.rb
77
+ - test/test_rtika.rb