jrtika 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.bundle/config ADDED
@@ -0,0 +1,3 @@
1
+ ---
2
+ BUNDLE_DISABLE_SHARED_GEMS: '1'
3
+ ...
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "shoulda", ">= 0"
10
+ gem "rdoc", "~> 3.12"
11
+ gem "bundler", "~> 1.1.3"
12
+ gem "jeweler", "~> 1.8.3"
13
+ gem "simplecov"
14
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,33 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.8.3)
6
+ bundler (~> 1.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rdoc
10
+ json (1.7.0-java)
11
+ multi_json (1.3.4)
12
+ rake (0.9.2.2)
13
+ rdoc (3.12)
14
+ json (~> 1.4)
15
+ shoulda (3.0.1)
16
+ shoulda-context (~> 1.0.0)
17
+ shoulda-matchers (~> 1.0.0)
18
+ shoulda-context (1.0.0)
19
+ shoulda-matchers (1.0.0)
20
+ simplecov (0.6.2)
21
+ multi_json (~> 1.3)
22
+ simplecov-html (~> 0.5.3)
23
+ simplecov-html (0.5.3)
24
+
25
+ PLATFORMS
26
+ java
27
+
28
+ DEPENDENCIES
29
+ bundler (~> 1.1.3)
30
+ jeweler (~> 1.8.3)
31
+ rdoc (~> 3.12)
32
+ shoulda
33
+ simplecov
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2012 gauravgaur111
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,23 @@
1
+ = jrtika
2
+
3
+ JRUBY Tika connector to read text and metadata from files using Apache Tika 1.1.
4
+ Usage: Jrtika.read(full_file_path) where full_file_path is the full path of the file including file names.
5
+ Returns : Hash containing text and metadata of the document. {:text => "text of the document extracted by tika", :metadata => "metadata of the document determined by tika"}
6
+ Uses Tika AutoDetectParser to detect file type. Supported file types are at : http://tika.apache.org/1.1/formats.html
7
+ Uses Tika BodyContentHandler as the handler.
8
+
9
+ == Contributing to jrtika
10
+
11
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
12
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
13
+ * Fork the project.
14
+ * Start a feature/bugfix branch.
15
+ * Commit and push until you are happy with your contribution.
16
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
17
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
18
+
19
+ == Copyright
20
+
21
+ Copyright (c) 2012 gauravgaur111. See LICENSE.txt for
22
+ further details.
23
+
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "jrtika"
18
+ gem.homepage = "http://github.com/gauravgaur111/jrtika"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{JRuby Tika connector to read text and metadata from files using Apache Tika}
21
+ gem.description = %Q{JRUBY Tika connector to read text and metadata from files using Apache Tika 1.1. Usage: Jrtika.read(full_file_path)}
22
+ gem.email = "gauravgaur111@gmail.com"
23
+ gem.authors = ["gauravgaur111"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ #require 'simplecov/simplecovtask'
36
+ #Rcov::RcovTask.new do |test|
37
+ # test.libs << 'test'
38
+ # test.pattern = 'test/**/test_*.rb'
39
+ # test.verbose = true
40
+ # test.rcov_opts << '--exclude "gems/*"'
41
+ #end
42
+
43
+ task :default => :test
44
+
45
+ require 'rdoc/task'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "jrtika #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
Binary file
data/jrtika.gemspec ADDED
@@ -0,0 +1,63 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "jrtika"
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["gauravgaur111"]
12
+ s.date = "2012-04-30"
13
+ s.description = "JRUBY Tika connector to read text and metadata from files using Apache Tika 1.1. Usage: Jrtika.read(full_file_path)"
14
+ s.email = "gauravgaur111@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".bundle/config",
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "LICENSE.txt",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "jar/tika-app-1.1.jar",
28
+ "jrtika.gemspec",
29
+ "lib/jrtika.rb",
30
+ "test/helper.rb",
31
+ "test/test_jrtika.rb"
32
+ ]
33
+ s.homepage = "http://github.com/gauravgaur111/jrtika"
34
+ s.licenses = ["MIT"]
35
+ s.require_paths = ["lib"]
36
+ s.rubygems_version = "1.8.15"
37
+ s.summary = "JRuby Tika connector to read text and metadata from files using Apache Tika"
38
+
39
+ if s.respond_to? :specification_version then
40
+ s.specification_version = 3
41
+
42
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
43
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
44
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
45
+ s.add_development_dependency(%q<bundler>, ["~> 1.1.3"])
46
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.3"])
47
+ s.add_development_dependency(%q<simplecov>, [">= 0"])
48
+ else
49
+ s.add_dependency(%q<shoulda>, [">= 0"])
50
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
51
+ s.add_dependency(%q<bundler>, ["~> 1.1.3"])
52
+ s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
53
+ s.add_dependency(%q<simplecov>, [">= 0"])
54
+ end
55
+ else
56
+ s.add_dependency(%q<shoulda>, [">= 0"])
57
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
58
+ s.add_dependency(%q<bundler>, ["~> 1.1.3"])
59
+ s.add_dependency(%q<jeweler>, ["~> 1.8.3"])
60
+ s.add_dependency(%q<simplecov>, [">= 0"])
61
+ end
62
+ end
63
+
data/lib/jrtika.rb ADDED
@@ -0,0 +1,28 @@
1
+ require 'java'
2
+ $CLASSPATH << File.expand_path(File.join('..', 'jar', 'tika-app-1.1.jar'))
3
+
4
+ class Jrtika
5
+
6
+ # Read text and metadata from a file and return a hash containing both.
7
+ #
8
+ # Jrtika.read file_path
9
+
10
+ def self.read(file_path)
11
+ @RESOURCE_NAME_KEY = ""
12
+
13
+ is = java.io.FileInputStream.new(file_path)
14
+ parser = org.apache.tika.parser.AutoDetectParser.new()
15
+
16
+ handler = org.apache.tika.sax.BodyContentHandler.new(-1)
17
+ meta = org.apache.tika.metadata.Metadata.new()
18
+
19
+ meta.set(@RESOURCE_NAME_KEY,file_path)
20
+
21
+ parser.parse(is, handler, meta)
22
+ is.close()
23
+ {:text => handler.toString(), :metadata => meta.toString()}
24
+
25
+ end
26
+
27
+ end
28
+
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'jrtika'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestJrtika < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jrtika
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - gauravgaur111
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2012-04-30 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: shoulda
17
+ version_requirements: &id001 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ requirement: *id001
24
+ prerelease: false
25
+ type: :development
26
+ - !ruby/object:Gem::Dependency
27
+ name: rdoc
28
+ version_requirements: &id002 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: "3.12"
34
+ requirement: *id002
35
+ prerelease: false
36
+ type: :development
37
+ - !ruby/object:Gem::Dependency
38
+ name: bundler
39
+ version_requirements: &id003 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ~>
43
+ - !ruby/object:Gem::Version
44
+ version: 1.1.3
45
+ requirement: *id003
46
+ prerelease: false
47
+ type: :development
48
+ - !ruby/object:Gem::Dependency
49
+ name: jeweler
50
+ version_requirements: &id004 !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ~>
54
+ - !ruby/object:Gem::Version
55
+ version: 1.8.3
56
+ requirement: *id004
57
+ prerelease: false
58
+ type: :development
59
+ - !ruby/object:Gem::Dependency
60
+ name: simplecov
61
+ version_requirements: &id005 !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ requirement: *id005
68
+ prerelease: false
69
+ type: :development
70
+ description: "JRUBY Tika connector to read text and metadata from files using Apache Tika 1.1. Usage: Jrtika.read(full_file_path)"
71
+ email: gauravgaur111@gmail.com
72
+ executables: []
73
+
74
+ extensions: []
75
+
76
+ extra_rdoc_files:
77
+ - LICENSE.txt
78
+ - README.rdoc
79
+ files:
80
+ - .bundle/config
81
+ - Gemfile
82
+ - Gemfile.lock
83
+ - LICENSE.txt
84
+ - README.rdoc
85
+ - Rakefile
86
+ - VERSION
87
+ - jar/tika-app-1.1.jar
88
+ - jrtika.gemspec
89
+ - lib/jrtika.rb
90
+ - test/helper.rb
91
+ - test/test_jrtika.rb
92
+ homepage: http://github.com/gauravgaur111/jrtika
93
+ licenses:
94
+ - MIT
95
+ post_install_message:
96
+ rdoc_options: []
97
+
98
+ require_paths:
99
+ - lib
100
+ required_ruby_version: !ruby/object:Gem::Requirement
101
+ none: false
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ hash: 2
106
+ segments:
107
+ - 0
108
+ version: "0"
109
+ required_rubygems_version: !ruby/object:Gem::Requirement
110
+ none: false
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: "0"
115
+ requirements: []
116
+
117
+ rubyforge_project:
118
+ rubygems_version: 1.8.15
119
+ signing_key:
120
+ specification_version: 3
121
+ summary: JRuby Tika connector to read text and metadata from files using Apache Tika
122
+ test_files: []
123
+