rika 0.9.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +43 -0
- data/Rakefile +1 -0
- data/lib/apache-mime4j-core-0.7.2.jar +0 -0
- data/lib/apache-mime4j-dom-0.7.2.jar +0 -0
- data/lib/asm-3.1.jar +0 -0
- data/lib/aspectjrt-1.6.11.jar +0 -0
- data/lib/bcmail-jdk15-1.45.jar +0 -0
- data/lib/bcprov-jdk15-1.45.jar +0 -0
- data/lib/boilerpipe-1.1.0.jar +0 -0
- data/lib/commons-codec-1.5.jar +0 -0
- data/lib/commons-compress-1.4.1.jar +0 -0
- data/lib/commons-logging-1.1.1.jar +0 -0
- data/lib/dom4j-1.6.1.jar +0 -0
- data/lib/fontbox-1.7.0.jar +0 -0
- data/lib/isoparser-1.0-RC-1.jar +0 -0
- data/lib/jempbox-1.7.0.jar +0 -0
- data/lib/juniversalchardet-1.0.3.jar +0 -0
- data/lib/metadata-extractor-2.4.0-beta-1.jar +0 -0
- data/lib/pdfbox-1.7.0.jar +0 -0
- data/lib/poi-3.8.jar +0 -0
- data/lib/poi-ooxml-3.8.jar +0 -0
- data/lib/poi-ooxml-schemas-3.8.jar +0 -0
- data/lib/poi-scratchpad-3.8.jar +0 -0
- data/lib/rika.rb +64 -0
- data/lib/rika/version.rb +3 -0
- data/lib/rome-0.9.jar +0 -0
- data/lib/tagsoup-1.2.1.jar +0 -0
- data/lib/tika-core-1.2.jar +0 -0
- data/lib/tika-parsers-1.2.jar +0 -0
- data/lib/vorbis-java-core-0.1-tests.jar +0 -0
- data/lib/vorbis-java-core-0.1.jar +0 -0
- data/lib/vorbis-java-tika-0.1.jar +0 -0
- data/lib/xmlbeans-2.3.0.jar +0 -0
- data/lib/xz-1.0.jar +0 -0
- data/rika.gemspec +20 -0
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/image.jpg +0 -0
- data/spec/fixtures/text_file.txt +1 -0
- data/spec/rika_spec.rb +80 -0
- data/spec/spec_helper.rb +13 -0
- metadata +119 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Richard Nyström
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Rika
|
2
|
+
|
3
|
+
A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
|
4
|
+
|
5
|
+
More information about Apache Tika can be found here: http://tika.apache.org/
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'rika'
|
12
|
+
|
13
|
+
Remember that this gem only works on JRuby.
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install rika
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
Something like this:
|
26
|
+
|
27
|
+
require 'rika'
|
28
|
+
|
29
|
+
parser = Rika::Parser.new('document.pdf')
|
30
|
+
|
31
|
+
parser.content # Returns the content of the document as text
|
32
|
+
|
33
|
+
parser.metadata["title"] if parser.metadata_exists?("title") # Returns the metadata field title if it exists
|
34
|
+
|
35
|
+
parser.available_metadata # Returns all the available metadata keys that can be read from the document
|
36
|
+
|
37
|
+
## Contributing
|
38
|
+
|
39
|
+
1. Fork it
|
40
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
41
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
42
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
43
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
Binary file
|
Binary file
|
data/lib/asm-3.1.jar
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/dom4j-1.6.1.jar
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/poi-3.8.jar
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/rika.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
|
2
|
+
|
3
|
+
require "rika/version"
|
4
|
+
require 'java'
|
5
|
+
|
6
|
+
Dir[File.join(File.dirname(__FILE__), "*.jar")].each do |jar|
|
7
|
+
require jar
|
8
|
+
end
|
9
|
+
|
10
|
+
module Rika
|
11
|
+
import org.apache.tika.sax.BodyContentHandler
|
12
|
+
import org.apache.tika.parser.AutoDetectParser
|
13
|
+
import org.apache.tika.metadata.Metadata
|
14
|
+
|
15
|
+
class Parser
|
16
|
+
|
17
|
+
def initialize(filename)
|
18
|
+
if File.exists?(filename)
|
19
|
+
@filename = filename
|
20
|
+
self.perform
|
21
|
+
else
|
22
|
+
raise IOError, "File does not exist"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def content
|
27
|
+
@content.to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
def metadata
|
31
|
+
metadata_hash = {}
|
32
|
+
|
33
|
+
@metadata.names.each do |name|
|
34
|
+
metadata_hash[name] = @metadata.get(name)
|
35
|
+
end
|
36
|
+
|
37
|
+
metadata_hash
|
38
|
+
end
|
39
|
+
|
40
|
+
def available_metadata
|
41
|
+
@metadata.names.to_a
|
42
|
+
end
|
43
|
+
|
44
|
+
def metadata_exists?(name)
|
45
|
+
if @metadata.get(name) == nil
|
46
|
+
false
|
47
|
+
else
|
48
|
+
true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
|
54
|
+
def perform
|
55
|
+
input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
|
56
|
+
@metadata = Metadata.new
|
57
|
+
@metadata.set("filename", File.basename(@filename))
|
58
|
+
@parser = AutoDetectParser.new
|
59
|
+
@content = BodyContentHandler.new
|
60
|
+
@parser.parse(input_stream, @content, @metadata)
|
61
|
+
input_stream.close
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/rika/version.rb
ADDED
data/lib/rome-0.9.jar
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/xz-1.0.jar
ADDED
Binary file
|
data/rika.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'rika/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "rika"
|
8
|
+
gem.version = Rika::VERSION
|
9
|
+
gem.authors = ["Richard Nyström"]
|
10
|
+
gem.email = ["ricny046@gmail.com"]
|
11
|
+
gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various file formats. }
|
12
|
+
gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various file formats. }
|
13
|
+
gem.homepage = "https://github.com/ricn/rika"
|
14
|
+
gem.files = `git ls-files`.split($/)
|
15
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
|
+
gem.require_paths = ["lib"]
|
18
|
+
gem.add_development_dependency "rspec"
|
19
|
+
gem.platform = "java"
|
20
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
First they ignore you, then they ridicule you, then they fight you, then you win.
|
data/spec/rika_spec.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe Rika::Parser do
|
6
|
+
before(:all) do
|
7
|
+
@txt_parser = Rika::Parser.new(file_path("text_file.txt"))
|
8
|
+
@docx_parser = Rika::Parser.new(file_path("document.docx"))
|
9
|
+
@pdf_parser = Rika::Parser.new(file_path("document.pdf"))
|
10
|
+
@image_parser = Rika::Parser.new(file_path("image.jpg"))
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should crash if file does not exists" do
|
14
|
+
lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist")
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '#content' do
|
18
|
+
it "should return the content in a text file" do
|
19
|
+
@txt_parser.content.strip.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should return the content in a docx file" do
|
23
|
+
@docx_parser.content.strip.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should return the content in a pdf file" do
|
27
|
+
@pdf_parser.content.strip.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should return no content for an image" do
|
31
|
+
@image_parser.content.should be_empty
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# We just test a few of the metadata fields for some common file formats
|
36
|
+
# to make sure the integration with Apache Tika works. Apache Tika already
|
37
|
+
# have tests for all file formats it supports so we won't retest that
|
38
|
+
describe '#metadata' do
|
39
|
+
it "should return nil if metadata field does not exists" do
|
40
|
+
@txt_parser.metadata["nonsense"].should be_nil
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should return metadata from a text file" do
|
44
|
+
@txt_parser.metadata["filename"].should == "text_file.txt"
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should return metadata from a docx file" do
|
48
|
+
@docx_parser.metadata["Page-Count"].should == "1"
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should return metadata from a pdf file" do
|
52
|
+
@pdf_parser.metadata["title"].should == "A simple title"
|
53
|
+
end
|
54
|
+
|
55
|
+
it "should return metadata from an image" do
|
56
|
+
@image_parser.metadata["Image Height"].should == "72 pixels"
|
57
|
+
@image_parser.metadata["Image Width"].should == "72 pixels"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe '#available_metadata' do
|
62
|
+
it "should return available metadata fields" do
|
63
|
+
@txt_parser.available_metadata.should_not be_empty
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should be an array" do
|
67
|
+
@txt_parser.available_metadata.is_a?(Array).should == true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
describe '#metadata_exists?' do
|
72
|
+
it "should return false if metadata does not exists" do
|
73
|
+
@txt_parser.metadata_exists?("title").should == false
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should return true if metadata exists" do
|
77
|
+
@docx_parser.metadata_exists?("title").should == true
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require "rika"
|
2
|
+
|
3
|
+
def file_path( *paths )
|
4
|
+
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
|
5
|
+
end
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
config.order = 'random'
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rika
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.9.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- Richard Nyström
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-16 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
version_requirements: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ! '>='
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: !binary |-
|
21
|
+
MA==
|
22
|
+
none: false
|
23
|
+
requirement: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ! '>='
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: !binary |-
|
28
|
+
MA==
|
29
|
+
none: false
|
30
|
+
prerelease: false
|
31
|
+
type: :development
|
32
|
+
description: ! ' A JRuby wrapper for Apache Tika to extract text and metadata from
|
33
|
+
various file formats. '
|
34
|
+
email:
|
35
|
+
- ricny046@gmail.com
|
36
|
+
executables: []
|
37
|
+
extensions: []
|
38
|
+
extra_rdoc_files: []
|
39
|
+
files:
|
40
|
+
- .gitignore
|
41
|
+
- .rspec
|
42
|
+
- Gemfile
|
43
|
+
- LICENSE.txt
|
44
|
+
- README.md
|
45
|
+
- Rakefile
|
46
|
+
- lib/apache-mime4j-core-0.7.2.jar
|
47
|
+
- lib/apache-mime4j-dom-0.7.2.jar
|
48
|
+
- lib/asm-3.1.jar
|
49
|
+
- lib/aspectjrt-1.6.11.jar
|
50
|
+
- lib/bcmail-jdk15-1.45.jar
|
51
|
+
- lib/bcprov-jdk15-1.45.jar
|
52
|
+
- lib/boilerpipe-1.1.0.jar
|
53
|
+
- lib/commons-codec-1.5.jar
|
54
|
+
- lib/commons-compress-1.4.1.jar
|
55
|
+
- lib/commons-logging-1.1.1.jar
|
56
|
+
- lib/dom4j-1.6.1.jar
|
57
|
+
- lib/fontbox-1.7.0.jar
|
58
|
+
- lib/isoparser-1.0-RC-1.jar
|
59
|
+
- lib/jempbox-1.7.0.jar
|
60
|
+
- lib/juniversalchardet-1.0.3.jar
|
61
|
+
- lib/metadata-extractor-2.4.0-beta-1.jar
|
62
|
+
- lib/pdfbox-1.7.0.jar
|
63
|
+
- lib/poi-3.8.jar
|
64
|
+
- lib/poi-ooxml-3.8.jar
|
65
|
+
- lib/poi-ooxml-schemas-3.8.jar
|
66
|
+
- lib/poi-scratchpad-3.8.jar
|
67
|
+
- lib/rika.rb
|
68
|
+
- lib/rika/version.rb
|
69
|
+
- lib/rome-0.9.jar
|
70
|
+
- lib/tagsoup-1.2.1.jar
|
71
|
+
- lib/tika-core-1.2.jar
|
72
|
+
- lib/tika-parsers-1.2.jar
|
73
|
+
- lib/vorbis-java-core-0.1-tests.jar
|
74
|
+
- lib/vorbis-java-core-0.1.jar
|
75
|
+
- lib/vorbis-java-tika-0.1.jar
|
76
|
+
- lib/xmlbeans-2.3.0.jar
|
77
|
+
- lib/xz-1.0.jar
|
78
|
+
- rika.gemspec
|
79
|
+
- spec/fixtures/document.doc
|
80
|
+
- spec/fixtures/document.docx
|
81
|
+
- spec/fixtures/document.pdf
|
82
|
+
- spec/fixtures/image.jpg
|
83
|
+
- spec/fixtures/text_file.txt
|
84
|
+
- spec/rika_spec.rb
|
85
|
+
- spec/spec_helper.rb
|
86
|
+
homepage: https://github.com/ricn/rika
|
87
|
+
licenses: []
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: !binary |-
|
97
|
+
MA==
|
98
|
+
none: false
|
99
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: !binary |-
|
104
|
+
MA==
|
105
|
+
none: false
|
106
|
+
requirements: []
|
107
|
+
rubyforge_project:
|
108
|
+
rubygems_version: 1.8.24
|
109
|
+
signing_key:
|
110
|
+
specification_version: 3
|
111
|
+
summary: A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
|
112
|
+
test_files:
|
113
|
+
- spec/fixtures/document.doc
|
114
|
+
- spec/fixtures/document.docx
|
115
|
+
- spec/fixtures/document.pdf
|
116
|
+
- spec/fixtures/image.jpg
|
117
|
+
- spec/fixtures/text_file.txt
|
118
|
+
- spec/rika_spec.rb
|
119
|
+
- spec/spec_helper.rb
|