rika 0.9.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/.gitignore +19 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +43 -0
  6. data/Rakefile +1 -0
  7. data/lib/apache-mime4j-core-0.7.2.jar +0 -0
  8. data/lib/apache-mime4j-dom-0.7.2.jar +0 -0
  9. data/lib/asm-3.1.jar +0 -0
  10. data/lib/aspectjrt-1.6.11.jar +0 -0
  11. data/lib/bcmail-jdk15-1.45.jar +0 -0
  12. data/lib/bcprov-jdk15-1.45.jar +0 -0
  13. data/lib/boilerpipe-1.1.0.jar +0 -0
  14. data/lib/commons-codec-1.5.jar +0 -0
  15. data/lib/commons-compress-1.4.1.jar +0 -0
  16. data/lib/commons-logging-1.1.1.jar +0 -0
  17. data/lib/dom4j-1.6.1.jar +0 -0
  18. data/lib/fontbox-1.7.0.jar +0 -0
  19. data/lib/isoparser-1.0-RC-1.jar +0 -0
  20. data/lib/jempbox-1.7.0.jar +0 -0
  21. data/lib/juniversalchardet-1.0.3.jar +0 -0
  22. data/lib/metadata-extractor-2.4.0-beta-1.jar +0 -0
  23. data/lib/pdfbox-1.7.0.jar +0 -0
  24. data/lib/poi-3.8.jar +0 -0
  25. data/lib/poi-ooxml-3.8.jar +0 -0
  26. data/lib/poi-ooxml-schemas-3.8.jar +0 -0
  27. data/lib/poi-scratchpad-3.8.jar +0 -0
  28. data/lib/rika.rb +64 -0
  29. data/lib/rika/version.rb +3 -0
  30. data/lib/rome-0.9.jar +0 -0
  31. data/lib/tagsoup-1.2.1.jar +0 -0
  32. data/lib/tika-core-1.2.jar +0 -0
  33. data/lib/tika-parsers-1.2.jar +0 -0
  34. data/lib/vorbis-java-core-0.1-tests.jar +0 -0
  35. data/lib/vorbis-java-core-0.1.jar +0 -0
  36. data/lib/vorbis-java-tika-0.1.jar +0 -0
  37. data/lib/xmlbeans-2.3.0.jar +0 -0
  38. data/lib/xz-1.0.jar +0 -0
  39. data/rika.gemspec +20 -0
  40. data/spec/fixtures/document.doc +0 -0
  41. data/spec/fixtures/document.docx +0 -0
  42. data/spec/fixtures/document.pdf +0 -0
  43. data/spec/fixtures/image.jpg +0 -0
  44. data/spec/fixtures/text_file.txt +1 -0
  45. data/spec/rika_spec.rb +80 -0
  46. data/spec/spec_helper.rb +13 -0
  47. metadata +119 -0
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+
19
+ .DS_Store
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in rika.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Richard Nyström
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,43 @@
1
+ # Rika
2
+
3
+ A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
4
+
5
+ More information about Apache Tika can be found here: http://tika.apache.org/
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'rika'
12
+
13
+ Remember that this gem only works on JRuby.
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install rika
22
+
23
+ ## Usage
24
+
25
+ Something like this:
26
+
27
+ require 'rika'
28
+
29
+ parser = Rika::Parser.new('document.pdf')
30
+
31
+ parser.content # Returns the content of the document as text
32
+
33
+ parser.metadata["title"] if parser.metadata_exists?("title") # Returns the metadata field title if it exists
34
+
35
+ parser.available_metadata # Returns all the available metadata keys that can be read from the document
36
+
37
+ ## Contributing
38
+
39
+ 1. Fork it
40
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
41
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
42
+ 4. Push to the branch (`git push origin my-new-feature`)
43
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
Binary file
Binary file
data/lib/asm-3.1.jar ADDED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/lib/poi-3.8.jar ADDED
Binary file
Binary file
Binary file
Binary file
data/lib/rika.rb ADDED
@@ -0,0 +1,64 @@
1
+ raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
2
+
3
+ require "rika/version"
4
+ require 'java'
5
+
6
+ Dir[File.join(File.dirname(__FILE__), "*.jar")].each do |jar|
7
+ require jar
8
+ end
9
+
10
+ module Rika
11
+ import org.apache.tika.sax.BodyContentHandler
12
+ import org.apache.tika.parser.AutoDetectParser
13
+ import org.apache.tika.metadata.Metadata
14
+
15
+ class Parser
16
+
17
+ def initialize(filename)
18
+ if File.exists?(filename)
19
+ @filename = filename
20
+ self.perform
21
+ else
22
+ raise IOError, "File does not exist"
23
+ end
24
+ end
25
+
26
+ def content
27
+ @content.to_s
28
+ end
29
+
30
+ def metadata
31
+ metadata_hash = {}
32
+
33
+ @metadata.names.each do |name|
34
+ metadata_hash[name] = @metadata.get(name)
35
+ end
36
+
37
+ metadata_hash
38
+ end
39
+
40
+ def available_metadata
41
+ @metadata.names.to_a
42
+ end
43
+
44
+ def metadata_exists?(name)
45
+ if @metadata.get(name) == nil
46
+ false
47
+ else
48
+ true
49
+ end
50
+ end
51
+
52
+ protected
53
+
54
+ def perform
55
+ input_stream = java.io.FileInputStream.new(java.io.File.new(@filename))
56
+ @metadata = Metadata.new
57
+ @metadata.set("filename", File.basename(@filename))
58
+ @parser = AutoDetectParser.new
59
+ @content = BodyContentHandler.new
60
+ @parser.parse(input_stream, @content, @metadata)
61
+ input_stream.close
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,3 @@
1
+ module Rika
2
+ VERSION = "0.9.0"
3
+ end
data/lib/rome-0.9.jar ADDED
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/lib/xz-1.0.jar ADDED
Binary file
data/rika.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'rika/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "rika"
8
+ gem.version = Rika::VERSION
9
+ gem.authors = ["Richard Nyström"]
10
+ gem.email = ["ricny046@gmail.com"]
11
+ gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various file formats. }
12
+ gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from various file formats. }
13
+ gem.homepage = "https://github.com/ricn/rika"
14
+ gem.files = `git ls-files`.split($/)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.require_paths = ["lib"]
18
+ gem.add_development_dependency "rspec"
19
+ gem.platform = "java"
20
+ end
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ First they ignore you, then they ridicule you, then they fight you, then you win.
data/spec/rika_spec.rb ADDED
@@ -0,0 +1,80 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Rika::Parser do
6
+ before(:all) do
7
+ @txt_parser = Rika::Parser.new(file_path("text_file.txt"))
8
+ @docx_parser = Rika::Parser.new(file_path("document.docx"))
9
+ @pdf_parser = Rika::Parser.new(file_path("document.pdf"))
10
+ @image_parser = Rika::Parser.new(file_path("image.jpg"))
11
+ end
12
+
13
+ it "should crash if file does not exists" do
14
+ lambda { Rika::Parser.new(file_path("nonsense.txt")) }.should raise_error(IOError, "File does not exist")
15
+ end
16
+
17
+ describe '#content' do
18
+ it "should return the content in a text file" do
19
+ @txt_parser.content.strip.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
20
+ end
21
+
22
+ it "should return the content in a docx file" do
23
+ @docx_parser.content.strip.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
24
+ end
25
+
26
+ it "should return the content in a pdf file" do
27
+ @pdf_parser.content.strip.should == "First they ignore you, then they ridicule you, then they fight you, then you win."
28
+ end
29
+
30
+ it "should return no content for an image" do
31
+ @image_parser.content.should be_empty
32
+ end
33
+ end
34
+
35
+ # We just test a few of the metadata fields for some common file formats
36
+ # to make sure the integration with Apache Tika works. Apache Tika already
37
+ # have tests for all file formats it supports so we won't retest that
38
+ describe '#metadata' do
39
+ it "should return nil if metadata field does not exists" do
40
+ @txt_parser.metadata["nonsense"].should be_nil
41
+ end
42
+
43
+ it "should return metadata from a text file" do
44
+ @txt_parser.metadata["filename"].should == "text_file.txt"
45
+ end
46
+
47
+ it "should return metadata from a docx file" do
48
+ @docx_parser.metadata["Page-Count"].should == "1"
49
+ end
50
+
51
+ it "should return metadata from a pdf file" do
52
+ @pdf_parser.metadata["title"].should == "A simple title"
53
+ end
54
+
55
+ it "should return metadata from an image" do
56
+ @image_parser.metadata["Image Height"].should == "72 pixels"
57
+ @image_parser.metadata["Image Width"].should == "72 pixels"
58
+ end
59
+ end
60
+
61
+ describe '#available_metadata' do
62
+ it "should return available metadata fields" do
63
+ @txt_parser.available_metadata.should_not be_empty
64
+ end
65
+
66
+ it "should be an array" do
67
+ @txt_parser.available_metadata.is_a?(Array).should == true
68
+ end
69
+ end
70
+
71
+ describe '#metadata_exists?' do
72
+ it "should return false if metadata does not exists" do
73
+ @txt_parser.metadata_exists?("title").should == false
74
+ end
75
+
76
+ it "should return true if metadata exists" do
77
+ @docx_parser.metadata_exists?("title").should == true
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,13 @@
1
+ require "rika"
2
+
3
+ def file_path( *paths )
4
+ File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
5
+ end
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ config.order = 'random'
13
+ end
metadata ADDED
@@ -0,0 +1,119 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rika
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.9.0
6
+ platform: java
7
+ authors:
8
+ - Richard Nyström
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-16 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ! '>='
19
+ - !ruby/object:Gem::Version
20
+ version: !binary |-
21
+ MA==
22
+ none: false
23
+ requirement: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ! '>='
26
+ - !ruby/object:Gem::Version
27
+ version: !binary |-
28
+ MA==
29
+ none: false
30
+ prerelease: false
31
+ type: :development
32
+ description: ! ' A JRuby wrapper for Apache Tika to extract text and metadata from
33
+ various file formats. '
34
+ email:
35
+ - ricny046@gmail.com
36
+ executables: []
37
+ extensions: []
38
+ extra_rdoc_files: []
39
+ files:
40
+ - .gitignore
41
+ - .rspec
42
+ - Gemfile
43
+ - LICENSE.txt
44
+ - README.md
45
+ - Rakefile
46
+ - lib/apache-mime4j-core-0.7.2.jar
47
+ - lib/apache-mime4j-dom-0.7.2.jar
48
+ - lib/asm-3.1.jar
49
+ - lib/aspectjrt-1.6.11.jar
50
+ - lib/bcmail-jdk15-1.45.jar
51
+ - lib/bcprov-jdk15-1.45.jar
52
+ - lib/boilerpipe-1.1.0.jar
53
+ - lib/commons-codec-1.5.jar
54
+ - lib/commons-compress-1.4.1.jar
55
+ - lib/commons-logging-1.1.1.jar
56
+ - lib/dom4j-1.6.1.jar
57
+ - lib/fontbox-1.7.0.jar
58
+ - lib/isoparser-1.0-RC-1.jar
59
+ - lib/jempbox-1.7.0.jar
60
+ - lib/juniversalchardet-1.0.3.jar
61
+ - lib/metadata-extractor-2.4.0-beta-1.jar
62
+ - lib/pdfbox-1.7.0.jar
63
+ - lib/poi-3.8.jar
64
+ - lib/poi-ooxml-3.8.jar
65
+ - lib/poi-ooxml-schemas-3.8.jar
66
+ - lib/poi-scratchpad-3.8.jar
67
+ - lib/rika.rb
68
+ - lib/rika/version.rb
69
+ - lib/rome-0.9.jar
70
+ - lib/tagsoup-1.2.1.jar
71
+ - lib/tika-core-1.2.jar
72
+ - lib/tika-parsers-1.2.jar
73
+ - lib/vorbis-java-core-0.1-tests.jar
74
+ - lib/vorbis-java-core-0.1.jar
75
+ - lib/vorbis-java-tika-0.1.jar
76
+ - lib/xmlbeans-2.3.0.jar
77
+ - lib/xz-1.0.jar
78
+ - rika.gemspec
79
+ - spec/fixtures/document.doc
80
+ - spec/fixtures/document.docx
81
+ - spec/fixtures/document.pdf
82
+ - spec/fixtures/image.jpg
83
+ - spec/fixtures/text_file.txt
84
+ - spec/rika_spec.rb
85
+ - spec/spec_helper.rb
86
+ homepage: https://github.com/ricn/rika
87
+ licenses: []
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: !binary |-
97
+ MA==
98
+ none: false
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: !binary |-
104
+ MA==
105
+ none: false
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 1.8.24
109
+ signing_key:
110
+ specification_version: 3
111
+ summary: A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
112
+ test_files:
113
+ - spec/fixtures/document.doc
114
+ - spec/fixtures/document.docx
115
+ - spec/fixtures/document.pdf
116
+ - spec/fixtures/image.jpg
117
+ - spec/fixtures/text_file.txt
118
+ - spec/rika_spec.rb
119
+ - spec/spec_helper.rb