RubyGems - rika - Versions diffs - 1.1.1-java → 1.11.1-java - Mend

rika 1.1.1-java → 1.11.1-java

Files changed (64) hide show

checksums.yaml +7 -0
data/.gitignore +1 -0
data/.travis.yml +3 -3
data/README.md +82 -40
data/RELEASE_NOTES.md +17 -0
data/Rakefile +1 -1
data/java-lib/tika-app-1.24.1.jar +0 -0
data/lib/rika.rb +18 -93
data/lib/rika/parser.rb +90 -0
data/lib/rika/version.rb +1 -1
data/pom.xml +4 -4
data/rika.gemspec +9 -7
data/rika_helper.rb +38 -0
data/spec/fixtures/de.txt +21 -1
data/spec/fixtures/document.doc +0 -0
data/spec/fixtures/document.docx +0 -0
data/spec/fixtures/document.pdf +0 -0
data/spec/fixtures/en.txt +23 -1
data/spec/fixtures/es.txt +21 -1
data/spec/fixtures/fr.txt +23 -1
data/spec/fixtures/ru.txt +21 -1
data/spec/fixtures/text_file.txt +23 -1
data/spec/fixtures/text_file_without_extension +23 -1
data/spec/rika_spec.rb +153 -101
data/spec/spec_helper.rb +4 -3
metadata +36 -76
data/spec/fixtures/over_100k_file.txt +0 -1241
data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
data/target/dependency/asm-3.1.jar +0 -0
data/target/dependency/aspectjrt-1.6.11.jar +0 -0
data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
data/target/dependency/boilerpipe-1.1.0.jar +0 -0
data/target/dependency/commons-codec-1.5.jar +0 -0
data/target/dependency/commons-compress-1.4.1.jar +0 -0
data/target/dependency/commons-logging-1.1.1.jar +0 -0
data/target/dependency/dom4j-1.6.1.jar +0 -0
data/target/dependency/fontbox-1.7.1.jar +0 -0
data/target/dependency/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
data/target/dependency/isoparser-1.0-RC-1.jar +0 -0
data/target/dependency/jdom-1.0.jar +0 -0
data/target/dependency/jempbox-1.7.1.jar +0 -0
data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
data/target/dependency/netcdf-4.2-min.jar +0 -0
data/target/dependency/pdfbox-1.7.1.jar +0 -0
data/target/dependency/poi-3.8.jar +0 -0
data/target/dependency/poi-ooxml-3.8.jar +0 -0
data/target/dependency/poi-ooxml-schemas-3.8.jar +0 -0
data/target/dependency/poi-scratchpad-3.8.jar +0 -0
data/target/dependency/rome-0.9.jar +0 -0
data/target/dependency/slf4j-api-1.5.6.jar +0 -0
data/target/dependency/tagsoup-1.2.1.jar +0 -0
data/target/dependency/tika-core-1.3.jar +0 -0
data/target/dependency/tika-parsers-1.3.jar +0 -0
data/target/dependency/vorbis-java-core-0.1-tests.jar +0 -0
data/target/dependency/vorbis-java-core-0.1.jar +0 -0
data/target/dependency/vorbis-java-tika-0.1.jar +0 -0
data/target/dependency/xercesImpl-2.8.1.jar +0 -0
data/target/dependency/xml-apis-1.3.03.jar +0 -0
data/target/dependency/xmlbeans-2.3.0.jar +0 -0
data/target/dependency/xmpcore-5.1.2.jar +0 -0
data/target/dependency/xz-1.0.jar +0 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 2964b22b0e32e770c6ace90cf6b3ad1b05a54899b7838696c037c95645f1e73a
+  data.tar.gz: 3557bba0a54a62f00c9c4c148be307cc7132d806fb13178deb95dbb8f566eb33
+SHA512:
+  metadata.gz: aae34480ff9bf6ee7e9a00221a27fb6780cc60c8425644bf11bbcbf9875d2684a964a3bd88fccc28cc9c42b20d075548d531d485dada7ffd50a3d3eddc83294e
+  data.tar.gz: 530de9844daa28dddb9b149a0a671eaaa84d6e4dfc4aa840af609121e89a8d2a6a5f2f70e78523e9d62c0eef6472291b50e44a0899c9e464057b76dc89cfed3c

data/.gitignore CHANGED

@@ -12,6 +12,7 @@ lib/bundler/man
 pkg
 rdoc
 spec/reports
+target/
 test/tmp
 test/version_tmp
 tmp

data/.travis.yml CHANGED

@@ -1,7 +1,7 @@
 language: ruby
 rvm:
-  - jruby-19mode
-  - jruby-head
+  - jruby-9.2.12.0
 notifications:
   recipients:
-    - ricny046@gmail.com
+    - ricny046@gmail.com
+    - keithrbennett@gmail.com

data/README.md CHANGED

@@ -1,39 +1,31 @@
-# Rika
-A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
-More information about Apache Tika can be found here: http://tika.apache.org/
-[![Code Climate](https://codeclimate.com/github/ricn/rika.png)](https://codeclimate.com/github/ricn/rika)
-[![Build Status](https://travis-ci.org/ricn/rika.png?branch=master)](https://travis-ci.org/ricn/rika)
-## Installation
-Add this line to your application's Gemfile:
-    gem 'rika'
+# Rika
-Remember that this gem only works on JRuby.
+Rika is a [JRuby](https://www.jruby.org) wrapper for the [Apache Tika](http://tika.apache.org/) Java library, which extracts text and metadata from files and resources of [many different formats](https://tika.apache.org/1.24.1/formats.html).
-And then execute:
+_Caution: This gem only works with [JRuby](https://www.jruby.org)._
-    $ bundle
+Rika currently supports some basic and commonly used functions of Tika. Future development may add Ruby support for more Tika functionality, and perhaps a command line interface as well. See the [Other Tika Resources](#other-tika-resources) section for alternatives to Rika that may suit more demanding needs.
-Or install it yourself as:
-    $ gem install rika
+[![Code Climate](https://codeclimate.com/github/keithrbennett/rika.png)](https://codeclimate.com/github/keithrbennett/rika)
+[![Build Status](https://travis-ci.org/keithrbennett/rika.png?branch=master)](https://travis-ci.org/keithrbennett/rika)
 ## Usage
-For a quick start with the simplest use cases, the following functions
-are provided to get what you need in a single function call, for your convenience:
+For a quick start with the simplest use cases, the following functions are provided to get what you need in a single function call, for your convenience:
 ```ruby
 require 'rika'
-content           = Rika.parse_content('document.pdf')    # string containing all content text
-metadata          = Rika.parse_metadata('document.pdf')   # hash containing the document metadata
-content, metadata = Rika.parse_content_and_metadata('document.pdf')   # both of the above
+content           = Rika.parse_content('x.pdf')    # string containing all content text
+metadata          = Rika.parse_metadata('x.pdf')   # hash containing the document metadata
+content, metadata = Rika.parse_content_and_metadata('x.pdf')   # both of the above
+```
+A URL can be used instead of a filespec wherever a data source is specified:
+```ruby
+content, metadata = Rika.parse_content_and_metadata('https://github.com/keithrbennett/rika')
 ```
 For other use cases and finer control, you can work directly with the Rika::Parser object:
@@ -41,43 +33,93 @@ For other use cases and finer control, you can work directly with the Rika::Pars
 ```ruby
 require 'rika'
-parser = Rika::Parser.new('document.pdf')
+parser = Rika::Parser.new('x.pdf')
 # Return the content of the document:
 parser.content
-# Return the media type for the document:
-parser.media_type
-=> "application/pdf"
-# Return the metadata field title if it exists:
-parser.metadata["title"] if parser.metadata_exists?("title")
+# Return the metadata of the document:
+parser.metadata
-# Return all the available metadata keys that can be read from the document
-parser.available_metadata
+# Return the media type for the document, e.g. "application/pdf":
+parser.media_type
 # Return only the first 10000 chars of the content:
-parser = Rika::Parser.new('document.pdf', 10000)
+parser = Rika::Parser.new('x.pdf', 10000)
 parser.content # 10000 first chars returned
 # Return content from URL
-parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
+parser = Rika::Parser.new('http://example.com/x.pdf', 200)
 parser.content
 # Return the language for the content
-parser = parser = Rika::Parser.new('german document.pdf')
+parser = Rika::Parser.new('german-document.pdf')
 parser.language
 => "de"
-# Check whether the langugage identification is certain enough to be trusted
+# Check whether the language identification is certain enough to be trusted
 parser.language_is_reasonably_certain?
 ```
+#### Simple Command Line Use
+Since Ruby supports the `-r` option to require a library, and the `-e` option to evaluate a string of code, you can easily do simple parsing on the command line, such as:
+```
+ruby -r rika -e 'puts Rika.parse_content("x.pdf")'
+```
+You could also parse the metadata and output it as JSON as follows:
+```
+ruby -r rika -r json -e 'puts Rika.parse_metadata("x.pdf").to_json'
+```
+If you want to get both content and metadata in JSON format, this would do that:
+```
+ruby -r rika -r json -e 'c,m = Rika.parse_content_and_metadata("tw.pdf"); puts({ c: c, m: m }.to_json)'
+```
+Using the [rexe](https://github.com/keithrbennett/rexe) gem, that can be made much more concise:
+```
+rexe -r rika -oj 'c,m = Rika.parse_content_and_metadata("x.pdf"); { c: c, m: m }'
+```
+...and changing the `-oj` option gives you access to other output formats such as "Pretty JSON", YAML, and AwesomePrint (a very human readable format).
+## Installation
+Add this line to your application's Gemfile. Use `gem` or `jgem` depending on your JRuby installation:
+    gem 'rika' # or: jgem 'rika'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install rika  # or: jgem install rika
+## Other Tika Resources
+* For more sophisticated use of Tika, you can use the Tika jar file directly in your JRuby code. After installing the `rika` gem, the Tika jar file will be located in `$GEM_HOME/gems/rika-[rika-version]-java/target/dependency/tika-core-[tika-version].jar`.
+* Tika also provides another jar file containing a RESTful server that you can run on the command line. You can download this server jar from http://tika.apache.org/download.html.
+ See the "Running the Tika Server as a Jar file" section of https://cwiki.apache.org/confluence/display/TIKA/TikaServer for more information.
+* @chrismattman and others have provided a [Python library and CLI](https://github.com/chrismattmann/tika-python) that interfaces with the Tika server.
+* A general Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
 ## Credits
-The following people have contributed ideas, documentation, or code to Rika:
-* Keith Bennett
-* Richard Nyström
+Richard Nyström (@ricn) is the original author of Rika, but has not been able to maintain it since 2015. In July 2020, Richard transferred the project to Keith Bennett (@keithrbennett), who had made made some contributions back in 2013.
 ## Contributing

data/RELEASE_NOTES.md ADDED

@@ -0,0 +1,17 @@
+## Release Notes
+#### v1.11.1
+* Add Apache-2.0 license to gemspec.
+#### v1.11.0
+* Replace 2015 Tika jar files w/2020 tika-app-1.24.1.jar.
+* Handover of maintainer status from @ricn to @keithrbennett.
+* Add rika_helper.rb to provide abbreviated method names for interactive use w/pry, etc.
+* Extract parser class to its own file.
+* Various cleanup and refactoring.
+* Improve README.md documentation.
+* Tested successfully on Java 14.
+* Move Tika jar file from /target/dependency to /java-lib.

data/Rakefile CHANGED

@@ -8,4 +8,4 @@ task :default => :spec
 desc 'Download jars'
 task :download_jars do
 	system "mvn dependency:copy-dependencies"
- end
+end

data/java-lib/tika-app-1.24.1.jar ADDED

Binary file

data/lib/rika.rb CHANGED

@@ -4,18 +4,15 @@ raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
 require "rika/version"
 require 'uri'
-require 'net/http'
-require 'java'
+require 'open-uri'
+require_relative 'rika/parser'
+require_relative '../java-lib/tika-app-1.24.1.jar'
-Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
-  require jar
-end
-# Heavily based on the Apache Tika API: http://tika.apache.org/1.3/api/org/apache/tika/Tika.html
 module Rika
   import org.apache.tika.metadata.Metadata
   import org.apache.tika.Tika
   import org.apache.tika.language.LanguageIdentifier
+  import org.apache.tika.detect.DefaultDetector
   import java.io.FileInputStream
   import java.net.URL
@@ -24,95 +21,23 @@ module Rika
     [parser.content, parser.metadata]
   end
-  def self.parse_content(file_location, max_content_length = -1)
-    parser = Parser.new(file_location, max_content_length)
-    parser.content
+  def self.parse_content_and_metadata_as_hash(file_location, max_content_length = -1)
+    content, metadata = parse_content_and_metadata(file_location, max_content_length)
+    { content: content, metadata: metadata }
   end
-  def self.parse_metadata(file_location)
-    parser = Parser.new(file_location, 0)
-    parser.metadata
+  def self.parse_content(file_location, max_content_length = -1)
+    Parser.new(file_location, max_content_length).content
   end
-  class Parser
-    def initialize(file_location, max_content_length = -1)
-      @uri = file_location
-      @tika = Tika.new
-      @tika.set_max_string_length(max_content_length)
-      @metadata_java = Metadata.new
-      @metadata_ruby = nil
-      @input_type = get_input_type
-    end
-    def content
-      self.parse
-      @content
-    end
-    def metadata
-      unless @metadata_ruby
-        self.parse
-        @metadata_ruby = {}
-        @metadata_java.names.each do |name|
-          @metadata_ruby[name] = @metadata_java.get(name)
-        end
-      end
-      @metadata_ruby
-    end
-    def media_type
-      @media_type ||= @tika.detect(input_stream)
-    end
-    def available_metadata
-      metadata.keys
-    end
-    def metadata_exists?(name)
-      metadata[name] != nil
-    end
-    def file?
-      @input_type == :file
-    end
-    def language
-      @lang ||= LanguageIdentifier.new(content)
-      @lang.language
-    end
-    def language_is_reasonably_certain?
-      @lang ||= LanguageIdentifier.new(content)
-      @lang.is_reasonably_certain
-    end
-    protected
-    def parse
-      @content ||= @tika.parse_to_string(input_stream, @metadata_java).to_s.strip
-    end
-    def get_input_type
-      if File.exists?(@uri) && File.directory?(@uri) == false
-        :file
-      elsif URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
-        :http
-      else
-        raise IOError, "Input (#{@uri}) is neither file nor http."
-      end
-    end
-    def input_stream
-      if file?
-        FileInputStream.new(java.io.File.new(@uri))
-      else # :http
-        URL.new(@uri).open_stream
-      end
-    end
+  # Regarding max_content_length, the default is set at 0 to save unnecessary processing,
+  # since the content is being ignored. However, the PDF metadata "pdf:unmappedUnicodeCharsPerPage"
+  # and "pdf:charsPerPage" will be absent if the max_content_length is 0, and will be
+  # ]may differ depending on
+  # the number of characters read.
+  def self.parse_metadata(file_location, max_content_length = 0)
+    Parser.new(file_location, max_content_length).metadata
   end
 end

data/lib/rika/parser.rb ADDED

@@ -0,0 +1,90 @@
+module Rika
+  class Parser
+    attr_reader :data_source, :tika, :metadata_java, :metadata_ruby, :input_type
+    def initialize(data_source, max_content_length = -1, detector = DefaultDetector.new)
+      @data_source = data_source
+      @tika = Tika.new(detector)
+      @tika.set_max_string_length(max_content_length)
+      @metadata_java = nil
+      @metadata_ruby = nil
+      @input_type = get_input_type
+    end
+    def content
+      parse
+      @content
+    end
+    def metadata
+      unless @metadata_ruby
+        parse
+        @metadata_ruby = metadata_java.names.each_with_object({}) do |name, m_ruby|
+          m_ruby[name] = metadata_java.get(name)
+        end
+      end
+      @metadata_ruby
+    end
+    def media_type
+      @media_type ||= file? \
+          ? tika.detect(java.io.File.new(data_source)) \
+          : tika.detect(input_stream)
+    end
+    # @deprecated
+    def available_metadata
+      metadata.keys
+    end
+    # @deprecated
+    def metadata_exists?(name)
+      metadata[name] != nil
+    end
+    def language
+      @lang ||= LanguageIdentifier.new(content)
+      @lang.language
+    end
+    # @deprecated
+    # https://tika.apache.org/1.9/api/org/apache/tika/language/LanguageIdentifier.html#isReasonablyCertain()
+    # says: WARNING: Will never return true for small amount of input texts.
+    # https://tika.apache.org/1.19/api/org/apache/tika/language/LanguageIdentifier.html
+    # indicated that the LanguageIdentifier class used in this implementation is deprecated.
+    # TODO: More research needed to see if an alternate implementation can be used.
+    def language_is_reasonably_certain?
+      @lang ||= LanguageIdentifier.new(content)
+      @lang.is_reasonably_certain
+    end
+    def parse
+      unless @content
+        @metadata_java = Metadata.new
+        @content = tika.parse_to_string(input_stream, @metadata_java).to_s.strip
+      end
+    end
+    private def get_input_type
+      if File.file?(data_source)
+        :file
+      elsif URI(data_source).is_a?(URI::HTTP) && URI.open(data_source)
+        :http
+      else
+        raise IOError, "Input (#{data_source}) is not an available file or HTTP resource."
+      end
+    end
+    private def input_stream
+      file? \
+          ? FileInputStream.new(java.io.File.new(data_source)) \
+          : URL.new(data_source).open_stream
+    end
+    private def file?
+      input_type == :file
+    end
+  end
+end

data/lib/rika/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Rika
-  VERSION = "1.1.1"
+  VERSION = "1.11.1"
 end