rika 1.6.0-java → 1.11.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.travis.yml +3 -3
  4. data/README.md +82 -40
  5. data/RELEASE_NOTES.md +17 -0
  6. data/java-lib/tika-app-1.24.1.jar +0 -0
  7. data/lib/rika.rb +17 -96
  8. data/lib/rika/parser.rb +90 -0
  9. data/lib/rika/version.rb +1 -1
  10. data/pom.xml +2 -2
  11. data/rika.gemspec +9 -7
  12. data/rika_helper.rb +38 -0
  13. data/spec/fixtures/de.txt +21 -1
  14. data/spec/fixtures/document.doc +0 -0
  15. data/spec/fixtures/document.docx +0 -0
  16. data/spec/fixtures/document.pdf +0 -0
  17. data/spec/fixtures/en.txt +23 -1
  18. data/spec/fixtures/es.txt +21 -1
  19. data/spec/fixtures/fr.txt +23 -1
  20. data/spec/fixtures/ru.txt +21 -1
  21. data/spec/fixtures/text_file.txt +23 -1
  22. data/spec/fixtures/text_file_without_extension +23 -1
  23. data/spec/rika_spec.rb +145 -102
  24. data/spec/spec_helper.rb +4 -3
  25. metadata +31 -66
  26. data/spec/fixtures/over_100k_file.txt +0 -1241
  27. data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
  28. data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
  29. data/target/dependency/asm-debug-all-4.1.jar +0 -0
  30. data/target/dependency/aspectjrt-1.8.0.jar +0 -0
  31. data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
  32. data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
  33. data/target/dependency/boilerpipe-1.1.0.jar +0 -0
  34. data/target/dependency/commons-codec-1.9.jar +0 -0
  35. data/target/dependency/commons-compress-1.8.1.jar +0 -0
  36. data/target/dependency/commons-httpclient-3.1.jar +0 -0
  37. data/target/dependency/commons-logging-1.1.1.jar +0 -0
  38. data/target/dependency/fontbox-1.8.6.jar +0 -0
  39. data/target/dependency/isoparser-1.0.2.jar +0 -0
  40. data/target/dependency/java-libpst-0.8.1.jar +0 -0
  41. data/target/dependency/jcip-annotations-1.0.jar +0 -0
  42. data/target/dependency/jdom-1.0.jar +0 -0
  43. data/target/dependency/jempbox-1.8.6.jar +0 -0
  44. data/target/dependency/jhighlight-1.0.jar +0 -0
  45. data/target/dependency/jmatio-1.0.jar +0 -0
  46. data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
  47. data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
  48. data/target/dependency/netcdf-4.2.20.jar +0 -0
  49. data/target/dependency/pdfbox-1.8.6.jar +0 -0
  50. data/target/dependency/poi-3.11-beta2.jar +0 -0
  51. data/target/dependency/poi-ooxml-3.11-beta2.jar +0 -0
  52. data/target/dependency/poi-ooxml-schemas-3.11-beta2.jar +0 -0
  53. data/target/dependency/poi-scratchpad-3.11-beta2.jar +0 -0
  54. data/target/dependency/rome-1.0.jar +0 -0
  55. data/target/dependency/slf4j-api-1.6.1.jar +0 -0
  56. data/target/dependency/tagsoup-1.2.1.jar +0 -0
  57. data/target/dependency/tika-core-1.6.jar +0 -0
  58. data/target/dependency/tika-parsers-1.6.jar +0 -0
  59. data/target/dependency/unidataCommon-4.2.20.jar +0 -0
  60. data/target/dependency/vorbis-java-core-0.6.jar +0 -0
  61. data/target/dependency/vorbis-java-tika-0.6.jar +0 -0
  62. data/target/dependency/xercesImpl-2.8.1.jar +0 -0
  63. data/target/dependency/xml-apis-1.3.03.jar +0 -0
  64. data/target/dependency/xmlbeans-2.6.0.jar +0 -0
  65. data/target/dependency/xmpcore-5.1.2.jar +0 -0
  66. data/target/dependency/xz-1.5.jar +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 077d41fc4797ad2074f4acdd6df554cb7221ef01
4
- data.tar.gz: 7f1a591fe8bad0e68ca8173e1d8d7690b16341db
2
+ SHA256:
3
+ metadata.gz: 2964b22b0e32e770c6ace90cf6b3ad1b05a54899b7838696c037c95645f1e73a
4
+ data.tar.gz: 3557bba0a54a62f00c9c4c148be307cc7132d806fb13178deb95dbb8f566eb33
5
5
  SHA512:
6
- metadata.gz: 07346daa2e40986329f5b19e85d12e02b9a6cb802249d9a303aa86022e6b7515f309c3b411338bfe1d82e5bb27e2c4e5de0a6cdb18221e80b8c414c5bb5deaa2
7
- data.tar.gz: c26125c52c0c0411aa8c5dff2f7f0f81e28d0d63f4b241ca6ef47d5ea841c50576c5c40ce47a072c31ec7f16a6312510ab194c6f47499847b323f4bd80c29c68
6
+ metadata.gz: aae34480ff9bf6ee7e9a00221a27fb6780cc60c8425644bf11bbcbf9875d2684a964a3bd88fccc28cc9c42b20d075548d531d485dada7ffd50a3d3eddc83294e
7
+ data.tar.gz: 530de9844daa28dddb9b149a0a671eaaa84d6e4dfc4aa840af609121e89a8d2a6a5f2f70e78523e9d62c0eef6472291b50e44a0899c9e464057b76dc89cfed3c
data/.gitignore CHANGED
@@ -12,6 +12,7 @@ lib/bundler/man
12
12
  pkg
13
13
  rdoc
14
14
  spec/reports
15
+ target/
15
16
  test/tmp
16
17
  test/version_tmp
17
18
  tmp
@@ -1,7 +1,7 @@
1
1
  language: ruby
2
2
  rvm:
3
- - jruby-19mode
4
- - jruby-head
3
+ - jruby-9.2.12.0
5
4
  notifications:
6
5
  recipients:
7
- - ricny046@gmail.com
6
+ - ricny046@gmail.com
7
+ - keithrbennett@gmail.com
data/README.md CHANGED
@@ -1,39 +1,31 @@
1
- # Rika
2
-
3
- A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
4
-
5
- More information about Apache Tika can be found here: http://tika.apache.org/
6
-
7
- [![Code Climate](https://codeclimate.com/github/ricn/rika.png)](https://codeclimate.com/github/ricn/rika)
8
- [![Build Status](https://travis-ci.org/ricn/rika.png?branch=master)](https://travis-ci.org/ricn/rika)
9
-
10
- ## Installation
11
-
12
- Add this line to your application's Gemfile:
13
1
 
14
- gem 'rika'
2
+ # Rika
15
3
 
16
- Remember that this gem only works on JRuby.
4
+ Rika is a [JRuby](https://www.jruby.org) wrapper for the [Apache Tika](http://tika.apache.org/) Java library, which extracts text and metadata from files and resources of [many different formats](https://tika.apache.org/1.24.1/formats.html).
17
5
 
18
- And then execute:
6
+ _Caution: This gem only works with [JRuby](https://www.jruby.org)._
19
7
 
20
- $ bundle
8
+ Rika currently supports some basic and commonly used functions of Tika. Future development may add Ruby support for more Tika functionality, and perhaps a command line interface as well. See the [Other Tika Resources](#other-tika-resources) section for alternatives to Rika that may suit more demanding needs.
21
9
 
22
- Or install it yourself as:
23
-
24
- $ gem install rika
10
+ [![Code Climate](https://codeclimate.com/github/keithrbennett/rika.png)](https://codeclimate.com/github/keithrbennett/rika)
11
+ [![Build Status](https://travis-ci.org/keithrbennett/rika.png?branch=master)](https://travis-ci.org/keithrbennett/rika)
25
12
 
26
13
  ## Usage
27
14
 
28
- For a quick start with the simplest use cases, the following functions
29
- are provided to get what you need in a single function call, for your convenience:
15
+ For a quick start with the simplest use cases, the following functions are provided to get what you need in a single function call, for your convenience:
30
16
 
31
17
  ```ruby
32
18
  require 'rika'
33
19
 
34
- content = Rika.parse_content('document.pdf') # string containing all content text
35
- metadata = Rika.parse_metadata('document.pdf') # hash containing the document metadata
36
- content, metadata = Rika.parse_content_and_metadata('document.pdf') # both of the above
20
+ content = Rika.parse_content('x.pdf') # string containing all content text
21
+ metadata = Rika.parse_metadata('x.pdf') # hash containing the document metadata
22
+ content, metadata = Rika.parse_content_and_metadata('x.pdf') # both of the above
23
+ ```
24
+
25
+ A URL can be used instead of a filespec wherever a data source is specified:
26
+
27
+ ```ruby
28
+ content, metadata = Rika.parse_content_and_metadata('https://github.com/keithrbennett/rika')
37
29
  ```
38
30
 
39
31
  For other use cases and finer control, you can work directly with the Rika::Parser object:
@@ -41,43 +33,93 @@ For other use cases and finer control, you can work directly with the Rika::Pars
41
33
  ```ruby
42
34
  require 'rika'
43
35
 
44
- parser = Rika::Parser.new('document.pdf')
36
+ parser = Rika::Parser.new('x.pdf')
45
37
 
46
38
  # Return the content of the document:
47
39
  parser.content
48
40
 
49
- # Return the media type for the document:
50
- parser.media_type
51
- => "application/pdf"
52
-
53
- # Return the metadata field title if it exists:
54
- parser.metadata["title"] if parser.metadata_exists?("title")
41
+ # Return the metadata of the document:
42
+ parser.metadata
55
43
 
56
- # Return all the available metadata keys that can be read from the document
57
- parser.available_metadata
44
+ # Return the media type for the document, e.g. "application/pdf":
45
+ parser.media_type
58
46
 
59
47
  # Return only the first 10000 chars of the content:
60
- parser = Rika::Parser.new('document.pdf', 10000)
48
+ parser = Rika::Parser.new('x.pdf', 10000)
61
49
  parser.content # 10000 first chars returned
62
50
 
63
51
  # Return content from URL
64
- parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
52
+ parser = Rika::Parser.new('http://example.com/x.pdf', 200)
65
53
  parser.content
66
54
 
67
55
  # Return the language for the content
68
- parser = parser = Rika::Parser.new('german document.pdf')
56
+ parser = Rika::Parser.new('german-document.pdf')
69
57
  parser.language
70
58
  => "de"
71
59
 
72
- # Check whether the langugage identification is certain enough to be trusted
60
+ # Check whether the language identification is certain enough to be trusted
73
61
  parser.language_is_reasonably_certain?
74
62
 
75
63
  ```
76
64
 
65
+ #### Simple Command Line Use
66
+
67
+ Since Ruby supports the `-r` option to require a library, and the `-e` option to evaluate a string of code, you can easily do simple parsing on the command line, such as:
68
+
69
+ ```
70
+ ruby -r rika -e 'puts Rika.parse_content("x.pdf")'
71
+ ```
72
+
73
+ You could also parse the metadata and output it as JSON as follows:
74
+
75
+ ```
76
+ ruby -r rika -r json -e 'puts Rika.parse_metadata("x.pdf").to_json'
77
+ ```
78
+
79
+ If you want to get both content and metadata in JSON format, this would do that:
80
+
81
+ ```
82
+ ruby -r rika -r json -e 'c,m = Rika.parse_content_and_metadata("tw.pdf"); puts({ c: c, m: m }.to_json)'
83
+ ```
84
+
85
+ Using the [rexe](https://github.com/keithrbennett/rexe) gem, that can be made much more concise:
86
+
87
+ ```
88
+ rexe -r rika -oj 'c,m = Rika.parse_content_and_metadata("x.pdf"); { c: c, m: m }'
89
+ ```
90
+
91
+ ...and changing the `-oj` option gives you access to other output formats such as "Pretty JSON", YAML, and AwesomePrint (a very human readable format).
92
+
93
+
94
+ ## Installation
95
+
96
+ Add this line to your application's Gemfile. Use `gem` or `jgem` depending on your JRuby installation:
97
+
98
+ gem 'rika' # or: jgem 'rika'
99
+
100
+ And then execute:
101
+
102
+ $ bundle
103
+
104
+ Or install it yourself as:
105
+
106
+ $ gem install rika # or: jgem install rika
107
+
108
+ ## Other Tika Resources
109
+
110
+ * For more sophisticated use of Tika, you can use the Tika jar file directly in your JRuby code. After installing the `rika` gem, the Tika jar file will be located in `$GEM_HOME/gems/rika-[rika-version]-java/target/dependency/tika-core-[tika-version].jar`.
111
+
112
+ * Tika also provides another jar file containing a RESTful server that you can run on the command line. You can download this server jar from http://tika.apache.org/download.html.
113
+ See the "Running the Tika Server as a Jar file" section of https://cwiki.apache.org/confluence/display/TIKA/TikaServer for more information.
114
+
115
+ * @chrismattman and others have provided a [Python library and CLI](https://github.com/chrismattmann/tika-python) that interfaces with the Tika server.
116
+
117
+ * A general Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
118
+
119
+
77
120
  ## Credits
78
- The following people have contributed ideas, documentation, or code to Rika:
79
- * Keith Bennett
80
- * Richard Nyström
121
+
122
+ Richard Nyström (@ricn) is the original author of Rika, but has not been able to maintain it since 2015. In July 2020, Richard transferred the project to Keith Bennett (@keithrbennett), who had made made some contributions back in 2013.
81
123
 
82
124
  ## Contributing
83
125
 
@@ -0,0 +1,17 @@
1
+ ## Release Notes
2
+
3
+ #### v1.11.1
4
+
5
+ * Add Apache-2.0 license to gemspec.
6
+
7
+
8
+ #### v1.11.0
9
+
10
+ * Replace 2015 Tika jar files w/2020 tika-app-1.24.1.jar.
11
+ * Handover of maintainer status from @ricn to @keithrbennett.
12
+ * Add rika_helper.rb to provide abbreviated method names for interactive use w/pry, etc.
13
+ * Extract parser class to its own file.
14
+ * Various cleanup and refactoring.
15
+ * Improve README.md documentation.
16
+ * Tested successfully on Java 14.
17
+ * Move Tika jar file from /target/dependency to /java-lib.
@@ -4,14 +4,10 @@ raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
4
4
 
5
5
  require "rika/version"
6
6
  require 'uri'
7
- require 'net/http'
8
- require 'java'
7
+ require 'open-uri'
8
+ require_relative 'rika/parser'
9
+ require_relative '../java-lib/tika-app-1.24.1.jar'
9
10
 
10
- Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
11
- require jar
12
- end
13
-
14
- # Heavily based on the Apache Tika API: http://tika.apache.org/1.5/api/org/apache/tika/Tika.html
15
11
  module Rika
16
12
  import org.apache.tika.metadata.Metadata
17
13
  import org.apache.tika.Tika
@@ -25,98 +21,23 @@ module Rika
25
21
  [parser.content, parser.metadata]
26
22
  end
27
23
 
28
- def self.parse_content(file_location, max_content_length = -1)
29
- parser = Parser.new(file_location, max_content_length)
30
- parser.content
24
+ def self.parse_content_and_metadata_as_hash(file_location, max_content_length = -1)
25
+ content, metadata = parse_content_and_metadata(file_location, max_content_length)
26
+ { content: content, metadata: metadata }
31
27
  end
32
28
 
33
- def self.parse_metadata(file_location)
34
- parser = Parser.new(file_location, 0)
35
- parser.metadata
29
+ def self.parse_content(file_location, max_content_length = -1)
30
+ Parser.new(file_location, max_content_length).content
36
31
  end
37
32
 
38
- class Parser
39
-
40
- def initialize(file_location, max_content_length = -1, detector = DefaultDetector.new)
41
- @uri = file_location
42
- @tika = Tika.new(detector)
43
- @tika.set_max_string_length(max_content_length)
44
- @metadata_java = Metadata.new
45
- @metadata_ruby = nil
46
- @input_type = get_input_type
47
- end
48
-
49
- def content
50
- self.parse
51
- @content
52
- end
53
-
54
- def metadata
55
- unless @metadata_ruby
56
- self.parse
57
- @metadata_ruby = {}
58
-
59
- @metadata_java.names.each do |name|
60
- @metadata_ruby[name] = @metadata_java.get(name)
61
- end
62
- end
63
- @metadata_ruby
64
- end
65
-
66
- def media_type
67
- if file?
68
- @media_type ||= @tika.detect(java.io.File.new(@uri))
69
- else
70
- @media_type ||= @tika.detect(input_stream)
71
- end
72
- end
73
-
74
- def available_metadata
75
- metadata.keys
76
- end
77
-
78
- def metadata_exists?(name)
79
- metadata[name] != nil
80
- end
81
-
82
- def file?
83
- @input_type == :file
84
- end
85
-
86
- def language
87
- @lang ||= LanguageIdentifier.new(content)
88
-
89
- @lang.language
90
- end
91
-
92
- def language_is_reasonably_certain?
93
- @lang ||= LanguageIdentifier.new(content)
94
-
95
- @lang.is_reasonably_certain
96
- end
97
-
98
- protected
99
-
100
- def parse
101
- @content ||= @tika.parse_to_string(input_stream, @metadata_java).to_s.strip
102
- end
103
-
104
- def get_input_type
105
- if File.exists?(@uri) && File.directory?(@uri) == false
106
- :file
107
- elsif URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
108
- :http
109
- else
110
- raise IOError, "Input (#{@uri}) is neither file nor http."
111
- end
112
- end
113
-
114
- def input_stream
115
- if file?
116
- FileInputStream.new(java.io.File.new(@uri))
117
- else # :http
118
- URL.new(@uri).open_stream
119
- end
120
- end
33
+ # Regarding max_content_length, the default is set at 0 to save unnecessary processing,
34
+ # since the content is being ignored. However, the PDF metadata "pdf:unmappedUnicodeCharsPerPage"
35
+ # and "pdf:charsPerPage" will be absent if the max_content_length is 0, and will be
36
+ # ]may differ depending on
37
+ # the number of characters read.
38
+ def self.parse_metadata(file_location, max_content_length = 0)
39
+ Parser.new(file_location, max_content_length).metadata
121
40
  end
122
41
  end
42
+
43
+
@@ -0,0 +1,90 @@
1
+ module Rika
2
+ class Parser
3
+
4
+ attr_reader :data_source, :tika, :metadata_java, :metadata_ruby, :input_type
5
+
6
+ def initialize(data_source, max_content_length = -1, detector = DefaultDetector.new)
7
+ @data_source = data_source
8
+ @tika = Tika.new(detector)
9
+ @tika.set_max_string_length(max_content_length)
10
+ @metadata_java = nil
11
+ @metadata_ruby = nil
12
+ @input_type = get_input_type
13
+ end
14
+
15
+ def content
16
+ parse
17
+ @content
18
+ end
19
+
20
+ def metadata
21
+ unless @metadata_ruby
22
+ parse
23
+ @metadata_ruby = metadata_java.names.each_with_object({}) do |name, m_ruby|
24
+ m_ruby[name] = metadata_java.get(name)
25
+ end
26
+ end
27
+ @metadata_ruby
28
+ end
29
+
30
+ def media_type
31
+ @media_type ||= file? \
32
+ ? tika.detect(java.io.File.new(data_source)) \
33
+ : tika.detect(input_stream)
34
+ end
35
+
36
+ # @deprecated
37
+ def available_metadata
38
+ metadata.keys
39
+ end
40
+
41
+ # @deprecated
42
+ def metadata_exists?(name)
43
+ metadata[name] != nil
44
+ end
45
+
46
+ def language
47
+ @lang ||= LanguageIdentifier.new(content)
48
+ @lang.language
49
+ end
50
+
51
+ # @deprecated
52
+ # https://tika.apache.org/1.9/api/org/apache/tika/language/LanguageIdentifier.html#isReasonablyCertain()
53
+ # says: WARNING: Will never return true for small amount of input texts.
54
+ # https://tika.apache.org/1.19/api/org/apache/tika/language/LanguageIdentifier.html
55
+ # indicated that the LanguageIdentifier class used in this implementation is deprecated.
56
+ # TODO: More research needed to see if an alternate implementation can be used.
57
+ def language_is_reasonably_certain?
58
+ @lang ||= LanguageIdentifier.new(content)
59
+ @lang.is_reasonably_certain
60
+ end
61
+
62
+
63
+ def parse
64
+ unless @content
65
+ @metadata_java = Metadata.new
66
+ @content = tika.parse_to_string(input_stream, @metadata_java).to_s.strip
67
+ end
68
+ end
69
+
70
+ private def get_input_type
71
+ if File.file?(data_source)
72
+ :file
73
+ elsif URI(data_source).is_a?(URI::HTTP) && URI.open(data_source)
74
+ :http
75
+ else
76
+ raise IOError, "Input (#{data_source}) is not an available file or HTTP resource."
77
+ end
78
+ end
79
+
80
+ private def input_stream
81
+ file? \
82
+ ? FileInputStream.new(java.io.File.new(data_source)) \
83
+ : URL.new(data_source).open_stream
84
+ end
85
+
86
+ private def file?
87
+ input_type == :file
88
+ end
89
+ end
90
+ end
@@ -1,3 +1,3 @@
1
1
  module Rika
2
- VERSION = "1.6.0"
2
+ VERSION = "1.11.1"
3
3
  end
data/pom.xml CHANGED
@@ -12,8 +12,8 @@
12
12
  <dependencies>
13
13
  <dependency>
14
14
  <groupId>org.apache.tika</groupId>
15
- <artifactId>tika-parsers</artifactId>
16
- <version>1.6</version>
15
+ <artifactId>tika-app</artifactId>
16
+ <version>1.24</version>
17
17
  <scope>test</scope>
18
18
  </dependency>
19
19
  </dependencies>