rika 1.1.1-java → 1.11.1-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/.travis.yml +3 -3
- data/README.md +82 -40
- data/RELEASE_NOTES.md +17 -0
- data/Rakefile +1 -1
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/lib/rika.rb +18 -93
- data/lib/rika/parser.rb +90 -0
- data/lib/rika/version.rb +1 -1
- data/pom.xml +4 -4
- data/rika.gemspec +9 -7
- data/rika_helper.rb +38 -0
- data/spec/fixtures/de.txt +21 -1
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/en.txt +23 -1
- data/spec/fixtures/es.txt +21 -1
- data/spec/fixtures/fr.txt +23 -1
- data/spec/fixtures/ru.txt +21 -1
- data/spec/fixtures/text_file.txt +23 -1
- data/spec/fixtures/text_file_without_extension +23 -1
- data/spec/rika_spec.rb +153 -101
- data/spec/spec_helper.rb +4 -3
- metadata +36 -76
- data/spec/fixtures/over_100k_file.txt +0 -1241
- data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
- data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
- data/target/dependency/asm-3.1.jar +0 -0
- data/target/dependency/aspectjrt-1.6.11.jar +0 -0
- data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
- data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
- data/target/dependency/boilerpipe-1.1.0.jar +0 -0
- data/target/dependency/commons-codec-1.5.jar +0 -0
- data/target/dependency/commons-compress-1.4.1.jar +0 -0
- data/target/dependency/commons-logging-1.1.1.jar +0 -0
- data/target/dependency/dom4j-1.6.1.jar +0 -0
- data/target/dependency/fontbox-1.7.1.jar +0 -0
- data/target/dependency/geronimo-stax-api_1.0_spec-1.0.1.jar +0 -0
- data/target/dependency/isoparser-1.0-RC-1.jar +0 -0
- data/target/dependency/jdom-1.0.jar +0 -0
- data/target/dependency/jempbox-1.7.1.jar +0 -0
- data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
- data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
- data/target/dependency/netcdf-4.2-min.jar +0 -0
- data/target/dependency/pdfbox-1.7.1.jar +0 -0
- data/target/dependency/poi-3.8.jar +0 -0
- data/target/dependency/poi-ooxml-3.8.jar +0 -0
- data/target/dependency/poi-ooxml-schemas-3.8.jar +0 -0
- data/target/dependency/poi-scratchpad-3.8.jar +0 -0
- data/target/dependency/rome-0.9.jar +0 -0
- data/target/dependency/slf4j-api-1.5.6.jar +0 -0
- data/target/dependency/tagsoup-1.2.1.jar +0 -0
- data/target/dependency/tika-core-1.3.jar +0 -0
- data/target/dependency/tika-parsers-1.3.jar +0 -0
- data/target/dependency/vorbis-java-core-0.1-tests.jar +0 -0
- data/target/dependency/vorbis-java-core-0.1.jar +0 -0
- data/target/dependency/vorbis-java-tika-0.1.jar +0 -0
- data/target/dependency/xercesImpl-2.8.1.jar +0 -0
- data/target/dependency/xml-apis-1.3.03.jar +0 -0
- data/target/dependency/xmlbeans-2.3.0.jar +0 -0
- data/target/dependency/xmpcore-5.1.2.jar +0 -0
- data/target/dependency/xz-1.0.jar +0 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2964b22b0e32e770c6ace90cf6b3ad1b05a54899b7838696c037c95645f1e73a
|
4
|
+
data.tar.gz: 3557bba0a54a62f00c9c4c148be307cc7132d806fb13178deb95dbb8f566eb33
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: aae34480ff9bf6ee7e9a00221a27fb6780cc60c8425644bf11bbcbf9875d2684a964a3bd88fccc28cc9c42b20d075548d531d485dada7ffd50a3d3eddc83294e
|
7
|
+
data.tar.gz: 530de9844daa28dddb9b149a0a671eaaa84d6e4dfc4aa840af609121e89a8d2a6a5f2f70e78523e9d62c0eef6472291b50e44a0899c9e464057b76dc89cfed3c
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,39 +1,31 @@
|
|
1
|
-
# Rika
|
2
|
-
|
3
|
-
A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
|
4
|
-
|
5
|
-
More information about Apache Tika can be found here: http://tika.apache.org/
|
6
|
-
|
7
|
-
[![Code Climate](https://codeclimate.com/github/ricn/rika.png)](https://codeclimate.com/github/ricn/rika)
|
8
|
-
[![Build Status](https://travis-ci.org/ricn/rika.png?branch=master)](https://travis-ci.org/ricn/rika)
|
9
|
-
|
10
|
-
## Installation
|
11
|
-
|
12
|
-
Add this line to your application's Gemfile:
|
13
1
|
|
14
|
-
|
2
|
+
# Rika
|
15
3
|
|
16
|
-
|
4
|
+
Rika is a [JRuby](https://www.jruby.org) wrapper for the [Apache Tika](http://tika.apache.org/) Java library, which extracts text and metadata from files and resources of [many different formats](https://tika.apache.org/1.24.1/formats.html).
|
17
5
|
|
18
|
-
|
6
|
+
_Caution: This gem only works with [JRuby](https://www.jruby.org)._
|
19
7
|
|
20
|
-
|
8
|
+
Rika currently supports some basic and commonly used functions of Tika. Future development may add Ruby support for more Tika functionality, and perhaps a command line interface as well. See the [Other Tika Resources](#other-tika-resources) section for alternatives to Rika that may suit more demanding needs.
|
21
9
|
|
22
|
-
|
23
|
-
|
24
|
-
$ gem install rika
|
10
|
+
[![Code Climate](https://codeclimate.com/github/keithrbennett/rika.png)](https://codeclimate.com/github/keithrbennett/rika)
|
11
|
+
[![Build Status](https://travis-ci.org/keithrbennett/rika.png?branch=master)](https://travis-ci.org/keithrbennett/rika)
|
25
12
|
|
26
13
|
## Usage
|
27
14
|
|
28
|
-
For a quick start with the simplest use cases, the following functions
|
29
|
-
are provided to get what you need in a single function call, for your convenience:
|
15
|
+
For a quick start with the simplest use cases, the following functions are provided to get what you need in a single function call, for your convenience:
|
30
16
|
|
31
17
|
```ruby
|
32
18
|
require 'rika'
|
33
19
|
|
34
|
-
content = Rika.parse_content('
|
35
|
-
metadata = Rika.parse_metadata('
|
36
|
-
content, metadata = Rika.parse_content_and_metadata('
|
20
|
+
content = Rika.parse_content('x.pdf') # string containing all content text
|
21
|
+
metadata = Rika.parse_metadata('x.pdf') # hash containing the document metadata
|
22
|
+
content, metadata = Rika.parse_content_and_metadata('x.pdf') # both of the above
|
23
|
+
```
|
24
|
+
|
25
|
+
A URL can be used instead of a filespec wherever a data source is specified:
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
content, metadata = Rika.parse_content_and_metadata('https://github.com/keithrbennett/rika')
|
37
29
|
```
|
38
30
|
|
39
31
|
For other use cases and finer control, you can work directly with the Rika::Parser object:
|
@@ -41,43 +33,93 @@ For other use cases and finer control, you can work directly with the Rika::Pars
|
|
41
33
|
```ruby
|
42
34
|
require 'rika'
|
43
35
|
|
44
|
-
parser = Rika::Parser.new('
|
36
|
+
parser = Rika::Parser.new('x.pdf')
|
45
37
|
|
46
38
|
# Return the content of the document:
|
47
39
|
parser.content
|
48
40
|
|
49
|
-
# Return the
|
50
|
-
parser.
|
51
|
-
=> "application/pdf"
|
52
|
-
|
53
|
-
# Return the metadata field title if it exists:
|
54
|
-
parser.metadata["title"] if parser.metadata_exists?("title")
|
41
|
+
# Return the metadata of the document:
|
42
|
+
parser.metadata
|
55
43
|
|
56
|
-
# Return
|
57
|
-
parser.
|
44
|
+
# Return the media type for the document, e.g. "application/pdf":
|
45
|
+
parser.media_type
|
58
46
|
|
59
47
|
# Return only the first 10000 chars of the content:
|
60
|
-
parser = Rika::Parser.new('
|
48
|
+
parser = Rika::Parser.new('x.pdf', 10000)
|
61
49
|
parser.content # 10000 first chars returned
|
62
50
|
|
63
51
|
# Return content from URL
|
64
|
-
parser = Rika::Parser.new('http://
|
52
|
+
parser = Rika::Parser.new('http://example.com/x.pdf', 200)
|
65
53
|
parser.content
|
66
54
|
|
67
55
|
# Return the language for the content
|
68
|
-
parser =
|
56
|
+
parser = Rika::Parser.new('german-document.pdf')
|
69
57
|
parser.language
|
70
58
|
=> "de"
|
71
59
|
|
72
|
-
# Check whether the
|
60
|
+
# Check whether the language identification is certain enough to be trusted
|
73
61
|
parser.language_is_reasonably_certain?
|
74
62
|
|
75
63
|
```
|
76
64
|
|
65
|
+
#### Simple Command Line Use
|
66
|
+
|
67
|
+
Since Ruby supports the `-r` option to require a library, and the `-e` option to evaluate a string of code, you can easily do simple parsing on the command line, such as:
|
68
|
+
|
69
|
+
```
|
70
|
+
ruby -r rika -e 'puts Rika.parse_content("x.pdf")'
|
71
|
+
```
|
72
|
+
|
73
|
+
You could also parse the metadata and output it as JSON as follows:
|
74
|
+
|
75
|
+
```
|
76
|
+
ruby -r rika -r json -e 'puts Rika.parse_metadata("x.pdf").to_json'
|
77
|
+
```
|
78
|
+
|
79
|
+
If you want to get both content and metadata in JSON format, this would do that:
|
80
|
+
|
81
|
+
```
|
82
|
+
ruby -r rika -r json -e 'c,m = Rika.parse_content_and_metadata("tw.pdf"); puts({ c: c, m: m }.to_json)'
|
83
|
+
```
|
84
|
+
|
85
|
+
Using the [rexe](https://github.com/keithrbennett/rexe) gem, that can be made much more concise:
|
86
|
+
|
87
|
+
```
|
88
|
+
rexe -r rika -oj 'c,m = Rika.parse_content_and_metadata("x.pdf"); { c: c, m: m }'
|
89
|
+
```
|
90
|
+
|
91
|
+
...and changing the `-oj` option gives you access to other output formats such as "Pretty JSON", YAML, and AwesomePrint (a very human readable format).
|
92
|
+
|
93
|
+
|
94
|
+
## Installation
|
95
|
+
|
96
|
+
Add this line to your application's Gemfile. Use `gem` or `jgem` depending on your JRuby installation:
|
97
|
+
|
98
|
+
gem 'rika' # or: jgem 'rika'
|
99
|
+
|
100
|
+
And then execute:
|
101
|
+
|
102
|
+
$ bundle
|
103
|
+
|
104
|
+
Or install it yourself as:
|
105
|
+
|
106
|
+
$ gem install rika # or: jgem install rika
|
107
|
+
|
108
|
+
## Other Tika Resources
|
109
|
+
|
110
|
+
* For more sophisticated use of Tika, you can use the Tika jar file directly in your JRuby code. After installing the `rika` gem, the Tika jar file will be located in `$GEM_HOME/gems/rika-[rika-version]-java/target/dependency/tika-core-[tika-version].jar`.
|
111
|
+
|
112
|
+
* Tika also provides another jar file containing a RESTful server that you can run on the command line. You can download this server jar from http://tika.apache.org/download.html.
|
113
|
+
See the "Running the Tika Server as a Jar file" section of https://cwiki.apache.org/confluence/display/TIKA/TikaServer for more information.
|
114
|
+
|
115
|
+
* @chrismattman and others have provided a [Python library and CLI](https://github.com/chrismattmann/tika-python) that interfaces with the Tika server.
|
116
|
+
|
117
|
+
* A general Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
|
118
|
+
|
119
|
+
|
77
120
|
## Credits
|
78
|
-
|
79
|
-
|
80
|
-
* Richard Nyström
|
121
|
+
|
122
|
+
Richard Nyström (@ricn) is the original author of Rika, but has not been able to maintain it since 2015. In July 2020, Richard transferred the project to Keith Bennett (@keithrbennett), who had made made some contributions back in 2013.
|
81
123
|
|
82
124
|
## Contributing
|
83
125
|
|
data/RELEASE_NOTES.md
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
## Release Notes
|
2
|
+
|
3
|
+
#### v1.11.1
|
4
|
+
|
5
|
+
* Add Apache-2.0 license to gemspec.
|
6
|
+
|
7
|
+
|
8
|
+
#### v1.11.0
|
9
|
+
|
10
|
+
* Replace 2015 Tika jar files w/2020 tika-app-1.24.1.jar.
|
11
|
+
* Handover of maintainer status from @ricn to @keithrbennett.
|
12
|
+
* Add rika_helper.rb to provide abbreviated method names for interactive use w/pry, etc.
|
13
|
+
* Extract parser class to its own file.
|
14
|
+
* Various cleanup and refactoring.
|
15
|
+
* Improve README.md documentation.
|
16
|
+
* Tested successfully on Java 14.
|
17
|
+
* Move Tika jar file from /target/dependency to /java-lib.
|
data/Rakefile
CHANGED
Binary file
|
data/lib/rika.rb
CHANGED
@@ -4,18 +4,15 @@ raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
|
|
4
4
|
|
5
5
|
require "rika/version"
|
6
6
|
require 'uri'
|
7
|
-
require '
|
8
|
-
|
7
|
+
require 'open-uri'
|
8
|
+
require_relative 'rika/parser'
|
9
|
+
require_relative '../java-lib/tika-app-1.24.1.jar'
|
9
10
|
|
10
|
-
Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
|
11
|
-
require jar
|
12
|
-
end
|
13
|
-
|
14
|
-
# Heavily based on the Apache Tika API: http://tika.apache.org/1.3/api/org/apache/tika/Tika.html
|
15
11
|
module Rika
|
16
12
|
import org.apache.tika.metadata.Metadata
|
17
13
|
import org.apache.tika.Tika
|
18
14
|
import org.apache.tika.language.LanguageIdentifier
|
15
|
+
import org.apache.tika.detect.DefaultDetector
|
19
16
|
import java.io.FileInputStream
|
20
17
|
import java.net.URL
|
21
18
|
|
@@ -24,95 +21,23 @@ module Rika
|
|
24
21
|
[parser.content, parser.metadata]
|
25
22
|
end
|
26
23
|
|
27
|
-
def self.
|
28
|
-
|
29
|
-
|
24
|
+
def self.parse_content_and_metadata_as_hash(file_location, max_content_length = -1)
|
25
|
+
content, metadata = parse_content_and_metadata(file_location, max_content_length)
|
26
|
+
{ content: content, metadata: metadata }
|
30
27
|
end
|
31
28
|
|
32
|
-
def self.
|
33
|
-
|
34
|
-
parser.metadata
|
29
|
+
def self.parse_content(file_location, max_content_length = -1)
|
30
|
+
Parser.new(file_location, max_content_length).content
|
35
31
|
end
|
36
32
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
@metadata_java = Metadata.new
|
45
|
-
@metadata_ruby = nil
|
46
|
-
@input_type = get_input_type
|
47
|
-
end
|
48
|
-
|
49
|
-
def content
|
50
|
-
self.parse
|
51
|
-
@content
|
52
|
-
end
|
53
|
-
|
54
|
-
def metadata
|
55
|
-
unless @metadata_ruby
|
56
|
-
self.parse
|
57
|
-
@metadata_ruby = {}
|
58
|
-
|
59
|
-
@metadata_java.names.each do |name|
|
60
|
-
@metadata_ruby[name] = @metadata_java.get(name)
|
61
|
-
end
|
62
|
-
end
|
63
|
-
@metadata_ruby
|
64
|
-
end
|
65
|
-
|
66
|
-
def media_type
|
67
|
-
@media_type ||= @tika.detect(input_stream)
|
68
|
-
end
|
69
|
-
|
70
|
-
def available_metadata
|
71
|
-
metadata.keys
|
72
|
-
end
|
73
|
-
|
74
|
-
def metadata_exists?(name)
|
75
|
-
metadata[name] != nil
|
76
|
-
end
|
77
|
-
|
78
|
-
def file?
|
79
|
-
@input_type == :file
|
80
|
-
end
|
81
|
-
|
82
|
-
def language
|
83
|
-
@lang ||= LanguageIdentifier.new(content)
|
84
|
-
|
85
|
-
@lang.language
|
86
|
-
end
|
87
|
-
|
88
|
-
def language_is_reasonably_certain?
|
89
|
-
@lang ||= LanguageIdentifier.new(content)
|
90
|
-
|
91
|
-
@lang.is_reasonably_certain
|
92
|
-
end
|
93
|
-
|
94
|
-
protected
|
95
|
-
|
96
|
-
def parse
|
97
|
-
@content ||= @tika.parse_to_string(input_stream, @metadata_java).to_s.strip
|
98
|
-
end
|
99
|
-
|
100
|
-
def get_input_type
|
101
|
-
if File.exists?(@uri) && File.directory?(@uri) == false
|
102
|
-
:file
|
103
|
-
elsif URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
|
104
|
-
:http
|
105
|
-
else
|
106
|
-
raise IOError, "Input (#{@uri}) is neither file nor http."
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
def input_stream
|
111
|
-
if file?
|
112
|
-
FileInputStream.new(java.io.File.new(@uri))
|
113
|
-
else # :http
|
114
|
-
URL.new(@uri).open_stream
|
115
|
-
end
|
116
|
-
end
|
33
|
+
# Regarding max_content_length, the default is set at 0 to save unnecessary processing,
|
34
|
+
# since the content is being ignored. However, the PDF metadata "pdf:unmappedUnicodeCharsPerPage"
|
35
|
+
# and "pdf:charsPerPage" will be absent if the max_content_length is 0, and will be
|
36
|
+
# ]may differ depending on
|
37
|
+
# the number of characters read.
|
38
|
+
def self.parse_metadata(file_location, max_content_length = 0)
|
39
|
+
Parser.new(file_location, max_content_length).metadata
|
117
40
|
end
|
118
41
|
end
|
42
|
+
|
43
|
+
|
data/lib/rika/parser.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
module Rika
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
attr_reader :data_source, :tika, :metadata_java, :metadata_ruby, :input_type
|
5
|
+
|
6
|
+
def initialize(data_source, max_content_length = -1, detector = DefaultDetector.new)
|
7
|
+
@data_source = data_source
|
8
|
+
@tika = Tika.new(detector)
|
9
|
+
@tika.set_max_string_length(max_content_length)
|
10
|
+
@metadata_java = nil
|
11
|
+
@metadata_ruby = nil
|
12
|
+
@input_type = get_input_type
|
13
|
+
end
|
14
|
+
|
15
|
+
def content
|
16
|
+
parse
|
17
|
+
@content
|
18
|
+
end
|
19
|
+
|
20
|
+
def metadata
|
21
|
+
unless @metadata_ruby
|
22
|
+
parse
|
23
|
+
@metadata_ruby = metadata_java.names.each_with_object({}) do |name, m_ruby|
|
24
|
+
m_ruby[name] = metadata_java.get(name)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
@metadata_ruby
|
28
|
+
end
|
29
|
+
|
30
|
+
def media_type
|
31
|
+
@media_type ||= file? \
|
32
|
+
? tika.detect(java.io.File.new(data_source)) \
|
33
|
+
: tika.detect(input_stream)
|
34
|
+
end
|
35
|
+
|
36
|
+
# @deprecated
|
37
|
+
def available_metadata
|
38
|
+
metadata.keys
|
39
|
+
end
|
40
|
+
|
41
|
+
# @deprecated
|
42
|
+
def metadata_exists?(name)
|
43
|
+
metadata[name] != nil
|
44
|
+
end
|
45
|
+
|
46
|
+
def language
|
47
|
+
@lang ||= LanguageIdentifier.new(content)
|
48
|
+
@lang.language
|
49
|
+
end
|
50
|
+
|
51
|
+
# @deprecated
|
52
|
+
# https://tika.apache.org/1.9/api/org/apache/tika/language/LanguageIdentifier.html#isReasonablyCertain()
|
53
|
+
# says: WARNING: Will never return true for small amount of input texts.
|
54
|
+
# https://tika.apache.org/1.19/api/org/apache/tika/language/LanguageIdentifier.html
|
55
|
+
# indicated that the LanguageIdentifier class used in this implementation is deprecated.
|
56
|
+
# TODO: More research needed to see if an alternate implementation can be used.
|
57
|
+
def language_is_reasonably_certain?
|
58
|
+
@lang ||= LanguageIdentifier.new(content)
|
59
|
+
@lang.is_reasonably_certain
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
def parse
|
64
|
+
unless @content
|
65
|
+
@metadata_java = Metadata.new
|
66
|
+
@content = tika.parse_to_string(input_stream, @metadata_java).to_s.strip
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
private def get_input_type
|
71
|
+
if File.file?(data_source)
|
72
|
+
:file
|
73
|
+
elsif URI(data_source).is_a?(URI::HTTP) && URI.open(data_source)
|
74
|
+
:http
|
75
|
+
else
|
76
|
+
raise IOError, "Input (#{data_source}) is not an available file or HTTP resource."
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
private def input_stream
|
81
|
+
file? \
|
82
|
+
? FileInputStream.new(java.io.File.new(data_source)) \
|
83
|
+
: URL.new(data_source).open_stream
|
84
|
+
end
|
85
|
+
|
86
|
+
private def file?
|
87
|
+
input_type == :file
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
data/lib/rika/version.rb
CHANGED