rika 1.6.0-java → 1.11.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.travis.yml +3 -3
- data/README.md +82 -40
- data/RELEASE_NOTES.md +17 -0
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/lib/rika.rb +17 -96
- data/lib/rika/parser.rb +90 -0
- data/lib/rika/version.rb +1 -1
- data/pom.xml +2 -2
- data/rika.gemspec +9 -7
- data/rika_helper.rb +38 -0
- data/spec/fixtures/de.txt +21 -1
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/en.txt +23 -1
- data/spec/fixtures/es.txt +21 -1
- data/spec/fixtures/fr.txt +23 -1
- data/spec/fixtures/ru.txt +21 -1
- data/spec/fixtures/text_file.txt +23 -1
- data/spec/fixtures/text_file_without_extension +23 -1
- data/spec/rika_spec.rb +145 -102
- data/spec/spec_helper.rb +4 -3
- metadata +31 -66
- data/spec/fixtures/over_100k_file.txt +0 -1241
- data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
- data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
- data/target/dependency/asm-debug-all-4.1.jar +0 -0
- data/target/dependency/aspectjrt-1.8.0.jar +0 -0
- data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
- data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
- data/target/dependency/boilerpipe-1.1.0.jar +0 -0
- data/target/dependency/commons-codec-1.9.jar +0 -0
- data/target/dependency/commons-compress-1.8.1.jar +0 -0
- data/target/dependency/commons-httpclient-3.1.jar +0 -0
- data/target/dependency/commons-logging-1.1.1.jar +0 -0
- data/target/dependency/fontbox-1.8.6.jar +0 -0
- data/target/dependency/isoparser-1.0.2.jar +0 -0
- data/target/dependency/java-libpst-0.8.1.jar +0 -0
- data/target/dependency/jcip-annotations-1.0.jar +0 -0
- data/target/dependency/jdom-1.0.jar +0 -0
- data/target/dependency/jempbox-1.8.6.jar +0 -0
- data/target/dependency/jhighlight-1.0.jar +0 -0
- data/target/dependency/jmatio-1.0.jar +0 -0
- data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
- data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
- data/target/dependency/netcdf-4.2.20.jar +0 -0
- data/target/dependency/pdfbox-1.8.6.jar +0 -0
- data/target/dependency/poi-3.11-beta2.jar +0 -0
- data/target/dependency/poi-ooxml-3.11-beta2.jar +0 -0
- data/target/dependency/poi-ooxml-schemas-3.11-beta2.jar +0 -0
- data/target/dependency/poi-scratchpad-3.11-beta2.jar +0 -0
- data/target/dependency/rome-1.0.jar +0 -0
- data/target/dependency/slf4j-api-1.6.1.jar +0 -0
- data/target/dependency/tagsoup-1.2.1.jar +0 -0
- data/target/dependency/tika-core-1.6.jar +0 -0
- data/target/dependency/tika-parsers-1.6.jar +0 -0
- data/target/dependency/unidataCommon-4.2.20.jar +0 -0
- data/target/dependency/vorbis-java-core-0.6.jar +0 -0
- data/target/dependency/vorbis-java-tika-0.6.jar +0 -0
- data/target/dependency/xercesImpl-2.8.1.jar +0 -0
- data/target/dependency/xml-apis-1.3.03.jar +0 -0
- data/target/dependency/xmlbeans-2.6.0.jar +0 -0
- data/target/dependency/xmpcore-5.1.2.jar +0 -0
- data/target/dependency/xz-1.5.jar +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 2964b22b0e32e770c6ace90cf6b3ad1b05a54899b7838696c037c95645f1e73a
|
|
4
|
+
data.tar.gz: 3557bba0a54a62f00c9c4c148be307cc7132d806fb13178deb95dbb8f566eb33
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: aae34480ff9bf6ee7e9a00221a27fb6780cc60c8425644bf11bbcbf9875d2684a964a3bd88fccc28cc9c42b20d075548d531d485dada7ffd50a3d3eddc83294e
|
|
7
|
+
data.tar.gz: 530de9844daa28dddb9b149a0a671eaaa84d6e4dfc4aa840af609121e89a8d2a6a5f2f70e78523e9d62c0eef6472291b50e44a0899c9e464057b76dc89cfed3c
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/README.md
CHANGED
|
@@ -1,39 +1,31 @@
|
|
|
1
|
-
# Rika
|
|
2
|
-
|
|
3
|
-
A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
|
|
4
|
-
|
|
5
|
-
More information about Apache Tika can be found here: http://tika.apache.org/
|
|
6
|
-
|
|
7
|
-
[](https://codeclimate.com/github/ricn/rika)
|
|
8
|
-
[](https://travis-ci.org/ricn/rika)
|
|
9
|
-
|
|
10
|
-
## Installation
|
|
11
|
-
|
|
12
|
-
Add this line to your application's Gemfile:
|
|
13
1
|
|
|
14
|
-
|
|
2
|
+
# Rika
|
|
15
3
|
|
|
16
|
-
|
|
4
|
+
Rika is a [JRuby](https://www.jruby.org) wrapper for the [Apache Tika](http://tika.apache.org/) Java library, which extracts text and metadata from files and resources of [many different formats](https://tika.apache.org/1.24.1/formats.html).
|
|
17
5
|
|
|
18
|
-
|
|
6
|
+
_Caution: This gem only works with [JRuby](https://www.jruby.org)._
|
|
19
7
|
|
|
20
|
-
|
|
8
|
+
Rika currently supports some basic and commonly used functions of Tika. Future development may add Ruby support for more Tika functionality, and perhaps a command line interface as well. See the [Other Tika Resources](#other-tika-resources) section for alternatives to Rika that may suit more demanding needs.
|
|
21
9
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
$ gem install rika
|
|
10
|
+
[](https://codeclimate.com/github/keithrbennett/rika)
|
|
11
|
+
[](https://travis-ci.org/keithrbennett/rika)
|
|
25
12
|
|
|
26
13
|
## Usage
|
|
27
14
|
|
|
28
|
-
For a quick start with the simplest use cases, the following functions
|
|
29
|
-
are provided to get what you need in a single function call, for your convenience:
|
|
15
|
+
For a quick start with the simplest use cases, the following functions are provided to get what you need in a single function call, for your convenience:
|
|
30
16
|
|
|
31
17
|
```ruby
|
|
32
18
|
require 'rika'
|
|
33
19
|
|
|
34
|
-
content = Rika.parse_content('
|
|
35
|
-
metadata = Rika.parse_metadata('
|
|
36
|
-
content, metadata = Rika.parse_content_and_metadata('
|
|
20
|
+
content = Rika.parse_content('x.pdf') # string containing all content text
|
|
21
|
+
metadata = Rika.parse_metadata('x.pdf') # hash containing the document metadata
|
|
22
|
+
content, metadata = Rika.parse_content_and_metadata('x.pdf') # both of the above
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
A URL can be used instead of a filespec wherever a data source is specified:
|
|
26
|
+
|
|
27
|
+
```ruby
|
|
28
|
+
content, metadata = Rika.parse_content_and_metadata('https://github.com/keithrbennett/rika')
|
|
37
29
|
```
|
|
38
30
|
|
|
39
31
|
For other use cases and finer control, you can work directly with the Rika::Parser object:
|
|
@@ -41,43 +33,93 @@ For other use cases and finer control, you can work directly with the Rika::Pars
|
|
|
41
33
|
```ruby
|
|
42
34
|
require 'rika'
|
|
43
35
|
|
|
44
|
-
parser = Rika::Parser.new('
|
|
36
|
+
parser = Rika::Parser.new('x.pdf')
|
|
45
37
|
|
|
46
38
|
# Return the content of the document:
|
|
47
39
|
parser.content
|
|
48
40
|
|
|
49
|
-
# Return the
|
|
50
|
-
parser.
|
|
51
|
-
=> "application/pdf"
|
|
52
|
-
|
|
53
|
-
# Return the metadata field title if it exists:
|
|
54
|
-
parser.metadata["title"] if parser.metadata_exists?("title")
|
|
41
|
+
# Return the metadata of the document:
|
|
42
|
+
parser.metadata
|
|
55
43
|
|
|
56
|
-
# Return
|
|
57
|
-
parser.
|
|
44
|
+
# Return the media type for the document, e.g. "application/pdf":
|
|
45
|
+
parser.media_type
|
|
58
46
|
|
|
59
47
|
# Return only the first 10000 chars of the content:
|
|
60
|
-
parser = Rika::Parser.new('
|
|
48
|
+
parser = Rika::Parser.new('x.pdf', 10000)
|
|
61
49
|
parser.content # 10000 first chars returned
|
|
62
50
|
|
|
63
51
|
# Return content from URL
|
|
64
|
-
parser = Rika::Parser.new('http://
|
|
52
|
+
parser = Rika::Parser.new('http://example.com/x.pdf', 200)
|
|
65
53
|
parser.content
|
|
66
54
|
|
|
67
55
|
# Return the language for the content
|
|
68
|
-
parser =
|
|
56
|
+
parser = Rika::Parser.new('german-document.pdf')
|
|
69
57
|
parser.language
|
|
70
58
|
=> "de"
|
|
71
59
|
|
|
72
|
-
# Check whether the
|
|
60
|
+
# Check whether the language identification is certain enough to be trusted
|
|
73
61
|
parser.language_is_reasonably_certain?
|
|
74
62
|
|
|
75
63
|
```
|
|
76
64
|
|
|
65
|
+
#### Simple Command Line Use
|
|
66
|
+
|
|
67
|
+
Since Ruby supports the `-r` option to require a library, and the `-e` option to evaluate a string of code, you can easily do simple parsing on the command line, such as:
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
ruby -r rika -e 'puts Rika.parse_content("x.pdf")'
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
You could also parse the metadata and output it as JSON as follows:
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
ruby -r rika -r json -e 'puts Rika.parse_metadata("x.pdf").to_json'
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
If you want to get both content and metadata in JSON format, this would do that:
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
ruby -r rika -r json -e 'c,m = Rika.parse_content_and_metadata("tw.pdf"); puts({ c: c, m: m }.to_json)'
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Using the [rexe](https://github.com/keithrbennett/rexe) gem, that can be made much more concise:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
rexe -r rika -oj 'c,m = Rika.parse_content_and_metadata("x.pdf"); { c: c, m: m }'
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
...and changing the `-oj` option gives you access to other output formats such as "Pretty JSON", YAML, and AwesomePrint (a very human readable format).
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
## Installation
|
|
95
|
+
|
|
96
|
+
Add this line to your application's Gemfile. Use `gem` or `jgem` depending on your JRuby installation:
|
|
97
|
+
|
|
98
|
+
gem 'rika' # or: jgem 'rika'
|
|
99
|
+
|
|
100
|
+
And then execute:
|
|
101
|
+
|
|
102
|
+
$ bundle
|
|
103
|
+
|
|
104
|
+
Or install it yourself as:
|
|
105
|
+
|
|
106
|
+
$ gem install rika # or: jgem install rika
|
|
107
|
+
|
|
108
|
+
## Other Tika Resources
|
|
109
|
+
|
|
110
|
+
* For more sophisticated use of Tika, you can use the Tika jar file directly in your JRuby code. After installing the `rika` gem, the Tika jar file will be located in `$GEM_HOME/gems/rika-[rika-version]-java/target/dependency/tika-core-[tika-version].jar`.
|
|
111
|
+
|
|
112
|
+
* Tika also provides another jar file containing a RESTful server that you can run on the command line. You can download this server jar from http://tika.apache.org/download.html.
|
|
113
|
+
See the "Running the Tika Server as a Jar file" section of https://cwiki.apache.org/confluence/display/TIKA/TikaServer for more information.
|
|
114
|
+
|
|
115
|
+
* @chrismattman and others have provided a [Python library and CLI](https://github.com/chrismattmann/tika-python) that interfaces with the Tika server.
|
|
116
|
+
|
|
117
|
+
* A general Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
|
|
118
|
+
|
|
119
|
+
|
|
77
120
|
## Credits
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
* Richard Nyström
|
|
121
|
+
|
|
122
|
+
Richard Nyström (@ricn) is the original author of Rika, but has not been able to maintain it since 2015. In July 2020, Richard transferred the project to Keith Bennett (@keithrbennett), who had made made some contributions back in 2013.
|
|
81
123
|
|
|
82
124
|
## Contributing
|
|
83
125
|
|
data/RELEASE_NOTES.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
## Release Notes
|
|
2
|
+
|
|
3
|
+
#### v1.11.1
|
|
4
|
+
|
|
5
|
+
* Add Apache-2.0 license to gemspec.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
#### v1.11.0
|
|
9
|
+
|
|
10
|
+
* Replace 2015 Tika jar files w/2020 tika-app-1.24.1.jar.
|
|
11
|
+
* Handover of maintainer status from @ricn to @keithrbennett.
|
|
12
|
+
* Add rika_helper.rb to provide abbreviated method names for interactive use w/pry, etc.
|
|
13
|
+
* Extract parser class to its own file.
|
|
14
|
+
* Various cleanup and refactoring.
|
|
15
|
+
* Improve README.md documentation.
|
|
16
|
+
* Tested successfully on Java 14.
|
|
17
|
+
* Move Tika jar file from /target/dependency to /java-lib.
|
|
Binary file
|
data/lib/rika.rb
CHANGED
|
@@ -4,14 +4,10 @@ raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
|
|
|
4
4
|
|
|
5
5
|
require "rika/version"
|
|
6
6
|
require 'uri'
|
|
7
|
-
require '
|
|
8
|
-
|
|
7
|
+
require 'open-uri'
|
|
8
|
+
require_relative 'rika/parser'
|
|
9
|
+
require_relative '../java-lib/tika-app-1.24.1.jar'
|
|
9
10
|
|
|
10
|
-
Dir[File.join(File.dirname(__FILE__), "../target/dependency/*.jar")].each do |jar|
|
|
11
|
-
require jar
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
# Heavily based on the Apache Tika API: http://tika.apache.org/1.5/api/org/apache/tika/Tika.html
|
|
15
11
|
module Rika
|
|
16
12
|
import org.apache.tika.metadata.Metadata
|
|
17
13
|
import org.apache.tika.Tika
|
|
@@ -25,98 +21,23 @@ module Rika
|
|
|
25
21
|
[parser.content, parser.metadata]
|
|
26
22
|
end
|
|
27
23
|
|
|
28
|
-
def self.
|
|
29
|
-
|
|
30
|
-
|
|
24
|
+
def self.parse_content_and_metadata_as_hash(file_location, max_content_length = -1)
|
|
25
|
+
content, metadata = parse_content_and_metadata(file_location, max_content_length)
|
|
26
|
+
{ content: content, metadata: metadata }
|
|
31
27
|
end
|
|
32
28
|
|
|
33
|
-
def self.
|
|
34
|
-
|
|
35
|
-
parser.metadata
|
|
29
|
+
def self.parse_content(file_location, max_content_length = -1)
|
|
30
|
+
Parser.new(file_location, max_content_length).content
|
|
36
31
|
end
|
|
37
32
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@metadata_ruby = nil
|
|
46
|
-
@input_type = get_input_type
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
def content
|
|
50
|
-
self.parse
|
|
51
|
-
@content
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
def metadata
|
|
55
|
-
unless @metadata_ruby
|
|
56
|
-
self.parse
|
|
57
|
-
@metadata_ruby = {}
|
|
58
|
-
|
|
59
|
-
@metadata_java.names.each do |name|
|
|
60
|
-
@metadata_ruby[name] = @metadata_java.get(name)
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
@metadata_ruby
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
def media_type
|
|
67
|
-
if file?
|
|
68
|
-
@media_type ||= @tika.detect(java.io.File.new(@uri))
|
|
69
|
-
else
|
|
70
|
-
@media_type ||= @tika.detect(input_stream)
|
|
71
|
-
end
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
def available_metadata
|
|
75
|
-
metadata.keys
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
def metadata_exists?(name)
|
|
79
|
-
metadata[name] != nil
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
def file?
|
|
83
|
-
@input_type == :file
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
def language
|
|
87
|
-
@lang ||= LanguageIdentifier.new(content)
|
|
88
|
-
|
|
89
|
-
@lang.language
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
def language_is_reasonably_certain?
|
|
93
|
-
@lang ||= LanguageIdentifier.new(content)
|
|
94
|
-
|
|
95
|
-
@lang.is_reasonably_certain
|
|
96
|
-
end
|
|
97
|
-
|
|
98
|
-
protected
|
|
99
|
-
|
|
100
|
-
def parse
|
|
101
|
-
@content ||= @tika.parse_to_string(input_stream, @metadata_java).to_s.strip
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
def get_input_type
|
|
105
|
-
if File.exists?(@uri) && File.directory?(@uri) == false
|
|
106
|
-
:file
|
|
107
|
-
elsif URI(@uri).scheme == "http" && Net::HTTP.get_response(URI(@uri)).is_a?(Net::HTTPSuccess)
|
|
108
|
-
:http
|
|
109
|
-
else
|
|
110
|
-
raise IOError, "Input (#{@uri}) is neither file nor http."
|
|
111
|
-
end
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
def input_stream
|
|
115
|
-
if file?
|
|
116
|
-
FileInputStream.new(java.io.File.new(@uri))
|
|
117
|
-
else # :http
|
|
118
|
-
URL.new(@uri).open_stream
|
|
119
|
-
end
|
|
120
|
-
end
|
|
33
|
+
# Regarding max_content_length, the default is set at 0 to save unnecessary processing,
|
|
34
|
+
# since the content is being ignored. However, the PDF metadata "pdf:unmappedUnicodeCharsPerPage"
|
|
35
|
+
# and "pdf:charsPerPage" will be absent if the max_content_length is 0, and will be
|
|
36
|
+
# ]may differ depending on
|
|
37
|
+
# the number of characters read.
|
|
38
|
+
def self.parse_metadata(file_location, max_content_length = 0)
|
|
39
|
+
Parser.new(file_location, max_content_length).metadata
|
|
121
40
|
end
|
|
122
41
|
end
|
|
42
|
+
|
|
43
|
+
|
data/lib/rika/parser.rb
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
module Rika
|
|
2
|
+
class Parser
|
|
3
|
+
|
|
4
|
+
attr_reader :data_source, :tika, :metadata_java, :metadata_ruby, :input_type
|
|
5
|
+
|
|
6
|
+
def initialize(data_source, max_content_length = -1, detector = DefaultDetector.new)
|
|
7
|
+
@data_source = data_source
|
|
8
|
+
@tika = Tika.new(detector)
|
|
9
|
+
@tika.set_max_string_length(max_content_length)
|
|
10
|
+
@metadata_java = nil
|
|
11
|
+
@metadata_ruby = nil
|
|
12
|
+
@input_type = get_input_type
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def content
|
|
16
|
+
parse
|
|
17
|
+
@content
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def metadata
|
|
21
|
+
unless @metadata_ruby
|
|
22
|
+
parse
|
|
23
|
+
@metadata_ruby = metadata_java.names.each_with_object({}) do |name, m_ruby|
|
|
24
|
+
m_ruby[name] = metadata_java.get(name)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
@metadata_ruby
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def media_type
|
|
31
|
+
@media_type ||= file? \
|
|
32
|
+
? tika.detect(java.io.File.new(data_source)) \
|
|
33
|
+
: tika.detect(input_stream)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# @deprecated
|
|
37
|
+
def available_metadata
|
|
38
|
+
metadata.keys
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# @deprecated
|
|
42
|
+
def metadata_exists?(name)
|
|
43
|
+
metadata[name] != nil
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def language
|
|
47
|
+
@lang ||= LanguageIdentifier.new(content)
|
|
48
|
+
@lang.language
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# @deprecated
|
|
52
|
+
# https://tika.apache.org/1.9/api/org/apache/tika/language/LanguageIdentifier.html#isReasonablyCertain()
|
|
53
|
+
# says: WARNING: Will never return true for small amount of input texts.
|
|
54
|
+
# https://tika.apache.org/1.19/api/org/apache/tika/language/LanguageIdentifier.html
|
|
55
|
+
# indicated that the LanguageIdentifier class used in this implementation is deprecated.
|
|
56
|
+
# TODO: More research needed to see if an alternate implementation can be used.
|
|
57
|
+
def language_is_reasonably_certain?
|
|
58
|
+
@lang ||= LanguageIdentifier.new(content)
|
|
59
|
+
@lang.is_reasonably_certain
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def parse
|
|
64
|
+
unless @content
|
|
65
|
+
@metadata_java = Metadata.new
|
|
66
|
+
@content = tika.parse_to_string(input_stream, @metadata_java).to_s.strip
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private def get_input_type
|
|
71
|
+
if File.file?(data_source)
|
|
72
|
+
:file
|
|
73
|
+
elsif URI(data_source).is_a?(URI::HTTP) && URI.open(data_source)
|
|
74
|
+
:http
|
|
75
|
+
else
|
|
76
|
+
raise IOError, "Input (#{data_source}) is not an available file or HTTP resource."
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private def input_stream
|
|
81
|
+
file? \
|
|
82
|
+
? FileInputStream.new(java.io.File.new(data_source)) \
|
|
83
|
+
: URL.new(data_source).open_stream
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private def file?
|
|
87
|
+
input_type == :file
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
data/lib/rika/version.rb
CHANGED
data/pom.xml
CHANGED
|
@@ -12,8 +12,8 @@
|
|
|
12
12
|
<dependencies>
|
|
13
13
|
<dependency>
|
|
14
14
|
<groupId>org.apache.tika</groupId>
|
|
15
|
-
<artifactId>tika-
|
|
16
|
-
<version>1.
|
|
15
|
+
<artifactId>tika-app</artifactId>
|
|
16
|
+
<version>1.24</version>
|
|
17
17
|
<scope>test</scope>
|
|
18
18
|
</dependency>
|
|
19
19
|
</dependencies>
|