rika 1.11.1-java → 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +5 -4
- data/.rubocop.yml +49 -0
- data/Gemfile +12 -0
- data/README.md +213 -76
- data/RELEASE_NOTES.md +26 -0
- data/Rakefile +4 -7
- data/bin/rika +13 -0
- data/lib/rika/cli/args_parser.rb +131 -0
- data/lib/rika/cli/rika_command.rb +129 -0
- data/lib/rika/formatters.rb +39 -0
- data/lib/rika/parse_result.rb +34 -0
- data/lib/rika/parser.rb +64 -70
- data/lib/rika/tika_loader.rb +65 -0
- data/lib/rika/version.rb +3 -1
- data/lib/rika.rb +98 -27
- data/rika.gemspec +30 -17
- data/rika_helper.rb +14 -22
- data/spec/fixtures/image_jpg_without_extension +0 -0
- data/spec/fixtures/tiny.txt +1 -0
- data/spec/rika/cli/args_parser_spec.rb +117 -0
- data/spec/rika/cli/rika_command_spec.rb +120 -0
- data/spec/rika/formatters_spec.rb +23 -0
- data/spec/rika/parse_result_spec.rb +42 -0
- data/spec/rika/parser_spec.rb +304 -0
- data/spec/rika/rika_spec.rb +10 -0
- data/spec/rika/tika_loader_spec.rb +57 -0
- data/spec/spec_helper.rb +10 -3
- metadata +40 -49
- data/.travis.yml +0 -7
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/spec/fixtures/text_file_without_extension +0 -23
- data/spec/rika_spec.rb +0 -245
- /data/spec/fixtures/{text_file.txt → document.txt} +0 -0
@@ -0,0 +1,129 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'awesome_print'
|
4
|
+
require 'optparse'
|
5
|
+
require 'rika'
|
6
|
+
require 'rika/formatters'
|
7
|
+
require 'rika/cli/args_parser'
|
8
|
+
|
9
|
+
# This command line application enables the parsing of documents on the command line.
|
10
|
+
# Syntax is:
|
11
|
+
# rika [options] <file or url> [...file or url...]
|
12
|
+
# Run with -h or --help option for more details.
|
13
|
+
#
|
14
|
+
# Defaults to outputting both content (text) and metadata,
|
15
|
+
# but the -t and -m flags can be used to enable or suppress either.
|
16
|
+
# Supports output formats of JSON, Pretty JSON, YAML, Awesome Print, to_s, and inspect (see Formatters class).
|
17
|
+
class RikaCommand
|
18
|
+
attr_reader :args, :help_text, :metadata_formatter, :options, :targets, :text_formatter
|
19
|
+
|
20
|
+
# @param [Array<String>] args command line arguments; default to ARGV but may be overridden for testing
|
21
|
+
def initialize(args = ARGV)
|
22
|
+
# Dup the array in case it has been frozen. The array will be modified later when options are parsed
|
23
|
+
# and removed, and when directories are removed, so this array should not be frozen.
|
24
|
+
@args = args.dup
|
25
|
+
end
|
26
|
+
|
27
|
+
# Main method and entry point for this class' work.
|
28
|
+
def call
|
29
|
+
prepare
|
30
|
+
report_and_exit_if_no_targets_specified
|
31
|
+
if options[:as_array]
|
32
|
+
puts result_array_output
|
33
|
+
else
|
34
|
+
targets.each do |target|
|
35
|
+
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
36
|
+
puts single_document_output(target, result)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
|
42
|
+
# Prepares to run the parse. This method is separate from #call so that it can be called from tests.
|
43
|
+
# @return [void]
|
44
|
+
private def prepare
|
45
|
+
@options, @targets, @help_text = ArgsParser.call(args)
|
46
|
+
set_output_formats
|
47
|
+
end
|
48
|
+
|
49
|
+
# Sets the output format(s) based on the command line options.
|
50
|
+
# Exits with error message if format is invalid.
|
51
|
+
# @return [void]
|
52
|
+
private def set_output_formats
|
53
|
+
format = options[:format]
|
54
|
+
@metadata_formatter = Rika::Formatters.get(format[0])
|
55
|
+
@text_formatter = Rika::Formatters.get(format[1])
|
56
|
+
nil
|
57
|
+
rescue KeyError
|
58
|
+
$stderr.puts "Invalid format: #{format}\n\n"
|
59
|
+
$stderr.puts help_text
|
60
|
+
exit 1
|
61
|
+
end
|
62
|
+
|
63
|
+
# Converts a ParseResult to a hash containing the selected pieces of data.
|
64
|
+
# @param [ParseResult] result the parse result
|
65
|
+
# @return [Hash] the hash containing the selected pieces of data
|
66
|
+
private def result_hash(result)
|
67
|
+
h = {}
|
68
|
+
h['source'] = result.metadata['rika:data-source'] if options[:source]
|
69
|
+
h['metadata'] = result.metadata if options[:metadata]
|
70
|
+
h['text'] = result.content if options[:text]
|
71
|
+
h
|
72
|
+
end
|
73
|
+
|
74
|
+
# Builds the string representation of the result of parsing a single document
|
75
|
+
# @param [String] target the target document
|
76
|
+
# @param [ParseResult] result the parse result
|
77
|
+
# @return [String] the string representation of the result of parsing a single document
|
78
|
+
private def single_document_output(target, result)
|
79
|
+
if options[:metadata] && options[:text] && %w[jj JJ yy].include?(options[:format])
|
80
|
+
metadata_formatter.(result_hash(result))
|
81
|
+
else
|
82
|
+
sio = StringIO.new
|
83
|
+
sio << "Source: #{target}\n" if options[:source]
|
84
|
+
sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
|
85
|
+
sio << text_formatter.(result.content) << "\n" if options[:text]
|
86
|
+
sio.string
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Parses the documents and outputs the result of the parse to stdout as an array of hashes.
|
91
|
+
# Outputting as an array necessitates that the metadata and text formatters be the same
|
92
|
+
# (otherwise the output would be invalid, especially with JSON or YAML).
|
93
|
+
# Therefore, the metadata formatter is arbitrarily selected to be used by both.
|
94
|
+
# @return [String] the string representation of the result of parsing the documents
|
95
|
+
private def result_array_output
|
96
|
+
output_hashes = targets.map do |target|
|
97
|
+
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
98
|
+
result_hash(result)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Either the metadata or text formatter will do, since they will necessarily be the same formatter.
|
102
|
+
metadata_formatter.call(output_hashes)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Tika offers a max_content_length option, but it is not exposed in Rika.
|
106
|
+
# Instead it is used only to enable or disable the entire text output.
|
107
|
+
private def max_content_length
|
108
|
+
options[:text] ? -1 : 0
|
109
|
+
end
|
110
|
+
|
111
|
+
# Prints message and help and exits if no targets are specified.
|
112
|
+
# The exit code is zero because this may not necessarily be an error, and we wouldn't want to
|
113
|
+
# be the cause of aborting a script. The documents specified as input to this command may be
|
114
|
+
# dynamically generated by a script, and the script may not want to abort if no documents are
|
115
|
+
# generated.
|
116
|
+
# @return [void] or exits
|
117
|
+
private def report_and_exit_if_no_targets_specified
|
118
|
+
if targets.empty?
|
119
|
+
$stderr.puts <<~MESSAGE
|
120
|
+
|
121
|
+
No targets specified.
|
122
|
+
|
123
|
+
#{help_text}
|
124
|
+
MESSAGE
|
125
|
+
exit 0
|
126
|
+
end
|
127
|
+
nil
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'yaml'
|
5
|
+
require 'awesome_print'
|
6
|
+
|
7
|
+
module Rika
|
8
|
+
# This module manages the formatters used to format the output of the Rika command line application.
|
9
|
+
class Formatters
|
10
|
+
AWESOME_PRINT_FORMATTER = ->(object) { object.ai }
|
11
|
+
INSPECT_FORMATTER = ->(object) { object.inspect }
|
12
|
+
JSON_FORMATTER = ->(object) { object.to_json }
|
13
|
+
PRETTY_JSON_FORMATTER = ->(object) { JSON.pretty_generate(object) }
|
14
|
+
TO_S_FORMATTER = ->(object) { object.to_s }
|
15
|
+
YAML_FORMATTER = ->(object) { object.to_yaml }
|
16
|
+
|
17
|
+
# A hash of formatters, keyed by the format character.
|
18
|
+
# The value is a lambda that takes the object to be formatted as a parameter.
|
19
|
+
# @return [Hash] the hash of formatters
|
20
|
+
FORMATTER_LOOKUP_TABLE = {
|
21
|
+
'a' => AWESOME_PRINT_FORMATTER,
|
22
|
+
'i' => INSPECT_FORMATTER,
|
23
|
+
'j' => JSON_FORMATTER,
|
24
|
+
'J' => PRETTY_JSON_FORMATTER,
|
25
|
+
't' => TO_S_FORMATTER,
|
26
|
+
'y' => YAML_FORMATTER
|
27
|
+
}.freeze
|
28
|
+
|
29
|
+
VALID_OPTION_CHARS = FORMATTER_LOOKUP_TABLE.keys
|
30
|
+
|
31
|
+
# Gets the formatter lambda for the given option character.
|
32
|
+
# @param [String] option_char the option character
|
33
|
+
# @return [Lambda] the formatter lambda
|
34
|
+
# @raise [KeyError] if any option character is invalid
|
35
|
+
def self.get(option_char)
|
36
|
+
FORMATTER_LOOKUP_TABLE.fetch(option_char)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Rika
|
4
|
+
# Encapsulates all results of parsing a document.
|
5
|
+
ParseResult = Struct.new(
|
6
|
+
:content,
|
7
|
+
:metadata,
|
8
|
+
:metadata_java,
|
9
|
+
:content_type,
|
10
|
+
:language,
|
11
|
+
:input_type,
|
12
|
+
:data_source,
|
13
|
+
:max_content_length,
|
14
|
+
keyword_init: true
|
15
|
+
) do
|
16
|
+
# Support using 'text' instead of 'content'; this often makes more sense.
|
17
|
+
alias_method :text, :content
|
18
|
+
|
19
|
+
# @return [Boolean] true if, and only if, input is a file
|
20
|
+
def file?
|
21
|
+
input_type == :file
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [Boolean] true if, and only if, input is HTTP
|
25
|
+
def http?
|
26
|
+
input_type == :http
|
27
|
+
end
|
28
|
+
|
29
|
+
# @return [Hash] content and metadata of ParseResult instance as hash
|
30
|
+
def content_and_metadata_hash
|
31
|
+
{ content: content, metadata: metadata }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/rika/parser.rb
CHANGED
@@ -1,90 +1,84 @@
|
|
1
|
-
|
2
|
-
class Parser
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
|
-
|
3
|
+
require 'uri'
|
4
|
+
require_relative 'parse_result'
|
5
5
|
|
6
|
-
|
6
|
+
module Rika
|
7
|
+
# Parses a document and returns a ParseResult.
|
8
|
+
# This class is intended to be used only by the Rika module, not by users of the gem,
|
9
|
+
# who should instead call `Rika.parse`.
|
10
|
+
class Parser
|
11
|
+
# @param [String] data_source file path or HTTP(s) URL
|
12
|
+
# @param [Boolean] key_sort whether to sort the keys in the metadata hash, defaults to true
|
13
|
+
# @param [Integer] max_content_length maximum content length to return, defaults to all
|
14
|
+
# @param [Detector] detector Tika detector, defaults to DefaultDetector
|
15
|
+
def initialize(data_source, key_sort: true, max_content_length: -1, detector: DefaultDetector.new)
|
7
16
|
@data_source = data_source
|
8
|
-
@
|
9
|
-
@
|
10
|
-
@
|
11
|
-
@
|
12
|
-
@
|
13
|
-
end
|
14
|
-
|
15
|
-
def content
|
16
|
-
parse
|
17
|
-
@content
|
18
|
-
end
|
19
|
-
|
20
|
-
def metadata
|
21
|
-
unless @metadata_ruby
|
22
|
-
parse
|
23
|
-
@metadata_ruby = metadata_java.names.each_with_object({}) do |name, m_ruby|
|
24
|
-
m_ruby[name] = metadata_java.get(name)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
@metadata_ruby
|
28
|
-
end
|
29
|
-
|
30
|
-
def media_type
|
31
|
-
@media_type ||= file? \
|
32
|
-
? tika.detect(java.io.File.new(data_source)) \
|
33
|
-
: tika.detect(input_stream)
|
17
|
+
@key_sort = key_sort
|
18
|
+
@max_content_length = max_content_length
|
19
|
+
@detector = detector
|
20
|
+
@input_type = data_source_input_type
|
21
|
+
@tika = Tika.new(@detector)
|
34
22
|
end
|
35
23
|
|
36
|
-
#
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
@lang ||= LanguageIdentifier.new(content)
|
48
|
-
@lang.language
|
49
|
-
end
|
24
|
+
# Entry point method for parsing a document
|
25
|
+
# @return [ParseResult] parse result
|
26
|
+
def parse
|
27
|
+
metadata_java = Metadata.new
|
28
|
+
@tika.set_max_string_length(@max_content_length)
|
29
|
+
content = with_input_stream { |stream| @tika.parse_to_string(stream, metadata_java) }
|
30
|
+
language = Rika.language(content)
|
31
|
+
metadata_java.set('rika:language', language)
|
32
|
+
metadata_java.set('rika:data-source', @data_source)
|
33
|
+
metadata = metadata_java_to_ruby(metadata_java)
|
34
|
+
metadata = metadata.sort_by { |key, _value| key.downcase }.to_h if @key_sort
|
50
35
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
36
|
+
ParseResult.new(
|
37
|
+
content: content,
|
38
|
+
metadata: metadata,
|
39
|
+
metadata_java: metadata_java,
|
40
|
+
content_type: metadata['Content-Type'],
|
41
|
+
language: language,
|
42
|
+
input_type: @input_type,
|
43
|
+
data_source: @data_source,
|
44
|
+
max_content_length: @max_content_length
|
45
|
+
)
|
60
46
|
end
|
61
47
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
48
|
+
# @param [Metadata] metadata_java a Tika Java metadata instance populated by the parse and added to by this class
|
49
|
+
# @return [Hash] a Ruby hash containing the data of the Java Metadata instance
|
50
|
+
private def metadata_java_to_ruby(metadata_java)
|
51
|
+
metadata_java.names.each_with_object({}) do |name, m_ruby|
|
52
|
+
m_ruby[name] = metadata_java.get(name)
|
67
53
|
end
|
68
54
|
end
|
69
55
|
|
70
|
-
|
71
|
-
|
56
|
+
# @return [Symbol] input type (currently only :file and :http are supported)
|
57
|
+
# @raise [IOError] if input is not a file or HTTP resource
|
58
|
+
private def data_source_input_type
|
59
|
+
if File.file?(@data_source)
|
72
60
|
:file
|
73
|
-
elsif URI(data_source).is_a?(URI::HTTP)
|
61
|
+
elsif URI(@data_source).is_a?(URI::HTTP)
|
74
62
|
:http
|
75
63
|
else
|
76
|
-
raise IOError, "Input (#{data_source}) is not an available file or HTTP resource."
|
64
|
+
raise IOError, "Input (#{@data_source}) is not an available file or HTTP resource."
|
77
65
|
end
|
78
66
|
end
|
79
67
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
68
|
+
# * Creates and opens an input stream from the configured resource.
|
69
|
+
# * Yields that stream to the passed code block.
|
70
|
+
# * Then closes the stream.
|
71
|
+
# @return [Object] the value returned by the passed code block
|
72
|
+
private def with_input_stream
|
73
|
+
input_stream =
|
74
|
+
if @input_type == :file
|
75
|
+
FileInputStream.new(java.io.File.new(@data_source))
|
76
|
+
else
|
77
|
+
URL.new(@data_source).open_stream
|
78
|
+
end
|
79
|
+
yield input_stream
|
80
|
+
ensure
|
81
|
+
input_stream.close if input_stream.respond_to?(:close)
|
88
82
|
end
|
89
83
|
end
|
90
84
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Requires the Tika jar file, either from the default location (packaged with this gem)
|
4
|
+
# or from an override specified in the TIKA_JAR_FILESPEC environment variable.
|
5
|
+
|
6
|
+
module Rika
|
7
|
+
# This class handles the loading of the Apache Tika Java jar file.
|
8
|
+
# It is not intended to be instantiated. Instead, call the only public class method, `require_tika`.
|
9
|
+
class TikaLoader
|
10
|
+
# @return [String] absolute filespec of loaded Tika jar file
|
11
|
+
# @raise [TikaLoadError] if the Tika jar file cannot be loaded
|
12
|
+
def self.require_tika
|
13
|
+
tika_jar_filespec = specified_tika_filespec
|
14
|
+
|
15
|
+
begin
|
16
|
+
abs_tika_jar_filespec = File.absolute_path(tika_jar_filespec)
|
17
|
+
require abs_tika_jar_filespec
|
18
|
+
abs_tika_jar_filespec
|
19
|
+
rescue LoadError
|
20
|
+
message = "Unable to load Tika jar file from '#{tika_jar_filespec}'."
|
21
|
+
if tika_jar_filespec != abs_tika_jar_filespec
|
22
|
+
message += "\nAbsolute filespec is '#{abs_tika_jar_filespec}'."
|
23
|
+
end
|
24
|
+
raise TikaLoadError, message
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Gets the Tika jar filespec from the TIKA_JAR_FILESPEC environment variable,
|
29
|
+
# and prints an error message and exits if it is not set.
|
30
|
+
#
|
31
|
+
# @return [String] Tika jar filespec from env var TIKA_JAR_FILESPEC
|
32
|
+
# @raise [TikaLoadError] if the Tika jar file was not specified
|
33
|
+
private_class_method def self.specified_tika_filespec
|
34
|
+
tika_jar_filespec = ENV['TIKA_JAR_FILESPEC']
|
35
|
+
not_specified = tika_jar_filespec.nil? || tika_jar_filespec.strip.empty?
|
36
|
+
raise(TikaLoadError, 'Environment variable TIKA_JAR_FILESPEC is not set.') if not_specified
|
37
|
+
|
38
|
+
tika_jar_filespec
|
39
|
+
end
|
40
|
+
|
41
|
+
# Formats an error message for printing to stderr.
|
42
|
+
#
|
43
|
+
# @param [String] message the error message
|
44
|
+
# @return [String] the formatted error message
|
45
|
+
private_class_method def self.formatted_error_message(message)
|
46
|
+
banner = '!' * 79 # message.length
|
47
|
+
<<~MESSAGE
|
48
|
+
|
49
|
+
#{banner}
|
50
|
+
#{message}
|
51
|
+
#{banner}
|
52
|
+
|
53
|
+
MESSAGE
|
54
|
+
end
|
55
|
+
|
56
|
+
# Prints an error message to stderr and exits with a non-zero exit code.
|
57
|
+
private_class_method def self.print_message_and_exit(message)
|
58
|
+
warn formatted_error_message(message)
|
59
|
+
exit 1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# This error class reports the inability to load the Tika jar file.
|
64
|
+
class TikaLoadError < RuntimeError; end
|
65
|
+
end
|
data/lib/rika/version.rb
CHANGED
data/lib/rika.rb
CHANGED
@@ -1,43 +1,114 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
require '
|
7
|
-
require 'open-uri'
|
3
|
+
# This file is the top level file for the Rika gem.
|
4
|
+
# It requires the other files in the gem and provides the top level API.
|
5
|
+
# It also provides the top level module for the gem.
|
6
|
+
require 'rika/version'
|
8
7
|
require_relative 'rika/parser'
|
9
|
-
require_relative '
|
8
|
+
require_relative 'rika/tika_loader'
|
10
9
|
|
10
|
+
# The top level module for the Rika gem.
|
11
11
|
module Rika
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
12
|
+
PROJECT_URL = 'https://github.com/keithrbennett/rika'
|
13
|
+
|
14
|
+
# Loads the Tika jar file and imports the needed Java classes.
|
15
|
+
# @return [Module] the Rika module, for chaining
|
16
|
+
def self.init
|
17
|
+
return if @initialized
|
18
|
+
|
19
|
+
Rika.raise_unless_jruby
|
20
|
+
|
21
|
+
Rika::TikaLoader.require_tika
|
22
|
+
import java.io.FileInputStream
|
23
|
+
import java.net.URL
|
24
|
+
import org.apache.tika.Tika
|
25
|
+
import org.apache.tika.detect.DefaultDetector
|
26
|
+
import org.apache.tika.io.TikaInputStream
|
27
|
+
import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector
|
28
|
+
import org.apache.tika.language.detect.LanguageDetector
|
29
|
+
import org.apache.tika.language.detect.LanguageResult
|
30
|
+
import org.apache.tika.metadata.Metadata
|
31
|
+
|
32
|
+
@initialized = true
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
# Gets a ParseResult from parsing a document.
|
37
|
+
#
|
38
|
+
# @param [String] data_source file path or HTTP(s) URL
|
39
|
+
# @param [Boolean] key_sort whether to sort the keys in the metadata hash, defaults to true
|
40
|
+
# @param [Integer] max_content_length maximum content length to return, defaults to all
|
41
|
+
# @param [Detector] detector Tika detector, defaults to DefaultDetector
|
42
|
+
# @return [ParseResult]
|
43
|
+
def self.parse(data_source, key_sort: true, max_content_length: -1, detector: DefaultDetector.new)
|
44
|
+
init
|
45
|
+
parser = Parser.new(data_source, key_sort: key_sort, max_content_length: max_content_length, detector: detector)
|
46
|
+
parser.parse
|
47
|
+
end
|
18
48
|
|
19
|
-
|
20
|
-
|
21
|
-
|
49
|
+
# @return [String] version of loaded Tika jar file
|
50
|
+
def self.tika_version
|
51
|
+
init
|
52
|
+
Tika.java_class.package.implementation_version
|
22
53
|
end
|
23
54
|
|
24
|
-
|
25
|
-
|
26
|
-
|
55
|
+
# @param [String] text text to detect language of
|
56
|
+
# @return [String] language of passed text, as 2-character ISO 639-1 code
|
57
|
+
def self.language(text)
|
58
|
+
init
|
59
|
+
tika_language_detector.detect(text.to_java_string).get_language
|
27
60
|
end
|
28
61
|
|
29
|
-
|
30
|
-
|
62
|
+
# @param [String] data_source file path or HTTP URL
|
63
|
+
# @return [Array<String,Hash>] content and metadata of file at specified location
|
64
|
+
#
|
65
|
+
# @deprecated Instead, get a ParseResult and access the content and metadata fields.
|
66
|
+
def self.parse_content_and_metadata(data_source, max_content_length: -1)
|
67
|
+
init
|
68
|
+
result = parse(data_source, max_content_length: max_content_length)
|
69
|
+
[result.content, result.metadata]
|
70
|
+
end
|
71
|
+
|
72
|
+
# @param [String] data_source file path or HTTP URL
|
73
|
+
# @return [Hash] content and metadata of file at specified location
|
74
|
+
#
|
75
|
+
# @deprecated Instead, use a ParseResult or its to_h method.
|
76
|
+
def self.parse_content_and_metadata_as_hash(data_source, max_content_length: -1)
|
77
|
+
init
|
78
|
+
result = parse(data_source, max_content_length: max_content_length)
|
79
|
+
{ content: result.content, metadata: result.metadata }
|
80
|
+
end
|
81
|
+
|
82
|
+
# @param [String] data_source file path or HTTP URL
|
83
|
+
# @return [Parser] parser for resource at specified location
|
84
|
+
#
|
85
|
+
# @deprecated Instead, get a ParseResult and access the content field
|
86
|
+
def self.parse_content(data_source, max_content_length: -1)
|
87
|
+
init
|
88
|
+
parse(data_source, max_content_length: max_content_length).content
|
31
89
|
end
|
32
90
|
|
33
91
|
# Regarding max_content_length, the default is set at 0 to save unnecessary processing,
|
34
92
|
# since the content is being ignored. However, the PDF metadata "pdf:unmappedUnicodeCharsPerPage"
|
35
|
-
# and "pdf:charsPerPage" will be absent if the max_content_length is 0, and
|
36
|
-
#
|
37
|
-
#
|
38
|
-
|
39
|
-
|
93
|
+
# and "pdf:charsPerPage" will be absent if the max_content_length is 0, and otherwise may differ
|
94
|
+
# depending on the number of characters read.
|
95
|
+
#
|
96
|
+
# @deprecated Instead, get a ParseResult and access the metadata field
|
97
|
+
def self.parse_metadata(data_source, max_content_length: -1)
|
98
|
+
init
|
99
|
+
parse(data_source, max_content_length: max_content_length).metadata
|
40
100
|
end
|
41
|
-
end
|
42
101
|
|
102
|
+
# @return [Detector] Tika detector
|
103
|
+
def self.tika_language_detector
|
104
|
+
init
|
105
|
+
@tika_language_detector ||= OptimaizeLangDetector.new.loadModels
|
106
|
+
end
|
43
107
|
|
108
|
+
# Raise an error if not running under JRuby.
|
109
|
+
def self.raise_unless_jruby
|
110
|
+
unless RUBY_PLATFORM.match(/java/)
|
111
|
+
raise "\n\n\nRika can only be run with JRuby! It needs access to the Java Virtual Machine.\n\n\n"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
data/rika.gemspec
CHANGED
@@ -1,23 +1,36 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'English'
|
4
|
+
|
5
|
+
lib = File.expand_path('lib', __dir__)
|
3
6
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
7
|
require 'rika/version'
|
5
8
|
|
6
9
|
Gem::Specification.new do |gem|
|
7
|
-
gem.name =
|
10
|
+
gem.name = 'rika'
|
8
11
|
gem.version = Rika::VERSION
|
9
|
-
gem.authors = [
|
10
|
-
gem.email = [
|
11
|
-
gem.description =
|
12
|
-
gem.summary =
|
13
|
-
gem.homepage =
|
14
|
-
gem.files = `git ls-files`.split(
|
15
|
-
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
|
-
gem.
|
17
|
-
gem.
|
18
|
-
gem.
|
19
|
-
gem.
|
20
|
-
gem.
|
21
|
-
|
22
|
-
|
12
|
+
gem.authors = ['Richard Nyström', 'Keith Bennett']
|
13
|
+
gem.email = ['ricny046@gmail.com', 'keithrbennett@gmail.com']
|
14
|
+
gem.description = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
|
15
|
+
gem.summary = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
|
16
|
+
gem.homepage = 'https://github.com/keithrbennett/rika'
|
17
|
+
gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
18
|
+
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
19
|
+
gem.require_paths = ['lib']
|
20
|
+
gem.add_dependency 'awesome_print'
|
21
|
+
gem.platform = 'java'
|
22
|
+
gem.license = 'Apache-2.0'
|
23
|
+
gem.metadata['rubygems_mfa_required'] = 'true'
|
24
|
+
|
25
|
+
# NOTE: I am excluding the Ruby version constraint because this gem runs only in JRuby, and I don't know the
|
26
|
+
# minimum version requirement, and don't want to exclude use of any versions that might work.
|
23
27
|
|
28
|
+
gem.post_install_message = <<~MESSAGE
|
29
|
+
|
30
|
+
Using the rika gem requires that you:
|
31
|
+
1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
|
32
|
+
2) place it somewhere accessible to the running application
|
33
|
+
3) specify its location in the TIKA_JAR_FILESPEC environment variable
|
34
|
+
|
35
|
+
MESSAGE
|
36
|
+
end
|