rika 1.11.1-java → 2.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -4
- data/.rubocop.yml +49 -0
- data/Gemfile +12 -0
- data/README.md +213 -76
- data/RELEASE_NOTES.md +26 -0
- data/Rakefile +4 -7
- data/bin/rika +13 -0
- data/lib/rika/cli/args_parser.rb +131 -0
- data/lib/rika/cli/rika_command.rb +129 -0
- data/lib/rika/formatters.rb +39 -0
- data/lib/rika/parse_result.rb +34 -0
- data/lib/rika/parser.rb +64 -70
- data/lib/rika/tika_loader.rb +65 -0
- data/lib/rika/version.rb +3 -1
- data/lib/rika.rb +98 -27
- data/rika.gemspec +30 -17
- data/rika_helper.rb +14 -22
- data/spec/fixtures/image_jpg_without_extension +0 -0
- data/spec/fixtures/tiny.txt +1 -0
- data/spec/rika/cli/args_parser_spec.rb +117 -0
- data/spec/rika/cli/rika_command_spec.rb +120 -0
- data/spec/rika/formatters_spec.rb +23 -0
- data/spec/rika/parse_result_spec.rb +42 -0
- data/spec/rika/parser_spec.rb +304 -0
- data/spec/rika/rika_spec.rb +10 -0
- data/spec/rika/tika_loader_spec.rb +57 -0
- data/spec/spec_helper.rb +10 -3
- metadata +40 -49
- data/.travis.yml +0 -7
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/spec/fixtures/text_file_without_extension +0 -23
- data/spec/rika_spec.rb +0 -245
- /data/spec/fixtures/{text_file.txt → document.txt} +0 -0
@@ -0,0 +1,129 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'awesome_print'
|
4
|
+
require 'optparse'
|
5
|
+
require 'rika'
|
6
|
+
require 'rika/formatters'
|
7
|
+
require 'rika/cli/args_parser'
|
8
|
+
|
9
|
+
# This command line application enables the parsing of documents on the command line.
|
10
|
+
# Syntax is:
|
11
|
+
# rika [options] <file or url> [...file or url...]
|
12
|
+
# Run with -h or --help option for more details.
|
13
|
+
#
|
14
|
+
# Defaults to outputting both content (text) and metadata,
|
15
|
+
# but the -t and -m flags can be used to enable or suppress either.
|
16
|
+
# Supports output formats of JSON, Pretty JSON, YAML, Awesome Print, to_s, and inspect (see Formatters class).
|
17
|
+
class RikaCommand
|
18
|
+
attr_reader :args, :help_text, :metadata_formatter, :options, :targets, :text_formatter
|
19
|
+
|
20
|
+
# @param [Array<String>] args command line arguments; default to ARGV but may be overridden for testing
|
21
|
+
def initialize(args = ARGV)
|
22
|
+
# Dup the array in case it has been frozen. The array will be modified later when options are parsed
|
23
|
+
# and removed, and when directories are removed, so this array should not be frozen.
|
24
|
+
@args = args.dup
|
25
|
+
end
|
26
|
+
|
27
|
+
# Main method and entry point for this class' work.
|
28
|
+
def call
|
29
|
+
prepare
|
30
|
+
report_and_exit_if_no_targets_specified
|
31
|
+
if options[:as_array]
|
32
|
+
puts result_array_output
|
33
|
+
else
|
34
|
+
targets.each do |target|
|
35
|
+
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
36
|
+
puts single_document_output(target, result)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
nil
|
40
|
+
end
|
41
|
+
|
42
|
+
# Prepares to run the parse. This method is separate from #call so that it can be called from tests.
|
43
|
+
# @return [void]
|
44
|
+
private def prepare
|
45
|
+
@options, @targets, @help_text = ArgsParser.call(args)
|
46
|
+
set_output_formats
|
47
|
+
end
|
48
|
+
|
49
|
+
# Sets the output format(s) based on the command line options.
|
50
|
+
# Exits with error message if format is invalid.
|
51
|
+
# @return [void]
|
52
|
+
private def set_output_formats
|
53
|
+
format = options[:format]
|
54
|
+
@metadata_formatter = Rika::Formatters.get(format[0])
|
55
|
+
@text_formatter = Rika::Formatters.get(format[1])
|
56
|
+
nil
|
57
|
+
rescue KeyError
|
58
|
+
$stderr.puts "Invalid format: #{format}\n\n"
|
59
|
+
$stderr.puts help_text
|
60
|
+
exit 1
|
61
|
+
end
|
62
|
+
|
63
|
+
# Converts a ParseResult to a hash containing the selected pieces of data.
|
64
|
+
# @param [ParseResult] result the parse result
|
65
|
+
# @return [Hash] the hash containing the selected pieces of data
|
66
|
+
private def result_hash(result)
|
67
|
+
h = {}
|
68
|
+
h['source'] = result.metadata['rika:data-source'] if options[:source]
|
69
|
+
h['metadata'] = result.metadata if options[:metadata]
|
70
|
+
h['text'] = result.content if options[:text]
|
71
|
+
h
|
72
|
+
end
|
73
|
+
|
74
|
+
# Builds the string representation of the result of parsing a single document
|
75
|
+
# @param [String] target the target document
|
76
|
+
# @param [ParseResult] result the parse result
|
77
|
+
# @return [String] the string representation of the result of parsing a single document
|
78
|
+
private def single_document_output(target, result)
|
79
|
+
if options[:metadata] && options[:text] && %w[jj JJ yy].include?(options[:format])
|
80
|
+
metadata_formatter.(result_hash(result))
|
81
|
+
else
|
82
|
+
sio = StringIO.new
|
83
|
+
sio << "Source: #{target}\n" if options[:source]
|
84
|
+
sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
|
85
|
+
sio << text_formatter.(result.content) << "\n" if options[:text]
|
86
|
+
sio.string
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Parses the documents and outputs the result of the parse to stdout as an array of hashes.
|
91
|
+
# Outputting as an array necessitates that the metadata and text formatters be the same
|
92
|
+
# (otherwise the output would be invalid, especially with JSON or YAML).
|
93
|
+
# Therefore, the metadata formatter is arbitrarily selected to be used by both.
|
94
|
+
# @return [String] the string representation of the result of parsing the documents
|
95
|
+
private def result_array_output
|
96
|
+
output_hashes = targets.map do |target|
|
97
|
+
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
98
|
+
result_hash(result)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Either the metadata or text formatter will do, since they will necessarily be the same formatter.
|
102
|
+
metadata_formatter.call(output_hashes)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Tika offers a max_content_length option, but it is not exposed in Rika.
|
106
|
+
# Instead it is used only to enable or disable the entire text output.
|
107
|
+
private def max_content_length
|
108
|
+
options[:text] ? -1 : 0
|
109
|
+
end
|
110
|
+
|
111
|
+
# Prints message and help and exits if no targets are specified.
|
112
|
+
# The exit code is zero because this may not necessarily be an error, and we wouldn't want to
|
113
|
+
# be the cause of aborting a script. The documents specified as input to this command may be
|
114
|
+
# dynamically generated by a script, and the script may not want to abort if no documents are
|
115
|
+
# generated.
|
116
|
+
# @return [void] or exits
|
117
|
+
private def report_and_exit_if_no_targets_specified
|
118
|
+
if targets.empty?
|
119
|
+
$stderr.puts <<~MESSAGE
|
120
|
+
|
121
|
+
No targets specified.
|
122
|
+
|
123
|
+
#{help_text}
|
124
|
+
MESSAGE
|
125
|
+
exit 0
|
126
|
+
end
|
127
|
+
nil
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'yaml'
|
5
|
+
require 'awesome_print'
|
6
|
+
|
7
|
+
module Rika
|
8
|
+
# This module manages the formatters used to format the output of the Rika command line application.
|
9
|
+
class Formatters
|
10
|
+
AWESOME_PRINT_FORMATTER = ->(object) { object.ai }
|
11
|
+
INSPECT_FORMATTER = ->(object) { object.inspect }
|
12
|
+
JSON_FORMATTER = ->(object) { object.to_json }
|
13
|
+
PRETTY_JSON_FORMATTER = ->(object) { JSON.pretty_generate(object) }
|
14
|
+
TO_S_FORMATTER = ->(object) { object.to_s }
|
15
|
+
YAML_FORMATTER = ->(object) { object.to_yaml }
|
16
|
+
|
17
|
+
# A hash of formatters, keyed by the format character.
|
18
|
+
# The value is a lambda that takes the object to be formatted as a parameter.
|
19
|
+
# @return [Hash] the hash of formatters
|
20
|
+
FORMATTER_LOOKUP_TABLE = {
|
21
|
+
'a' => AWESOME_PRINT_FORMATTER,
|
22
|
+
'i' => INSPECT_FORMATTER,
|
23
|
+
'j' => JSON_FORMATTER,
|
24
|
+
'J' => PRETTY_JSON_FORMATTER,
|
25
|
+
't' => TO_S_FORMATTER,
|
26
|
+
'y' => YAML_FORMATTER
|
27
|
+
}.freeze
|
28
|
+
|
29
|
+
VALID_OPTION_CHARS = FORMATTER_LOOKUP_TABLE.keys
|
30
|
+
|
31
|
+
# Gets the formatter lambda for the given option character.
|
32
|
+
# @param [String] option_char the option character
|
33
|
+
# @return [Lambda] the formatter lambda
|
34
|
+
# @raise [KeyError] if any option character is invalid
|
35
|
+
def self.get(option_char)
|
36
|
+
FORMATTER_LOOKUP_TABLE.fetch(option_char)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Rika
|
4
|
+
# Encapsulates all results of parsing a document.
|
5
|
+
ParseResult = Struct.new(
|
6
|
+
:content,
|
7
|
+
:metadata,
|
8
|
+
:metadata_java,
|
9
|
+
:content_type,
|
10
|
+
:language,
|
11
|
+
:input_type,
|
12
|
+
:data_source,
|
13
|
+
:max_content_length,
|
14
|
+
keyword_init: true
|
15
|
+
) do
|
16
|
+
# Support using 'text' instead of 'content'; this often makes more sense.
|
17
|
+
alias_method :text, :content
|
18
|
+
|
19
|
+
# @return [Boolean] true if, and only if, input is a file
|
20
|
+
def file?
|
21
|
+
input_type == :file
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [Boolean] true if, and only if, input is HTTP
|
25
|
+
def http?
|
26
|
+
input_type == :http
|
27
|
+
end
|
28
|
+
|
29
|
+
# @return [Hash] content and metadata of ParseResult instance as hash
|
30
|
+
def content_and_metadata_hash
|
31
|
+
{ content: content, metadata: metadata }
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/rika/parser.rb
CHANGED
@@ -1,90 +1,84 @@
|
|
1
|
-
|
2
|
-
class Parser
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
|
-
|
3
|
+
require 'uri'
|
4
|
+
require_relative 'parse_result'
|
5
5
|
|
6
|
-
|
6
|
+
module Rika
|
7
|
+
# Parses a document and returns a ParseResult.
|
8
|
+
# This class is intended to be used only by the Rika module, not by users of the gem,
|
9
|
+
# who should instead call `Rika.parse`.
|
10
|
+
class Parser
|
11
|
+
# @param [String] data_source file path or HTTP(s) URL
|
12
|
+
# @param [Boolean] key_sort whether to sort the keys in the metadata hash, defaults to true
|
13
|
+
# @param [Integer] max_content_length maximum content length to return, defaults to all
|
14
|
+
# @param [Detector] detector Tika detector, defaults to DefaultDetector
|
15
|
+
def initialize(data_source, key_sort: true, max_content_length: -1, detector: DefaultDetector.new)
|
7
16
|
@data_source = data_source
|
8
|
-
@
|
9
|
-
@
|
10
|
-
@
|
11
|
-
@
|
12
|
-
@
|
13
|
-
end
|
14
|
-
|
15
|
-
def content
|
16
|
-
parse
|
17
|
-
@content
|
18
|
-
end
|
19
|
-
|
20
|
-
def metadata
|
21
|
-
unless @metadata_ruby
|
22
|
-
parse
|
23
|
-
@metadata_ruby = metadata_java.names.each_with_object({}) do |name, m_ruby|
|
24
|
-
m_ruby[name] = metadata_java.get(name)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
@metadata_ruby
|
28
|
-
end
|
29
|
-
|
30
|
-
def media_type
|
31
|
-
@media_type ||= file? \
|
32
|
-
? tika.detect(java.io.File.new(data_source)) \
|
33
|
-
: tika.detect(input_stream)
|
17
|
+
@key_sort = key_sort
|
18
|
+
@max_content_length = max_content_length
|
19
|
+
@detector = detector
|
20
|
+
@input_type = data_source_input_type
|
21
|
+
@tika = Tika.new(@detector)
|
34
22
|
end
|
35
23
|
|
36
|
-
#
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
@lang ||= LanguageIdentifier.new(content)
|
48
|
-
@lang.language
|
49
|
-
end
|
24
|
+
# Entry point method for parsing a document
|
25
|
+
# @return [ParseResult] parse result
|
26
|
+
def parse
|
27
|
+
metadata_java = Metadata.new
|
28
|
+
@tika.set_max_string_length(@max_content_length)
|
29
|
+
content = with_input_stream { |stream| @tika.parse_to_string(stream, metadata_java) }
|
30
|
+
language = Rika.language(content)
|
31
|
+
metadata_java.set('rika:language', language)
|
32
|
+
metadata_java.set('rika:data-source', @data_source)
|
33
|
+
metadata = metadata_java_to_ruby(metadata_java)
|
34
|
+
metadata = metadata.sort_by { |key, _value| key.downcase }.to_h if @key_sort
|
50
35
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
36
|
+
ParseResult.new(
|
37
|
+
content: content,
|
38
|
+
metadata: metadata,
|
39
|
+
metadata_java: metadata_java,
|
40
|
+
content_type: metadata['Content-Type'],
|
41
|
+
language: language,
|
42
|
+
input_type: @input_type,
|
43
|
+
data_source: @data_source,
|
44
|
+
max_content_length: @max_content_length
|
45
|
+
)
|
60
46
|
end
|
61
47
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
48
|
+
# @param [Metadata] metadata_java a Tika Java metadata instance populated by the parse and added to by this class
|
49
|
+
# @return [Hash] a Ruby hash containing the data of the Java Metadata instance
|
50
|
+
private def metadata_java_to_ruby(metadata_java)
|
51
|
+
metadata_java.names.each_with_object({}) do |name, m_ruby|
|
52
|
+
m_ruby[name] = metadata_java.get(name)
|
67
53
|
end
|
68
54
|
end
|
69
55
|
|
70
|
-
|
71
|
-
|
56
|
+
# @return [Symbol] input type (currently only :file and :http are supported)
|
57
|
+
# @raise [IOError] if input is not a file or HTTP resource
|
58
|
+
private def data_source_input_type
|
59
|
+
if File.file?(@data_source)
|
72
60
|
:file
|
73
|
-
elsif URI(data_source).is_a?(URI::HTTP)
|
61
|
+
elsif URI(@data_source).is_a?(URI::HTTP)
|
74
62
|
:http
|
75
63
|
else
|
76
|
-
raise IOError, "Input (#{data_source}) is not an available file or HTTP resource."
|
64
|
+
raise IOError, "Input (#{@data_source}) is not an available file or HTTP resource."
|
77
65
|
end
|
78
66
|
end
|
79
67
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
68
|
+
# * Creates and opens an input stream from the configured resource.
|
69
|
+
# * Yields that stream to the passed code block.
|
70
|
+
# * Then closes the stream.
|
71
|
+
# @return [Object] the value returned by the passed code block
|
72
|
+
private def with_input_stream
|
73
|
+
input_stream =
|
74
|
+
if @input_type == :file
|
75
|
+
FileInputStream.new(java.io.File.new(@data_source))
|
76
|
+
else
|
77
|
+
URL.new(@data_source).open_stream
|
78
|
+
end
|
79
|
+
yield input_stream
|
80
|
+
ensure
|
81
|
+
input_stream.close if input_stream.respond_to?(:close)
|
88
82
|
end
|
89
83
|
end
|
90
84
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Requires the Tika jar file, either from the default location (packaged with this gem)
|
4
|
+
# or from an override specified in the TIKA_JAR_FILESPEC environment variable.
|
5
|
+
|
6
|
+
module Rika
|
7
|
+
# This class handles the loading of the Apache Tika Java jar file.
|
8
|
+
# It is not intended to be instantiated. Instead, call the only public class method, `require_tika`.
|
9
|
+
class TikaLoader
|
10
|
+
# @return [String] absolute filespec of loaded Tika jar file
|
11
|
+
# @raise [TikaLoadError] if the Tika jar file cannot be loaded
|
12
|
+
def self.require_tika
|
13
|
+
tika_jar_filespec = specified_tika_filespec
|
14
|
+
|
15
|
+
begin
|
16
|
+
abs_tika_jar_filespec = File.absolute_path(tika_jar_filespec)
|
17
|
+
require abs_tika_jar_filespec
|
18
|
+
abs_tika_jar_filespec
|
19
|
+
rescue LoadError
|
20
|
+
message = "Unable to load Tika jar file from '#{tika_jar_filespec}'."
|
21
|
+
if tika_jar_filespec != abs_tika_jar_filespec
|
22
|
+
message += "\nAbsolute filespec is '#{abs_tika_jar_filespec}'."
|
23
|
+
end
|
24
|
+
raise TikaLoadError, message
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Gets the Tika jar filespec from the TIKA_JAR_FILESPEC environment variable,
|
29
|
+
# and prints an error message and exits if it is not set.
|
30
|
+
#
|
31
|
+
# @return [String] Tika jar filespec from env var TIKA_JAR_FILESPEC
|
32
|
+
# @raise [TikaLoadError] if the Tika jar file was not specified
|
33
|
+
private_class_method def self.specified_tika_filespec
|
34
|
+
tika_jar_filespec = ENV['TIKA_JAR_FILESPEC']
|
35
|
+
not_specified = tika_jar_filespec.nil? || tika_jar_filespec.strip.empty?
|
36
|
+
raise(TikaLoadError, 'Environment variable TIKA_JAR_FILESPEC is not set.') if not_specified
|
37
|
+
|
38
|
+
tika_jar_filespec
|
39
|
+
end
|
40
|
+
|
41
|
+
# Formats an error message for printing to stderr.
|
42
|
+
#
|
43
|
+
# @param [String] message the error message
|
44
|
+
# @return [String] the formatted error message
|
45
|
+
private_class_method def self.formatted_error_message(message)
|
46
|
+
banner = '!' * 79 # message.length
|
47
|
+
<<~MESSAGE
|
48
|
+
|
49
|
+
#{banner}
|
50
|
+
#{message}
|
51
|
+
#{banner}
|
52
|
+
|
53
|
+
MESSAGE
|
54
|
+
end
|
55
|
+
|
56
|
+
# Prints an error message to stderr and exits with a non-zero exit code.
|
57
|
+
private_class_method def self.print_message_and_exit(message)
|
58
|
+
warn formatted_error_message(message)
|
59
|
+
exit 1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# This error class reports the inability to load the Tika jar file.
|
64
|
+
class TikaLoadError < RuntimeError; end
|
65
|
+
end
|
data/lib/rika/version.rb
CHANGED
data/lib/rika.rb
CHANGED
@@ -1,43 +1,114 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
require '
|
7
|
-
require 'open-uri'
|
3
|
+
# This file is the top level file for the Rika gem.
|
4
|
+
# It requires the other files in the gem and provides the top level API.
|
5
|
+
# It also provides the top level module for the gem.
|
6
|
+
require 'rika/version'
|
8
7
|
require_relative 'rika/parser'
|
9
|
-
require_relative '
|
8
|
+
require_relative 'rika/tika_loader'
|
10
9
|
|
10
|
+
# The top level module for the Rika gem.
|
11
11
|
module Rika
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
12
|
+
PROJECT_URL = 'https://github.com/keithrbennett/rika'
|
13
|
+
|
14
|
+
# Loads the Tika jar file and imports the needed Java classes.
|
15
|
+
# @return [Module] the Rika module, for chaining
|
16
|
+
def self.init
|
17
|
+
return if @initialized
|
18
|
+
|
19
|
+
Rika.raise_unless_jruby
|
20
|
+
|
21
|
+
Rika::TikaLoader.require_tika
|
22
|
+
import java.io.FileInputStream
|
23
|
+
import java.net.URL
|
24
|
+
import org.apache.tika.Tika
|
25
|
+
import org.apache.tika.detect.DefaultDetector
|
26
|
+
import org.apache.tika.io.TikaInputStream
|
27
|
+
import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector
|
28
|
+
import org.apache.tika.language.detect.LanguageDetector
|
29
|
+
import org.apache.tika.language.detect.LanguageResult
|
30
|
+
import org.apache.tika.metadata.Metadata
|
31
|
+
|
32
|
+
@initialized = true
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
# Gets a ParseResult from parsing a document.
|
37
|
+
#
|
38
|
+
# @param [String] data_source file path or HTTP(s) URL
|
39
|
+
# @param [Boolean] key_sort whether to sort the keys in the metadata hash, defaults to true
|
40
|
+
# @param [Integer] max_content_length maximum content length to return, defaults to all
|
41
|
+
# @param [Detector] detector Tika detector, defaults to DefaultDetector
|
42
|
+
# @return [ParseResult]
|
43
|
+
def self.parse(data_source, key_sort: true, max_content_length: -1, detector: DefaultDetector.new)
|
44
|
+
init
|
45
|
+
parser = Parser.new(data_source, key_sort: key_sort, max_content_length: max_content_length, detector: detector)
|
46
|
+
parser.parse
|
47
|
+
end
|
18
48
|
|
19
|
-
|
20
|
-
|
21
|
-
|
49
|
+
# @return [String] version of loaded Tika jar file
|
50
|
+
def self.tika_version
|
51
|
+
init
|
52
|
+
Tika.java_class.package.implementation_version
|
22
53
|
end
|
23
54
|
|
24
|
-
|
25
|
-
|
26
|
-
|
55
|
+
# @param [String] text text to detect language of
|
56
|
+
# @return [String] language of passed text, as 2-character ISO 639-1 code
|
57
|
+
def self.language(text)
|
58
|
+
init
|
59
|
+
tika_language_detector.detect(text.to_java_string).get_language
|
27
60
|
end
|
28
61
|
|
29
|
-
|
30
|
-
|
62
|
+
# @param [String] data_source file path or HTTP URL
|
63
|
+
# @return [Array<String,Hash>] content and metadata of file at specified location
|
64
|
+
#
|
65
|
+
# @deprecated Instead, get a ParseResult and access the content and metadata fields.
|
66
|
+
def self.parse_content_and_metadata(data_source, max_content_length: -1)
|
67
|
+
init
|
68
|
+
result = parse(data_source, max_content_length: max_content_length)
|
69
|
+
[result.content, result.metadata]
|
70
|
+
end
|
71
|
+
|
72
|
+
# @param [String] data_source file path or HTTP URL
|
73
|
+
# @return [Hash] content and metadata of file at specified location
|
74
|
+
#
|
75
|
+
# @deprecated Instead, use a ParseResult or its to_h method.
|
76
|
+
def self.parse_content_and_metadata_as_hash(data_source, max_content_length: -1)
|
77
|
+
init
|
78
|
+
result = parse(data_source, max_content_length: max_content_length)
|
79
|
+
{ content: result.content, metadata: result.metadata }
|
80
|
+
end
|
81
|
+
|
82
|
+
# @param [String] data_source file path or HTTP URL
|
83
|
+
# @return [Parser] parser for resource at specified location
|
84
|
+
#
|
85
|
+
# @deprecated Instead, get a ParseResult and access the content field
|
86
|
+
def self.parse_content(data_source, max_content_length: -1)
|
87
|
+
init
|
88
|
+
parse(data_source, max_content_length: max_content_length).content
|
31
89
|
end
|
32
90
|
|
33
91
|
# Regarding max_content_length, the default is set at 0 to save unnecessary processing,
|
34
92
|
# since the content is being ignored. However, the PDF metadata "pdf:unmappedUnicodeCharsPerPage"
|
35
|
-
# and "pdf:charsPerPage" will be absent if the max_content_length is 0, and
|
36
|
-
#
|
37
|
-
#
|
38
|
-
|
39
|
-
|
93
|
+
# and "pdf:charsPerPage" will be absent if the max_content_length is 0, and otherwise may differ
|
94
|
+
# depending on the number of characters read.
|
95
|
+
#
|
96
|
+
# @deprecated Instead, get a ParseResult and access the metadata field
|
97
|
+
def self.parse_metadata(data_source, max_content_length: -1)
|
98
|
+
init
|
99
|
+
parse(data_source, max_content_length: max_content_length).metadata
|
40
100
|
end
|
41
|
-
end
|
42
101
|
|
102
|
+
# @return [Detector] Tika detector
|
103
|
+
def self.tika_language_detector
|
104
|
+
init
|
105
|
+
@tika_language_detector ||= OptimaizeLangDetector.new.loadModels
|
106
|
+
end
|
43
107
|
|
108
|
+
# Raise an error if not running under JRuby.
|
109
|
+
def self.raise_unless_jruby
|
110
|
+
unless RUBY_PLATFORM.match(/java/)
|
111
|
+
raise "\n\n\nRika can only be run with JRuby! It needs access to the Java Virtual Machine.\n\n\n"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
data/rika.gemspec
CHANGED
@@ -1,23 +1,36 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'English'
|
4
|
+
|
5
|
+
lib = File.expand_path('lib', __dir__)
|
3
6
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
7
|
require 'rika/version'
|
5
8
|
|
6
9
|
Gem::Specification.new do |gem|
|
7
|
-
gem.name =
|
10
|
+
gem.name = 'rika'
|
8
11
|
gem.version = Rika::VERSION
|
9
|
-
gem.authors = [
|
10
|
-
gem.email = [
|
11
|
-
gem.description =
|
12
|
-
gem.summary =
|
13
|
-
gem.homepage =
|
14
|
-
gem.files = `git ls-files`.split(
|
15
|
-
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
|
-
gem.
|
17
|
-
gem.
|
18
|
-
gem.
|
19
|
-
gem.
|
20
|
-
gem.
|
21
|
-
|
22
|
-
|
12
|
+
gem.authors = ['Richard Nyström', 'Keith Bennett']
|
13
|
+
gem.email = ['ricny046@gmail.com', 'keithrbennett@gmail.com']
|
14
|
+
gem.description = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
|
15
|
+
gem.summary = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
|
16
|
+
gem.homepage = 'https://github.com/keithrbennett/rika'
|
17
|
+
gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
18
|
+
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
19
|
+
gem.require_paths = ['lib']
|
20
|
+
gem.add_dependency 'awesome_print'
|
21
|
+
gem.platform = 'java'
|
22
|
+
gem.license = 'Apache-2.0'
|
23
|
+
gem.metadata['rubygems_mfa_required'] = 'true'
|
24
|
+
|
25
|
+
# NOTE: I am excluding the Ruby version constraint because this gem runs only in JRuby, and I don't know the
|
26
|
+
# minimum version requirement, and don't want to exclude use of any versions that might work.
|
23
27
|
|
28
|
+
gem.post_install_message = <<~MESSAGE
|
29
|
+
|
30
|
+
Using the rika gem requires that you:
|
31
|
+
1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
|
32
|
+
2) place it somewhere accessible to the running application
|
33
|
+
3) specify its location in the TIKA_JAR_FILESPEC environment variable
|
34
|
+
|
35
|
+
MESSAGE
|
36
|
+
end
|