rika 1.11.1-java → 2.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'awesome_print'
4
+ require 'optparse'
5
+ require 'rika'
6
+ require 'rika/formatters'
7
+ require 'rika/cli/args_parser'
8
+
9
+ # This command line application enables the parsing of documents on the command line.
10
+ # Syntax is:
11
+ # rika [options] <file or url> [...file or url...]
12
+ # Run with -h or --help option for more details.
13
+ #
14
+ # Defaults to outputting both content (text) and metadata,
15
+ # but the -t and -m flags can be used to enable or suppress either.
16
+ # Supports output formats of JSON, Pretty JSON, YAML, Awesome Print, to_s, and inspect (see Formatters class).
17
+ class RikaCommand
18
+ attr_reader :args, :help_text, :metadata_formatter, :options, :targets, :text_formatter
19
+
20
+ # @param [Array<String>] args command line arguments; default to ARGV but may be overridden for testing
21
+ def initialize(args = ARGV)
22
+ # Dup the array in case it has been frozen. The array will be modified later when options are parsed
23
+ # and removed, and when directories are removed, so this array should not be frozen.
24
+ @args = args.dup
25
+ end
26
+
27
+ # Main method and entry point for this class' work.
28
+ def call
29
+ prepare
30
+ report_and_exit_if_no_targets_specified
31
+ if options[:as_array]
32
+ puts result_array_output
33
+ else
34
+ targets.each do |target|
35
+ result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
36
+ puts single_document_output(target, result)
37
+ end
38
+ end
39
+ nil
40
+ end
41
+
42
+ # Prepares to run the parse. This method is separate from #call so that it can be called from tests.
43
+ # @return [void]
44
+ private def prepare
45
+ @options, @targets, @help_text = ArgsParser.call(args)
46
+ set_output_formats
47
+ end
48
+
49
+ # Sets the output format(s) based on the command line options.
50
+ # Exits with error message if format is invalid.
51
+ # @return [void]
52
+ private def set_output_formats
53
+ format = options[:format]
54
+ @metadata_formatter = Rika::Formatters.get(format[0])
55
+ @text_formatter = Rika::Formatters.get(format[1])
56
+ nil
57
+ rescue KeyError
58
+ $stderr.puts "Invalid format: #{format}\n\n"
59
+ $stderr.puts help_text
60
+ exit 1
61
+ end
62
+
63
+ # Converts a ParseResult to a hash containing the selected pieces of data.
64
+ # @param [ParseResult] result the parse result
65
+ # @return [Hash] the hash containing the selected pieces of data
66
+ private def result_hash(result)
67
+ h = {}
68
+ h['source'] = result.metadata['rika:data-source'] if options[:source]
69
+ h['metadata'] = result.metadata if options[:metadata]
70
+ h['text'] = result.content if options[:text]
71
+ h
72
+ end
73
+
74
+ # Builds the string representation of the result of parsing a single document
75
+ # @param [String] target the target document
76
+ # @param [ParseResult] result the parse result
77
+ # @return [String] the string representation of the result of parsing a single document
78
+ private def single_document_output(target, result)
79
+ if options[:metadata] && options[:text] && %w[jj JJ yy].include?(options[:format])
80
+ metadata_formatter.(result_hash(result))
81
+ else
82
+ sio = StringIO.new
83
+ sio << "Source: #{target}\n" if options[:source]
84
+ sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
85
+ sio << text_formatter.(result.content) << "\n" if options[:text]
86
+ sio.string
87
+ end
88
+ end
89
+
90
+ # Parses the documents and outputs the result of the parse to stdout as an array of hashes.
91
+ # Outputting as an array necessitates that the metadata and text formatters be the same
92
+ # (otherwise the output would be invalid, especially with JSON or YAML).
93
+ # Therefore, the metadata formatter is arbitrarily selected to be used by both.
94
+ # @return [String] the string representation of the result of parsing the documents
95
+ private def result_array_output
96
+ output_hashes = targets.map do |target|
97
+ result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
98
+ result_hash(result)
99
+ end
100
+
101
+ # Either the metadata or text formatter will do, since they will necessarily be the same formatter.
102
+ metadata_formatter.call(output_hashes)
103
+ end
104
+
105
+ # Tika offers a max_content_length option, but it is not exposed in Rika.
106
+ # Instead it is used only to enable or disable the entire text output.
107
+ private def max_content_length
108
+ options[:text] ? -1 : 0
109
+ end
110
+
111
+ # Prints message and help and exits if no targets are specified.
112
+ # The exit code is zero because this may not necessarily be an error, and we wouldn't want to
113
+ # be the cause of aborting a script. The documents specified as input to this command may be
114
+ # dynamically generated by a script, and the script may not want to abort if no documents are
115
+ # generated.
116
+ # @return [void] or exits
117
+ private def report_and_exit_if_no_targets_specified
118
+ if targets.empty?
119
+ $stderr.puts <<~MESSAGE
120
+
121
+ No targets specified.
122
+
123
+ #{help_text}
124
+ MESSAGE
125
+ exit 0
126
+ end
127
+ nil
128
+ end
129
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'yaml'
5
+ require 'awesome_print'
6
+
7
+ module Rika
8
+ # This module manages the formatters used to format the output of the Rika command line application.
9
+ class Formatters
10
+ AWESOME_PRINT_FORMATTER = ->(object) { object.ai }
11
+ INSPECT_FORMATTER = ->(object) { object.inspect }
12
+ JSON_FORMATTER = ->(object) { object.to_json }
13
+ PRETTY_JSON_FORMATTER = ->(object) { JSON.pretty_generate(object) }
14
+ TO_S_FORMATTER = ->(object) { object.to_s }
15
+ YAML_FORMATTER = ->(object) { object.to_yaml }
16
+
17
+ # A hash of formatters, keyed by the format character.
18
+ # The value is a lambda that takes the object to be formatted as a parameter.
19
+ # @return [Hash] the hash of formatters
20
+ FORMATTER_LOOKUP_TABLE = {
21
+ 'a' => AWESOME_PRINT_FORMATTER,
22
+ 'i' => INSPECT_FORMATTER,
23
+ 'j' => JSON_FORMATTER,
24
+ 'J' => PRETTY_JSON_FORMATTER,
25
+ 't' => TO_S_FORMATTER,
26
+ 'y' => YAML_FORMATTER
27
+ }.freeze
28
+
29
+ VALID_OPTION_CHARS = FORMATTER_LOOKUP_TABLE.keys
30
+
31
+ # Gets the formatter lambda for the given option character.
32
+ # @param [String] option_char the option character
33
+ # @return [Lambda] the formatter lambda
34
+ # @raise [KeyError] if any option character is invalid
35
+ def self.get(option_char)
36
+ FORMATTER_LOOKUP_TABLE.fetch(option_char)
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Rika
4
+ # Encapsulates all results of parsing a document.
5
+ ParseResult = Struct.new(
6
+ :content,
7
+ :metadata,
8
+ :metadata_java,
9
+ :content_type,
10
+ :language,
11
+ :input_type,
12
+ :data_source,
13
+ :max_content_length,
14
+ keyword_init: true
15
+ ) do
16
+ # Support using 'text' instead of 'content'; this often makes more sense.
17
+ alias_method :text, :content
18
+
19
+ # @return [Boolean] true if, and only if, input is a file
20
+ def file?
21
+ input_type == :file
22
+ end
23
+
24
+ # @return [Boolean] true if, and only if, input is HTTP
25
+ def http?
26
+ input_type == :http
27
+ end
28
+
29
+ # @return [Hash] content and metadata of ParseResult instance as hash
30
+ def content_and_metadata_hash
31
+ { content: content, metadata: metadata }
32
+ end
33
+ end
34
+ end
data/lib/rika/parser.rb CHANGED
@@ -1,90 +1,84 @@
1
- module Rika
2
- class Parser
1
+ # frozen_string_literal: true
3
2
 
4
- attr_reader :data_source, :tika, :metadata_java, :metadata_ruby, :input_type
3
+ require 'uri'
4
+ require_relative 'parse_result'
5
5
 
6
- def initialize(data_source, max_content_length = -1, detector = DefaultDetector.new)
6
+ module Rika
7
+ # Parses a document and returns a ParseResult.
8
+ # This class is intended to be used only by the Rika module, not by users of the gem,
9
+ # who should instead call `Rika.parse`.
10
+ class Parser
11
+ # @param [String] data_source file path or HTTP(s) URL
12
+ # @param [Boolean] key_sort whether to sort the keys in the metadata hash, defaults to true
13
+ # @param [Integer] max_content_length maximum content length to return, defaults to all
14
+ # @param [Detector] detector Tika detector, defaults to DefaultDetector
15
+ def initialize(data_source, key_sort: true, max_content_length: -1, detector: DefaultDetector.new)
7
16
  @data_source = data_source
8
- @tika = Tika.new(detector)
9
- @tika.set_max_string_length(max_content_length)
10
- @metadata_java = nil
11
- @metadata_ruby = nil
12
- @input_type = get_input_type
13
- end
14
-
15
- def content
16
- parse
17
- @content
18
- end
19
-
20
- def metadata
21
- unless @metadata_ruby
22
- parse
23
- @metadata_ruby = metadata_java.names.each_with_object({}) do |name, m_ruby|
24
- m_ruby[name] = metadata_java.get(name)
25
- end
26
- end
27
- @metadata_ruby
28
- end
29
-
30
- def media_type
31
- @media_type ||= file? \
32
- ? tika.detect(java.io.File.new(data_source)) \
33
- : tika.detect(input_stream)
17
+ @key_sort = key_sort
18
+ @max_content_length = max_content_length
19
+ @detector = detector
20
+ @input_type = data_source_input_type
21
+ @tika = Tika.new(@detector)
34
22
  end
35
23
 
36
- # @deprecated
37
- def available_metadata
38
- metadata.keys
39
- end
40
-
41
- # @deprecated
42
- def metadata_exists?(name)
43
- metadata[name] != nil
44
- end
45
-
46
- def language
47
- @lang ||= LanguageIdentifier.new(content)
48
- @lang.language
49
- end
24
+ # Entry point method for parsing a document
25
+ # @return [ParseResult] parse result
26
+ def parse
27
+ metadata_java = Metadata.new
28
+ @tika.set_max_string_length(@max_content_length)
29
+ content = with_input_stream { |stream| @tika.parse_to_string(stream, metadata_java) }
30
+ language = Rika.language(content)
31
+ metadata_java.set('rika:language', language)
32
+ metadata_java.set('rika:data-source', @data_source)
33
+ metadata = metadata_java_to_ruby(metadata_java)
34
+ metadata = metadata.sort_by { |key, _value| key.downcase }.to_h if @key_sort
50
35
 
51
- # @deprecated
52
- # https://tika.apache.org/1.9/api/org/apache/tika/language/LanguageIdentifier.html#isReasonablyCertain()
53
- # says: WARNING: Will never return true for small amount of input texts.
54
- # https://tika.apache.org/1.19/api/org/apache/tika/language/LanguageIdentifier.html
55
- # indicated that the LanguageIdentifier class used in this implementation is deprecated.
56
- # TODO: More research needed to see if an alternate implementation can be used.
57
- def language_is_reasonably_certain?
58
- @lang ||= LanguageIdentifier.new(content)
59
- @lang.is_reasonably_certain
36
+ ParseResult.new(
37
+ content: content,
38
+ metadata: metadata,
39
+ metadata_java: metadata_java,
40
+ content_type: metadata['Content-Type'],
41
+ language: language,
42
+ input_type: @input_type,
43
+ data_source: @data_source,
44
+ max_content_length: @max_content_length
45
+ )
60
46
  end
61
47
 
62
-
63
- def parse
64
- unless @content
65
- @metadata_java = Metadata.new
66
- @content = tika.parse_to_string(input_stream, @metadata_java).to_s.strip
48
+ # @param [Metadata] metadata_java a Tika Java metadata instance populated by the parse and added to by this class
49
+ # @return [Hash] a Ruby hash containing the data of the Java Metadata instance
50
+ private def metadata_java_to_ruby(metadata_java)
51
+ metadata_java.names.each_with_object({}) do |name, m_ruby|
52
+ m_ruby[name] = metadata_java.get(name)
67
53
  end
68
54
  end
69
55
 
70
- private def get_input_type
71
- if File.file?(data_source)
56
+ # @return [Symbol] input type (currently only :file and :http are supported)
57
+ # @raise [IOError] if input is not a file or HTTP resource
58
+ private def data_source_input_type
59
+ if File.file?(@data_source)
72
60
  :file
73
- elsif URI(data_source).is_a?(URI::HTTP) && URI.open(data_source)
61
+ elsif URI(@data_source).is_a?(URI::HTTP)
74
62
  :http
75
63
  else
76
- raise IOError, "Input (#{data_source}) is not an available file or HTTP resource."
64
+ raise IOError, "Input (#{@data_source}) is not an available file or HTTP resource."
77
65
  end
78
66
  end
79
67
 
80
- private def input_stream
81
- file? \
82
- ? FileInputStream.new(java.io.File.new(data_source)) \
83
- : URL.new(data_source).open_stream
84
- end
85
-
86
- private def file?
87
- input_type == :file
68
+ # * Creates and opens an input stream from the configured resource.
69
+ # * Yields that stream to the passed code block.
70
+ # * Then closes the stream.
71
+ # @return [Object] the value returned by the passed code block
72
+ private def with_input_stream
73
+ input_stream =
74
+ if @input_type == :file
75
+ FileInputStream.new(java.io.File.new(@data_source))
76
+ else
77
+ URL.new(@data_source).open_stream
78
+ end
79
+ yield input_stream
80
+ ensure
81
+ input_stream.close if input_stream.respond_to?(:close)
88
82
  end
89
83
  end
90
84
  end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Requires the Tika jar file, either from the default location (packaged with this gem)
4
+ # or from an override specified in the TIKA_JAR_FILESPEC environment variable.
5
+
6
+ module Rika
7
+ # This class handles the loading of the Apache Tika Java jar file.
8
+ # It is not intended to be instantiated. Instead, call the only public class method, `require_tika`.
9
+ class TikaLoader
10
+ # @return [String] absolute filespec of loaded Tika jar file
11
+ # @raise [TikaLoadError] if the Tika jar file cannot be loaded
12
+ def self.require_tika
13
+ tika_jar_filespec = specified_tika_filespec
14
+
15
+ begin
16
+ abs_tika_jar_filespec = File.absolute_path(tika_jar_filespec)
17
+ require abs_tika_jar_filespec
18
+ abs_tika_jar_filespec
19
+ rescue LoadError
20
+ message = "Unable to load Tika jar file from '#{tika_jar_filespec}'."
21
+ if tika_jar_filespec != abs_tika_jar_filespec
22
+ message += "\nAbsolute filespec is '#{abs_tika_jar_filespec}'."
23
+ end
24
+ raise TikaLoadError, message
25
+ end
26
+ end
27
+
28
+ # Gets the Tika jar filespec from the TIKA_JAR_FILESPEC environment variable,
29
+ # and prints an error message and exits if it is not set.
30
+ #
31
+ # @return [String] Tika jar filespec from env var TIKA_JAR_FILESPEC
32
+ # @raise [TikaLoadError] if the Tika jar file was not specified
33
+ private_class_method def self.specified_tika_filespec
34
+ tika_jar_filespec = ENV['TIKA_JAR_FILESPEC']
35
+ not_specified = tika_jar_filespec.nil? || tika_jar_filespec.strip.empty?
36
+ raise(TikaLoadError, 'Environment variable TIKA_JAR_FILESPEC is not set.') if not_specified
37
+
38
+ tika_jar_filespec
39
+ end
40
+
41
+ # Formats an error message for printing to stderr.
42
+ #
43
+ # @param [String] message the error message
44
+ # @return [String] the formatted error message
45
+ private_class_method def self.formatted_error_message(message)
46
+ banner = '!' * 79 # message.length
47
+ <<~MESSAGE
48
+
49
+ #{banner}
50
+ #{message}
51
+ #{banner}
52
+
53
+ MESSAGE
54
+ end
55
+
56
+ # Prints an error message to stderr and exits with a non-zero exit code.
57
+ private_class_method def self.print_message_and_exit(message)
58
+ warn formatted_error_message(message)
59
+ exit 1
60
+ end
61
+ end
62
+
63
+ # This error class reports the inability to load the Tika jar file.
64
+ class TikaLoadError < RuntimeError; end
65
+ end
data/lib/rika/version.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Rika
2
- VERSION = "1.11.1"
4
+ VERSION = '2.0.0'
3
5
  end
data/lib/rika.rb CHANGED
@@ -1,43 +1,114 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
- raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
4
-
5
- require "rika/version"
6
- require 'uri'
7
- require 'open-uri'
3
+ # This file is the top level file for the Rika gem.
4
+ # It requires the other files in the gem and provides the top level API.
5
+ # It also provides the top level module for the gem.
6
+ require 'rika/version'
8
7
  require_relative 'rika/parser'
9
- require_relative '../java-lib/tika-app-1.24.1.jar'
8
+ require_relative 'rika/tika_loader'
10
9
 
10
+ # The top level module for the Rika gem.
11
11
  module Rika
12
- import org.apache.tika.metadata.Metadata
13
- import org.apache.tika.Tika
14
- import org.apache.tika.language.LanguageIdentifier
15
- import org.apache.tika.detect.DefaultDetector
16
- import java.io.FileInputStream
17
- import java.net.URL
12
+ PROJECT_URL = 'https://github.com/keithrbennett/rika'
13
+
14
+ # Loads the Tika jar file and imports the needed Java classes.
15
+ # @return [Module] the Rika module, for chaining
16
+ def self.init
17
+ return if @initialized
18
+
19
+ Rika.raise_unless_jruby
20
+
21
+ Rika::TikaLoader.require_tika
22
+ import java.io.FileInputStream
23
+ import java.net.URL
24
+ import org.apache.tika.Tika
25
+ import org.apache.tika.detect.DefaultDetector
26
+ import org.apache.tika.io.TikaInputStream
27
+ import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector
28
+ import org.apache.tika.language.detect.LanguageDetector
29
+ import org.apache.tika.language.detect.LanguageResult
30
+ import org.apache.tika.metadata.Metadata
31
+
32
+ @initialized = true
33
+ self
34
+ end
35
+
36
+ # Gets a ParseResult from parsing a document.
37
+ #
38
+ # @param [String] data_source file path or HTTP(s) URL
39
+ # @param [Boolean] key_sort whether to sort the keys in the metadata hash, defaults to true
40
+ # @param [Integer] max_content_length maximum content length to return, defaults to all
41
+ # @param [Detector] detector Tika detector, defaults to DefaultDetector
42
+ # @return [ParseResult]
43
+ def self.parse(data_source, key_sort: true, max_content_length: -1, detector: DefaultDetector.new)
44
+ init
45
+ parser = Parser.new(data_source, key_sort: key_sort, max_content_length: max_content_length, detector: detector)
46
+ parser.parse
47
+ end
18
48
 
19
- def self.parse_content_and_metadata(file_location, max_content_length = -1)
20
- parser = Parser.new(file_location, max_content_length)
21
- [parser.content, parser.metadata]
49
+ # @return [String] version of loaded Tika jar file
50
+ def self.tika_version
51
+ init
52
+ Tika.java_class.package.implementation_version
22
53
  end
23
54
 
24
- def self.parse_content_and_metadata_as_hash(file_location, max_content_length = -1)
25
- content, metadata = parse_content_and_metadata(file_location, max_content_length)
26
- { content: content, metadata: metadata }
55
+ # @param [String] text text to detect language of
56
+ # @return [String] language of passed text, as 2-character ISO 639-1 code
57
+ def self.language(text)
58
+ init
59
+ tika_language_detector.detect(text.to_java_string).get_language
27
60
  end
28
61
 
29
- def self.parse_content(file_location, max_content_length = -1)
30
- Parser.new(file_location, max_content_length).content
62
+ # @param [String] data_source file path or HTTP URL
63
+ # @return [Array<String,Hash>] content and metadata of file at specified location
64
+ #
65
+ # @deprecated Instead, get a ParseResult and access the content and metadata fields.
66
+ def self.parse_content_and_metadata(data_source, max_content_length: -1)
67
+ init
68
+ result = parse(data_source, max_content_length: max_content_length)
69
+ [result.content, result.metadata]
70
+ end
71
+
72
+ # @param [String] data_source file path or HTTP URL
73
+ # @return [Hash] content and metadata of file at specified location
74
+ #
75
+ # @deprecated Instead, use a ParseResult or its to_h method.
76
+ def self.parse_content_and_metadata_as_hash(data_source, max_content_length: -1)
77
+ init
78
+ result = parse(data_source, max_content_length: max_content_length)
79
+ { content: result.content, metadata: result.metadata }
80
+ end
81
+
82
+ # @param [String] data_source file path or HTTP URL
83
+ # @return [Parser] parser for resource at specified location
84
+ #
85
+ # @deprecated Instead, get a ParseResult and access the content field
86
+ def self.parse_content(data_source, max_content_length: -1)
87
+ init
88
+ parse(data_source, max_content_length: max_content_length).content
31
89
  end
32
90
 
33
91
  # Regarding max_content_length, the default is set at 0 to save unnecessary processing,
34
92
  # since the content is being ignored. However, the PDF metadata "pdf:unmappedUnicodeCharsPerPage"
35
- # and "pdf:charsPerPage" will be absent if the max_content_length is 0, and will be
36
- # ]may differ depending on
37
- # the number of characters read.
38
- def self.parse_metadata(file_location, max_content_length = 0)
39
- Parser.new(file_location, max_content_length).metadata
93
+ # and "pdf:charsPerPage" will be absent if the max_content_length is 0, and otherwise may differ
94
+ # depending on the number of characters read.
95
+ #
96
+ # @deprecated Instead, get a ParseResult and access the metadata field
97
+ def self.parse_metadata(data_source, max_content_length: -1)
98
+ init
99
+ parse(data_source, max_content_length: max_content_length).metadata
40
100
  end
41
- end
42
101
 
102
+ # @return [Detector] Tika detector
103
+ def self.tika_language_detector
104
+ init
105
+ @tika_language_detector ||= OptimaizeLangDetector.new.loadModels
106
+ end
43
107
 
108
+ # Raise an error if not running under JRuby.
109
+ def self.raise_unless_jruby
110
+ unless RUBY_PLATFORM.match(/java/)
111
+ raise "\n\n\nRika can only be run with JRuby! It needs access to the Java Virtual Machine.\n\n\n"
112
+ end
113
+ end
114
+ end
data/rika.gemspec CHANGED
@@ -1,23 +1,36 @@
1
- # -*- encoding: utf-8 -*-
2
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ require 'English'
4
+
5
+ lib = File.expand_path('lib', __dir__)
3
6
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
7
  require 'rika/version'
5
8
 
6
9
  Gem::Specification.new do |gem|
7
- gem.name = "rika"
10
+ gem.name = 'rika'
8
11
  gem.version = Rika::VERSION
9
- gem.authors = ["Richard Nyström", "Keith Bennett"]
10
- gem.email = ["ricny046@gmail.com", "keithrbennett@gmail.com"]
11
- gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
12
- gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
13
- gem.homepage = "https://github.com/keithrbennett/rika"
14
- gem.files = `git ls-files`.split($/)
15
- gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
- gem.require_paths = ["lib"]
18
- gem.add_development_dependency "rspec", "~> 3.9"
19
- gem.add_development_dependency "rake", "~> 13.0"
20
- gem.platform = "java"
21
- gem.license = "Apache-2.0"
22
- end
12
+ gem.authors = ['Richard Nyström', 'Keith Bennett']
13
+ gem.email = ['ricny046@gmail.com', 'keithrbennett@gmail.com']
14
+ gem.description = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
15
+ gem.summary = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
16
+ gem.homepage = 'https://github.com/keithrbennett/rika'
17
+ gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
19
+ gem.require_paths = ['lib']
20
+ gem.add_dependency 'awesome_print'
21
+ gem.platform = 'java'
22
+ gem.license = 'Apache-2.0'
23
+ gem.metadata['rubygems_mfa_required'] = 'true'
24
+
25
+ # NOTE: I am excluding the Ruby version constraint because this gem runs only in JRuby, and I don't know the
26
+ # minimum version requirement, and don't want to exclude use of any versions that might work.
23
27
 
28
+ gem.post_install_message = <<~MESSAGE
29
+
30
+ Using the rika gem requires that you:
31
+ 1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
32
+ 2) place it somewhere accessible to the running application
33
+ 3) specify its location in the TIKA_JAR_FILESPEC environment variable
34
+
35
+ MESSAGE
36
+ end