rika 1.11.1-java → 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'awesome_print'
4
+ require 'optparse'
5
+ require 'rika'
6
+ require 'rika/formatters'
7
+ require 'rika/cli/args_parser'
8
+
9
+ # This command line application enables the parsing of documents on the command line.
10
+ # Syntax is:
11
+ # rika [options] <file or url> [...file or url...]
12
+ # Run with -h or --help option for more details.
13
+ #
14
+ # Defaults to outputting both content (text) and metadata,
15
+ # but the -t and -m flags can be used to enable or suppress either.
16
+ # Supports output formats of JSON, Pretty JSON, YAML, Awesome Print, to_s, and inspect (see Formatters class).
17
+ class RikaCommand
18
+ attr_reader :args, :help_text, :metadata_formatter, :options, :targets, :text_formatter
19
+
20
+ # @param [Array<String>] args command line arguments; default to ARGV but may be overridden for testing
21
+ def initialize(args = ARGV)
22
+ # Dup the array in case it has been frozen. The array will be modified later when options are parsed
23
+ # and removed, and when directories are removed, so this array should not be frozen.
24
+ @args = args.dup
25
+ end
26
+
27
+ # Main method and entry point for this class' work.
28
+ def call
29
+ prepare
30
+ report_and_exit_if_no_targets_specified
31
+ if options[:as_array]
32
+ puts result_array_output
33
+ else
34
+ targets.each do |target|
35
+ result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
36
+ puts single_document_output(target, result)
37
+ end
38
+ end
39
+ nil
40
+ end
41
+
42
+ # Prepares to run the parse. This method is separate from #call so that it can be called from tests.
43
+ # @return [void]
44
+ private def prepare
45
+ @options, @targets, @help_text = ArgsParser.call(args)
46
+ set_output_formats
47
+ end
48
+
49
+ # Sets the output format(s) based on the command line options.
50
+ # Exits with error message if format is invalid.
51
+ # @return [void]
52
+ private def set_output_formats
53
+ format = options[:format]
54
+ @metadata_formatter = Rika::Formatters.get(format[0])
55
+ @text_formatter = Rika::Formatters.get(format[1])
56
+ nil
57
+ rescue KeyError
58
+ $stderr.puts "Invalid format: #{format}\n\n"
59
+ $stderr.puts help_text
60
+ exit 1
61
+ end
62
+
63
+ # Converts a ParseResult to a hash containing the selected pieces of data.
64
+ # @param [ParseResult] result the parse result
65
+ # @return [Hash] the hash containing the selected pieces of data
66
+ private def result_hash(result)
67
+ h = {}
68
+ h['source'] = result.metadata['rika:data-source'] if options[:source]
69
+ h['metadata'] = result.metadata if options[:metadata]
70
+ h['text'] = result.content if options[:text]
71
+ h
72
+ end
73
+
74
+ # Builds the string representation of the result of parsing a single document
75
+ # @param [String] target the target document
76
+ # @param [ParseResult] result the parse result
77
+ # @return [String] the string representation of the result of parsing a single document
78
+ private def single_document_output(target, result)
79
+ if options[:metadata] && options[:text] && %w[jj JJ yy].include?(options[:format])
80
+ metadata_formatter.(result_hash(result))
81
+ else
82
+ sio = StringIO.new
83
+ sio << "Source: #{target}\n" if options[:source]
84
+ sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
85
+ sio << text_formatter.(result.content) << "\n" if options[:text]
86
+ sio.string
87
+ end
88
+ end
89
+
90
+ # Parses the documents and outputs the result of the parse to stdout as an array of hashes.
91
+ # Outputting as an array necessitates that the metadata and text formatters be the same
92
+ # (otherwise the output would be invalid, especially with JSON or YAML).
93
+ # Therefore, the metadata formatter is arbitrarily selected to be used by both.
94
+ # @return [String] the string representation of the result of parsing the documents
95
+ private def result_array_output
96
+ output_hashes = targets.map do |target|
97
+ result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
98
+ result_hash(result)
99
+ end
100
+
101
+ # Either the metadata or text formatter will do, since they will necessarily be the same formatter.
102
+ metadata_formatter.call(output_hashes)
103
+ end
104
+
105
+ # Tika offers a max_content_length option, but it is not exposed in Rika.
106
+ # Instead it is used only to enable or disable the entire text output.
107
+ private def max_content_length
108
+ options[:text] ? -1 : 0
109
+ end
110
+
111
+ # Prints message and help and exits if no targets are specified.
112
+ # The exit code is zero because this may not necessarily be an error, and we wouldn't want to
113
+ # be the cause of aborting a script. The documents specified as input to this command may be
114
+ # dynamically generated by a script, and the script may not want to abort if no documents are
115
+ # generated.
116
+ # @return [void] or exits
117
+ private def report_and_exit_if_no_targets_specified
118
+ if targets.empty?
119
+ $stderr.puts <<~MESSAGE
120
+
121
+ No targets specified.
122
+
123
+ #{help_text}
124
+ MESSAGE
125
+ exit 0
126
+ end
127
+ nil
128
+ end
129
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'yaml'
5
+ require 'awesome_print'
6
+
7
+ module Rika
8
+ # This module manages the formatters used to format the output of the Rika command line application.
9
+ class Formatters
10
+ AWESOME_PRINT_FORMATTER = ->(object) { object.ai }
11
+ INSPECT_FORMATTER = ->(object) { object.inspect }
12
+ JSON_FORMATTER = ->(object) { object.to_json }
13
+ PRETTY_JSON_FORMATTER = ->(object) { JSON.pretty_generate(object) }
14
+ TO_S_FORMATTER = ->(object) { object.to_s }
15
+ YAML_FORMATTER = ->(object) { object.to_yaml }
16
+
17
+ # A hash of formatters, keyed by the format character.
18
+ # The value is a lambda that takes the object to be formatted as a parameter.
19
+ # @return [Hash] the hash of formatters
20
+ FORMATTER_LOOKUP_TABLE = {
21
+ 'a' => AWESOME_PRINT_FORMATTER,
22
+ 'i' => INSPECT_FORMATTER,
23
+ 'j' => JSON_FORMATTER,
24
+ 'J' => PRETTY_JSON_FORMATTER,
25
+ 't' => TO_S_FORMATTER,
26
+ 'y' => YAML_FORMATTER
27
+ }.freeze
28
+
29
+ VALID_OPTION_CHARS = FORMATTER_LOOKUP_TABLE.keys
30
+
31
+ # Gets the formatter lambda for the given option character.
32
+ # @param [String] option_char the option character
33
+ # @return [Lambda] the formatter lambda
34
+ # @raise [KeyError] if any option character is invalid
35
+ def self.get(option_char)
36
+ FORMATTER_LOOKUP_TABLE.fetch(option_char)
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Rika
4
+ # Encapsulates all results of parsing a document.
5
+ ParseResult = Struct.new(
6
+ :content,
7
+ :metadata,
8
+ :metadata_java,
9
+ :content_type,
10
+ :language,
11
+ :input_type,
12
+ :data_source,
13
+ :max_content_length,
14
+ keyword_init: true
15
+ ) do
16
+ # Support using 'text' instead of 'content'; this often makes more sense.
17
+ alias_method :text, :content
18
+
19
+ # @return [Boolean] true if, and only if, input is a file
20
+ def file?
21
+ input_type == :file
22
+ end
23
+
24
+ # @return [Boolean] true if, and only if, input is HTTP
25
+ def http?
26
+ input_type == :http
27
+ end
28
+
29
+ # @return [Hash] content and metadata of ParseResult instance as hash
30
+ def content_and_metadata_hash
31
+ { content: content, metadata: metadata }
32
+ end
33
+ end
34
+ end
data/lib/rika/parser.rb CHANGED
@@ -1,90 +1,84 @@
1
- module Rika
2
- class Parser
1
+ # frozen_string_literal: true
3
2
 
4
- attr_reader :data_source, :tika, :metadata_java, :metadata_ruby, :input_type
3
+ require 'uri'
4
+ require_relative 'parse_result'
5
5
 
6
- def initialize(data_source, max_content_length = -1, detector = DefaultDetector.new)
6
+ module Rika
7
+ # Parses a document and returns a ParseResult.
8
+ # This class is intended to be used only by the Rika module, not by users of the gem,
9
+ # who should instead call `Rika.parse`.
10
+ class Parser
11
+ # @param [String] data_source file path or HTTP(s) URL
12
+ # @param [Boolean] key_sort whether to sort the keys in the metadata hash, defaults to true
13
+ # @param [Integer] max_content_length maximum content length to return, defaults to all
14
+ # @param [Detector] detector Tika detector, defaults to DefaultDetector
15
+ def initialize(data_source, key_sort: true, max_content_length: -1, detector: DefaultDetector.new)
7
16
  @data_source = data_source
8
- @tika = Tika.new(detector)
9
- @tika.set_max_string_length(max_content_length)
10
- @metadata_java = nil
11
- @metadata_ruby = nil
12
- @input_type = get_input_type
13
- end
14
-
15
- def content
16
- parse
17
- @content
18
- end
19
-
20
- def metadata
21
- unless @metadata_ruby
22
- parse
23
- @metadata_ruby = metadata_java.names.each_with_object({}) do |name, m_ruby|
24
- m_ruby[name] = metadata_java.get(name)
25
- end
26
- end
27
- @metadata_ruby
28
- end
29
-
30
- def media_type
31
- @media_type ||= file? \
32
- ? tika.detect(java.io.File.new(data_source)) \
33
- : tika.detect(input_stream)
17
+ @key_sort = key_sort
18
+ @max_content_length = max_content_length
19
+ @detector = detector
20
+ @input_type = data_source_input_type
21
+ @tika = Tika.new(@detector)
34
22
  end
35
23
 
36
- # @deprecated
37
- def available_metadata
38
- metadata.keys
39
- end
40
-
41
- # @deprecated
42
- def metadata_exists?(name)
43
- metadata[name] != nil
44
- end
45
-
46
- def language
47
- @lang ||= LanguageIdentifier.new(content)
48
- @lang.language
49
- end
24
+ # Entry point method for parsing a document
25
+ # @return [ParseResult] parse result
26
+ def parse
27
+ metadata_java = Metadata.new
28
+ @tika.set_max_string_length(@max_content_length)
29
+ content = with_input_stream { |stream| @tika.parse_to_string(stream, metadata_java) }
30
+ language = Rika.language(content)
31
+ metadata_java.set('rika:language', language)
32
+ metadata_java.set('rika:data-source', @data_source)
33
+ metadata = metadata_java_to_ruby(metadata_java)
34
+ metadata = metadata.sort_by { |key, _value| key.downcase }.to_h if @key_sort
50
35
 
51
- # @deprecated
52
- # https://tika.apache.org/1.9/api/org/apache/tika/language/LanguageIdentifier.html#isReasonablyCertain()
53
- # says: WARNING: Will never return true for small amount of input texts.
54
- # https://tika.apache.org/1.19/api/org/apache/tika/language/LanguageIdentifier.html
55
- # indicated that the LanguageIdentifier class used in this implementation is deprecated.
56
- # TODO: More research needed to see if an alternate implementation can be used.
57
- def language_is_reasonably_certain?
58
- @lang ||= LanguageIdentifier.new(content)
59
- @lang.is_reasonably_certain
36
+ ParseResult.new(
37
+ content: content,
38
+ metadata: metadata,
39
+ metadata_java: metadata_java,
40
+ content_type: metadata['Content-Type'],
41
+ language: language,
42
+ input_type: @input_type,
43
+ data_source: @data_source,
44
+ max_content_length: @max_content_length
45
+ )
60
46
  end
61
47
 
62
-
63
- def parse
64
- unless @content
65
- @metadata_java = Metadata.new
66
- @content = tika.parse_to_string(input_stream, @metadata_java).to_s.strip
48
+ # @param [Metadata] metadata_java a Tika Java metadata instance populated by the parse and added to by this class
49
+ # @return [Hash] a Ruby hash containing the data of the Java Metadata instance
50
+ private def metadata_java_to_ruby(metadata_java)
51
+ metadata_java.names.each_with_object({}) do |name, m_ruby|
52
+ m_ruby[name] = metadata_java.get(name)
67
53
  end
68
54
  end
69
55
 
70
- private def get_input_type
71
- if File.file?(data_source)
56
+ # @return [Symbol] input type (currently only :file and :http are supported)
57
+ # @raise [IOError] if input is not a file or HTTP resource
58
+ private def data_source_input_type
59
+ if File.file?(@data_source)
72
60
  :file
73
- elsif URI(data_source).is_a?(URI::HTTP) && URI.open(data_source)
61
+ elsif URI(@data_source).is_a?(URI::HTTP)
74
62
  :http
75
63
  else
76
- raise IOError, "Input (#{data_source}) is not an available file or HTTP resource."
64
+ raise IOError, "Input (#{@data_source}) is not an available file or HTTP resource."
77
65
  end
78
66
  end
79
67
 
80
- private def input_stream
81
- file? \
82
- ? FileInputStream.new(java.io.File.new(data_source)) \
83
- : URL.new(data_source).open_stream
84
- end
85
-
86
- private def file?
87
- input_type == :file
68
+ # * Creates and opens an input stream from the configured resource.
69
+ # * Yields that stream to the passed code block.
70
+ # * Then closes the stream.
71
+ # @return [Object] the value returned by the passed code block
72
+ private def with_input_stream
73
+ input_stream =
74
+ if @input_type == :file
75
+ FileInputStream.new(java.io.File.new(@data_source))
76
+ else
77
+ URL.new(@data_source).open_stream
78
+ end
79
+ yield input_stream
80
+ ensure
81
+ input_stream.close if input_stream.respond_to?(:close)
88
82
  end
89
83
  end
90
84
  end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Requires the Tika jar file, either from the default location (packaged with this gem)
4
+ # or from an override specified in the TIKA_JAR_FILESPEC environment variable.
5
+
6
+ module Rika
7
+ # This class handles the loading of the Apache Tika Java jar file.
8
+ # It is not intended to be instantiated. Instead, call the only public class method, `require_tika`.
9
+ class TikaLoader
10
+ # @return [String] absolute filespec of loaded Tika jar file
11
+ # @raise [TikaLoadError] if the Tika jar file cannot be loaded
12
+ def self.require_tika
13
+ tika_jar_filespec = specified_tika_filespec
14
+
15
+ begin
16
+ abs_tika_jar_filespec = File.absolute_path(tika_jar_filespec)
17
+ require abs_tika_jar_filespec
18
+ abs_tika_jar_filespec
19
+ rescue LoadError
20
+ message = "Unable to load Tika jar file from '#{tika_jar_filespec}'."
21
+ if tika_jar_filespec != abs_tika_jar_filespec
22
+ message += "\nAbsolute filespec is '#{abs_tika_jar_filespec}'."
23
+ end
24
+ raise TikaLoadError, message
25
+ end
26
+ end
27
+
28
+ # Gets the Tika jar filespec from the TIKA_JAR_FILESPEC environment variable,
29
+ # and prints an error message and exits if it is not set.
30
+ #
31
+ # @return [String] Tika jar filespec from env var TIKA_JAR_FILESPEC
32
+ # @raise [TikaLoadError] if the Tika jar file was not specified
33
+ private_class_method def self.specified_tika_filespec
34
+ tika_jar_filespec = ENV['TIKA_JAR_FILESPEC']
35
+ not_specified = tika_jar_filespec.nil? || tika_jar_filespec.strip.empty?
36
+ raise(TikaLoadError, 'Environment variable TIKA_JAR_FILESPEC is not set.') if not_specified
37
+
38
+ tika_jar_filespec
39
+ end
40
+
41
+ # Formats an error message for printing to stderr.
42
+ #
43
+ # @param [String] message the error message
44
+ # @return [String] the formatted error message
45
+ private_class_method def self.formatted_error_message(message)
46
+ banner = '!' * 79 # message.length
47
+ <<~MESSAGE
48
+
49
+ #{banner}
50
+ #{message}
51
+ #{banner}
52
+
53
+ MESSAGE
54
+ end
55
+
56
+ # Prints an error message to stderr and exits with a non-zero exit code.
57
+ private_class_method def self.print_message_and_exit(message)
58
+ warn formatted_error_message(message)
59
+ exit 1
60
+ end
61
+ end
62
+
63
+ # This error class reports the inability to load the Tika jar file.
64
+ class TikaLoadError < RuntimeError; end
65
+ end
data/lib/rika/version.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Rika
2
- VERSION = "1.11.1"
4
+ VERSION = '2.0.0'
3
5
  end
data/lib/rika.rb CHANGED
@@ -1,43 +1,114 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
2
 
3
- raise "You need to run JRuby to use Rika" unless RUBY_PLATFORM =~ /java/
4
-
5
- require "rika/version"
6
- require 'uri'
7
- require 'open-uri'
3
+ # This file is the top level file for the Rika gem.
4
+ # It requires the other files in the gem and provides the top level API.
5
+ # It also provides the top level module for the gem.
6
+ require 'rika/version'
8
7
  require_relative 'rika/parser'
9
- require_relative '../java-lib/tika-app-1.24.1.jar'
8
+ require_relative 'rika/tika_loader'
10
9
 
10
+ # The top level module for the Rika gem.
11
11
  module Rika
12
- import org.apache.tika.metadata.Metadata
13
- import org.apache.tika.Tika
14
- import org.apache.tika.language.LanguageIdentifier
15
- import org.apache.tika.detect.DefaultDetector
16
- import java.io.FileInputStream
17
- import java.net.URL
12
+ PROJECT_URL = 'https://github.com/keithrbennett/rika'
13
+
14
+ # Loads the Tika jar file and imports the needed Java classes.
15
+ # @return [Module] the Rika module, for chaining
16
+ def self.init
17
+ return if @initialized
18
+
19
+ Rika.raise_unless_jruby
20
+
21
+ Rika::TikaLoader.require_tika
22
+ import java.io.FileInputStream
23
+ import java.net.URL
24
+ import org.apache.tika.Tika
25
+ import org.apache.tika.detect.DefaultDetector
26
+ import org.apache.tika.io.TikaInputStream
27
+ import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector
28
+ import org.apache.tika.language.detect.LanguageDetector
29
+ import org.apache.tika.language.detect.LanguageResult
30
+ import org.apache.tika.metadata.Metadata
31
+
32
+ @initialized = true
33
+ self
34
+ end
35
+
36
+ # Gets a ParseResult from parsing a document.
37
+ #
38
+ # @param [String] data_source file path or HTTP(s) URL
39
+ # @param [Boolean] key_sort whether to sort the keys in the metadata hash, defaults to true
40
+ # @param [Integer] max_content_length maximum content length to return, defaults to all
41
+ # @param [Detector] detector Tika detector, defaults to DefaultDetector
42
+ # @return [ParseResult]
43
+ def self.parse(data_source, key_sort: true, max_content_length: -1, detector: DefaultDetector.new)
44
+ init
45
+ parser = Parser.new(data_source, key_sort: key_sort, max_content_length: max_content_length, detector: detector)
46
+ parser.parse
47
+ end
18
48
 
19
- def self.parse_content_and_metadata(file_location, max_content_length = -1)
20
- parser = Parser.new(file_location, max_content_length)
21
- [parser.content, parser.metadata]
49
+ # @return [String] version of loaded Tika jar file
50
+ def self.tika_version
51
+ init
52
+ Tika.java_class.package.implementation_version
22
53
  end
23
54
 
24
- def self.parse_content_and_metadata_as_hash(file_location, max_content_length = -1)
25
- content, metadata = parse_content_and_metadata(file_location, max_content_length)
26
- { content: content, metadata: metadata }
55
+ # @param [String] text text to detect language of
56
+ # @return [String] language of passed text, as 2-character ISO 639-1 code
57
+ def self.language(text)
58
+ init
59
+ tika_language_detector.detect(text.to_java_string).get_language
27
60
  end
28
61
 
29
- def self.parse_content(file_location, max_content_length = -1)
30
- Parser.new(file_location, max_content_length).content
62
+ # @param [String] data_source file path or HTTP URL
63
+ # @return [Array<String,Hash>] content and metadata of file at specified location
64
+ #
65
+ # @deprecated Instead, get a ParseResult and access the content and metadata fields.
66
+ def self.parse_content_and_metadata(data_source, max_content_length: -1)
67
+ init
68
+ result = parse(data_source, max_content_length: max_content_length)
69
+ [result.content, result.metadata]
70
+ end
71
+
72
+ # @param [String] data_source file path or HTTP URL
73
+ # @return [Hash] content and metadata of file at specified location
74
+ #
75
+ # @deprecated Instead, use a ParseResult or its to_h method.
76
+ def self.parse_content_and_metadata_as_hash(data_source, max_content_length: -1)
77
+ init
78
+ result = parse(data_source, max_content_length: max_content_length)
79
+ { content: result.content, metadata: result.metadata }
80
+ end
81
+
82
+ # @param [String] data_source file path or HTTP URL
83
+ # @return [Parser] parser for resource at specified location
84
+ #
85
+ # @deprecated Instead, get a ParseResult and access the content field
86
+ def self.parse_content(data_source, max_content_length: -1)
87
+ init
88
+ parse(data_source, max_content_length: max_content_length).content
31
89
  end
32
90
 
33
91
  # Regarding max_content_length, the default is set at 0 to save unnecessary processing,
34
92
  # since the content is being ignored. However, the PDF metadata "pdf:unmappedUnicodeCharsPerPage"
35
- # and "pdf:charsPerPage" will be absent if the max_content_length is 0, and will be
36
- # ]may differ depending on
37
- # the number of characters read.
38
- def self.parse_metadata(file_location, max_content_length = 0)
39
- Parser.new(file_location, max_content_length).metadata
93
+ # and "pdf:charsPerPage" will be absent if the max_content_length is 0, and otherwise may differ
94
+ # depending on the number of characters read.
95
+ #
96
+ # @deprecated Instead, get a ParseResult and access the metadata field
97
+ def self.parse_metadata(data_source, max_content_length: -1)
98
+ init
99
+ parse(data_source, max_content_length: max_content_length).metadata
40
100
  end
41
- end
42
101
 
102
+ # @return [Detector] Tika detector
103
+ def self.tika_language_detector
104
+ init
105
+ @tika_language_detector ||= OptimaizeLangDetector.new.loadModels
106
+ end
43
107
 
108
+ # Raise an error if not running under JRuby.
109
+ def self.raise_unless_jruby
110
+ unless RUBY_PLATFORM.match(/java/)
111
+ raise "\n\n\nRika can only be run with JRuby! It needs access to the Java Virtual Machine.\n\n\n"
112
+ end
113
+ end
114
+ end
data/rika.gemspec CHANGED
@@ -1,23 +1,36 @@
1
- # -*- encoding: utf-8 -*-
2
- lib = File.expand_path('../lib', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ require 'English'
4
+
5
+ lib = File.expand_path('lib', __dir__)
3
6
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
7
  require 'rika/version'
5
8
 
6
9
  Gem::Specification.new do |gem|
7
- gem.name = "rika"
10
+ gem.name = 'rika'
8
11
  gem.version = Rika::VERSION
9
- gem.authors = ["Richard Nyström", "Keith Bennett"]
10
- gem.email = ["ricny046@gmail.com", "keithrbennett@gmail.com"]
11
- gem.description = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
12
- gem.summary = %q{ A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats. }
13
- gem.homepage = "https://github.com/keithrbennett/rika"
14
- gem.files = `git ls-files`.split($/)
15
- gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
- gem.require_paths = ["lib"]
18
- gem.add_development_dependency "rspec", "~> 3.9"
19
- gem.add_development_dependency "rake", "~> 13.0"
20
- gem.platform = "java"
21
- gem.license = "Apache-2.0"
22
- end
12
+ gem.authors = ['Richard Nyström', 'Keith Bennett']
13
+ gem.email = ['ricny046@gmail.com', 'keithrbennett@gmail.com']
14
+ gem.description = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
15
+ gem.summary = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
16
+ gem.homepage = 'https://github.com/keithrbennett/rika'
17
+ gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
19
+ gem.require_paths = ['lib']
20
+ gem.add_dependency 'awesome_print'
21
+ gem.platform = 'java'
22
+ gem.license = 'Apache-2.0'
23
+ gem.metadata['rubygems_mfa_required'] = 'true'
24
+
25
+ # NOTE: I am excluding the Ruby version constraint because this gem runs only in JRuby, and I don't know the
26
+ # minimum version requirement, and don't want to exclude use of any versions that might work.
23
27
 
28
+ gem.post_install_message = <<~MESSAGE
29
+
30
+ Using the rika gem requires that you:
31
+ 1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
32
+ 2) place it somewhere accessible to the running application
33
+ 3) specify its location in the TIKA_JAR_FILESPEC environment variable
34
+
35
+ MESSAGE
36
+ end