rika 2.1.0-java → 2.2.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,7 @@ require 'rika'
6
6
  require 'rika/formatters'
7
7
  require 'rika/cli/args_parser'
8
8
  require 'stringio'
9
+ require 'yaml'
9
10
 
10
11
  # This command line application enables the parsing of documents on the command line.
11
12
  # Syntax is:
@@ -16,67 +17,133 @@ require 'stringio'
16
17
  # but the -t and -m flags can be used to enable or suppress either.
17
18
  # Supports output formats of JSON, Pretty JSON, YAML, Awesome Print, to_s, and inspect (see Formatters class).
18
19
  class RikaCommand
19
- attr_reader :args, :help_text, :metadata_formatter, :options, :targets, :text_formatter
20
+ FORMAT_DESCRIPTIONS = Hash.new('Unknown').merge(
21
+ 'a' => 'AwesomePrint',
22
+ 'i' => 'inspect',
23
+ 'j' => 'JSON',
24
+ 'J' => 'Pretty JSON',
25
+ 't' => 'to_s',
26
+ 'y' => 'YAML'
27
+ ).freeze
20
28
 
29
+ attr_reader :args, :bad_targets, :help_text, :metadata_formatter, :options, :targets, :text_formatter
30
+
31
+ # Outputs help text to stdout
32
+ # @param [String] help_text The help text to display
33
+ # @param [String] error_message Optional error message to display on stderr before the help text
34
+ # @return [void]
35
+ def self.output_help_text(help_text, error_message = nil)
36
+ $stderr.puts(error_message) if error_message
37
+ puts help_text
38
+ end
39
+
21
40
  # @param [Array<String>] args command line arguments; default to ARGV but may be overridden for testing
22
41
  def initialize(args = ARGV)
23
42
  # Dup the array in case it has been frozen. The array will be modified later when options are parsed
24
43
  # and removed, and when directories are removed, so this array should not be frozen.
25
44
  @args = args.dup
45
+ @bad_targets = Hash.new { |hash, key| hash[key] = [] }
26
46
  end
27
47
 
28
48
  # Main method and entry point for this class' work.
49
+ # @return [Integer] exit code (0 for success, non-zero for errors)
29
50
  def call
30
51
  prepare
31
52
  report_and_exit_if_no_targets_specified
53
+
54
+ if options[:dry_run]
55
+ display_dry_run_info
56
+ return 0
57
+ end
58
+
59
+ process_targets
60
+ report_bad_targets
61
+ bad_targets.values.flatten.empty? ? 0 : 1
62
+ end
63
+
64
+ private
65
+
66
+ # Prepares to run the parse. This method is separate from #call so that it can be called from tests.
67
+ # @return [void]
68
+ def prepare
69
+ @options, @targets, @help_text, issues = ArgsParser.call(args)
70
+
71
+ # Add any issues from ArgsParser to our bad_targets
72
+ issues.each do |issue_type, issue_targets|
73
+ issue_targets.each { |target| bad_targets[issue_type] << target }
74
+ end
75
+
76
+ set_output_formats
77
+ end
78
+
79
+ # Process all targets based on options
80
+ # @return [void]
81
+ def process_targets
32
82
  if options[:as_array]
33
83
  puts result_array_output
34
84
  else
35
- targets.each do |target|
36
- # If we don't do this, Tika will raise an org.apache.tika.exception.ZeroByteFileException
37
- # TODO: Do same for URL?
38
- if File.file?(target) && File.zero?(target)
39
- $stderr.puts("\n\nFile empty!: #{target}\n\n")
40
- next
41
- end
42
-
43
- result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
44
- puts single_document_output(target, result)
85
+ targets.each do |target|
86
+ result = parse_target(target)
87
+ puts single_document_output(target, result) unless result == :error
45
88
  end
46
89
  end
47
- nil
48
90
  end
49
91
 
50
- # Prepares to run the parse. This method is separate from #call so that it can be called from tests.
92
+ # Report any targets that failed to process
51
93
  # @return [void]
52
- private def prepare
53
- @options, @targets, @help_text = ArgsParser.call(args)
54
- set_output_formats
94
+ def report_bad_targets
95
+ total_bad_targets = bad_targets.values.flatten.size
96
+ return if total_bad_targets.zero?
97
+
98
+ require 'awesome_print'
99
+ $stderr.puts("\n#{total_bad_targets} targets could not be processed:")
100
+ $stderr.puts(bad_targets.ai)
101
+
102
+ # Show any issues found during preparation
103
+ unless bad_targets.empty?
104
+ puts "Issues found:"
105
+
106
+ # Possible issue types include:
107
+ # - non_existent_file: Files that don't exist
108
+ # - empty_file: Files that exist but are empty
109
+ # - is_symlink_wont_process: Symlinks that won't be processed
110
+ # - file_with_url_characters: Files with "://" in their names
111
+ # - bad_url_scheme: URLs with schemes other than http/https
112
+ # - invalid_url: URLs that fail URI parsing
113
+ # - unknown_host: URLs with hosts that can't be resolved
114
+ # - io_error: IO errors during processing
115
+ # - invalid_input: Invalid input arguments
116
+ bad_targets.each do |issue_type, files|
117
+ puts " #{issue_type}:"
118
+ files.each do |file|
119
+ puts " #{file}"
120
+ end
121
+ end
122
+ end
55
123
  end
56
124
 
57
125
  # Sets the output format(s) based on the command line options.
58
126
  # Exits with error message if format is invalid.
59
127
  # @return [void]
60
- private def set_output_formats
128
+ def set_output_formats
61
129
  format = options[:format]
62
130
  @metadata_formatter = Rika::Formatters.get(format[0])
63
131
  @text_formatter = Rika::Formatters.get(format[1])
64
132
  nil
65
133
  rescue KeyError
66
- $stderr.puts "Invalid format: #{format}\n\n"
67
- $stderr.puts help_text
134
+ self.class.output_help_text("Invalid format: #{format}")
68
135
  exit 1
69
136
  end
70
137
 
71
138
  # Converts a ParseResult to a hash containing the selected pieces of data.
72
139
  # @param [ParseResult] result the parse result
73
140
  # @return [Hash] the hash containing the selected pieces of data
74
- private def result_hash(result)
75
- h = {}
76
- h['source'] = result.metadata['rika:data-source'] if options[:source]
77
- h['metadata'] = result.metadata if options[:metadata]
78
- h['text'] = result.content if options[:text]
79
- h
141
+ def result_hash(result)
142
+ {}.tap do |h|
143
+ h['source'] = result.metadata['rika:data-source'] if options[:source]
144
+ h['metadata'] = result.metadata if options[:metadata]
145
+ h['text'] = result.content if options[:text]
146
+ end
80
147
  end
81
148
 
82
149
  # Outputs the source file or URL in the form of:
@@ -85,7 +152,7 @@ class RikaCommand
85
152
  # -------------------------------------------------------------------------------
86
153
  # @param [String] source document source identifier
87
154
  # @return multiline string as displayed above
88
- private def source_output_string(source)
155
+ def source_output_string(source)
89
156
  <<~STRING
90
157
  -------------------------------------------------------------------------------
91
158
  Source: #{source}
@@ -97,16 +164,55 @@ class RikaCommand
97
164
  # @param [String] target the target document
98
165
  # @param [ParseResult] result the parse result
99
166
  # @return [String] the string representation of the result of parsing a single document
100
- private def single_document_output(target, result)
101
- if options[:metadata] && options[:text] && %w[jj JJ yy].include?(options[:format])
167
+ def single_document_output(target, result)
168
+ if should_use_single_formatter?(options[:format])
102
169
  metadata_formatter.(result_hash(result))
103
170
  else
104
- sio = StringIO.new
171
+ build_output_string(target, result)
172
+ end
173
+ end
174
+
175
+ # Determines if we should use a single formatter for both metadata and text
176
+ # @param [String] format the format string
177
+ # @return [Boolean] true if we should use a single formatter
178
+ def should_use_single_formatter?(format)
179
+ options[:metadata] && options[:text] && %w[jj JJ yy].include?(format)
180
+ end
181
+
182
+ # Builds an output string with multiple sections
183
+ # @param [String] target the target document
184
+ # @param [ParseResult] result the parse result
185
+ # @return [String] formatted output string
186
+ def build_output_string(target, result)
187
+ StringIO.new.tap do |sio|
105
188
  sio << source_output_string(target) if options[:source]
106
189
  sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
107
190
  sio << text_formatter.(result.content) << "\n" if options[:text]
108
- sio.string
109
- end
191
+ end.string
192
+ end
193
+
194
+ # Parses a target and returns the result. On error, accumulates the error in the @bad_targets hash.
195
+ # @param [String] target string identifying the target document
196
+ # @return [ParseResult] the parse result
197
+ def parse_target(target)
198
+ Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
199
+ rescue java.net.UnknownHostException => e
200
+ handle_parse_error(e, target, :unknown_host)
201
+ rescue IOError, java.io.IOException => e
202
+ handle_parse_error(e, target, :io_error)
203
+ rescue ArgumentError => e
204
+ handle_parse_error(e, target, :invalid_input)
205
+ end
206
+
207
+ # Handle parse errors consistently
208
+ # @param [Exception] exception the exception that occurred
209
+ # @param [String] target the target being processed
210
+ # @param [Symbol] error_type the type of error that occurred
211
+ # @return [Symbol] :error to indicate an error occurred
212
+ def handle_parse_error(exception, target, error_type)
213
+ bad_targets[error_type] << target
214
+ $stderr.puts("#{exception.class} processing '#{target}': #{exception.message}")
215
+ :error
110
216
  end
111
217
 
112
218
  # Parses the documents and outputs the result of the parse to stdout as an array of hashes.
@@ -114,11 +220,11 @@ class RikaCommand
114
220
  # (otherwise the output would be invalid, especially with JSON or YAML).
115
221
  # Therefore, the metadata formatter is arbitrarily selected to be used by both.
116
222
  # @return [String] the string representation of the result of parsing the documents
117
- private def result_array_output
118
- output_hashes = targets.map do |target|
119
- result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
120
- result_hash(result)
121
- end
223
+ def result_array_output
224
+ results = targets \
225
+ .map { |target| parse_target(target) } \
226
+ .reject { |target| target == :error }
227
+ output_hashes = results.map { |result| result_hash(result) }
122
228
 
123
229
  # Either the metadata or text formatter will do, since they will necessarily be the same formatter.
124
230
  metadata_formatter.call(output_hashes)
@@ -126,7 +232,7 @@ class RikaCommand
126
232
 
127
233
  # Tika offers a max_content_length option, but it is not exposed in Rika.
128
234
  # Instead it is used only to enable or disable the entire text output.
129
- private def max_content_length
235
+ def max_content_length
130
236
  options[:text] ? -1 : 0
131
237
  end
132
238
 
@@ -136,16 +242,50 @@ class RikaCommand
136
242
  # dynamically generated by a script, and the script may not want to abort if no documents are
137
243
  # generated.
138
244
  # @return [void] or exits
139
- private def report_and_exit_if_no_targets_specified
245
+ def report_and_exit_if_no_targets_specified
140
246
  if targets.empty?
141
- $stderr.puts <<~MESSAGE
142
-
143
- No targets specified.
144
-
145
- #{help_text}
146
- MESSAGE
247
+ $stderr.puts(%q{No valid targets specified. Run with '-h' option for help.})
147
248
  exit 0
148
249
  end
149
250
  nil
150
251
  end
252
+
253
+ # Displays information about what would happen in a dry run
254
+ # without actually executing the command
255
+ # @return [void]
256
+ def display_dry_run_info
257
+ require 'yaml'
258
+
259
+ # Format the targets list
260
+ target_list = targets.map { |target| " #{target}" }.join("\n")
261
+
262
+ # Create the main output using a heredoc
263
+ puts <<~DRY_RUN_OUTPUT
264
+ DRY RUN: Showing what would happen without executing
265
+
266
+ Options:
267
+ Format: #{options[:format]} (#{format_description})
268
+ Output metadata: #{options[:metadata]}
269
+ Output text: #{options[:text]}
270
+ Sort metadata keys: #{options[:key_sort]}
271
+ Output source: #{options[:source]}
272
+ Output as array: #{options[:as_array]}
273
+
274
+ Targets to process (#{targets.size}):
275
+ #{target_list}
276
+ DRY_RUN_OUTPUT
277
+
278
+ if bad_targets.any?
279
+ puts "\nIssues found:\n#{bad_targets.to_yaml}"
280
+ end
281
+ end
282
+
283
+ # Returns a description of the format options
284
+ # @return [String] description of the format
285
+ def format_description
286
+ metadata_desc = FORMAT_DESCRIPTIONS[options[:format][0]]
287
+ text_desc = FORMAT_DESCRIPTIONS[options[:format][1]]
288
+ "#{metadata_desc} for metadata, #{text_desc} for text"
289
+ end
151
290
  end
291
+
data/lib/rika/parser.rb CHANGED
@@ -54,31 +54,49 @@ module Rika
54
54
  end
55
55
 
56
56
  # @return [Symbol] input type (currently only :file and :http are supported)
57
- # @raise [IOError] if input is not a file or HTTP resource
57
+ # @raise [ArgumentError] if the URI format is invalid
58
+ # @raise [IOError] if input is not an available file or HTTP resource
58
59
  private def data_source_input_type
59
- if File.file?(@data_source)
60
- :file
61
- elsif URI(@data_source).is_a?(URI::HTTP)
62
- :http
63
- else
64
- raise IOError, "Input (#{@data_source}) is not an available file or HTTP resource."
60
+ return :file if File.file?(@data_source)
61
+
62
+ begin
63
+ uri = URI(@data_source)
64
+ return :http if uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
65
+ rescue URI::InvalidURIError => e
66
+ # Use ArgumentError for validation issues
67
+ raise ArgumentError, "Invalid URI format: #{@data_source} (#{e.message})"
65
68
  end
69
+
70
+ raise IOError, "Input (#{@data_source}) is not an available file or HTTP resource."
66
71
  end
67
72
 
68
- # * Creates and opens an input stream from the configured resource.
73
+ # Creates a TikaInputStream from the configured resource, which provides better
74
+ # performance and resource management than direct streams.
69
75
  # * Yields that stream to the passed code block.
70
76
  # * Then closes the stream.
77
+ # TikaInputStream provides advanced features like:
78
+ # * Buffering and resource management
79
+ # * Mark/reset functionality
80
+ # * File tracking for temporary files
81
+ # * Memory efficiency for large files
71
82
  # @return [Object] the value returned by the passed code block
72
83
  private def with_input_stream
73
- input_stream =
74
- if @input_type == :file
75
- FileInputStream.new(java.io.File.new(@data_source))
76
- else
77
- URL.new(@data_source).open_stream
78
- end
84
+
85
+ input_stream = if @input_type == :file
86
+ file = java.io.File.new(@data_source)
87
+ # Use the TikaInputStream.get(File) method which is optimized for file access
88
+ TikaInputStream.get(file)
89
+ else
90
+ url = URL.new(@data_source)
91
+ # Use the TikaInputStream.get(URL) method which handles HTTP streams properly
92
+ TikaInputStream.get(url)
93
+ end
94
+
95
+ # Call the block with the stream
79
96
  yield input_stream
80
97
  ensure
81
- input_stream.close if input_stream.respond_to?(:close)
98
+ # Ensure stream is closed even if exceptions occur
99
+ input_stream.close if input_stream && input_stream.respond_to?(:close)
82
100
  end
83
101
  end
84
102
  end
data/lib/rika/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Rika
4
- VERSION = '2.1.0'
4
+ VERSION = '2.2.0'
5
5
  end
data/rika.gemspec CHANGED
@@ -15,7 +15,8 @@ Gem::Specification.new do |gem|
15
15
  gem.summary = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
16
16
  gem.homepage = 'https://github.com/keithrbennett/rika'
17
17
  gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
18
- gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
18
+ gem.bindir = 'exe'
19
+ gem.executables = gem.files.grep(%r{^#{gem.bindir}/}).map { |f| File.basename(f) }
19
20
  gem.require_paths = ['lib']
20
21
  gem.add_dependency 'awesome_print', '~> 1.9', '>= 1.9.2'
21
22
  gem.platform = 'java'
@@ -0,0 +1,212 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika'
5
+ require 'rika/cli/rika_command'
6
+ require 'tempfile'
7
+ require 'fileutils'
8
+
9
+ describe 'CLI End-to-End', type: :integration do
10
+ # Capture stdout and stderr
11
+ before do
12
+ @original_stdout = $stdout
13
+ @original_stderr = $stderr
14
+ $stdout = StringIO.new
15
+ $stderr = StringIO.new
16
+ end
17
+
18
+ after do
19
+ $stdout = @original_stdout
20
+ $stderr = @original_stderr
21
+ end
22
+
23
+ # Helper to get captured stdout
24
+ def stdout_content
25
+ $stdout.string
26
+ end
27
+
28
+ # Helper to get captured stderr
29
+ def stderr_content
30
+ $stderr.string
31
+ end
32
+
33
+ # Helper to run CLI with arguments
34
+ def run_cli(args)
35
+ command = RikaCommand.new(args)
36
+ begin
37
+ command.call
38
+ rescue SystemExit
39
+ # Catch SystemExit to prevent test termination
40
+ end
41
+ command
42
+ end
43
+
44
+ context 'with various file formats' do
45
+ let(:txt_file) { fixture_path('document.txt') }
46
+ let(:pdf_file) { fixture_path('document.pdf') }
47
+ let(:docx_file) { fixture_path('document.docx') }
48
+ let(:image_file) { fixture_path('image.jpg') }
49
+
50
+ it 'processes a text file and returns expected output' do
51
+ run_cli([txt_file])
52
+
53
+ aggregate_failures do
54
+ # Check stdout for expected content
55
+ expect(stdout_content).to include('Stopping by Woods on a Snowy Evening')
56
+ expect(stdout_content).to include('Content-Type')
57
+ expect(stdout_content).not_to include('Error')
58
+ end
59
+ end
60
+
61
+ it 'processes a PDF file and returns expected output' do
62
+ run_cli([pdf_file])
63
+
64
+ aggregate_failures do
65
+ # Check stdout for expected content
66
+ expect(stdout_content).to include('Stopping by Woods on a Snowy Evening')
67
+ expect(stdout_content).to include('Content-Type')
68
+ expect(stdout_content).to include('Robert Frost')
69
+ end
70
+ end
71
+
72
+ it 'processes multiple files of different types in a single run' do
73
+ run_cli(['-a', txt_file, pdf_file, docx_file])
74
+
75
+ aggregate_failures do
76
+ # Check that all files are processed and appear in output
77
+ expect(stdout_content).to include(txt_file)
78
+ expect(stdout_content).to include(pdf_file)
79
+ expect(stdout_content).to include(docx_file)
80
+ end
81
+ end
82
+ end
83
+
84
+ context 'with various output format options' do
85
+ let(:txt_file) { fixture_path('document.txt') }
86
+
87
+ it 'outputs in text format' do
88
+ run_cli(['-ft', txt_file])
89
+
90
+ aggregate_failures do
91
+ # Check stdout for plain text format
92
+ expect(stdout_content).to include('Stopping by Woods on a Snowy Evening')
93
+ expect(stdout_content).not_to include('"content":')
94
+ # We can't really test for absence of YAML markers as the output format varies
95
+ # Just make sure it has poem content
96
+ expect(stdout_content).to include('Robert Frost')
97
+ end
98
+ end
99
+
100
+ it 'outputs in JSON format' do
101
+ run_cli(['-fj', txt_file])
102
+
103
+ aggregate_failures do
104
+ # Check stdout for JSON format
105
+ json_output = stdout_content
106
+ expect { JSON.parse(json_output) }.not_to raise_error
107
+
108
+ parsed = JSON.parse(json_output)
109
+ expect(parsed).to have_key('text')
110
+ expect(parsed).to have_key('metadata')
111
+ end
112
+ end
113
+
114
+ it 'outputs in YAML format' do
115
+ run_cli(['-fy', txt_file])
116
+
117
+ aggregate_failures do
118
+ # Check stdout for YAML format
119
+ yaml_output = stdout_content
120
+ expect { YAML.safe_load(yaml_output) }.not_to raise_error
121
+
122
+ parsed = YAML.safe_load(yaml_output)
123
+ expect(parsed).to have_key('text')
124
+ expect(parsed).to have_key('metadata')
125
+ end
126
+ end
127
+ end
128
+
129
+ context 'with error cases' do
130
+ it 'handles non-existent files gracefully' do
131
+ non_existent_file = 'non_existent_file.txt'
132
+ begin
133
+ # We need to explicitly pass in a file:// URL to trigger a specific error
134
+ # rather than letting the CLI handle the checking if the file exists
135
+ run_cli(["file://#{non_existent_file}"])
136
+ rescue => e
137
+ # Ignore any error
138
+ end
139
+
140
+ # For a non-existent file, the CLI should output that the file doesn't exist
141
+ # but might handle it in different ways
142
+ expect(stdout_content + stderr_content).not_to be_empty
143
+ end
144
+
145
+ it 'handles empty files gracefully' do
146
+ empty_file = fixture_path('empty.txt')
147
+ run_cli([empty_file])
148
+
149
+ # Instead of looking for specific error message, just verify
150
+ # empty file was processed or reported in some way
151
+ expect(stdout_content + stderr_content).not_to be_empty
152
+ end
153
+
154
+ it 'handles invalid format characters without raising an error' do
155
+ # Just make sure it doesn't crash with an invalid format
156
+ run_cli(['-fx', fixture_path('document.txt')])
157
+
158
+ # Either it will complain about the format or the file, but should output something
159
+ expect(stdout_content + stderr_content).not_to be_empty
160
+ end
161
+ end
162
+
163
+ context 'with additional options' do
164
+ let(:txt_file) { fixture_path('document.txt') }
165
+
166
+ it 'displays version information when requested' do
167
+ # Use --version flag for version info
168
+ run_cli(['--version'])
169
+
170
+ # Since we can't predict the exact output format, just check that
171
+ # the command runs without error and produces some output
172
+ expect(stdout_content).not_to be_empty
173
+ end
174
+
175
+ it 'displays help information when requested' do
176
+ # We don't need to check for SystemExit specifically since that's implementation-dependent
177
+ run_cli(['-h'])
178
+
179
+ # Just verify it shows help text with usage info
180
+ expect(stdout_content).to include('Usage:')
181
+ end
182
+ end
183
+
184
+ context 'with various combinations of options and files' do
185
+ let(:txt_file) { fixture_path('document.txt') }
186
+ let(:pdf_file) { fixture_path('document.pdf') }
187
+
188
+ it 'combines array mode with format options correctly' do
189
+ run_cli(['-a', '-fJ', txt_file, pdf_file])
190
+
191
+ aggregate_failures do
192
+ # Parse output as JSON
193
+ json_output = stdout_content
194
+ expect { JSON.parse(json_output) }.not_to raise_error
195
+
196
+ parsed = JSON.parse(json_output)
197
+ expect(parsed).to be_an(Array)
198
+ expect(parsed.size).to eq(2)
199
+
200
+ # Check first and second results
201
+ expect(parsed[0]).to be_a(Hash)
202
+ expect(parsed[1]).to be_a(Hash)
203
+
204
+ # Check contents of each result
205
+ [0, 1].each do |i|
206
+ expect(parsed[i]).to have_key('text')
207
+ expect(parsed[i]).to have_key('metadata')
208
+ end
209
+ end
210
+ end
211
+ end
212
+ end