rika 2.1.0-java → 2.2.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +10 -7
- data/.rspec +1 -1
- data/README.md +58 -11
- data/RELEASE_NOTES.md +12 -0
- data/{bin → exe}/rika +1 -1
- data/lib/rika/cli/args_parser.rb +124 -26
- data/lib/rika/cli/rika_command.rb +184 -44
- data/lib/rika/parser.rb +33 -15
- data/lib/rika/version.rb +1 -1
- data/rika.gemspec +2 -1
- data/spec/integration/cli_end_to_end_spec.rb +212 -0
- data/spec/integration/document_processing_spec.rb +193 -0
- data/spec/integration/web_url_processing_spec.rb +252 -0
- data/spec/rika/cli/args_parser/boolean_options_spec.rb +136 -0
- data/spec/rika/cli/args_parser/environment_options_spec.rb +115 -0
- data/spec/rika/cli/args_parser/format_options_spec.rb +143 -0
- data/spec/rika/cli/{args_parser_spec.rb → args_parser/main_spec.rb} +63 -14
- data/spec/rika/cli/args_parser/url_filespec_spec.rb +134 -0
- data/spec/rika/cli/rika_command_spec.rb +81 -13
- metadata +12 -5
@@ -6,6 +6,7 @@ require 'rika'
|
|
6
6
|
require 'rika/formatters'
|
7
7
|
require 'rika/cli/args_parser'
|
8
8
|
require 'stringio'
|
9
|
+
require 'yaml'
|
9
10
|
|
10
11
|
# This command line application enables the parsing of documents on the command line.
|
11
12
|
# Syntax is:
|
@@ -16,67 +17,133 @@ require 'stringio'
|
|
16
17
|
# but the -t and -m flags can be used to enable or suppress either.
|
17
18
|
# Supports output formats of JSON, Pretty JSON, YAML, Awesome Print, to_s, and inspect (see Formatters class).
|
18
19
|
class RikaCommand
|
19
|
-
|
20
|
+
FORMAT_DESCRIPTIONS = Hash.new('Unknown').merge(
|
21
|
+
'a' => 'AwesomePrint',
|
22
|
+
'i' => 'inspect',
|
23
|
+
'j' => 'JSON',
|
24
|
+
'J' => 'Pretty JSON',
|
25
|
+
't' => 'to_s',
|
26
|
+
'y' => 'YAML'
|
27
|
+
).freeze
|
20
28
|
|
29
|
+
attr_reader :args, :bad_targets, :help_text, :metadata_formatter, :options, :targets, :text_formatter
|
30
|
+
|
31
|
+
# Outputs help text to stdout
|
32
|
+
# @param [String] help_text The help text to display
|
33
|
+
# @param [String] error_message Optional error message to display on stderr before the help text
|
34
|
+
# @return [void]
|
35
|
+
def self.output_help_text(help_text, error_message = nil)
|
36
|
+
$stderr.puts(error_message) if error_message
|
37
|
+
puts help_text
|
38
|
+
end
|
39
|
+
|
21
40
|
# @param [Array<String>] args command line arguments; default to ARGV but may be overridden for testing
|
22
41
|
def initialize(args = ARGV)
|
23
42
|
# Dup the array in case it has been frozen. The array will be modified later when options are parsed
|
24
43
|
# and removed, and when directories are removed, so this array should not be frozen.
|
25
44
|
@args = args.dup
|
45
|
+
@bad_targets = Hash.new { |hash, key| hash[key] = [] }
|
26
46
|
end
|
27
47
|
|
28
48
|
# Main method and entry point for this class' work.
|
49
|
+
# @return [Integer] exit code (0 for success, non-zero for errors)
|
29
50
|
def call
|
30
51
|
prepare
|
31
52
|
report_and_exit_if_no_targets_specified
|
53
|
+
|
54
|
+
if options[:dry_run]
|
55
|
+
display_dry_run_info
|
56
|
+
return 0
|
57
|
+
end
|
58
|
+
|
59
|
+
process_targets
|
60
|
+
report_bad_targets
|
61
|
+
bad_targets.values.flatten.empty? ? 0 : 1
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
# Prepares to run the parse. This method is separate from #call so that it can be called from tests.
|
67
|
+
# @return [void]
|
68
|
+
def prepare
|
69
|
+
@options, @targets, @help_text, issues = ArgsParser.call(args)
|
70
|
+
|
71
|
+
# Add any issues from ArgsParser to our bad_targets
|
72
|
+
issues.each do |issue_type, issue_targets|
|
73
|
+
issue_targets.each { |target| bad_targets[issue_type] << target }
|
74
|
+
end
|
75
|
+
|
76
|
+
set_output_formats
|
77
|
+
end
|
78
|
+
|
79
|
+
# Process all targets based on options
|
80
|
+
# @return [void]
|
81
|
+
def process_targets
|
32
82
|
if options[:as_array]
|
33
83
|
puts result_array_output
|
34
84
|
else
|
35
|
-
targets.each do |target|
|
36
|
-
|
37
|
-
|
38
|
-
if File.file?(target) && File.zero?(target)
|
39
|
-
$stderr.puts("\n\nFile empty!: #{target}\n\n")
|
40
|
-
next
|
41
|
-
end
|
42
|
-
|
43
|
-
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
44
|
-
puts single_document_output(target, result)
|
85
|
+
targets.each do |target|
|
86
|
+
result = parse_target(target)
|
87
|
+
puts single_document_output(target, result) unless result == :error
|
45
88
|
end
|
46
89
|
end
|
47
|
-
nil
|
48
90
|
end
|
49
91
|
|
50
|
-
#
|
92
|
+
# Report any targets that failed to process
|
51
93
|
# @return [void]
|
52
|
-
|
53
|
-
|
54
|
-
|
94
|
+
def report_bad_targets
|
95
|
+
total_bad_targets = bad_targets.values.flatten.size
|
96
|
+
return if total_bad_targets.zero?
|
97
|
+
|
98
|
+
require 'awesome_print'
|
99
|
+
$stderr.puts("\n#{total_bad_targets} targets could not be processed:")
|
100
|
+
$stderr.puts(bad_targets.ai)
|
101
|
+
|
102
|
+
# Show any issues found during preparation
|
103
|
+
unless bad_targets.empty?
|
104
|
+
puts "Issues found:"
|
105
|
+
|
106
|
+
# Possible issue types include:
|
107
|
+
# - non_existent_file: Files that don't exist
|
108
|
+
# - empty_file: Files that exist but are empty
|
109
|
+
# - is_symlink_wont_process: Symlinks that won't be processed
|
110
|
+
# - file_with_url_characters: Files with "://" in their names
|
111
|
+
# - bad_url_scheme: URLs with schemes other than http/https
|
112
|
+
# - invalid_url: URLs that fail URI parsing
|
113
|
+
# - unknown_host: URLs with hosts that can't be resolved
|
114
|
+
# - io_error: IO errors during processing
|
115
|
+
# - invalid_input: Invalid input arguments
|
116
|
+
bad_targets.each do |issue_type, files|
|
117
|
+
puts " #{issue_type}:"
|
118
|
+
files.each do |file|
|
119
|
+
puts " #{file}"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
55
123
|
end
|
56
124
|
|
57
125
|
# Sets the output format(s) based on the command line options.
|
58
126
|
# Exits with error message if format is invalid.
|
59
127
|
# @return [void]
|
60
|
-
|
128
|
+
def set_output_formats
|
61
129
|
format = options[:format]
|
62
130
|
@metadata_formatter = Rika::Formatters.get(format[0])
|
63
131
|
@text_formatter = Rika::Formatters.get(format[1])
|
64
132
|
nil
|
65
133
|
rescue KeyError
|
66
|
-
|
67
|
-
$stderr.puts help_text
|
134
|
+
self.class.output_help_text("Invalid format: #{format}")
|
68
135
|
exit 1
|
69
136
|
end
|
70
137
|
|
71
138
|
# Converts a ParseResult to a hash containing the selected pieces of data.
|
72
139
|
# @param [ParseResult] result the parse result
|
73
140
|
# @return [Hash] the hash containing the selected pieces of data
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
141
|
+
def result_hash(result)
|
142
|
+
{}.tap do |h|
|
143
|
+
h['source'] = result.metadata['rika:data-source'] if options[:source]
|
144
|
+
h['metadata'] = result.metadata if options[:metadata]
|
145
|
+
h['text'] = result.content if options[:text]
|
146
|
+
end
|
80
147
|
end
|
81
148
|
|
82
149
|
# Outputs the source file or URL in the form of:
|
@@ -85,7 +152,7 @@ class RikaCommand
|
|
85
152
|
# -------------------------------------------------------------------------------
|
86
153
|
# @param [String] source document source identifier
|
87
154
|
# @return multiline string as displayed above
|
88
|
-
|
155
|
+
def source_output_string(source)
|
89
156
|
<<~STRING
|
90
157
|
-------------------------------------------------------------------------------
|
91
158
|
Source: #{source}
|
@@ -97,16 +164,55 @@ class RikaCommand
|
|
97
164
|
# @param [String] target the target document
|
98
165
|
# @param [ParseResult] result the parse result
|
99
166
|
# @return [String] the string representation of the result of parsing a single document
|
100
|
-
|
101
|
-
if
|
167
|
+
def single_document_output(target, result)
|
168
|
+
if should_use_single_formatter?(options[:format])
|
102
169
|
metadata_formatter.(result_hash(result))
|
103
170
|
else
|
104
|
-
|
171
|
+
build_output_string(target, result)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Determines if we should use a single formatter for both metadata and text
|
176
|
+
# @param [String] format the format string
|
177
|
+
# @return [Boolean] true if we should use a single formatter
|
178
|
+
def should_use_single_formatter?(format)
|
179
|
+
options[:metadata] && options[:text] && %w[jj JJ yy].include?(format)
|
180
|
+
end
|
181
|
+
|
182
|
+
# Builds an output string with multiple sections
|
183
|
+
# @param [String] target the target document
|
184
|
+
# @param [ParseResult] result the parse result
|
185
|
+
# @return [String] formatted output string
|
186
|
+
def build_output_string(target, result)
|
187
|
+
StringIO.new.tap do |sio|
|
105
188
|
sio << source_output_string(target) if options[:source]
|
106
189
|
sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
|
107
190
|
sio << text_formatter.(result.content) << "\n" if options[:text]
|
108
|
-
|
109
|
-
|
191
|
+
end.string
|
192
|
+
end
|
193
|
+
|
194
|
+
# Parses a target and returns the result. On error, accumulates the error in the @bad_targets hash.
|
195
|
+
# @param [String] target string identifying the target document
|
196
|
+
# @return [ParseResult] the parse result
|
197
|
+
def parse_target(target)
|
198
|
+
Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
199
|
+
rescue java.net.UnknownHostException => e
|
200
|
+
handle_parse_error(e, target, :unknown_host)
|
201
|
+
rescue IOError, java.io.IOException => e
|
202
|
+
handle_parse_error(e, target, :io_error)
|
203
|
+
rescue ArgumentError => e
|
204
|
+
handle_parse_error(e, target, :invalid_input)
|
205
|
+
end
|
206
|
+
|
207
|
+
# Handle parse errors consistently
|
208
|
+
# @param [Exception] exception the exception that occurred
|
209
|
+
# @param [String] target the target being processed
|
210
|
+
# @param [Symbol] error_type the type of error that occurred
|
211
|
+
# @return [Symbol] :error to indicate an error occurred
|
212
|
+
def handle_parse_error(exception, target, error_type)
|
213
|
+
bad_targets[error_type] << target
|
214
|
+
$stderr.puts("#{exception.class} processing '#{target}': #{exception.message}")
|
215
|
+
:error
|
110
216
|
end
|
111
217
|
|
112
218
|
# Parses the documents and outputs the result of the parse to stdout as an array of hashes.
|
@@ -114,11 +220,11 @@ class RikaCommand
|
|
114
220
|
# (otherwise the output would be invalid, especially with JSON or YAML).
|
115
221
|
# Therefore, the metadata formatter is arbitrarily selected to be used by both.
|
116
222
|
# @return [String] the string representation of the result of parsing the documents
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
223
|
+
def result_array_output
|
224
|
+
results = targets \
|
225
|
+
.map { |target| parse_target(target) } \
|
226
|
+
.reject { |target| target == :error }
|
227
|
+
output_hashes = results.map { |result| result_hash(result) }
|
122
228
|
|
123
229
|
# Either the metadata or text formatter will do, since they will necessarily be the same formatter.
|
124
230
|
metadata_formatter.call(output_hashes)
|
@@ -126,7 +232,7 @@ class RikaCommand
|
|
126
232
|
|
127
233
|
# Tika offers a max_content_length option, but it is not exposed in Rika.
|
128
234
|
# Instead it is used only to enable or disable the entire text output.
|
129
|
-
|
235
|
+
def max_content_length
|
130
236
|
options[:text] ? -1 : 0
|
131
237
|
end
|
132
238
|
|
@@ -136,16 +242,50 @@ class RikaCommand
|
|
136
242
|
# dynamically generated by a script, and the script may not want to abort if no documents are
|
137
243
|
# generated.
|
138
244
|
# @return [void] or exits
|
139
|
-
|
245
|
+
def report_and_exit_if_no_targets_specified
|
140
246
|
if targets.empty?
|
141
|
-
$stderr.puts
|
142
|
-
|
143
|
-
No targets specified.
|
144
|
-
|
145
|
-
#{help_text}
|
146
|
-
MESSAGE
|
247
|
+
$stderr.puts(%q{No valid targets specified. Run with '-h' option for help.})
|
147
248
|
exit 0
|
148
249
|
end
|
149
250
|
nil
|
150
251
|
end
|
252
|
+
|
253
|
+
# Displays information about what would happen in a dry run
|
254
|
+
# without actually executing the command
|
255
|
+
# @return [void]
|
256
|
+
def display_dry_run_info
|
257
|
+
require 'yaml'
|
258
|
+
|
259
|
+
# Format the targets list
|
260
|
+
target_list = targets.map { |target| " #{target}" }.join("\n")
|
261
|
+
|
262
|
+
# Create the main output using a heredoc
|
263
|
+
puts <<~DRY_RUN_OUTPUT
|
264
|
+
DRY RUN: Showing what would happen without executing
|
265
|
+
|
266
|
+
Options:
|
267
|
+
Format: #{options[:format]} (#{format_description})
|
268
|
+
Output metadata: #{options[:metadata]}
|
269
|
+
Output text: #{options[:text]}
|
270
|
+
Sort metadata keys: #{options[:key_sort]}
|
271
|
+
Output source: #{options[:source]}
|
272
|
+
Output as array: #{options[:as_array]}
|
273
|
+
|
274
|
+
Targets to process (#{targets.size}):
|
275
|
+
#{target_list}
|
276
|
+
DRY_RUN_OUTPUT
|
277
|
+
|
278
|
+
if bad_targets.any?
|
279
|
+
puts "\nIssues found:\n#{bad_targets.to_yaml}"
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
# Returns a description of the format options
|
284
|
+
# @return [String] description of the format
|
285
|
+
def format_description
|
286
|
+
metadata_desc = FORMAT_DESCRIPTIONS[options[:format][0]]
|
287
|
+
text_desc = FORMAT_DESCRIPTIONS[options[:format][1]]
|
288
|
+
"#{metadata_desc} for metadata, #{text_desc} for text"
|
289
|
+
end
|
151
290
|
end
|
291
|
+
|
data/lib/rika/parser.rb
CHANGED
@@ -54,31 +54,49 @@ module Rika
|
|
54
54
|
end
|
55
55
|
|
56
56
|
# @return [Symbol] input type (currently only :file and :http are supported)
|
57
|
-
# @raise [
|
57
|
+
# @raise [ArgumentError] if the URI format is invalid
|
58
|
+
# @raise [IOError] if input is not an available file or HTTP resource
|
58
59
|
private def data_source_input_type
|
59
|
-
if File.file?(@data_source)
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
60
|
+
return :file if File.file?(@data_source)
|
61
|
+
|
62
|
+
begin
|
63
|
+
uri = URI(@data_source)
|
64
|
+
return :http if uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
65
|
+
rescue URI::InvalidURIError => e
|
66
|
+
# Use ArgumentError for validation issues
|
67
|
+
raise ArgumentError, "Invalid URI format: #{@data_source} (#{e.message})"
|
65
68
|
end
|
69
|
+
|
70
|
+
raise IOError, "Input (#{@data_source}) is not an available file or HTTP resource."
|
66
71
|
end
|
67
72
|
|
68
|
-
#
|
73
|
+
# Creates a TikaInputStream from the configured resource, which provides better
|
74
|
+
# performance and resource management than direct streams.
|
69
75
|
# * Yields that stream to the passed code block.
|
70
76
|
# * Then closes the stream.
|
77
|
+
# TikaInputStream provides advanced features like:
|
78
|
+
# * Buffering and resource management
|
79
|
+
# * Mark/reset functionality
|
80
|
+
# * File tracking for temporary files
|
81
|
+
# * Memory efficiency for large files
|
71
82
|
# @return [Object] the value returned by the passed code block
|
72
83
|
private def with_input_stream
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
84
|
+
|
85
|
+
input_stream = if @input_type == :file
|
86
|
+
file = java.io.File.new(@data_source)
|
87
|
+
# Use the TikaInputStream.get(File) method which is optimized for file access
|
88
|
+
TikaInputStream.get(file)
|
89
|
+
else
|
90
|
+
url = URL.new(@data_source)
|
91
|
+
# Use the TikaInputStream.get(URL) method which handles HTTP streams properly
|
92
|
+
TikaInputStream.get(url)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Call the block with the stream
|
79
96
|
yield input_stream
|
80
97
|
ensure
|
81
|
-
|
98
|
+
# Ensure stream is closed even if exceptions occur
|
99
|
+
input_stream.close if input_stream && input_stream.respond_to?(:close)
|
82
100
|
end
|
83
101
|
end
|
84
102
|
end
|
data/lib/rika/version.rb
CHANGED
data/rika.gemspec
CHANGED
@@ -15,7 +15,8 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.summary = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
|
16
16
|
gem.homepage = 'https://github.com/keithrbennett/rika'
|
17
17
|
gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
18
|
-
gem.
|
18
|
+
gem.bindir = 'exe'
|
19
|
+
gem.executables = gem.files.grep(%r{^#{gem.bindir}/}).map { |f| File.basename(f) }
|
19
20
|
gem.require_paths = ['lib']
|
20
21
|
gem.add_dependency 'awesome_print', '~> 1.9', '>= 1.9.2'
|
21
22
|
gem.platform = 'java'
|
@@ -0,0 +1,212 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'rika'
|
5
|
+
require 'rika/cli/rika_command'
|
6
|
+
require 'tempfile'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
describe 'CLI End-to-End', type: :integration do
|
10
|
+
# Capture stdout and stderr
|
11
|
+
before do
|
12
|
+
@original_stdout = $stdout
|
13
|
+
@original_stderr = $stderr
|
14
|
+
$stdout = StringIO.new
|
15
|
+
$stderr = StringIO.new
|
16
|
+
end
|
17
|
+
|
18
|
+
after do
|
19
|
+
$stdout = @original_stdout
|
20
|
+
$stderr = @original_stderr
|
21
|
+
end
|
22
|
+
|
23
|
+
# Helper to get captured stdout
|
24
|
+
def stdout_content
|
25
|
+
$stdout.string
|
26
|
+
end
|
27
|
+
|
28
|
+
# Helper to get captured stderr
|
29
|
+
def stderr_content
|
30
|
+
$stderr.string
|
31
|
+
end
|
32
|
+
|
33
|
+
# Helper to run CLI with arguments
|
34
|
+
def run_cli(args)
|
35
|
+
command = RikaCommand.new(args)
|
36
|
+
begin
|
37
|
+
command.call
|
38
|
+
rescue SystemExit
|
39
|
+
# Catch SystemExit to prevent test termination
|
40
|
+
end
|
41
|
+
command
|
42
|
+
end
|
43
|
+
|
44
|
+
context 'with various file formats' do
|
45
|
+
let(:txt_file) { fixture_path('document.txt') }
|
46
|
+
let(:pdf_file) { fixture_path('document.pdf') }
|
47
|
+
let(:docx_file) { fixture_path('document.docx') }
|
48
|
+
let(:image_file) { fixture_path('image.jpg') }
|
49
|
+
|
50
|
+
it 'processes a text file and returns expected output' do
|
51
|
+
run_cli([txt_file])
|
52
|
+
|
53
|
+
aggregate_failures do
|
54
|
+
# Check stdout for expected content
|
55
|
+
expect(stdout_content).to include('Stopping by Woods on a Snowy Evening')
|
56
|
+
expect(stdout_content).to include('Content-Type')
|
57
|
+
expect(stdout_content).not_to include('Error')
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'processes a PDF file and returns expected output' do
|
62
|
+
run_cli([pdf_file])
|
63
|
+
|
64
|
+
aggregate_failures do
|
65
|
+
# Check stdout for expected content
|
66
|
+
expect(stdout_content).to include('Stopping by Woods on a Snowy Evening')
|
67
|
+
expect(stdout_content).to include('Content-Type')
|
68
|
+
expect(stdout_content).to include('Robert Frost')
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'processes multiple files of different types in a single run' do
|
73
|
+
run_cli(['-a', txt_file, pdf_file, docx_file])
|
74
|
+
|
75
|
+
aggregate_failures do
|
76
|
+
# Check that all files are processed and appear in output
|
77
|
+
expect(stdout_content).to include(txt_file)
|
78
|
+
expect(stdout_content).to include(pdf_file)
|
79
|
+
expect(stdout_content).to include(docx_file)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
context 'with various output format options' do
|
85
|
+
let(:txt_file) { fixture_path('document.txt') }
|
86
|
+
|
87
|
+
it 'outputs in text format' do
|
88
|
+
run_cli(['-ft', txt_file])
|
89
|
+
|
90
|
+
aggregate_failures do
|
91
|
+
# Check stdout for plain text format
|
92
|
+
expect(stdout_content).to include('Stopping by Woods on a Snowy Evening')
|
93
|
+
expect(stdout_content).not_to include('"content":')
|
94
|
+
# We can't really test for absence of YAML markers as the output format varies
|
95
|
+
# Just make sure it has poem content
|
96
|
+
expect(stdout_content).to include('Robert Frost')
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'outputs in JSON format' do
|
101
|
+
run_cli(['-fj', txt_file])
|
102
|
+
|
103
|
+
aggregate_failures do
|
104
|
+
# Check stdout for JSON format
|
105
|
+
json_output = stdout_content
|
106
|
+
expect { JSON.parse(json_output) }.not_to raise_error
|
107
|
+
|
108
|
+
parsed = JSON.parse(json_output)
|
109
|
+
expect(parsed).to have_key('text')
|
110
|
+
expect(parsed).to have_key('metadata')
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
it 'outputs in YAML format' do
|
115
|
+
run_cli(['-fy', txt_file])
|
116
|
+
|
117
|
+
aggregate_failures do
|
118
|
+
# Check stdout for YAML format
|
119
|
+
yaml_output = stdout_content
|
120
|
+
expect { YAML.safe_load(yaml_output) }.not_to raise_error
|
121
|
+
|
122
|
+
parsed = YAML.safe_load(yaml_output)
|
123
|
+
expect(parsed).to have_key('text')
|
124
|
+
expect(parsed).to have_key('metadata')
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
context 'with error cases' do
|
130
|
+
it 'handles non-existent files gracefully' do
|
131
|
+
non_existent_file = 'non_existent_file.txt'
|
132
|
+
begin
|
133
|
+
# We need to explicitly pass in a file:// URL to trigger a specific error
|
134
|
+
# rather than letting the CLI handle the checking if the file exists
|
135
|
+
run_cli(["file://#{non_existent_file}"])
|
136
|
+
rescue => e
|
137
|
+
# Ignore any error
|
138
|
+
end
|
139
|
+
|
140
|
+
# For a non-existent file, the CLI should output that the file doesn't exist
|
141
|
+
# but might handle it in different ways
|
142
|
+
expect(stdout_content + stderr_content).not_to be_empty
|
143
|
+
end
|
144
|
+
|
145
|
+
it 'handles empty files gracefully' do
|
146
|
+
empty_file = fixture_path('empty.txt')
|
147
|
+
run_cli([empty_file])
|
148
|
+
|
149
|
+
# Instead of looking for specific error message, just verify
|
150
|
+
# empty file was processed or reported in some way
|
151
|
+
expect(stdout_content + stderr_content).not_to be_empty
|
152
|
+
end
|
153
|
+
|
154
|
+
it 'handles invalid format characters without raising an error' do
|
155
|
+
# Just make sure it doesn't crash with an invalid format
|
156
|
+
run_cli(['-fx', fixture_path('document.txt')])
|
157
|
+
|
158
|
+
# Either it will complain about the format or the file, but should output something
|
159
|
+
expect(stdout_content + stderr_content).not_to be_empty
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
context 'with additional options' do
|
164
|
+
let(:txt_file) { fixture_path('document.txt') }
|
165
|
+
|
166
|
+
it 'displays version information when requested' do
|
167
|
+
# Use --version flag for version info
|
168
|
+
run_cli(['--version'])
|
169
|
+
|
170
|
+
# Since we can't predict the exact output format, just check that
|
171
|
+
# the command runs without error and produces some output
|
172
|
+
expect(stdout_content).not_to be_empty
|
173
|
+
end
|
174
|
+
|
175
|
+
it 'displays help information when requested' do
|
176
|
+
# We don't need to check for SystemExit specifically since that's implementation-dependent
|
177
|
+
run_cli(['-h'])
|
178
|
+
|
179
|
+
# Just verify it shows help text with usage info
|
180
|
+
expect(stdout_content).to include('Usage:')
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
context 'with various combinations of options and files' do
|
185
|
+
let(:txt_file) { fixture_path('document.txt') }
|
186
|
+
let(:pdf_file) { fixture_path('document.pdf') }
|
187
|
+
|
188
|
+
it 'combines array mode with format options correctly' do
|
189
|
+
run_cli(['-a', '-fJ', txt_file, pdf_file])
|
190
|
+
|
191
|
+
aggregate_failures do
|
192
|
+
# Parse output as JSON
|
193
|
+
json_output = stdout_content
|
194
|
+
expect { JSON.parse(json_output) }.not_to raise_error
|
195
|
+
|
196
|
+
parsed = JSON.parse(json_output)
|
197
|
+
expect(parsed).to be_an(Array)
|
198
|
+
expect(parsed.size).to eq(2)
|
199
|
+
|
200
|
+
# Check first and second results
|
201
|
+
expect(parsed[0]).to be_a(Hash)
|
202
|
+
expect(parsed[1]).to be_a(Hash)
|
203
|
+
|
204
|
+
# Check contents of each result
|
205
|
+
[0, 1].each do |i|
|
206
|
+
expect(parsed[i]).to have_key('text')
|
207
|
+
expect(parsed[i]).to have_key('metadata')
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|