RubyGems - rika - Versions diffs - 2.1.0-java → 2.2.0-java - Mend

rika 2.1.0-java → 2.2.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/.gitignore +10 -7
data/.rspec +1 -1
data/README.md +58 -11
data/RELEASE_NOTES.md +12 -0
data/{bin → exe}/rika +1 -1
data/lib/rika/cli/args_parser.rb +124 -26
data/lib/rika/cli/rika_command.rb +184 -44
data/lib/rika/parser.rb +33 -15
data/lib/rika/version.rb +1 -1
data/rika.gemspec +2 -1
data/spec/integration/cli_end_to_end_spec.rb +212 -0
data/spec/integration/document_processing_spec.rb +193 -0
data/spec/integration/web_url_processing_spec.rb +252 -0
data/spec/rika/cli/args_parser/boolean_options_spec.rb +136 -0
data/spec/rika/cli/args_parser/environment_options_spec.rb +115 -0
data/spec/rika/cli/args_parser/format_options_spec.rb +143 -0
data/spec/rika/cli/{args_parser_spec.rb → args_parser/main_spec.rb} +63 -14
data/spec/rika/cli/args_parser/url_filespec_spec.rb +134 -0
data/spec/rika/cli/rika_command_spec.rb +81 -13
metadata +12 -5

data/lib/rika/cli/rika_command.rb CHANGED Viewed

@@ -6,6 +6,7 @@ require 'rika'
 require 'rika/formatters'
 require 'rika/cli/args_parser'
 require 'stringio'
+require 'yaml'
 # This command line application enables the parsing of documents on the command line.
 # Syntax is:
@@ -16,67 +17,133 @@ require 'stringio'
 # but the -t and -m flags can be used to enable or suppress either.
 # Supports output formats of JSON, Pretty JSON, YAML, Awesome Print, to_s, and inspect (see Formatters class).
 class RikaCommand
-  attr_reader :args, :help_text, :metadata_formatter, :options, :targets, :text_formatter
+  FORMAT_DESCRIPTIONS = Hash.new('Unknown').merge(
+    'a' => 'AwesomePrint',
+    'i' => 'inspect',
+    'j' => 'JSON',
+    'J' => 'Pretty JSON',
+    't' => 'to_s',
+    'y' => 'YAML'
+  ).freeze
+  attr_reader :args, :bad_targets, :help_text, :metadata_formatter, :options, :targets, :text_formatter
+  # Outputs help text to stdout
+  # @param [String] help_text The help text to display
+  # @param [String] error_message Optional error message to display on stderr before the help text
+  # @return [void]
+  def self.output_help_text(help_text, error_message = nil)
+    $stderr.puts(error_message) if error_message
+    puts help_text
+  end
   # @param [Array<String>] args command line arguments; default to ARGV but may be overridden for testing
   def initialize(args = ARGV)
     # Dup the array in case it has been frozen. The array will be modified later when options are parsed
     # and removed, and when directories are removed, so this array should not be frozen.
     @args = args.dup
+    @bad_targets = Hash.new { |hash, key| hash[key] = [] }
   end
   # Main method and entry point for this class' work.
+  # @return [Integer] exit code (0 for success, non-zero for errors)
   def call
     prepare
     report_and_exit_if_no_targets_specified
+    if options[:dry_run]
+      display_dry_run_info
+      return 0
+    end
+    process_targets
+    report_bad_targets
+    bad_targets.values.flatten.empty? ? 0 : 1
+  end
+  private
+  # Prepares to run the parse. This method is separate from #call so that it can be called from tests.
+  # @return [void]
+  def prepare
+    @options, @targets, @help_text, issues = ArgsParser.call(args)
+    # Add any issues from ArgsParser to our bad_targets
+    issues.each do |issue_type, issue_targets|
+      issue_targets.each { |target| bad_targets[issue_type] << target }
+    end
+    set_output_formats
+  end
+  # Process all targets based on options
+  # @return [void]
+  def process_targets
     if options[:as_array]
       puts result_array_output
     else
-      targets.each do |target|
-        # If we don't do this, Tika will raise an org.apache.tika.exception.ZeroByteFileException
-        # TODO: Do same for URL?
-        if File.file?(target) && File.zero?(target)
-          $stderr.puts("\n\nFile empty!: #{target}\n\n")
-          next
-        end
-        result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
-        puts single_document_output(target, result)
+      targets.each do |target|
+        result = parse_target(target)
+        puts single_document_output(target, result) unless result == :error
       end
     end
-    nil
   end
-  # Prepares to run the parse. This method is separate from #call so that it can be called from tests.
+  # Report any targets that failed to process
   # @return [void]
-  private def prepare
-    @options, @targets, @help_text = ArgsParser.call(args)
-    set_output_formats
+  def report_bad_targets
+    total_bad_targets = bad_targets.values.flatten.size
+    return if total_bad_targets.zero?
+    require 'awesome_print'
+    $stderr.puts("\n#{total_bad_targets} targets could not be processed:")
+    $stderr.puts(bad_targets.ai)
+    # Show any issues found during preparation
+    unless bad_targets.empty?
+      puts "Issues found:"
+      # Possible issue types include:
+      # - non_existent_file: Files that don't exist
+      # - empty_file: Files that exist but are empty
+      # - is_symlink_wont_process: Symlinks that won't be processed
+      # - file_with_url_characters: Files with "://" in their names
+      # - bad_url_scheme: URLs with schemes other than http/https
+      # - invalid_url: URLs that fail URI parsing
+      # - unknown_host: URLs with hosts that can't be resolved
+      # - io_error: IO errors during processing
+      # - invalid_input: Invalid input arguments
+      bad_targets.each do |issue_type, files|
+        puts "  #{issue_type}:"
+        files.each do |file|
+          puts "    #{file}"
+        end
+      end
+    end
   end
   # Sets the output format(s) based on the command line options.
   # Exits with error message if format is invalid.
   # @return [void]
-  private def set_output_formats
+  def set_output_formats
     format = options[:format]
     @metadata_formatter = Rika::Formatters.get(format[0])
     @text_formatter     = Rika::Formatters.get(format[1])
     nil
   rescue KeyError
-    $stderr.puts "Invalid format: #{format}\n\n"
-    $stderr.puts help_text
+    self.class.output_help_text("Invalid format: #{format}")
     exit 1
   end
   # Converts a ParseResult to a hash containing the selected pieces of data.
   # @param [ParseResult] result the parse result
   # @return [Hash] the hash containing the selected pieces of data
-  private def result_hash(result)
-    h = {}
-    h['source']   = result.metadata['rika:data-source'] if options[:source]
-    h['metadata'] = result.metadata                     if options[:metadata]
-    h['text']     = result.content                      if options[:text]
-    h
+  def result_hash(result)
+    {}.tap do |h|
+      h['source']   = result.metadata['rika:data-source'] if options[:source]
+      h['metadata'] = result.metadata                     if options[:metadata]
+      h['text']     = result.content                      if options[:text]
+    end
   end
   # Outputs the source file or URL in the form of:
@@ -85,7 +152,7 @@ class RikaCommand
   # -------------------------------------------------------------------------------
   # @param [String] source document source identifier
   # @return multiline string as displayed above
-  private def source_output_string(source)
+  def source_output_string(source)
     <<~STRING
       -------------------------------------------------------------------------------
       Source: #{source}
@@ -97,16 +164,55 @@ class RikaCommand
   # @param [String] target the target document
   # @param [ParseResult] result the parse result
   # @return [String] the string representation of the result of parsing a single document
-  private def single_document_output(target, result)
-    if options[:metadata] && options[:text] && %w[jj JJ yy].include?(options[:format])
+  def single_document_output(target, result)
+    if should_use_single_formatter?(options[:format])
       metadata_formatter.(result_hash(result))
     else
-      sio = StringIO.new
+      build_output_string(target, result)
+    end
+  end
+  # Determines if we should use a single formatter for both metadata and text
+  # @param [String] format the format string
+  # @return [Boolean] true if we should use a single formatter
+  def should_use_single_formatter?(format)
+    options[:metadata] && options[:text] && %w[jj JJ yy].include?(format)
+  end
+  # Builds an output string with multiple sections
+  # @param [String] target the target document
+  # @param [ParseResult] result the parse result
+  # @return [String] formatted output string
+  def build_output_string(target, result)
+    StringIO.new.tap do |sio|
       sio << source_output_string(target)                 if options[:source]
       sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
       sio << text_formatter.(result.content) << "\n"      if options[:text]
-      sio.string
-    end
+    end.string
+  end
+  # Parses a target and returns the result. On error, accumulates the error in the @bad_targets hash.
+  # @param [String] target string identifying the target document
+  # @return [ParseResult] the parse result
+  def parse_target(target)
+    Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
+  rescue java.net.UnknownHostException => e
+    handle_parse_error(e, target, :unknown_host)
+  rescue IOError, java.io.IOException => e
+    handle_parse_error(e, target, :io_error)
+  rescue ArgumentError => e
+    handle_parse_error(e, target, :invalid_input)
+  end
+  # Handle parse errors consistently
+  # @param [Exception] exception the exception that occurred
+  # @param [String] target the target being processed
+  # @param [Symbol] error_type the type of error that occurred
+  # @return [Symbol] :error to indicate an error occurred
+  def handle_parse_error(exception, target, error_type)
+    bad_targets[error_type] << target
+    $stderr.puts("#{exception.class} processing '#{target}': #{exception.message}")
+    :error
   end
   # Parses the documents and outputs the result of the parse to stdout as an array of hashes.
@@ -114,11 +220,11 @@ class RikaCommand
   # (otherwise the output would be invalid, especially with JSON or YAML).
   # Therefore, the metadata formatter is arbitrarily selected to be used by both.
   # @return [String] the string representation of the result of parsing the documents
-  private def result_array_output
-    output_hashes = targets.map do |target|
-      result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
-      result_hash(result)
-    end
+  def result_array_output
+    results = targets \
+      .map { |target| parse_target(target) } \
+      .reject { |target| target == :error }
+    output_hashes = results.map { |result| result_hash(result) }
     # Either the metadata or text formatter will do, since they will necessarily be the same formatter.
     metadata_formatter.call(output_hashes)
@@ -126,7 +232,7 @@ class RikaCommand
   # Tika offers a max_content_length option, but it is not exposed in Rika.
   # Instead it is used only to enable or disable the entire text output.
-  private def max_content_length
+  def max_content_length
     options[:text] ? -1 : 0
   end
@@ -136,16 +242,50 @@ class RikaCommand
   # dynamically generated by a script, and the script may not want to abort if no documents are
   # generated.
   # @return [void] or exits
-  private def report_and_exit_if_no_targets_specified
+  def report_and_exit_if_no_targets_specified
     if targets.empty?
-      $stderr.puts <<~MESSAGE
-        No targets specified.
-        #{help_text}
-      MESSAGE
+      $stderr.puts(%q{No valid targets specified. Run with '-h' option for help.})
       exit 0
     end
     nil
   end
+  # Displays information about what would happen in a dry run
+  # without actually executing the command
+  # @return [void]
+  def display_dry_run_info
+    require 'yaml'
+    # Format the targets list
+    target_list = targets.map { |target| "  #{target}" }.join("\n")
+    # Create the main output using a heredoc
+    puts <<~DRY_RUN_OUTPUT
+      DRY RUN: Showing what would happen without executing
+      Options:
+        Format: #{options[:format]} (#{format_description})
+        Output metadata: #{options[:metadata]}
+        Output text: #{options[:text]}
+        Sort metadata keys: #{options[:key_sort]}
+        Output source: #{options[:source]}
+        Output as array: #{options[:as_array]}
+      Targets to process (#{targets.size}):
+      #{target_list}
+    DRY_RUN_OUTPUT
+    if bad_targets.any?
+      puts "\nIssues found:\n#{bad_targets.to_yaml}"
+    end
+  end
+  # Returns a description of the format options
+  # @return [String] description of the format
+  def format_description
+    metadata_desc = FORMAT_DESCRIPTIONS[options[:format][0]]
+    text_desc = FORMAT_DESCRIPTIONS[options[:format][1]]
+    "#{metadata_desc} for metadata, #{text_desc} for text"
+  end
 end

data/lib/rika/parser.rb CHANGED Viewed

@@ -54,31 +54,49 @@ module Rika
     end
     # @return [Symbol] input type (currently only :file and :http are supported)
-    # @raise [IOError] if input is not a file or HTTP resource
+    # @raise [ArgumentError] if the URI format is invalid
+    # @raise [IOError] if input is not an available file or HTTP resource
     private def data_source_input_type
-      if File.file?(@data_source)
-        :file
-      elsif URI(@data_source).is_a?(URI::HTTP)
-        :http
-      else
-        raise IOError, "Input (#{@data_source}) is not an available file or HTTP resource."
+      return :file if File.file?(@data_source)
+      begin
+        uri = URI(@data_source)
+        return :http if uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
+      rescue URI::InvalidURIError => e
+        # Use ArgumentError for validation issues
+        raise ArgumentError, "Invalid URI format: #{@data_source} (#{e.message})"
       end
+      raise IOError, "Input (#{@data_source}) is not an available file or HTTP resource."
     end
-    # * Creates and opens an input stream from the configured resource.
+    # Creates a TikaInputStream from the configured resource, which provides better
+    # performance and resource management than direct streams.
     # * Yields that stream to the passed code block.
     # * Then closes the stream.
+    # TikaInputStream provides advanced features like:
+    # * Buffering and resource management
+    # * Mark/reset functionality
+    # * File tracking for temporary files
+    # * Memory efficiency for large files
     # @return [Object] the value returned by the passed code block
     private def with_input_stream
-      input_stream =
-        if @input_type == :file
-          FileInputStream.new(java.io.File.new(@data_source))
-        else
-          URL.new(@data_source).open_stream
-        end
+      input_stream = if @input_type == :file
+        file = java.io.File.new(@data_source)
+        # Use the TikaInputStream.get(File) method which is optimized for file access
+        TikaInputStream.get(file)
+      else
+        url = URL.new(@data_source)
+        # Use the TikaInputStream.get(URL) method which handles HTTP streams properly
+        TikaInputStream.get(url)
+      end
+      # Call the block with the stream
       yield input_stream
     ensure
-      input_stream.close if input_stream.respond_to?(:close)
+      # Ensure stream is closed even if exceptions occur
+      input_stream.close if input_stream && input_stream.respond_to?(:close)
     end
   end
 end

data/lib/rika/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Rika
-  VERSION = '2.1.0'
+  VERSION = '2.2.0'
 end

data/rika.gemspec CHANGED Viewed

@@ -15,7 +15,8 @@ Gem::Specification.new do |gem|
   gem.summary       = 'A JRuby wrapper for Apache Tika to extract text and metadata from files of various formats.'
   gem.homepage      = 'https://github.com/keithrbennett/rika'
   gem.files         = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
-  gem.executables   = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
+  gem.bindir        = 'exe'
+  gem.executables   = gem.files.grep(%r{^#{gem.bindir}/}).map { |f| File.basename(f) }
   gem.require_paths = ['lib']
   gem.add_dependency 'awesome_print', '~> 1.9', '>= 1.9.2'
   gem.platform = 'java'

data/spec/integration/cli_end_to_end_spec.rb ADDED Viewed

@@ -0,0 +1,212 @@
+# frozen_string_literal: true
+require 'spec_helper'
+require 'rika'
+require 'rika/cli/rika_command'
+require 'tempfile'
+require 'fileutils'
+describe 'CLI End-to-End', type: :integration do
+  # Capture stdout and stderr
+  before do
+    @original_stdout = $stdout
+    @original_stderr = $stderr
+    $stdout = StringIO.new
+    $stderr = StringIO.new
+  end
+  after do
+    $stdout = @original_stdout
+    $stderr = @original_stderr
+  end
+  # Helper to get captured stdout
+  def stdout_content
+    $stdout.string
+  end
+  # Helper to get captured stderr
+  def stderr_content
+    $stderr.string
+  end
+  # Helper to run CLI with arguments
+  def run_cli(args)
+    command = RikaCommand.new(args)
+    begin
+      command.call
+    rescue SystemExit
+      # Catch SystemExit to prevent test termination
+    end
+    command
+  end
+  context 'with various file formats' do
+    let(:txt_file) { fixture_path('document.txt') }
+    let(:pdf_file) { fixture_path('document.pdf') }
+    let(:docx_file) { fixture_path('document.docx') }
+    let(:image_file) { fixture_path('image.jpg') }
+    it 'processes a text file and returns expected output' do
+      run_cli([txt_file])
+      aggregate_failures do
+        # Check stdout for expected content
+        expect(stdout_content).to include('Stopping by Woods on a Snowy Evening')
+        expect(stdout_content).to include('Content-Type')
+        expect(stdout_content).not_to include('Error')
+      end
+    end
+    it 'processes a PDF file and returns expected output' do
+      run_cli([pdf_file])
+      aggregate_failures do
+        # Check stdout for expected content
+        expect(stdout_content).to include('Stopping by Woods on a Snowy Evening')
+        expect(stdout_content).to include('Content-Type')
+        expect(stdout_content).to include('Robert Frost')
+      end
+    end
+    it 'processes multiple files of different types in a single run' do
+      run_cli(['-a', txt_file, pdf_file, docx_file])
+      aggregate_failures do
+        # Check that all files are processed and appear in output
+        expect(stdout_content).to include(txt_file)
+        expect(stdout_content).to include(pdf_file)
+        expect(stdout_content).to include(docx_file)
+      end
+    end
+  end
+  context 'with various output format options' do
+    let(:txt_file) { fixture_path('document.txt') }
+    it 'outputs in text format' do
+      run_cli(['-ft', txt_file])
+      aggregate_failures do
+        # Check stdout for plain text format
+        expect(stdout_content).to include('Stopping by Woods on a Snowy Evening')
+        expect(stdout_content).not_to include('"content":')
+        # We can't really test for absence of YAML markers as the output format varies
+        # Just make sure it has poem content
+        expect(stdout_content).to include('Robert Frost')
+      end
+    end
+    it 'outputs in JSON format' do
+      run_cli(['-fj', txt_file])
+      aggregate_failures do
+        # Check stdout for JSON format
+        json_output = stdout_content
+        expect { JSON.parse(json_output) }.not_to raise_error
+        parsed = JSON.parse(json_output)
+        expect(parsed).to have_key('text')
+        expect(parsed).to have_key('metadata')
+      end
+    end
+    it 'outputs in YAML format' do
+      run_cli(['-fy', txt_file])
+      aggregate_failures do
+        # Check stdout for YAML format
+        yaml_output = stdout_content
+        expect { YAML.safe_load(yaml_output) }.not_to raise_error
+        parsed = YAML.safe_load(yaml_output)
+        expect(parsed).to have_key('text')
+        expect(parsed).to have_key('metadata')
+      end
+    end
+  end
+  context 'with error cases' do
+    it 'handles non-existent files gracefully' do
+      non_existent_file = 'non_existent_file.txt'
+      begin
+        # We need to explicitly pass in a file:// URL to trigger a specific error
+        # rather than letting the CLI handle the checking if the file exists
+        run_cli(["file://#{non_existent_file}"])
+      rescue => e
+        # Ignore any error
+      end
+      # For a non-existent file, the CLI should output that the file doesn't exist
+      # but might handle it in different ways
+      expect(stdout_content + stderr_content).not_to be_empty
+    end
+    it 'handles empty files gracefully' do
+      empty_file = fixture_path('empty.txt')
+      run_cli([empty_file])
+      # Instead of looking for specific error message, just verify
+      # empty file was processed or reported in some way
+      expect(stdout_content + stderr_content).not_to be_empty
+    end
+    it 'handles invalid format characters without raising an error' do
+      # Just make sure it doesn't crash with an invalid format
+      run_cli(['-fx', fixture_path('document.txt')])
+      # Either it will complain about the format or the file, but should output something
+      expect(stdout_content + stderr_content).not_to be_empty
+    end
+  end
+  context 'with additional options' do
+    let(:txt_file) { fixture_path('document.txt') }
+    it 'displays version information when requested' do
+      # Use --version flag for version info
+      run_cli(['--version'])
+      # Since we can't predict the exact output format, just check that
+      # the command runs without error and produces some output
+      expect(stdout_content).not_to be_empty
+    end
+    it 'displays help information when requested' do
+      # We don't need to check for SystemExit specifically since that's implementation-dependent
+      run_cli(['-h'])
+      # Just verify it shows help text with usage info
+      expect(stdout_content).to include('Usage:')
+    end
+  end
+  context 'with various combinations of options and files' do
+    let(:txt_file) { fixture_path('document.txt') }
+    let(:pdf_file) { fixture_path('document.pdf') }
+    it 'combines array mode with format options correctly' do
+      run_cli(['-a', '-fJ', txt_file, pdf_file])
+      aggregate_failures do
+        # Parse output as JSON
+        json_output = stdout_content
+        expect { JSON.parse(json_output) }.not_to raise_error
+        parsed = JSON.parse(json_output)
+        expect(parsed).to be_an(Array)
+        expect(parsed.size).to eq(2)
+        # Check first and second results
+        expect(parsed[0]).to be_a(Hash)
+        expect(parsed[1]).to be_a(Hash)
+        # Check contents of each result
+        [0, 1].each do |i|
+          expect(parsed[i]).to have_key('text')
+          expect(parsed[i]).to have_key('metadata')
+        end
+      end
+    end
+  end
+end