RubyGems - ing_kontoauszug_parser - Versions diffs - 0.1.0 - Mend

ing_kontoauszug_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +19 -0
data/LICENSE +21 -0
data/README.md +159 -0
data/bin/console +8 -0
data/bin/pdf_to_json +64 -0
data/bin/setup +5 -0
data/lib/ing_kontoauszug_parser/header.rb +146 -0
data/lib/ing_kontoauszug_parser/pdf_extractor.rb +233 -0
data/lib/ing_kontoauszug_parser/statement_parser.rb +269 -0
data/lib/ing_kontoauszug_parser/text_parser.rb +905 -0
data/lib/ing_kontoauszug_parser/version.rb +14 -0
data/lib/ing_kontoauszug_parser.rb +105 -0
metadata +74 -0

data/lib/ing_kontoauszug_parser/statement_parser.rb ADDED Viewed

@@ -0,0 +1,269 @@
+# frozen_string_literal: true
+require_relative 'text_parser'
+require_relative 'pdf_extractor'
+module IngKontoauszugParser
+  # The main interface for parsing ING Bank account statements.
+  #
+  # StatementParser converts PDF or text statement exports into structured
+  # Ruby hashes containing account information and individual transactions.
+  # It handles the entire pipeline: PDF text extraction, IBAN validation,
+  # transaction parsing, and SEPA reference extraction.
+  #
+  # == Quick Start
+  #
+  # For most use cases, create a parser and call {#parse} with a PDF path:
+  #
+  #   parser = IngKontoauszugParser::StatementParser.new
+  #   result = parser.parse(file_path: 'statement.pdf')
+  #
+  # == Return Structure
+  #
+  # All parsing methods return a hash with the following structure:
+  #
+  #   {
+  #     header: {
+  #       iban: "DE89 3704 0044 0532 0130 00"
+  #     },
+  #     statements: [
+  #       {
+  #         booking_date: "01.08.2025",
+  #         value_date: "01.08.2025",
+  #         transfer_type: "Lastschrift",
+  #         recipient: "Allianz Direct Vers.",
+  #         amount_eur: "-31,49",
+  #         amount_eur_numeric: "-31.49",
+  #         amount_direction: "debit",
+  #         narrative: "Versicherungsbeitrag August",
+  #         mandate_id: "MA123456",       # optional
+  #         reference: "RF123456789",     # optional
+  #         google_pay: true              # optional
+  #       },
+  #       # ... more statements
+  #     ],
+  #     warnings: [...]  # optional, only present if issues occurred
+  #   }
+  #
+  # == PDF Backend Selection
+  #
+  # The parser supports two PDF extraction methods:
+  # - +:poppler+ - Uses the +pdftotext+ command (5-10x faster, recommended)
+  # - +:pdf_reader+ - Pure Ruby extraction (no system dependencies)
+  #
+  # By default, poppler is used if available, falling back to pdf-reader.
+  #
+  # == Language Configuration
+  #
+  # The parser is configured for German ING statements by default. For other
+  # languages, override the marker and label parameters:
+  #
+  #   parser = StatementParser.new(
+  #     end_markers: ['New Balance'],
+  #     mandate_label: 'Mandate:',
+  #     reference_label: 'Reference:'
+  #   )
+  #
+  # @example Parse a PDF statement
+  #   parser = IngKontoauszugParser::StatementParser.new
+  #   result = parser.parse(file_path: '/path/to/statement.pdf')
+  #   puts "Account: #{result[:header][:iban]}"
+  #   result[:statements].each do |s|
+  #     puts "#{s[:booking_date]}: #{s[:recipient]} #{s[:amount_eur]}"
+  #   end
+  #
+  # @example Parse with specific PDF backend
+  #   parser = IngKontoauszugParser::StatementParser.new(pdf_backend: :poppler)
+  #   result = parser.parse(file_path: 'statement.pdf')
+  #
+  # @example Parse text export (for testing or text files)
+  #   parser = IngKontoauszugParser::StatementParser.new
+  #   result = parser.parse_text(File.read('statement.txt'))
+  #
+  # @example Skip IBAN validation
+  #   parser = IngKontoauszugParser::StatementParser.new(validate_iban: false)
+  class StatementParser
+    # Creates a new parser with optional language and backend configuration.
+    #
+    # All parameters have sensible defaults for German ING statements. Override
+    # as needed for other languages or specific requirements.
+    #
+    # @param end_markers [Array<String>, nil] phrases that signal the end of the
+    #   transaction list. When any of these appear, parsing stops.
+    #   Default: ["Neuer Betrag", "Neuer Saldo"] (German)
+    # @param mandate_label [String, nil] the label preceding SEPA mandate IDs in
+    #   the narrative text. Default: "Mandat:" (German)
+    # @param reference_label [String, nil] the label preceding SEPA references in
+    #   the narrative text. Default: "Referenz:" (German)
+    # @param validate_iban [Boolean] whether to verify the IBAN checksum using
+    #   ISO 13616 mod-97. Set to false for faster parsing when validation is
+    #   handled elsewhere. Default: true
+    # @param pdf_backend [Symbol, nil] force a specific PDF extraction backend:
+    #   - +:poppler+ - Use pdftotext (requires poppler-utils installed)
+    #   - +:pdf_reader+ - Use pure Ruby pdf-reader gem
+    #   - +nil+ - Auto-select fastest available (default)
+    def initialize(end_markers: nil, mandate_label: nil, reference_label: nil,
+                   validate_iban: true, pdf_backend: nil)
+      @parser_options = {
+        validate_iban: validate_iban
+      }
+      @parser_options[:end_markers] = end_markers if end_markers
+      @parser_options[:mandate_label] = mandate_label if mandate_label
+      @parser_options[:reference_label] = reference_label if reference_label
+      @pdf_backend = pdf_backend
+    end
+    # Parses an ING PDF statement file into structured data.
+    #
+    # This is the primary method for processing PDF statements. It extracts
+    # text from the PDF, locates the IBAN, and parses all transactions.
+    #
+    # @param file_path [String] path to the PDF statement file (absolute or relative)
+    # @param reader [Class, nil] custom PDF reader class for testing. When provided,
+    #   bypasses normal extraction and uses the given class directly. The class
+    #   must respond to +new(path)+ and return an object with +pages+ that yield
+    #   objects responding to +text+.
+    # @return [Hash] parsed result containing +:header+ and +:statements+ keys.
+    #   See class documentation for the complete structure.
+    # @raise [ArgumentError] if file_path is nil or empty
+    # @raise [IngKontoauszugParser::Error] if the file doesn't exist or PDF extraction fails
+    # @raise [IngKontoauszugParser::HeaderNotFound] if no IBAN is found in the statement
+    # @raise [IngKontoauszugParser::InvalidIBAN] if IBAN validation fails (when enabled)
+    # @raise [IngKontoauszugParser::BookingParseError] if no transactions can be parsed
+    def parse(file_path:, reader: nil)
+      validate_file_path!(file_path)
+      lines = if reader
+                # Legacy path: use provided reader class (for testing)
+                extract_with_reader(file_path, reader)
+              else
+                # Optimized path: use PdfExtractor with backend selection
+                PdfExtractor.extract_lines(file_path, backend: @pdf_backend)
+              end
+      text_parser.parse_lines(lines)
+    end
+    # Parses statement text directly without PDF extraction.
+    #
+    # Use this method when you have the statement as plain text (e.g., from
+    # a text export, copy-paste, or for testing). The text should contain
+    # the complete statement including the IBAN line and all transactions.
+    #
+    # @param text [String] complete statement text with IBAN and transactions
+    # @return [Hash] parsed result with the same structure as {#parse}
+    # @raise [IngKontoauszugParser::HeaderNotFound] if no IBAN is found
+    # @raise [IngKontoauszugParser::InvalidIBAN] if IBAN validation fails
+    # @raise [IngKontoauszugParser::BookingParseError] if no transactions found
+    #
+    # @example Parse text export
+    #   text = File.read('statement_export.txt')
+    #   result = parser.parse_text(text)
+    def parse_text(text)
+      text_parser.parse_text(text)
+    end
+    # Parses pre-split statement lines.
+    #
+    # Use this when you already have the statement as an array of lines
+    # (e.g., from custom preprocessing or line-by-line file reading).
+    #
+    # @param lines [Array<String>] statement lines including IBAN and transactions
+    # @return [Hash] parsed result with the same structure as {#parse}
+    # @raise [IngKontoauszugParser::HeaderNotFound] if no IBAN is found
+    # @raise [IngKontoauszugParser::InvalidIBAN] if IBAN validation fails
+    # @raise [IngKontoauszugParser::BookingParseError] if no transactions found
+    def parse_lines(lines)
+      text_parser.parse_lines(lines)
+    end
+    # Parses transaction lines without requiring an IBAN header.
+    #
+    # Use this when you have only the transaction portion of a statement
+    # (without the account header), or when the IBAN is already known from
+    # another source.
+    #
+    # Unlike {#parse}, {#parse_text}, and {#parse_lines}, this method returns
+    # only the statements array, not a hash with header information.
+    #
+    # @param lines [Array<String>] lines containing only the transactions
+    # @return [Array<Hash>] parsed transaction hashes (not wrapped in a result hash)
+    #
+    # @example Parse transactions only
+    #   transactions = parser.parse_statement_lines(booking_lines)
+    #   transactions.first[:recipient]  #=> "Amazon EU"
+    def parse_statement_lines(lines)
+      text_parser.parse_statement_lines(lines)
+    end
+    # Checks if the fast poppler PDF backend is available.
+    #
+    # Use this to conditionally inform users about installation recommendations
+    # or to choose between processing strategies.
+    #
+    # @return [Boolean] true if +pdftotext+ is found in the system PATH
+    #
+    # @example Show recommendation to user
+    #   unless StatementParser.poppler_available?
+    #     puts "Install poppler-utils for 5-10x faster PDF processing"
+    #   end
+    def self.poppler_available?
+      PdfExtractor.poppler_available?
+    end
+    private
+    # Returns a memoized TextParser instance configured with parser options.
+    # @return [TextParser]
+    # @api private
+    def text_parser
+      @text_parser ||= TextParser.new(**@parser_options)
+    end
+    # Validates that a file path is provided and the file exists.
+    #
+    # @param file_path [String, nil] the path to validate
+    # @raise [ArgumentError] if path is nil or empty
+    # @raise [IngKontoauszugParser::Error] if file doesn't exist
+    # @api private
+    def validate_file_path!(file_path)
+      raise ArgumentError, 'file_path is required' if file_path.nil? || file_path.empty?
+      return if File.exist?(file_path)
+      raise IngKontoauszugParser::Error,
+            "File not found: #{file_path}"
+    end
+    # Extracts text using a custom PDF reader class (for testing).
+    #
+    # This method provides backward compatibility with tests that inject
+    # custom PDF reader implementations. Production code should use
+    # {PdfExtractor} directly.
+    #
+    # @param file_path [String] path to the PDF file
+    # @param reader_class [Class] PDF reader class to use
+    # @return [Array<String>] extracted text lines
+    # @raise [IngKontoauszugParser::Error] if PDF extraction fails
+    # @api private
+    def extract_with_reader(file_path, reader_class)
+      pdf = reader_class.new(file_path)
+      lines = []
+      pdf.pages.each do |page|
+        next unless page.respond_to?(:text)
+        page.text.to_s.each_line do |line|
+          lines << line.rstrip
+        end
+      end
+      lines
+    rescue StandardError => e
+      # Wrap PDF-related errors for consistent API
+      raise IngKontoauszugParser::Error, e.message if e.class.name.start_with?('PDF::Reader::')
+      raise
+    end
+  end
+end