RubyGems - acroforge - Versions diffs - 0.1.0 - Mend

acroforge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +11 -0
data/LICENSE.txt +21 -0
data/README.md +217 -0
data/Rakefile +10 -0
data/acroforge.gemspec +37 -0
data/exe/acroforge +5 -0
data/lib/acroforge/all_text_processor.rb +126 -0
data/lib/acroforge/annotator.rb +137 -0
data/lib/acroforge/cli.rb +351 -0
data/lib/acroforge/constants.rb +46 -0
data/lib/acroforge/engine.rb +869 -0
data/lib/acroforge/labels.rb +112 -0
data/lib/acroforge/preparer.rb +103 -0
data/lib/acroforge/relabeler.rb +179 -0
data/lib/acroforge/schema.rb +208 -0
data/lib/acroforge/validator.rb +37 -0
data/lib/acroforge/version.rb +5 -0
data/lib/acroforge.rb +18 -0
data/sig/acroforge.rbs +4 -0
metadata +81 -0

data/lib/acroforge/labels.rb ADDED Viewed

@@ -0,0 +1,112 @@
+# frozen_string_literal: true
+require_relative "constants"
+module AcroForge
+  # Cleans up human-readable labels extracted from PDFs.
+  #
+  # PDF text extraction often produces broken word fragments (e.g. a ligature
+  # like "fi" gets split, producing "Tax Identi fi cation No.") and labels
+  # rendered in different casing conventions across vendors (ALL UPPER, mixed,
+  # sentence case). This module normalizes both: it fixes the typo fragments
+  # using Constants::TYPO_PHRASE_REPLACEMENTS and converts the result to
+  # consistent title case. Used by Engine, Schema, and Relabeler so the same
+  # corrections appear in the verbose log, schema variations, and mapping meta.
+  module Labels
+    module_function
+    # Words that conventionally stay lowercase inside a title (except when
+    # they're the first or last word).
+    TITLE_CASE_CONNECTORS = %w[
+      a an the and but or nor for so yet
+      of at by in on to up from with as vs
+    ].to_set
+    def humanize(label)
+      return label unless label.is_a?(String) && !label.empty?
+      result = fix_typos(label)
+      result = title_case(result)
+      result.gsub(/\s+/, " ").strip
+    end
+    # Fix snake_case typo patterns from Constants::TYPO_PHRASE_REPLACEMENTS in
+    # the human-readable label too. "Identi fi cation" -> "Identification".
+    def fix_typos(label)
+      result = label.dup
+      Constants::TYPO_PHRASE_REPLACEMENTS.each do |bad, good|
+        parts = bad.split("_").reject(&:empty?).map { |p| Regexp.escape(p) }
+        next if parts.empty?
+        pattern = /\b#{parts.join('\s+')}\b/i
+        result = result.gsub(pattern) do |match|
+          replacement = good.tr("_", " ")
+          if match[0] == match[0].upcase
+            replacement[0].upcase + (replacement[1..] || "")
+          else
+            replacement
+          end
+        end
+      end
+      result
+    end
+    # Convert a label to standard title case:
+    #   - First and last words always capitalized
+    #   - Conventional connectors (of, the, to, ...) lowercased mid-label
+    #   - All other words capitalized on the first letter
+    #   - Short all-uppercase tokens (<= 3 chars) preserved as acronyms
+    #     (GHC, DOB, PDF, ID stay as-is; "DDMMYYYY" also preserved as a format)
+    #   - A word immediately following an opening "(" or "[" is treated as
+    #     starting a fresh title, so its first letter capitalizes even if it
+    #     would otherwise be a connector ("(For Disbursement)" not "(for ...)")
+    def title_case(text)
+      words = text.split(/(\s+)/)  # preserve whitespace between words
+      content_indices = words.each_index.select { |i| words[i].match?(/\S/) }
+      first_idx = content_indices.first
+      last_idx = content_indices.last
+      words.each_with_index.map do |word, i|
+        next word if word.match?(/^\s*$/)
+        if acronym?(word)
+          word
+        elsif i == first_idx || i == last_idx || word.start_with?("(", "[", '"', "'")
+          capitalize_first(word)
+        elsif TITLE_CASE_CONNECTORS.include?(strip_punct(word).downcase)
+          word.downcase
+        else
+          capitalize_first(word)
+        end
+      end.join
+    end
+    # Treat as an acronym if it's all uppercase AND short (<= 3 chars), OR
+    # if it has 4+ all-upper letters AND looks like a format/code rather than
+    # a word (e.g., "DDMMYYYY"). Mixed letters & digits also count.
+    def acronym?(word)
+      core = strip_punct(word)
+      return false if core.empty?
+      return true if core.length <= 3 && core == core.upcase && core.match?(/[A-Z]/)
+      # Longer all-upper tokens: keep as acronym only if they contain digits
+      # (e.g. "DDMMYYYY", "ID2024") or repeat a pattern that suggests format code.
+      return true if core.length >= 4 && core == core.upcase && core.match?(/\d|^([A-Z])\1+/)
+      false
+    end
+    def strip_punct(word)
+      word.gsub(/[[:punct:]]/, "")
+    end
+    def capitalize_first(word)
+      return word if word.empty?
+      # Preserve trailing/embedded punctuation; only fix the casing of the
+      # alphabetic part.
+      word.sub(/^([[:punct:]]*)([A-Za-z])(.*)$/) do
+        prefix = ::Regexp.last_match(1)
+        first = ::Regexp.last_match(2).upcase
+        rest = ::Regexp.last_match(3).downcase
+        "#{prefix}#{first}#{rest}"
+      end
+    end
+  end
+end

data/lib/acroforge/preparer.rb ADDED Viewed

@@ -0,0 +1,103 @@
+# frozen_string_literal: true
+require "tmpdir"
+require "hexapdf"
+require_relative "engine"
+module AcroForge
+  # Resolves PDF-internal naming conflicts so they don't get in the way of
+  # the human-review workflow.
+  #
+  # Some PDFs have multiple AcroForm fields sharing the same :T name (e.g.,
+  # three separate fields all literally named "date"). YAML mappings can't
+  # represent that cleanly — the engine has to fall back to synthetic
+  # "date#1", "date#2" suffixes. Preparer mutates the PDF up front to give
+  # each duplicate a unique name based on the spatial heuristic's proposal,
+  # so subsequent commands (bootstrap, relabel apply) see a clean PDF.
+  #
+  # Single responsibility: rename duplicate-named fields. Fields with
+  # already-unique names are never touched, regardless of what the heuristic
+  # proposes for them.
+  module Preparer
+    module_function
+    def prepare!(pdf_path, out: nil, schema: {})
+      out ||= pdf_path
+      proposals = nil
+      Dir.mktmpdir do |tmp|
+        engine = AcroForge::Engine.new(pdf_path, schema: schema, normalized_dir: tmp)
+        engine.compile!
+        proposals = engine.field_proposals
+      end
+      # Group proposals by their ORIGINAL field name (strip any #N suffix).
+      # Any name appearing more than once is a duplicate that needs resolving.
+      grouped = proposals.group_by { |p| base_name(p[:pdf_field_name]) }
+      duplicates = grouped.select { |_, occs| occs.length > 1 }
+      if duplicates.empty?
+        # Nothing to do. Don't rewrite the file when out == in.
+        FileUtils.cp(pdf_path, out) if out != pdf_path
+        return {duplicate_groups: 0, renamed: 0, skipped: 0, out_path: out}
+      end
+      doc = HexaPDF::Document.open(pdf_path)
+      form = doc.acro_form(create: false)
+      raise RelabelError, "PDF has no AcroForm: #{pdf_path}" unless form
+      field_index = AcroForge::Engine.field_index(form)
+      # Names already in use by NON-duplicate fields; we can't collide with them.
+      reserved = field_index.keys.reject { |k| k.include?("#") }.to_set
+      duplicates.each_key { |base| reserved.delete(base) }
+      renamed = 0
+      skipped = 0
+      duplicates.each_value do |occurrences|
+        occurrences.each do |proposal|
+          field = field_index[proposal[:pdf_field_name]]
+          unless field
+            skipped += 1
+            next
+          end
+          proposed = proposal[:canonical_key]
+          unless proposed
+            skipped += 1
+            next
+          end
+          target = unique_target(proposed.to_s, reserved)
+          reserved.add(target)
+          field[:T] = target
+          field[:TU] = target
+          renamed += 1
+        end
+      end
+      doc.write(out)
+      {
+        duplicate_groups: duplicates.size,
+        renamed: renamed,
+        skipped: skipped,
+        out_path: out
+      }
+    end
+    def base_name(synthetic_name)
+      synthetic_name.to_s.sub(/#\d+\z/, "")
+    end
+    def unique_target(target, reserved)
+      return target unless reserved.include?(target)
+      counter = 1
+      loop do
+        candidate = "#{target}_#{counter}"
+        return candidate unless reserved.include?(candidate)
+        counter += 1
+      end
+    end
+  end
+end

data/lib/acroforge/relabeler.rb ADDED Viewed

@@ -0,0 +1,179 @@
+# frozen_string_literal: true
+require "yaml"
+require "tmpdir"
+require "time"
+require "hexapdf"
+require_relative "engine"
+require_relative "schema"
+require_relative "version"
+module AcroForge
+  class RelabelError < StandardError; end
+  module Relabeler
+    module_function
+    KEY_REGEX = /\A[a-z][a-z0-9_]*\z/
+    def apply!(pdf_path, mapping_path)
+      data = YAML.load_file(mapping_path) || {}
+      entries = data.reject { |k, _| k.to_s.start_with?("_") }
+      validate!(entries)
+      doc = HexaPDF::Document.open(pdf_path)
+      form = doc.acro_form(create: false)
+      raise RelabelError, "PDF has no AcroForm: #{pdf_path}" unless form
+      renamed = 0
+      disambiguated = 0
+      skipped_null = 0
+      stale = 0
+      # Build a synthetic-name -> field index using the same naming scheme
+      # the engine emits during compile!. This handles PDFs where multiple
+      # fields share the same :T name: the mapping refers to "date",
+      # "date#1", "date#2", and each one resolves to the right field.
+      field_index = AcroForge::Engine.field_index(form)
+      claimed = {}
+      entries.each do |pdf_name, entry|
+        key = entry["key"]
+        if key.nil? || key.to_s.empty?
+          skipped_null += 1
+          next
+        end
+        field = field_index[pdf_name]
+        unless field
+          stale += 1
+          warn "acroforge: stale entry #{pdf_name.inspect} not found in PDF (skipping)"
+          next
+        end
+        target = key.to_s
+        counter = 1
+        while claimed.key?(target)
+          target = "#{key}_#{counter}"
+          counter += 1
+        end
+        disambiguated += 1 if target != key.to_s
+        claimed[target] = true
+        field[:T] = target
+        field[:TU] = target
+        renamed += 1
+      end
+      doc.write(pdf_path)
+      {
+        total: entries.size,
+        renamed: renamed,
+        disambiguated: disambiguated,
+        skipped_null: skipped_null,
+        stale: stale
+      }
+    end
+    def validate!(entries)
+      entries.each do |pdf_name, entry|
+        raise RelabelError, "reserved sentinel: #{pdf_name.inspect}" if pdf_name.to_s.start_with?("_")
+        key = entry["key"]
+        next if key.nil? || key.to_s.empty?
+        unless key.to_s.match?(KEY_REGEX)
+          raise RelabelError, "invalid key #{key.inspect} for field #{pdf_name.inspect}: must match #{KEY_REGEX.inspect}"
+        end
+      end
+    end
+    # Write a mapping YAML proposing semantic names for every AcroForm field.
+    #
+    # If `engine:` is given, the caller has already compiled an engine and
+    # we use its proposals directly (no second compile). This lets callers
+    # like the CLI's `bootstrap` subcommand share one compile pass with
+    # Schema.infer instead of running the engine twice.
+    def propose(pdf_path, out:, schema: {}, mode: :merge, engine: nil)
+      existing = (mode == :merge && File.exist?(out)) ? YAML.load_file(out) : nil
+      proposals = if engine
+        engine.field_proposals
+      else
+        Dir.mktmpdir do |tmp|
+          e = AcroForge::Engine.new(pdf_path, schema: schema, normalized_dir: tmp)
+          e.compile!
+          e.field_proposals
+        end
+      end
+      sorted = proposals.sort_by { |p| [p[:page], -p[:y], p[:x]] }
+      entries = sorted.each_with_object({}) do |p, acc|
+        acc[p[:pdf_field_name]] = build_entry(p, existing&.[](p[:pdf_field_name]))
+      end
+      File.write(out, render_yaml(pdf_path, entries))
+      mapped = entries.values.count { |e| !e["key"].nil? && !e["key"].to_s.empty? }
+      {
+        total: entries.size,
+        mapped: mapped,
+        unmapped: entries.size - mapped,
+        out_path: out
+      }
+    end
+    def build_entry(proposal, prior)
+      proposed_key = proposal[:canonical_key]&.to_s
+      proposed_type = infer_type(proposal).to_s
+      key_value = prior&.key?("key") ? prior["key"] : proposed_key
+      type_value = prior&.key?("type") ? prior["type"] : proposed_type
+      meta = {
+        "raw_label" => AcroForge::Schema.humanize_label(proposal[:raw_label]),
+        "confidence" => proposal[:confidence].to_s,
+        "section" => proposal[:section]&.to_s,
+        "page" => proposal[:page]
+      }
+      options = proposal[:options]&.transform_keys(&:to_s)
+      meta["options"] = options if options
+      {
+        "key" => key_value,
+        "type" => type_value,
+        "meta" => meta
+      }
+    end
+    def infer_type(proposal)
+      case proposal[:pdf_field_type]
+      when :button
+        ((proposal[:options]&.size || 0) > 1) ? :select : :boolean
+      when :choice
+        :select
+      else
+        label = proposal[:raw_label].to_s.downcase
+        case label
+        when /amount|salary|income|balance|fee|tier3/ then :money
+        when /\bdate\b|birth|expiry|employed/ then :date
+        when /email/ then :email
+        when /years|tenor|number of|\bno\.?\b/ then :number
+        else :string
+        end
+      end
+    end
+    def render_yaml(pdf_path, entries)
+      banner = {
+        "_meta" => {
+          "source_pdf" => pdf_path,
+          "generated_at" => Time.now.utc.iso8601,
+          "acroforge_version" => AcroForge::VERSION,
+          "total_fields" => entries.size
+        }
+      }
+      YAML.dump(banner.merge(entries))
+    end
+  end
+end

data/lib/acroforge/schema.rb ADDED Viewed

@@ -0,0 +1,208 @@
+# frozen_string_literal: true
+require "yaml"
+require "json"
+require_relative "engine"
+require_relative "labels"
+module AcroForge
+  module Schema
+    module_function
+    def load(path)
+      raw = case File.extname(path).downcase
+      when ".yml", ".yaml"
+        YAML.safe_load_file(path, permitted_classes: [Symbol], aliases: true)
+      when ".json"
+        JSON.parse(File.read(path), symbolize_names: false)
+      else
+        raise ArgumentError, "unknown schema file extension: #{path.inspect}"
+      end
+      normalize(symbolize_schema(raw))
+    end
+    def dump(schema, path)
+      stringified = stringify_schema(schema)
+      case File.extname(path).downcase
+      when ".yml", ".yaml"
+        File.write(path, YAML.dump(stringified))
+      when ".json"
+        File.write(path, JSON.pretty_generate(stringified))
+      else
+        raise ArgumentError, "unknown schema file extension: #{path.inspect}"
+      end
+    end
+    def symbolize_schema(raw_hash)
+      return {} if raw_hash.nil? || raw_hash.empty?
+      raw_hash.each_with_object({}) do |(key, value), out|
+        out[key.to_sym] = symbolize_entry(value)
+      end
+    end
+    def symbolize_entry(entry)
+      return entry unless entry.is_a?(Hash)
+      result = {}
+      entry.each do |k, v|
+        sym_k = k.to_sym
+        result[sym_k] = case sym_k
+        when :type
+          v.is_a?(String) ? v.to_sym : v
+        when :options
+          if v.is_a?(Array)
+            v.map { |item| item.is_a?(String) ? item.to_sym : item }
+          else
+            v
+          end
+        else
+          v
+        end
+      end
+      result
+    end
+    def stringify_schema(schema)
+      schema.each_with_object({}) do |(key, value), out|
+        out[key.to_s] = stringify_entry(value)
+      end
+    end
+    def stringify_entry(entry)
+      return entry unless entry.is_a?(Hash)
+      result = {}
+      entry.each do |k, v|
+        str_k = k.to_s
+        result[str_k] = case k.to_sym
+        when :type
+          v.is_a?(Symbol) ? v.to_s : v
+        when :options
+          if v.is_a?(Array)
+            v.map { |item| item.is_a?(Symbol) ? item.to_s : item }
+          else
+            v
+          end
+        else
+          v
+        end
+      end
+      result
+    end
+    # Infer a schema from a PDF.
+    #
+    # If `engine:` is given, the caller has already compiled an engine and
+    # we use its proposals directly. This lets callers (notably the CLI's
+    # `bootstrap` subcommand) avoid a redundant second compile when they
+    # also want to call Relabeler.propose on the same PDF.
+    def infer(pdf_path, sections: [], engine: nil)
+      return aggregate_proposals(engine.field_proposals) if engine
+      require "tmpdir"
+      Dir.mktmpdir do |tmp|
+        e = AcroForge::Engine.new(pdf_path, sections: sections, normalized_dir: tmp)
+        e.compile!
+        aggregate_proposals(e.field_proposals)
+      end
+    end
+    def aggregate_proposals(proposals)
+      proposals.each_with_object({}) do |p, schema|
+        next if p[:canonical_key].nil?
+        key = p[:canonical_key].to_sym
+        schema[key] ||= {type: infer_type(p), variations: []}
+        if p[:raw_label]
+          cleaned = humanize_label(p[:raw_label])
+          schema[key][:variations] << cleaned unless schema[key][:variations].include?(cleaned)
+        end
+        if p[:pdf_field_type] == :button && p[:options]
+          schema[key][:options] = p[:options].keys.map(&:to_sym).uniq
+        end
+      end
+    end
+    # Thin delegator. The real implementation lives in AcroForge::Labels so
+    # the Engine can apply it at the source (right after the spatial heuristic
+    # picks a label) without creating a circular require between engine.rb
+    # and schema.rb.
+    def humanize_label(label)
+      AcroForge::Labels.humanize(label)
+    end
+    def infer_type(proposal)
+      case proposal[:pdf_field_type]
+      when :button
+        ((proposal[:options]&.size || 0) > 1) ? :select : :boolean
+      when :choice
+        :select
+      else
+        label = proposal[:raw_label].to_s.downcase
+        case label
+        when /amount|salary|income|balance|fee|tier3/ then :money
+        when /\bdate\b|birth|expiry|employed/ then :date
+        when /email/ then :email
+        when /years|tenor|number of|\bno\.?\b/ then :number
+        else :string
+        end
+      end
+    end
+    # Merge a mapping file's hand-reviewed decisions back into a schema.
+    # Each non-null mapping entry contributes a canonical key (stripped of
+    # any _N collision suffix), its type, and its raw_label as a variation.
+    # Existing schema entries keep their type but gain new variations;
+    # missing entries are created. Returns the merged schema hash.
+    def merge(schema, mapping_entries)
+      result = schema.each_with_object({}) do |(k, v), out|
+        out[k] = v.is_a?(Hash) ? v.dup.tap { |d| d[:variations] = (d[:variations] || []).dup } : v
+      end
+      mapping_entries.each do |pdf_field_name, entry|
+        next if pdf_field_name.to_s.start_with?("_")
+        next unless entry.is_a?(Hash)
+        key_str = entry["key"]
+        next if key_str.nil? || key_str.to_s.empty?
+        # Strip the _N collision suffix the engine appends when multiple
+        # fields map to the same canonical key (full_name_1, full_name_2).
+        canonical = key_str.to_s.sub(/_\d+\z/, "").to_sym
+        type_str = entry["type"]
+        type_sym = type_str.is_a?(String) ? type_str.to_sym : type_str
+        result[canonical] ||= {type: type_sym || :string, variations: []}
+        # Don't overwrite an existing type unless one is actually given
+        result[canonical][:type] = type_sym if type_sym
+        raw_label = entry.dig("meta", "raw_label")
+        if raw_label && !raw_label.to_s.empty?
+          variations = result[canonical][:variations] ||= []
+          variations << raw_label.to_s unless variations.include?(raw_label.to_s)
+        end
+      end
+      result
+    end
+    def normalize(input)
+      return {} if input.nil? || input.empty?
+      input.each_with_object({}) do |(key, value), out|
+        out[key] = case value
+        when Array
+          {type: :string, variations: value}
+        when Hash
+          value
+        else
+          raise ArgumentError, "Schema entry for #{key.inspect} must be an Array or Hash, got #{value.class}"
+        end
+      end
+    end
+  end
+end

data/lib/acroforge/validator.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+require "date"
+require "uri"
+module AcroForge
+  class ValidationError < StandardError; end
+  module Validator
+    def self.valid?(value, type, options = [])
+      return true if value.nil? || value.to_s.empty?
+      case type
+      when :money
+        value.to_s.gsub(/[$,]/, "").match?(/^\d+(\.\d+)?$/)
+      when :date
+        begin
+          Date.parse(value.to_s)
+          true
+        rescue ArgumentError, TypeError
+          false
+        end
+      when :email
+        value.to_s.match?(URI::MailTo::EMAIL_REGEXP)
+      when :number
+        value.to_s.gsub(/[\s-]/, "").match?(/^\d+$/)
+      when :boolean
+        ["true", "false", "yes", "no", "1", "0", "on", "off"].include?(value.to_s.downcase)
+      when :select
+        val_str = value.to_s.downcase
+        options.any? { |o| o.to_s.downcase == val_str }
+      else
+        true
+      end
+    end
+  end
+end

data/lib/acroforge/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module AcroForge
+  VERSION = "0.1.0"
+end

data/lib/acroforge.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+require_relative "acroforge/version"
+require_relative "acroforge/constants"
+module AcroForge
+  class Error < StandardError; end
+end
+require_relative "acroforge/all_text_processor"
+require_relative "acroforge/labels"
+require_relative "acroforge/validator"
+require_relative "acroforge/engine"
+require_relative "acroforge/schema"
+require_relative "acroforge/relabeler"
+require_relative "acroforge/annotator"
+require_relative "acroforge/preparer"
+require_relative "acroforge/cli"

data/sig/acroforge.rbs ADDED Viewed

@@ -0,0 +1,4 @@
+module AcroForge
+  VERSION: String
+  # See the writing guide of rbs: https://github.com/ruby/rbs#guides
+end