RubyGems - acro_that - Versions diffs - 0.1.1 → 0.1.3 - Mend

acro_that 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +16 -0
data/Gemfile.lock +1 -1
data/README.md +49 -0
data/docs/README.md +12 -0
data/docs/clear_fields.md +202 -0
data/issues/README.md +38 -0
data/issues/refactoring-opportunities.md +269 -0
data/lib/acro_that/actions/add_field.rb +2 -55
data/lib/acro_that/actions/add_signature_appearance.rb +3 -3
data/lib/acro_that/actions/base.rb +4 -0
data/lib/acro_that/actions/remove_field.rb +1 -5
data/lib/acro_that/dict_scan.rb +7 -0
data/lib/acro_that/document.rb +480 -45
data/lib/acro_that/version.rb +1 -1
data/lib/acro_that.rb +1 -0
data/publish +183 -0
metadata +5 -1

data/lib/acro_that/document.rb CHANGED Viewed

@@ -71,62 +71,185 @@ module AcroThat
       self
     end
-    # Return an array of Field(name, value, type, ref)
-    def list_fields
-      fields = []
-      field_widgets = {}
-      widgets_by_name = {}
+    # Return an array of page information (page number, width, height, ref, metadata)
+    def list_pages
+      pages = []
+      page_objects = find_all_pages
+      # Second pass: extract information from each page
+      page_objects.each_with_index do |ref, index|
+        body = @resolver.object_body(ref)
+        next unless body
+        # Extract MediaBox, CropBox, or ArtBox for dimensions
+        width = nil
+        height = nil
+        media_box = nil
+        crop_box = nil
+        art_box = nil
+        bleed_box = nil
+        trim_box = nil
+        # Try MediaBox first (most common)
+        if body =~ %r{/MediaBox\s*\[(.*?)\]}
+          box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
+          if box_values.length == 4
+            llx, lly, urx, ury = box_values
+            width = urx - llx
+            height = ury - lly
+            media_box = { llx: llx, lly: lly, urx: urx, ury: ury }
+          end
+        end
-      # First pass: collect widget information
-      @resolver.each_object do |ref, body|
-        next unless DictScan.is_widget?(body)
+        # Try CropBox
+        if body =~ %r{/CropBox\s*\[(.*?)\]}
+          box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
+          if box_values.length == 4
+            llx, lly, urx, ury = box_values
+            crop_box = { llx: llx, lly: lly, urx: urx, ury: ury }
+          end
+        end
-        # Extract position from widget
-        rect_tok = DictScan.value_token_after("/Rect", body)
-        next unless rect_tok && rect_tok.start_with?("[")
+        # Try ArtBox
+        if body =~ %r{/ArtBox\s*\[(.*?)\]}
+          box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
+          if box_values.length == 4
+            llx, lly, urx, ury = box_values
+            art_box = { llx: llx, lly: lly, urx: urx, ury: ury }
+          end
+        end
-        # Parse [x y x+width y+height] format
-        rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
-        next unless rect_values.length == 4
+        # Try BleedBox
+        if body =~ %r{/BleedBox\s*\[(.*?)\]}
+          box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
+          if box_values.length == 4
+            llx, lly, urx, ury = box_values
+            bleed_box = { llx: llx, lly: lly, urx: urx, ury: ury }
+          end
+        end
-        x, y, x2, y2 = rect_values
-        width = x2 - x
-        height = y2 - y
+        # Try TrimBox
+        if body =~ %r{/TrimBox\s*\[(.*?)\]}
+          box_values = ::Regexp.last_match(1).scan(/[-+]?\d*\.?\d+/).map(&:to_f)
+          if box_values.length == 4
+            llx, lly, urx, ury = box_values
+            trim_box = { llx: llx, lly: lly, urx: urx, ury: ury }
+          end
+        end
-        page_num = nil
-        if body =~ %r{/P\s+(\d+)\s+(\d+)\s+R}
-          page_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
-          page_num = find_page_number_for_ref(page_ref)
+        # Extract rotation
+        rotate = nil
+        if body =~ %r{/Rotate\s+(\d+)}
+          rotate = Integer(::Regexp.last_match(1))
         end
-        widget_info = {
-          x: x, y: y, width: width, height: height, page: page_num
-        }
+        # Extract Resources reference
+        resources_ref = nil
+        if body =~ %r{/Resources\s+(\d+)\s+(\d+)\s+R}
+          resources_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+        end
+        # Extract Parent reference
+        parent_ref = nil
         if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
           parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
-          field_widgets[parent_ref] ||= []
-          field_widgets[parent_ref] << widget_info
         end
-        next unless body.include?("/T")
-        t_tok = DictScan.value_token_after("/T", body)
-        if t_tok
-          widget_name = DictScan.decode_pdf_string(t_tok)
-          if widget_name && !widget_name.empty?
-            widgets_by_name[widget_name] ||= []
-            widgets_by_name[widget_name] << widget_info
+        # Extract Contents reference(s)
+        contents_refs = []
+        if body =~ %r{/Contents\s+(\d+)\s+(\d+)\s+R}
+          contents_refs << [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+        elsif body =~ %r{/Contents\s*\[(.*?)\]}
+          contents_array = ::Regexp.last_match(1)
+          contents_array.scan(/(\d+)\s+(\d+)\s+R/) do |num_str, gen_str|
+            contents_refs << [num_str.to_i, gen_str.to_i]
           end
         end
+        # Build metadata hash
+        metadata = {
+          rotate: rotate,
+          media_box: media_box,
+          crop_box: crop_box,
+          art_box: art_box,
+          bleed_box: bleed_box,
+          trim_box: trim_box,
+          resources_ref: resources_ref,
+          parent_ref: parent_ref,
+          contents_refs: contents_refs
+        }
+        pages << {
+          page: index + 1, # Page number starting at 1
+          width: width,
+          height: height,
+          ref: ref,
+          metadata: metadata
+        }
       end
-      # Second pass: collect all fields (both field objects and widget annotations with /T)
+      pages
+    end
+    # Return an array of Field(name, value, type, ref)
+    def list_fields
+      fields = []
+      field_widgets = {}
+      widgets_by_name = {}
+      # First pass: collect widget information
       @resolver.each_object do |ref, body|
-        next unless body&.include?("/T")
+        next unless body
+        is_widget = DictScan.is_widget?(body)
+        # Collect widget information if this is a widget
+        if is_widget
+          # Extract position from widget
+          rect_tok = DictScan.value_token_after("/Rect", body)
+          if rect_tok && rect_tok.start_with?("[")
+            # Parse [x y x+width y+height] format
+            rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
+            if rect_values.length == 4
+              x, y, x2, y2 = rect_values
+              width = x2 - x
+              height = y2 - y
-        is_widget_field = DictScan.is_widget?(body)
+              page_num = nil
+              if body =~ %r{/P\s+(\d+)\s+(\d+)\s+R}
+                page_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+                page_num = find_page_number_for_ref(page_ref)
+              end
+              widget_info = {
+                x: x, y: y, width: width, height: height, page: page_num
+              }
+              if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
+                parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+                field_widgets[parent_ref] ||= []
+                field_widgets[parent_ref] << widget_info
+              end
+              if body.include?("/T")
+                t_tok = DictScan.value_token_after("/T", body)
+                if t_tok
+                  widget_name = DictScan.decode_pdf_string(t_tok)
+                  if widget_name && !widget_name.empty?
+                    widgets_by_name[widget_name] ||= []
+                    widgets_by_name[widget_name] << widget_info
+                  end
+                end
+              end
+            end
+          end
+        end
+        # Second pass: collect all fields (both field objects and widget annotations with /T)
+        next unless body.include?("/T")
+        is_widget_field = is_widget
         hint = body.include?("/FT") || is_widget_field || body.include?("/Kids") || body.include?("/Parent")
         next unless hint
@@ -143,8 +266,7 @@ module AcroThat
         type = ft_tok
         position = {}
-        is_widget_annot = DictScan.is_widget?(body)
-        if is_widget_annot
+        if is_widget
           rect_tok = DictScan.value_token_after("/Rect", body)
           if rect_tok && rect_tok.start_with?("[")
             rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
@@ -270,8 +392,260 @@ module AcroThat
       field.remove
     end
+    # Clean up the PDF by removing unwanted fields.
+    # Options:
+    #   - keep_fields: Array of field names to keep (all others removed)
+    #   - remove_fields: Array of field names to remove
+    #   - remove_pattern: Regex pattern - fields matching this are removed
+    #   - block: Given field name, return true to keep, false to remove
+    # This rewrites the entire PDF (like flatten) but excludes the unwanted fields.
+    def clear(keep_fields: nil, remove_fields: nil, remove_pattern: nil)
+      root_ref = @resolver.root_ref
+      raise "Cannot clear: no /Root found" unless root_ref
+      # Build a set of fields to remove
+      fields_to_remove = Set.new
+      # Get all current fields
+      all_fields = list_fields
+      if block_given?
+        # Use block to determine which fields to keep
+        all_fields.each do |field|
+          fields_to_remove.add(field.name) unless yield(field.name)
+        end
+      elsif keep_fields
+        # Keep only specified fields
+        keep_set = Set.new(keep_fields.map(&:to_s))
+        all_fields.each do |field|
+          fields_to_remove.add(field.name) unless keep_set.include?(field.name)
+        end
+      elsif remove_fields
+        # Remove specified fields
+        remove_set = Set.new(remove_fields.map(&:to_s))
+        all_fields.each do |field|
+          fields_to_remove.add(field.name) if remove_set.include?(field.name)
+        end
+      elsif remove_pattern
+        # Remove fields matching pattern
+        all_fields.each do |field|
+          fields_to_remove.add(field.name) if field.name =~ remove_pattern
+        end
+      else
+        # No criteria specified, return original
+        return @raw
+      end
+      # Build sets of refs to exclude
+      field_refs_to_remove = Set.new
+      widget_refs_to_remove = Set.new
+      all_fields.each do |field|
+        next unless fields_to_remove.include?(field.name)
+        field_refs_to_remove.add(field.ref) if field.valid_ref?
+        # Find all widget annotations for this field
+        @resolver.each_object do |widget_ref, body|
+          next unless body && DictScan.is_widget?(body)
+          next if widget_ref == field.ref
+          # Match by /Parent reference
+          if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
+            widget_parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+            if widget_parent_ref == field.ref
+              widget_refs_to_remove.add(widget_ref)
+              next
+            end
+          end
+          # Also match by field name (/T)
+          next unless body.include?("/T")
+          t_tok = DictScan.value_token_after("/T", body)
+          next unless t_tok
+          widget_name = DictScan.decode_pdf_string(t_tok)
+          if widget_name && widget_name == field.name
+            widget_refs_to_remove.add(widget_ref)
+          end
+        end
+      end
+      # Collect objects to write (excluding removed fields and widgets)
+      objects = []
+      @resolver.each_object do |ref, body|
+        next if field_refs_to_remove.include?(ref)
+        next if widget_refs_to_remove.include?(ref)
+        next unless body
+        objects << { ref: ref, body: body }
+      end
+      # Process AcroForm to remove field references from /Fields array
+      af_ref = acroform_ref
+      if af_ref
+        # Find the AcroForm object in our objects list
+        af_obj = objects.find { |o| o[:ref] == af_ref }
+        if af_obj
+          af_body = af_obj[:body]
+          fields_array_ref = DictScan.value_token_after("/Fields", af_body)
+          if fields_array_ref && fields_array_ref =~ /\A(\d+)\s+(\d+)\s+R/
+            # /Fields points to separate array object
+            arr_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+            arr_obj = objects.find { |o| o[:ref] == arr_ref }
+            if arr_obj
+              arr_body = arr_obj[:body]
+              field_refs_to_remove.each do |field_ref|
+                arr_body = DictScan.remove_ref_from_array(arr_body, field_ref)
+              end
+              # Clean up empty array
+              arr_body = arr_body.strip.gsub(/\[\s+\]/, "[]")
+              arr_obj[:body] = arr_body
+            end
+          elsif af_body.include?("/Fields")
+            # Inline /Fields array
+            field_refs_to_remove.each do |field_ref|
+              af_body = DictScan.remove_ref_from_inline_array(af_body, "/Fields", field_ref)
+            end
+            af_obj[:body] = af_body
+          end
+        end
+      end
+      # Process page objects to remove widget references from /Annots arrays
+      # Also remove any orphaned widget references (widgets that reference non-existent fields)
+      objects_in_file = Set.new(objects.map { |o| o[:ref] })
+      field_refs_in_file = Set.new
+      objects.each do |obj|
+        body = obj[:body]
+        # Check if this is a field object
+        if body&.include?("/FT") && body.include?("/T")
+          field_refs_in_file.add(obj[:ref])
+        end
+        body = obj[:body]
+        next unless DictScan.is_page?(body)
+        # Handle inline /Annots array
+        if body =~ %r{/Annots\s*\[(.*?)\]}
+          annots_array_str = ::Regexp.last_match(1)
+          # Remove widgets that match removed fields
+          widget_refs_to_remove.each do |widget_ref|
+            annots_array_str = annots_array_str.gsub(/\b#{widget_ref[0]}\s+#{widget_ref[1]}\s+R\b/, "").strip
+            annots_array_str = annots_array_str.gsub(/\s+/, " ")
+          end
+          # Also remove orphaned widget references (widgets not in objects_in_file or pointing to non-existent fields)
+          annots_refs = annots_array_str.scan(/(\d+)\s+(\d+)\s+R/).map { |n, g| [Integer(n), Integer(g)] }
+          annots_refs.each do |annot_ref|
+            # Check if this annotation is a widget that should be removed
+            if objects_in_file.include?(annot_ref)
+              # Widget exists - check if it's an orphaned widget (references non-existent field)
+              widget_obj = objects.find { |o| o[:ref] == annot_ref }
+              if widget_obj && DictScan.is_widget?(widget_obj[:body])
+                widget_body = widget_obj[:body]
+                # Check if widget references a parent field that doesn't exist
+                if widget_body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
+                  parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+                  unless field_refs_in_file.include?(parent_ref)
+                    # Parent field doesn't exist - orphaned widget, remove it
+                    annots_array_str = annots_array_str.gsub(/\b#{annot_ref[0]}\s+#{annot_ref[1]}\s+R\b/, "").strip
+                    annots_array_str = annots_array_str.gsub(/\s+/, " ")
+                  end
+                end
+              end
+            else
+              # Widget object doesn't exist - remove it
+              annots_array_str = annots_array_str.gsub(/\b#{annot_ref[0]}\s+#{annot_ref[1]}\s+R\b/, "").strip
+              annots_array_str = annots_array_str.gsub(/\s+/, " ")
+            end
+          end
+          new_annots = if annots_array_str.empty? || annots_array_str.strip.empty?
+                         "[]"
+                       else
+                         "[#{annots_array_str}]"
+                       end
+          new_body = body.sub(%r{/Annots\s*\[.*?\]}, "/Annots #{new_annots}")
+          obj[:body] = new_body
+        # Handle indirect /Annots array reference
+        elsif body =~ %r{/Annots\s+(\d+)\s+(\d+)\s+R}
+          annots_array_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+          annots_obj = objects.find { |o| o[:ref] == annots_array_ref }
+          if annots_obj
+            annots_body = annots_obj[:body]
+            # Remove widgets that match removed fields
+            widget_refs_to_remove.each do |widget_ref|
+              annots_body = DictScan.remove_ref_from_array(annots_body, widget_ref)
+            end
+            # Also remove orphaned widget references
+            annots_refs = annots_body.scan(/(\d+)\s+(\d+)\s+R/).map { |n, g| [Integer(n), Integer(g)] }
+            annots_refs.each do |annot_ref|
+              if objects_in_file.include?(annot_ref)
+                widget_obj = objects.find { |o| o[:ref] == annot_ref }
+                if widget_obj && DictScan.is_widget?(widget_obj[:body])
+                  widget_body = widget_obj[:body]
+                  if widget_body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
+                    parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+                    unless field_refs_in_file.include?(parent_ref)
+                      annots_body = DictScan.remove_ref_from_array(annots_body, annot_ref)
+                    end
+                  end
+                end
+              else
+                annots_body = DictScan.remove_ref_from_array(annots_body, annot_ref)
+              end
+            end
+            annots_obj[:body] = annots_body
+          end
+        end
+      end
+      # Sort objects by object number
+      objects.sort_by! { |obj| obj[:ref][0] }
+      # Write the cleaned PDF
+      writer = PDFWriter.new
+      writer.write_header
+      objects.each do |obj|
+        writer.write_object(obj[:ref], obj[:body])
+      end
+      writer.write_xref
+      trailer_dict = @resolver.trailer_dict
+      info_ref = nil
+      if trailer_dict =~ %r{/Info\s+(\d+)\s+(\d+)\s+R}
+        info_ref = [::Regexp.last_match(1).to_i, ::Regexp.last_match(2).to_i]
+      end
+      # Write trailer
+      max_obj_num = objects.map { |obj| obj[:ref][0] }.max || 0
+      writer.write_trailer(max_obj_num + 1, root_ref, info_ref)
+      writer.output
+    end
+    # Clean up in-place (mutates current instance)
+    def clear!(...)
+      cleaned_content = clear(...)
+      @raw = cleaned_content
+      @resolver = AcroThat::ObjectResolver.new(cleaned_content)
+      @patches = []
+      self
+    end
     # Write out with an incremental update
-    def write(path_out = nil, flatten: false)
+    def write(path_out = nil, flatten: true)
       deduped_patches = @patches.reverse.uniq { |p| p[:ref] }.reverse
       writer = AcroThat::IncrementalWriter.new(@raw, deduped_patches)
       @raw = writer.render
@@ -290,14 +664,75 @@ module AcroThat
     private
-    def find_page_number_for_ref(page_ref)
+    def collect_pages_from_tree(pages_ref, page_objects)
+      pages_body = @resolver.object_body(pages_ref)
+      return unless pages_body
+      # Extract /Kids array from Pages object
+      if pages_body =~ %r{/Kids\s*\[(.*?)\]}m
+        kids_array = ::Regexp.last_match(1)
+        # Extract all object references from Kids array in order
+        kids_array.scan(/(\d+)\s+(\d+)\s+R/) do |num_str, gen_str|
+          kid_ref = [num_str.to_i, gen_str.to_i]
+          kid_body = @resolver.object_body(kid_ref)
+          # Check if this kid is a page (not /Type/Pages)
+          if kid_body && DictScan.is_page?(kid_body)
+            page_objects << kid_ref unless page_objects.include?(kid_ref)
+          elsif kid_body && kid_body.include?("/Type /Pages")
+            # Recursively find pages in this Pages node
+            collect_pages_from_tree(kid_ref, page_objects)
+          end
+        end
+      end
+    end
+    # Find all page objects in document order
+    # Returns an array of page references [obj_num, gen_num]
+    def find_all_pages
       page_objects = []
-      @resolver.each_object do |ref, body|
-        next unless body&.include?("/Type /Page")
-        page_objects << ref
+      # First, try to get pages in document order via page tree
+      root_ref = @resolver.root_ref
+      if root_ref
+        catalog_body = @resolver.object_body(root_ref)
+        if catalog_body && catalog_body =~ %r{/Pages\s+(\d+)\s+(\d+)\s+R}
+          pages_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
+          collect_pages_from_tree(pages_ref, page_objects)
+        end
       end
+      # Fallback: collect all page objects if page tree didn't work
+      if page_objects.empty?
+        @resolver.each_object do |ref, body|
+          next unless body
+          next unless DictScan.is_page?(body)
+          page_objects << ref unless page_objects.include?(ref)
+        end
+        # Sort by object number as fallback
+        page_objects.sort_by! { |ref| ref[0] }
+      end
+      page_objects
+    end
+    # Find a page by its page number (1-indexed)
+    # Returns [obj_num, gen_num] or nil if not found
+    def find_page_by_number(page_num)
+      page_objects = find_all_pages
+      return nil if page_objects.empty?
+      return page_objects[page_num - 1] if page_num.positive? && page_num <= page_objects.length
+      page_objects[0] # Default to first page if page_num is out of range
+    end
+    def find_page_number_for_ref(page_ref)
+      page_objects = find_all_pages
       return nil if page_objects.empty?
       page_index = page_objects.index(page_ref)

data/lib/acro_that/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module AcroThat
-  VERSION = "0.1.1"
+  VERSION = "0.1.3"
 end

data/lib/acro_that.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require "strscan"
 require "stringio"
 require "zlib"
 require "base64"
+require "set"
 require_relative "acro_that/dict_scan"
 require_relative "acro_that/object_resolver"