acro_that 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,331 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AcroThat
4
+ class Document
5
+ attr_reader :path
6
+
7
+ # Flatten a PDF to remove incremental updates
8
+ def self.flatten_pdf(input_path, output_path = nil)
9
+ output = new(input_path).flatten
10
+
11
+ if output_path
12
+ File.binwrite(output_path, output)
13
+ return output_path
14
+ else
15
+ return new(StringIO.new(output))
16
+ end
17
+ end
18
+
19
+ def initialize(path_or_io)
20
+ @path = path_or_io.is_a?(String) ? path_or_io : nil
21
+ @raw = case path_or_io
22
+ when String then File.binread(path_or_io)
23
+ else path_or_io.binmode
24
+ path_or_io.read
25
+ end
26
+ @resolver = AcroThat::ObjectResolver.new(@raw)
27
+ @patches = []
28
+ end
29
+
30
+ # Flatten this document to remove incremental updates
31
+ def flatten
32
+ root_ref = @resolver.root_ref
33
+ raise "Cannot flatten: no /Root found" unless root_ref
34
+
35
+ objects = []
36
+ @resolver.each_object do |ref, body|
37
+ objects << { ref: ref, body: body } if body
38
+ end
39
+
40
+ objects.sort_by! { |obj| obj[:ref][0] }
41
+
42
+ writer = PDFWriter.new
43
+ writer.write_header
44
+
45
+ objects.each do |obj|
46
+ writer.write_object(obj[:ref], obj[:body])
47
+ end
48
+
49
+ writer.write_xref
50
+
51
+ trailer_dict = @resolver.trailer_dict
52
+ info_ref = nil
53
+ if trailer_dict =~ %r{/Info\s+(\d+)\s+(\d+)\s+R}
54
+ info_ref = [::Regexp.last_match(1).to_i, ::Regexp.last_match(2).to_i]
55
+ end
56
+
57
+ # Write trailer
58
+ max_obj_num = objects.map { |obj| obj[:ref][0] }.max || 0
59
+ writer.write_trailer(max_obj_num + 1, root_ref, info_ref)
60
+
61
+ writer.output
62
+ end
63
+
64
+ # Flatten this document in-place (mutates current instance)
65
+ def flatten!
66
+ flattened_content = flatten
67
+ @raw = flattened_content
68
+ @resolver = AcroThat::ObjectResolver.new(flattened_content)
69
+ @patches = []
70
+
71
+ self
72
+ end
73
+
74
+ # Return an array of Field(name, value, type, ref)
75
+ def list_fields
76
+ fields = []
77
+ field_widgets = {}
78
+ widgets_by_name = {}
79
+
80
+ # First pass: collect widget information
81
+ @resolver.each_object do |ref, body|
82
+ next unless DictScan.is_widget?(body)
83
+
84
+ # Extract position from widget
85
+ rect_tok = DictScan.value_token_after("/Rect", body)
86
+ next unless rect_tok && rect_tok.start_with?("[")
87
+
88
+ # Parse [x y x+width y+height] format
89
+ rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
90
+ next unless rect_values.length == 4
91
+
92
+ x, y, x2, y2 = rect_values
93
+ width = x2 - x
94
+ height = y2 - y
95
+
96
+ page_num = nil
97
+ if body =~ %r{/P\s+(\d+)\s+(\d+)\s+R}
98
+ page_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
99
+ page_num = find_page_number_for_ref(page_ref)
100
+ end
101
+
102
+ widget_info = {
103
+ x: x, y: y, width: width, height: height, page: page_num
104
+ }
105
+
106
+ if body =~ %r{/Parent\s+(\d+)\s+(\d+)\s+R}
107
+ parent_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
108
+
109
+ field_widgets[parent_ref] ||= []
110
+ field_widgets[parent_ref] << widget_info
111
+ end
112
+
113
+ next unless body.include?("/T")
114
+
115
+ t_tok = DictScan.value_token_after("/T", body)
116
+ if t_tok
117
+ widget_name = DictScan.decode_pdf_string(t_tok)
118
+ if widget_name && !widget_name.empty?
119
+ widgets_by_name[widget_name] ||= []
120
+ widgets_by_name[widget_name] << widget_info
121
+ end
122
+ end
123
+ end
124
+
125
+ # Second pass: collect all fields (both field objects and widget annotations with /T)
126
+ @resolver.each_object do |ref, body|
127
+ next unless body&.include?("/T")
128
+
129
+ is_widget_field = DictScan.is_widget?(body)
130
+ hint = body.include?("/FT") || is_widget_field || body.include?("/Kids") || body.include?("/Parent")
131
+ next unless hint
132
+
133
+ t_tok = DictScan.value_token_after("/T", body)
134
+ next unless t_tok
135
+
136
+ name = DictScan.decode_pdf_string(t_tok)
137
+ next if name.nil? || name.empty? # Skip fields with empty names (deleted fields)
138
+
139
+ v_tok = body.include?("/V") ? DictScan.value_token_after("/V", body) : nil
140
+ value = v_tok && v_tok != "<<" ? DictScan.decode_pdf_string(v_tok) : nil
141
+
142
+ ft_tok = body.include?("/FT") ? DictScan.value_token_after("/FT", body) : nil
143
+ type = ft_tok
144
+
145
+ position = {}
146
+ is_widget_annot = DictScan.is_widget?(body)
147
+ if is_widget_annot
148
+ rect_tok = DictScan.value_token_after("/Rect", body)
149
+ if rect_tok && rect_tok.start_with?("[")
150
+ rect_values = rect_tok.scan(/[-+]?\d*\.?\d+/).map(&:to_f)
151
+ if rect_values.length == 4
152
+ x, y, x2, y2 = rect_values
153
+ position = { x: x, y: y, width: x2 - x, height: y2 - y }
154
+
155
+ if body =~ %r{/P\s+(\d+)\s+(\d+)\s+R}
156
+ page_ref = [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
157
+ position[:page] = find_page_number_for_ref(page_ref)
158
+ end
159
+ end
160
+ end
161
+ elsif field_widgets[ref]
162
+ widget_info = field_widgets[ref].first
163
+ position = {
164
+ x: widget_info[:x],
165
+ y: widget_info[:y],
166
+ width: widget_info[:width],
167
+ height: widget_info[:height],
168
+ page: widget_info[:page]
169
+ }
170
+ elsif widgets_by_name[name]
171
+ widget_info = widgets_by_name[name].first
172
+ position = {
173
+ x: widget_info[:x],
174
+ y: widget_info[:y],
175
+ width: widget_info[:width],
176
+ height: widget_info[:height],
177
+ page: widget_info[:page]
178
+ }
179
+ end
180
+
181
+ fields << Field.new(name, value, type, ref, self, position)
182
+ end
183
+
184
+ if fields.empty?
185
+ stripped = DictScan.strip_stream_bodies(@raw)
186
+ DictScan.each_dictionary(stripped) do |dict_src|
187
+ next unless dict_src.include?("/T")
188
+
189
+ is_widget_field_fallback = DictScan.is_widget?(dict_src)
190
+ hint = dict_src.include?("/FT") || is_widget_field_fallback || dict_src.include?("/Kids") || dict_src.include?("/Parent")
191
+ next unless hint
192
+
193
+ t_tok = DictScan.value_token_after("/T", dict_src)
194
+ next unless t_tok
195
+
196
+ name = DictScan.decode_pdf_string(t_tok)
197
+ next if name.nil? || name.empty? # Skip fields with empty names (deleted fields)
198
+
199
+ v_tok = dict_src.include?("/V") ? DictScan.value_token_after("/V", dict_src) : nil
200
+ value = v_tok && v_tok != "<<" ? DictScan.decode_pdf_string(v_tok) : nil
201
+ ft_tok = dict_src.include?("/FT") ? DictScan.value_token_after("/FT", dict_src) : nil
202
+ fields << Field.new(name, value, ft_tok, [-1, 0], self)
203
+ end
204
+ end
205
+
206
+ fields.group_by(&:name).values.map { |arr| arr.min_by { |f| f.ref[0] } }
207
+ end
208
+
209
+ # Add a new field to the AcroForm /Fields array
210
+ def add_field(name, options = {})
211
+ action = Actions::AddField.new(self, name, options)
212
+ result = action.call
213
+
214
+ if result
215
+ position = {
216
+ x: options[:x] || 100,
217
+ y: options[:y] || 500,
218
+ width: options[:width] || 100,
219
+ height: options[:height] || 20,
220
+ page: options[:page] || 1
221
+ }
222
+
223
+ field_obj_num = action.field_obj_num
224
+ field_type = action.field_type
225
+ field_value = action.field_value
226
+
227
+ Field.new(name, field_value, field_type, [field_obj_num, 0], self, position)
228
+ end
229
+ end
230
+
231
+ # Update field by name, setting /V and optionally /AS on widgets
232
+ def update_field(name, new_value, new_name: nil)
233
+ # First try to find in list_fields (already written fields)
234
+ field = list_fields.find { |f| f.name == name }
235
+
236
+ # If not found, check if field was just added (in patches) and create a Field object for it
237
+ unless field
238
+ patches = @patches
239
+ field_patch = patches.find do |p|
240
+ next unless p[:body]
241
+ next unless p[:body].include?("/T")
242
+
243
+ t_tok = DictScan.value_token_after("/T", p[:body])
244
+ next unless t_tok
245
+
246
+ field_name = DictScan.decode_pdf_string(t_tok)
247
+ field_name == name
248
+ end
249
+
250
+ if field_patch && field_patch[:body].include?("/FT")
251
+ ft_tok = DictScan.value_token_after("/FT", field_patch[:body])
252
+ if ft_tok
253
+ # Create a temporary Field object for newly added field
254
+ position = {}
255
+ field = Field.new(name, nil, ft_tok, field_patch[:ref], self, position)
256
+ end
257
+ end
258
+ end
259
+
260
+ return false unless field
261
+
262
+ field.update(new_value, new_name: new_name)
263
+ end
264
+
265
+ # Remove field by name from the AcroForm /Fields array
266
+ def remove_field(fld)
267
+ field = fld.is_a?(Field) ? fld : list_fields.find { |f| f.name == fld }
268
+ return false unless field
269
+
270
+ field.remove
271
+ end
272
+
273
+ # Write out with an incremental update
274
+ def write(path_out = nil, flatten: false)
275
+ deduped_patches = @patches.reverse.uniq { |p| p[:ref] }.reverse
276
+ writer = AcroThat::IncrementalWriter.new(@raw, deduped_patches)
277
+ @raw = writer.render
278
+ @patches = []
279
+ @resolver = AcroThat::ObjectResolver.new(@raw)
280
+
281
+ flatten! if flatten
282
+
283
+ if path_out
284
+ File.binwrite(path_out, @raw)
285
+ return true
286
+ else
287
+ return @raw
288
+ end
289
+ end
290
+
291
+ private
292
+
293
+ def find_page_number_for_ref(page_ref)
294
+ page_objects = []
295
+ @resolver.each_object do |ref, body|
296
+ next unless body&.include?("/Type /Page")
297
+
298
+ page_objects << ref
299
+ end
300
+
301
+ return nil if page_objects.empty?
302
+
303
+ page_index = page_objects.index(page_ref)
304
+ return nil unless page_index
305
+
306
+ page_index + 1
307
+ end
308
+
309
+ def next_fresh_object_number
310
+ max_obj_num = 0
311
+ @resolver.each_object do |ref, _|
312
+ max_obj_num = [max_obj_num, ref[0]].max
313
+ end
314
+ @patches.each do |p|
315
+ max_obj_num = [max_obj_num, p[:ref][0]].max
316
+ end
317
+ max_obj_num + 1
318
+ end
319
+
320
+ def acroform_ref
321
+ root_ref = @resolver.root_ref
322
+ return nil unless root_ref
323
+
324
+ cat_body = @resolver.object_body(root_ref)
325
+
326
+ return nil unless cat_body =~ %r{/AcroForm\s+(\d+)\s+(\d+)\s+R}
327
+
328
+ [Integer(::Regexp.last_match(1)), Integer(::Regexp.last_match(2))]
329
+ end
330
+ end
331
+ end
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AcroThat
4
+ # Represents a PDF form field
5
+ class Field
6
+ attr_accessor :value
7
+ attr_reader :name, :type, :ref, :x, :y, :width, :height, :page
8
+
9
+ TYPES = {
10
+ text: "/Tx",
11
+ button: "/Btn",
12
+ choice: "/Ch",
13
+ signature: "/Sig"
14
+ }.freeze
15
+
16
+ # Reverse lookup: map type strings to symbol keys
17
+ TYPE_KEYS = TYPES.invert.freeze
18
+
19
+ def initialize(name, value, type, ref, document = nil, position = {})
20
+ @name = name
21
+ @value = value
22
+ # Normalize type: accept symbol keys or type strings, default to "/Tx"
23
+ normalized_type = if type.is_a?(Symbol)
24
+ TYPES[type] || "/Tx"
25
+ else
26
+ type.to_s.strip
27
+ end
28
+ @type = normalized_type.empty? ? "/Tx" : normalized_type
29
+ @ref = ref
30
+ @document = document
31
+ @x = position[:x]
32
+ @y = position[:y]
33
+ @width = position[:width]
34
+ @height = position[:height]
35
+ @page = position[:page]
36
+ end
37
+
38
+ # Check if this is a text field
39
+ def text_field?
40
+ type == "/Tx"
41
+ end
42
+
43
+ # Check if this is a button field (checkbox/radio)
44
+ def button_field?
45
+ type == "/Btn"
46
+ end
47
+
48
+ # Check if this is a choice field (dropdown/list)
49
+ def choice_field?
50
+ type == "/Ch"
51
+ end
52
+
53
+ # Check if this is a signature field
54
+ def signature_field?
55
+ type == "/Sig"
56
+ end
57
+
58
+ # Check if the field has a value
59
+ def has_value?
60
+ !value.nil? && !value.to_s.empty?
61
+ end
62
+
63
+ # Get the object number (first element of ref)
64
+ def object_number
65
+ ref[0]
66
+ end
67
+
68
+ # Get the generation number (second element of ref)
69
+ def generation
70
+ ref[1]
71
+ end
72
+
73
+ # Check if field reference is valid (not [-1, 0] placeholder)
74
+ def valid_ref?
75
+ ref != [-1, 0]
76
+ end
77
+
78
+ # Equality comparison
79
+ def ==(other)
80
+ return false unless other.is_a?(Field)
81
+
82
+ name == other.name &&
83
+ value == other.value &&
84
+ type == other.type &&
85
+ ref == other.ref
86
+ end
87
+
88
+ # String representation for debugging
89
+ def to_s
90
+ type_str = type.inspect
91
+ type_str += " (:#{type_key})" if type_key
92
+ pos_str = if x && y && width && height
93
+ " x=#{x} y=#{y} w=#{width} h=#{height}"
94
+ else
95
+ " position=(unknown)"
96
+ end
97
+ page_str = page ? " page=#{page}" : ""
98
+ "#<AcroThat::Field name=#{name.inspect} type=#{type_str} value=#{value.inspect} ref=#{ref.inspect}#{pos_str}#{page_str}>"
99
+ end
100
+
101
+ alias inspect to_s
102
+
103
+ # Check if position is known
104
+ def has_position?
105
+ !x.nil? && !y.nil? && !width.nil? && !height.nil?
106
+ end
107
+
108
+ # Get the symbol key for the field type (e.g., :text for "/Tx")
109
+ # Returns nil if the type is not in the TYPES mapping
110
+ def type_key
111
+ TYPE_KEYS[type]
112
+ end
113
+
114
+ # Update this field's value and optionally rename it in the document
115
+ # Returns true if the field was found and queued for write.
116
+ def update(new_value, new_name: nil)
117
+ return false unless @document
118
+ return false unless valid_ref?
119
+
120
+ action = Actions::UpdateField.new(@document, @name, new_value, new_name: new_name)
121
+ result = action.call
122
+
123
+ # Update the local value if update was successful
124
+ @value = new_value if result
125
+ # Update the local name if rename was successful
126
+ @name = new_name if result && new_name && !new_name.empty?
127
+
128
+ result
129
+ end
130
+
131
+ # Remove this field from the AcroForm /Fields array and mark the field object as deleted.
132
+ # Note: This does not purge page /Annots widgets (non-trivial); most viewers will hide the field
133
+ # once it is no longer in the field tree.
134
+ # Returns true if the field was removed.
135
+ def remove
136
+ return false unless @document
137
+ return false unless valid_ref?
138
+
139
+ action = Actions::RemoveField.new(@document, self)
140
+ action.call
141
+ end
142
+ end
143
+ end