acroforge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +11 -0
- data/LICENSE.txt +21 -0
- data/README.md +217 -0
- data/Rakefile +10 -0
- data/acroforge.gemspec +37 -0
- data/exe/acroforge +5 -0
- data/lib/acroforge/all_text_processor.rb +126 -0
- data/lib/acroforge/annotator.rb +137 -0
- data/lib/acroforge/cli.rb +351 -0
- data/lib/acroforge/constants.rb +46 -0
- data/lib/acroforge/engine.rb +869 -0
- data/lib/acroforge/labels.rb +112 -0
- data/lib/acroforge/preparer.rb +103 -0
- data/lib/acroforge/relabeler.rb +179 -0
- data/lib/acroforge/schema.rb +208 -0
- data/lib/acroforge/validator.rb +37 -0
- data/lib/acroforge/version.rb +5 -0
- data/lib/acroforge.rb +18 -0
- data/sig/acroforge.rbs +4 -0
- metadata +81 -0
|
@@ -0,0 +1,869 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "hexapdf"
|
|
4
|
+
require "json"
|
|
5
|
+
require "date"
|
|
6
|
+
require "uri"
|
|
7
|
+
|
|
8
|
+
require_relative "all_text_processor"
|
|
9
|
+
require_relative "validator"
|
|
10
|
+
require_relative "constants"
|
|
11
|
+
require_relative "labels"
|
|
12
|
+
|
|
13
|
+
module AcroForge
|
|
14
|
+
class Engine
|
|
15
|
+
attr_reader :template_path, :schema, :overrides, :sections, :normalized_path,
|
|
16
|
+
:mapped_fields, :unmapped_fields, :filled_fields, :missing_fields,
|
|
17
|
+
:select_field_options, :new_fields_detected
|
|
18
|
+
|
|
19
|
+
def initialize(template_path, schema: {}, overrides: {}, sections: [], normalized_dir: nil)
|
|
20
|
+
@template_path = template_path
|
|
21
|
+
@schema = schema
|
|
22
|
+
@overrides = overrides
|
|
23
|
+
@sections = sections
|
|
24
|
+
|
|
25
|
+
dir = normalized_dir || File.dirname(template_path)
|
|
26
|
+
base = File.basename(template_path, ".*")
|
|
27
|
+
# Avoid double suffixes like "_normalized_normalized.pdf" when the
|
|
28
|
+
# template already contains the normalized marker.
|
|
29
|
+
normalized_base = base.sub(/_normalized\z/, "")
|
|
30
|
+
Dir.mkdir(dir) unless Dir.exist?(dir)
|
|
31
|
+
@normalized_path = File.join(dir, "#{normalized_base}_normalized.pdf")
|
|
32
|
+
|
|
33
|
+
@mapped_fields = {}
|
|
34
|
+
@unmapped_fields = []
|
|
35
|
+
@filled_fields = {}
|
|
36
|
+
@missing_fields = []
|
|
37
|
+
@select_field_options = {}
|
|
38
|
+
@new_fields_detected = []
|
|
39
|
+
@field_proposals = nil
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def source_doc
|
|
43
|
+
@source_doc ||= HexaPDF::Document.open(@template_path)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def source_form
|
|
47
|
+
@source_form ||= source_doc.acro_form(create: false)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def raw_fields
|
|
51
|
+
return [] unless source_form
|
|
52
|
+
extracted = []
|
|
53
|
+
source_form.each_field do |field|
|
|
54
|
+
next unless field.is_a?(HexaPDF::Type::AcroForm::Field)
|
|
55
|
+
type = if field.is_a?(HexaPDF::Type::AcroForm::TextField) then :text
|
|
56
|
+
elsif field.is_a?(HexaPDF::Type::AcroForm::ButtonField) then :button
|
|
57
|
+
elsif field.is_a?(HexaPDF::Type::AcroForm::ChoiceField) then :choice
|
|
58
|
+
else :other
|
|
59
|
+
end
|
|
60
|
+
extracted << {name: field.full_field_name, type: type, alternate_name: field[:TU]}
|
|
61
|
+
end
|
|
62
|
+
extracted
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def raw_field_names
|
|
66
|
+
raw_fields.map { |f| f[:name] }
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def any_raw_fields?
|
|
70
|
+
raw_fields.any?
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def fully_mapped?
|
|
74
|
+
@unmapped_fields.empty?
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def mapped_count
|
|
78
|
+
@mapped_fields.size
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def mapped_field_names
|
|
82
|
+
@mapped_fields.values.uniq
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def field_proposals
|
|
86
|
+
raise "field_proposals available only after compile!" if @field_proposals.nil?
|
|
87
|
+
@field_proposals
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Returns a hash mapping synthetic field names ("date", "date#1", "date#2")
|
|
91
|
+
# to the underlying AcroForm field objects, using the same naming scheme
|
|
92
|
+
# compile! emits. Callers (notably Relabeler.apply!) use this to resolve
|
|
93
|
+
# mapping keys back to the right field even when the PDF has multiple
|
|
94
|
+
# fields sharing the same :T name. The first occurrence keeps the bare
|
|
95
|
+
# base name; subsequent occurrences get a #N suffix.
|
|
96
|
+
def self.field_index(form)
|
|
97
|
+
return {} unless form
|
|
98
|
+
counts = Hash.new(0)
|
|
99
|
+
index = {}
|
|
100
|
+
form.each_field do |field|
|
|
101
|
+
next unless field.is_a?(HexaPDF::Type::AcroForm::Field)
|
|
102
|
+
name = field.full_field_name
|
|
103
|
+
next unless name
|
|
104
|
+
synth = (counts[name] == 0) ? name : "#{name}##{counts[name]}"
|
|
105
|
+
counts[name] += 1
|
|
106
|
+
index[synth] = field
|
|
107
|
+
end
|
|
108
|
+
index
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# ------------------------------------------
|
|
112
|
+
# PHASE 1: THE HIERARCHICAL COMPILER
|
|
113
|
+
# ------------------------------------------
|
|
114
|
+
def compile!
|
|
115
|
+
puts ">> Compiling template: #{@template_path}"
|
|
116
|
+
form = source_doc.acro_form(create: true)
|
|
117
|
+
|
|
118
|
+
@mapped_fields = {}
|
|
119
|
+
@unmapped_fields = []
|
|
120
|
+
@select_field_options = {}
|
|
121
|
+
@new_fields_detected = []
|
|
122
|
+
@field_proposals = []
|
|
123
|
+
|
|
124
|
+
page_text_map = {}
|
|
125
|
+
source_doc.pages.each_with_index do |page, index|
|
|
126
|
+
processor = AllTextProcessor.new
|
|
127
|
+
page.process_contents(processor)
|
|
128
|
+
page_text_map[index] = processor.text_chunks
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
section_map = build_section_map(page_text_map)
|
|
132
|
+
|
|
133
|
+
# Track occurrences of each field name so we can disambiguate
|
|
134
|
+
# duplicates (e.g., three separate fields all named "date") with a
|
|
135
|
+
# synthetic suffix: first -> "date", second -> "date#1", third ->
|
|
136
|
+
# "date#2". The bare base name is preserved when unique so existing
|
|
137
|
+
# mappings stay backwards compatible.
|
|
138
|
+
field_name_counts = Hash.new(0)
|
|
139
|
+
|
|
140
|
+
form.each_field do |field|
|
|
141
|
+
next unless field.is_a?(HexaPDF::Type::AcroForm::Field)
|
|
142
|
+
|
|
143
|
+
widget = field.each_widget.first
|
|
144
|
+
next unless widget && widget[:Rect]
|
|
145
|
+
|
|
146
|
+
page_index = nil
|
|
147
|
+
source_doc.pages.each_with_index do |page, idx|
|
|
148
|
+
if page[:Annots]&.include?(widget)
|
|
149
|
+
page_index = idx
|
|
150
|
+
break
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
next unless page_index
|
|
155
|
+
|
|
156
|
+
base_field_name = field.full_field_name
|
|
157
|
+
occurrence = field_name_counts[base_field_name]
|
|
158
|
+
field_name_counts[base_field_name] += 1
|
|
159
|
+
original_field_name = (occurrence == 0) ? base_field_name : "#{base_field_name}##{occurrence}"
|
|
160
|
+
|
|
161
|
+
is_btn = field.is_a?(HexaPDF::Type::AcroForm::ButtonField) || field.is_a?(HexaPDF::Type::AcroForm::ChoiceField)
|
|
162
|
+
is_radio_group = is_btn && field.each_widget.count > 1
|
|
163
|
+
|
|
164
|
+
options_map = nil
|
|
165
|
+
|
|
166
|
+
if is_radio_group
|
|
167
|
+
# THE FIX: Sort by Highest Y, then Leftmost X to guarantee finding the top-left box of multi-line groups
|
|
168
|
+
first_widget = field.each_widget.min_by { |w| [-w[:Rect][1], w[:Rect][0]] }
|
|
169
|
+
|
|
170
|
+
raw_label = find_nearest_text(page_text_map[page_index], first_widget[:Rect], mode: :group_label)
|
|
171
|
+
raw_label = AcroForge::Labels.humanize(raw_label)
|
|
172
|
+
|
|
173
|
+
if raw_label
|
|
174
|
+
if raw_label.include?(":")
|
|
175
|
+
raw_label = raw_label.split(":").first.strip
|
|
176
|
+
elsif raw_label.downcase.include?("title")
|
|
177
|
+
raw_label = "Title"
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
options_map = {}
|
|
182
|
+
field.each_widget do |w|
|
|
183
|
+
next unless w[:Rect]
|
|
184
|
+
|
|
185
|
+
opt_text = find_nearest_text(page_text_map[page_index], w[:Rect], mode: :button_option)
|
|
186
|
+
|
|
187
|
+
if opt_text&.include?(":")
|
|
188
|
+
opt_text = opt_text.split(":").last.strip
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
export_val = w[:AP]&.[](:N)&.value&.keys&.find { |k| k != :Off && k != :Off.to_s }
|
|
192
|
+
|
|
193
|
+
if export_val
|
|
194
|
+
ev_str = export_val.to_s.downcase
|
|
195
|
+
is_generic = ["yes", "on", "off", "choice", "button", "group"].any? { |g| ev_str.include?(g) } || ev_str.match?(/^[0-9]+$/)
|
|
196
|
+
|
|
197
|
+
final_key = if !is_generic && sanitize_key(export_val)
|
|
198
|
+
sanitize_key(export_val).to_s
|
|
199
|
+
else
|
|
200
|
+
sanitized_opt = opt_text ? sanitize_key(opt_text)&.to_s : nil
|
|
201
|
+
(sanitized_opt.nil? || sanitized_opt.empty?) ? ev_str : sanitized_opt
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
options_map[final_key] = export_val.to_s
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
elsif field.is_a?(HexaPDF::Type::AcroForm::ButtonField)
|
|
209
|
+
# Single-widget buttons are usually checkboxes. Build a predictable
|
|
210
|
+
# hash so payload values can resolve to the exact export state.
|
|
211
|
+
options_map = {}
|
|
212
|
+
on_state = button_on_states(field).first
|
|
213
|
+
if on_state
|
|
214
|
+
on_export = on_state.to_s
|
|
215
|
+
on_keys = ["yes", "true", "on", "1", "checked"]
|
|
216
|
+
sanitized_on = sanitize_key(on_export)&.to_s
|
|
217
|
+
on_keys << sanitized_on if sanitized_on && !sanitized_on.empty?
|
|
218
|
+
on_keys.uniq.each { |k| options_map[k] = on_export }
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
["no", "false", "off", "0", "unchecked"].each { |k| options_map[k] = "Off" }
|
|
222
|
+
|
|
223
|
+
elsif field.is_a?(HexaPDF::Type::AcroForm::ChoiceField)
|
|
224
|
+
# Choice fields can expose values via /Opt entries.
|
|
225
|
+
options_map = {}
|
|
226
|
+
if field[:Opt].is_a?(Array)
|
|
227
|
+
field[:Opt].each do |opt|
|
|
228
|
+
if opt.is_a?(Array)
|
|
229
|
+
export_val = opt[0].to_s
|
|
230
|
+
display_val = opt[1].to_s
|
|
231
|
+
[export_val, display_val].each do |candidate|
|
|
232
|
+
normalized = sanitize_key(candidate)&.to_s
|
|
233
|
+
options_map[normalized] = export_val if normalized && !normalized.empty?
|
|
234
|
+
end
|
|
235
|
+
else
|
|
236
|
+
export_val = opt.to_s
|
|
237
|
+
normalized = sanitize_key(export_val)&.to_s
|
|
238
|
+
options_map[normalized] = export_val if normalized && !normalized.empty?
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
else
|
|
243
|
+
field_rect = widget[:Rect]
|
|
244
|
+
raw_label = find_nearest_text(page_text_map[page_index], field_rect, mode: :standard)
|
|
245
|
+
raw_label = AcroForge::Labels.humanize(raw_label)
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
y_center = if is_radio_group
|
|
249
|
+
first_widget = field.each_widget.min_by { |w| [-w[:Rect][1], w[:Rect][0]] }
|
|
250
|
+
(first_widget[:Rect][1] + first_widget[:Rect][3]) / 2.0
|
|
251
|
+
else
|
|
252
|
+
(widget[:Rect][1] + widget[:Rect][3]) / 2.0
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
active_section = get_active_section(section_map, page_index, y_center)
|
|
256
|
+
|
|
257
|
+
target_key = nil
|
|
258
|
+
|
|
259
|
+
# Apply overrides if applicable. Support @overrides keyed by
|
|
260
|
+
# the original PDF field names (strings like "page0_field6"). When an
|
|
261
|
+
# override exists, map the PDF field to the semantic :key declared in the
|
|
262
|
+
# override (e.g. :full_name) so downstream validation uses semantic keys.
|
|
263
|
+
override_key_used = @overrides.key?(original_field_name.to_s) ? original_field_name.to_s : original_field_name.to_sym
|
|
264
|
+
override_entry = @overrides[original_field_name.to_s] || @overrides[original_field_name.to_sym]
|
|
265
|
+
if override_entry
|
|
266
|
+
semantic_name = override_entry[:key] || override_key_used
|
|
267
|
+
mapped_semantic = semantic_name.to_sym
|
|
268
|
+
target_key = (is_btn && !mapped_semantic.to_s.end_with?("_btn")) ? :"#{mapped_semantic}_btn" : mapped_semantic
|
|
269
|
+
|
|
270
|
+
# Ensure uniqueness when multiple fields map to the same semantic key
|
|
271
|
+
original_target = target_key
|
|
272
|
+
counter = 1
|
|
273
|
+
while @mapped_fields.value?(target_key)
|
|
274
|
+
target_key = :"#{original_target}_#{counter}"
|
|
275
|
+
counter += 1
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
puts " [Override] '#{original_field_name}' -> :#{target_key} (Override)"
|
|
279
|
+
elsif raw_label
|
|
280
|
+
base_key = sanitize_key(raw_label)
|
|
281
|
+
unless base_key
|
|
282
|
+
@unmapped_fields << original_field_name
|
|
283
|
+
@field_proposals << {
|
|
284
|
+
pdf_field_name: original_field_name,
|
|
285
|
+
pdf_field_type: case field
|
|
286
|
+
when HexaPDF::Type::AcroForm::TextField then :text
|
|
287
|
+
when HexaPDF::Type::AcroForm::ButtonField then :button
|
|
288
|
+
when HexaPDF::Type::AcroForm::ChoiceField then :choice
|
|
289
|
+
else :other
|
|
290
|
+
end,
|
|
291
|
+
canonical_key: nil,
|
|
292
|
+
raw_label: raw_label,
|
|
293
|
+
confidence: :none,
|
|
294
|
+
section: active_section,
|
|
295
|
+
page: page_index,
|
|
296
|
+
y: y_center,
|
|
297
|
+
x: (widget[:Rect][0] + widget[:Rect][2]) / 2.0,
|
|
298
|
+
options: options_map
|
|
299
|
+
}
|
|
300
|
+
puts " [Failed] Could not derive a valid key for field: #{original_field_name}"
|
|
301
|
+
next
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
if is_btn
|
|
305
|
+
base_key, override_label = normalize_button_base_key(base_key, options_map)
|
|
306
|
+
# Spatial heuristic's nearby-text guess can be wrong (e.g., a Title
|
|
307
|
+
# radio group sitting close to a "First Name" text input). If the
|
|
308
|
+
# options unambiguously identify the field, trust them and overwrite
|
|
309
|
+
# the misleading raw_label so variations + meta stay self-consistent.
|
|
310
|
+
raw_label = override_label if override_label
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
canonical_schema_key = canonical_schema_key_for(base_key, raw_label)
|
|
314
|
+
if canonical_schema_key
|
|
315
|
+
base_key = canonical_schema_key
|
|
316
|
+
elsif !likely_noisy_key?(base_key)
|
|
317
|
+
@new_fields_detected << base_key.to_s unless @new_fields_detected.include?(base_key.to_s)
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
target_key = active_section ? :"#{active_section}_#{base_key}" : base_key
|
|
321
|
+
target_key = @overrides[raw_label].to_sym if @overrides[raw_label]
|
|
322
|
+
target_key = :"#{target_key}_btn" if is_btn && !target_key.to_s.end_with?("_btn")
|
|
323
|
+
|
|
324
|
+
original_target = target_key
|
|
325
|
+
counter = 1
|
|
326
|
+
while @mapped_fields.value?(target_key)
|
|
327
|
+
target_key = :"#{original_target}_#{counter}"
|
|
328
|
+
counter += 1
|
|
329
|
+
end
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
if target_key
|
|
333
|
+
field[:T] = target_key.to_s
|
|
334
|
+
@mapped_fields[original_field_name] = target_key
|
|
335
|
+
@field_proposals << {
|
|
336
|
+
pdf_field_name: original_field_name,
|
|
337
|
+
pdf_field_type: case field
|
|
338
|
+
when HexaPDF::Type::AcroForm::TextField then :text
|
|
339
|
+
when HexaPDF::Type::AcroForm::ButtonField then :button
|
|
340
|
+
when HexaPDF::Type::AcroForm::ChoiceField then :choice
|
|
341
|
+
else :other
|
|
342
|
+
end,
|
|
343
|
+
canonical_key: target_key,
|
|
344
|
+
raw_label: raw_label,
|
|
345
|
+
confidence: confidence_for(raw_label, target_key),
|
|
346
|
+
section: active_section,
|
|
347
|
+
page: page_index,
|
|
348
|
+
y: y_center,
|
|
349
|
+
x: (widget[:Rect][0] + widget[:Rect][2]) / 2.0,
|
|
350
|
+
options: options_map
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if is_btn && options_map && options_map.any?
|
|
354
|
+
@select_field_options[target_key.to_s] = options_map
|
|
355
|
+
# Reuse TU to persist the mapping in the normalized template.
|
|
356
|
+
field[:TU] = options_map.to_json
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
prefix_notice = active_section ? "[#{active_section.upcase}] " : ""
|
|
360
|
+
puts " [Auto-Mapped] #{prefix_notice}'#{raw_label || original_field_name}' -> :#{target_key}"
|
|
361
|
+
|
|
362
|
+
if is_btn && options_map && options_map.any?
|
|
363
|
+
puts " └─ Valid Options Hash: #{options_map.keys.inspect}"
|
|
364
|
+
end
|
|
365
|
+
else
|
|
366
|
+
@unmapped_fields << original_field_name
|
|
367
|
+
@field_proposals << {
|
|
368
|
+
pdf_field_name: original_field_name,
|
|
369
|
+
pdf_field_type: case field
|
|
370
|
+
when HexaPDF::Type::AcroForm::TextField then :text
|
|
371
|
+
when HexaPDF::Type::AcroForm::ButtonField then :button
|
|
372
|
+
when HexaPDF::Type::AcroForm::ChoiceField then :choice
|
|
373
|
+
else :other
|
|
374
|
+
end,
|
|
375
|
+
canonical_key: nil,
|
|
376
|
+
raw_label: raw_label,
|
|
377
|
+
confidence: :none,
|
|
378
|
+
section: active_section,
|
|
379
|
+
page: page_index,
|
|
380
|
+
y: y_center,
|
|
381
|
+
x: (widget[:Rect][0] + widget[:Rect][2]) / 2.0,
|
|
382
|
+
options: options_map
|
|
383
|
+
}
|
|
384
|
+
puts " [Failed] Could not find a text label for field: #{original_field_name}"
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
source_doc.write(@normalized_path, optimize: true)
|
|
389
|
+
puts ">> Compilation Complete. #{mapped_count} fields mapped."
|
|
390
|
+
puts ">> Clean template saved to: #{@normalized_path}\n\n"
|
|
391
|
+
|
|
392
|
+
{
|
|
393
|
+
mapped: @mapped_fields,
|
|
394
|
+
unmapped: @unmapped_fields,
|
|
395
|
+
select_options: @select_field_options,
|
|
396
|
+
new_fields_detected: @new_fields_detected
|
|
397
|
+
}
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
# ------------------------------------------
|
|
401
|
+
# PHASE 2: THE CRASH-PROOF INJECTOR
|
|
402
|
+
# ------------------------------------------
|
|
403
|
+
def fill!(payload, output_path, image_overlays = {})
|
|
404
|
+
puts ">> Injecting data into: #{@normalized_path}"
|
|
405
|
+
|
|
406
|
+
unless File.exist?(@normalized_path)
|
|
407
|
+
raise "Normalized template missing. Please run compile! first."
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
validate_payload!(payload)
|
|
411
|
+
|
|
412
|
+
normalized_doc = HexaPDF::Document.open(@normalized_path)
|
|
413
|
+
form = normalized_doc.acro_form
|
|
414
|
+
|
|
415
|
+
@filled_fields = {}
|
|
416
|
+
@missing_fields = []
|
|
417
|
+
|
|
418
|
+
payload.each do |key, value|
|
|
419
|
+
next if value.nil?
|
|
420
|
+
next if image_overlays.key?(key) # Silence the harmless warnings for image overlays
|
|
421
|
+
|
|
422
|
+
doc_field = nil
|
|
423
|
+
form.each_field do |f|
|
|
424
|
+
if f.is_a?(HexaPDF::Type::AcroForm::Field) && f[:T].to_s == key.to_s
|
|
425
|
+
doc_field = f
|
|
426
|
+
break
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
if doc_field
|
|
431
|
+
begin
|
|
432
|
+
if doc_field.is_a?(HexaPDF::Type::AcroForm::ButtonField) ||
|
|
433
|
+
doc_field.is_a?(HexaPDF::Type::AcroForm::ChoiceField)
|
|
434
|
+
resolved_from_map = false
|
|
435
|
+
|
|
436
|
+
if doc_field[:TU]
|
|
437
|
+
begin
|
|
438
|
+
options_map = JSON.parse(doc_field[:TU])
|
|
439
|
+
normalized_user_val = sanitize_key(value)&.to_s
|
|
440
|
+
|
|
441
|
+
if normalized_user_val && options_map.key?(normalized_user_val)
|
|
442
|
+
target_val = options_map[normalized_user_val]
|
|
443
|
+
doc_field.field_value = target_val
|
|
444
|
+
resolved_from_map = true
|
|
445
|
+
|
|
446
|
+
if doc_field.is_a?(HexaPDF::Type::AcroForm::ButtonField)
|
|
447
|
+
doc_field.each_widget do |w|
|
|
448
|
+
next unless w[:AP] && w[:AP][:N]
|
|
449
|
+
w_export = w[:AP][:N].value.keys.find { |k| k != :Off && k.to_s.downcase != "off" }
|
|
450
|
+
w[:AS] = (w_export.to_s == target_val.to_s) ? w_export : :Off
|
|
451
|
+
end
|
|
452
|
+
end
|
|
453
|
+
elsif doc_field.is_a?(HexaPDF::Type::AcroForm::ButtonField)
|
|
454
|
+
puts " [Warning] :#{key} - '#{value}' not found in select options: #{options_map.keys.join(", ")}"
|
|
455
|
+
end
|
|
456
|
+
rescue JSON::ParserError
|
|
457
|
+
resolved_from_map = false
|
|
458
|
+
end
|
|
459
|
+
end
|
|
460
|
+
|
|
461
|
+
if resolved_from_map
|
|
462
|
+
# done
|
|
463
|
+
elsif doc_field.is_a?(HexaPDF::Type::AcroForm::ButtonField)
|
|
464
|
+
normalized_val = value.to_s.downcase.strip
|
|
465
|
+
on_state_sym = button_on_states(doc_field).first || :Yes
|
|
466
|
+
|
|
467
|
+
if ["true", "yes", "on", "1"].include?(normalized_val)
|
|
468
|
+
doc_field.field_value = on_state_sym.to_s
|
|
469
|
+
doc_field.each_widget { |w| w[:AS] = on_state_sym }
|
|
470
|
+
elsif ["false", "no", "off", "0"].include?(normalized_val)
|
|
471
|
+
doc_field.field_value = "Off"
|
|
472
|
+
doc_field.each_widget { |w| w[:AS] = :Off }
|
|
473
|
+
else
|
|
474
|
+
doc_field.field_value = value.to_s
|
|
475
|
+
end
|
|
476
|
+
else
|
|
477
|
+
doc_field.field_value = value.to_s
|
|
478
|
+
end
|
|
479
|
+
else
|
|
480
|
+
if doc_field.key?(:MaxLen)
|
|
481
|
+
doc_field[:Ff] = (doc_field[:Ff] || 0) & ~(1 << 24)
|
|
482
|
+
doc_field.delete(:MaxLen)
|
|
483
|
+
end
|
|
484
|
+
doc_field.field_value = value.to_s
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
@filled_fields[key] = value
|
|
488
|
+
puts " [Filled] :#{key} = #{value}"
|
|
489
|
+
rescue HexaPDF::Error => e
|
|
490
|
+
puts " [Warning] Rejected :#{key} - PDF formatting conflict (#{e.message.split(" (HexaPDF").first})"
|
|
491
|
+
end
|
|
492
|
+
else
|
|
493
|
+
@missing_fields << key
|
|
494
|
+
puts " [Warning] Field :#{key} not found in template."
|
|
495
|
+
end
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
image_overlays.each do |key, config|
|
|
499
|
+
next unless payload[key] && File.exist?(payload[key])
|
|
500
|
+
|
|
501
|
+
page_index = config[:page] || 0
|
|
502
|
+
x, y, w, h = config[:coords]
|
|
503
|
+
|
|
504
|
+
page = normalized_doc.pages[page_index]
|
|
505
|
+
canvas = page.canvas(type: :overlay)
|
|
506
|
+
|
|
507
|
+
canvas.fill_color(255, 255, 255)
|
|
508
|
+
canvas.rectangle(x, y, w, h).fill
|
|
509
|
+
canvas.image(File.open(payload[key]), at: [x, y], width: w, height: h)
|
|
510
|
+
|
|
511
|
+
puts " [Overlay] Stamped :#{key} onto page #{page_index}"
|
|
512
|
+
end
|
|
513
|
+
|
|
514
|
+
normalized_doc.write(output_path, optimize: true)
|
|
515
|
+
puts ">> Success! Saved filled PDF to: #{output_path}\n\n"
|
|
516
|
+
|
|
517
|
+
{filled: @filled_fields, missing: @missing_fields}
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
private
|
|
521
|
+
|
|
522
|
+
def confidence_for(raw_label, target_key)
|
|
523
|
+
return :none if raw_label.nil? || raw_label.strip.empty?
|
|
524
|
+
return :high if target_key && @schema.key?(target_key.to_s.to_sym)
|
|
525
|
+
return :medium if target_key
|
|
526
|
+
:low
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
def sanitize_key(string)
|
|
530
|
+
key = string.to_s.downcase
|
|
531
|
+
.gsub(/['’*]/, "")
|
|
532
|
+
.gsub(/[^a-z0-9]+/, "_").squeeze("_")
|
|
533
|
+
.sub(/_$/, "")
|
|
534
|
+
.sub(/^_/, "")
|
|
535
|
+
|
|
536
|
+
# Merge common split artifacts from broken text extraction.
|
|
537
|
+
loop do
|
|
538
|
+
previous = key
|
|
539
|
+
|
|
540
|
+
# Prefix split: "c_ertify" => "certify", "p_roperty" => "property".
|
|
541
|
+
# Require at least 3 chars in the tail to avoid merging across real word boundaries.
|
|
542
|
+
key = key.gsub(/(^|_)([a-z])_([a-z0-9]{3,})(?=_|$)/, '\\1\\2\\3')
|
|
543
|
+
|
|
544
|
+
# Suffix split with trailing consonant: "an_d" => "and", "i_s" => "is".
|
|
545
|
+
# Keep this narrow so valid tokens like "party_has" aren't corrupted.
|
|
546
|
+
key = key.gsub(/(^|_)([a-z0-9]{1,2})_([bcdfghjklmnpqrstvwxyz])(?=_|$)/, '\\1\\2\\3')
|
|
547
|
+
|
|
548
|
+
# Two-letter head split often seen in "th_at" => "that".
|
|
549
|
+
key = key.gsub(/(^|_)(th|wh)_([a-z0-9]{2,})(?=_|$)/, '\\1\\2\\3')
|
|
550
|
+
|
|
551
|
+
break if key == previous
|
|
552
|
+
end
|
|
553
|
+
|
|
554
|
+
key = fix_token_typos(key)
|
|
555
|
+
|
|
556
|
+
key = canonicalize_known_label_key(key)
|
|
557
|
+
|
|
558
|
+
return nil if key.empty?
|
|
559
|
+
|
|
560
|
+
key.to_sym
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
def canonicalize_known_label_key(key)
|
|
564
|
+
normalized = key.dup
|
|
565
|
+
|
|
566
|
+
# Common fragmented tokens observed across vendor forms.
|
|
567
|
+
replacements = {
|
|
568
|
+
"t_ax" => "tax",
|
|
569
|
+
"identi_cation" => "identification",
|
|
570
|
+
"ide_ntity" => "identity",
|
|
571
|
+
"othe_rbank" => "other_bank",
|
|
572
|
+
"cha_r_ge" => "charge",
|
|
573
|
+
"complet_e" => "complete",
|
|
574
|
+
"a_nd" => "and",
|
|
575
|
+
"ot_her" => "other",
|
|
576
|
+
"sa_vings" => "savings",
|
|
577
|
+
"aloan" => "a_loan",
|
|
578
|
+
"tob_eused" => "to_be_used",
|
|
579
|
+
"documen_tveri_edb_y" => "document_verified_by",
|
|
580
|
+
"contac_tpersons" => "contact_persons",
|
|
581
|
+
"modeof" => "mode_of",
|
|
582
|
+
"mrmrs" => "mr_mrs",
|
|
583
|
+
"mobile_n_o" => "mobile_no",
|
|
584
|
+
"account_n_o" => "account_no",
|
|
585
|
+
"name_of_authorized_ocial" => "name_of_authorized_official",
|
|
586
|
+
"signature_of_authorized_ocial" => "signature_of_authorized_official",
|
|
587
|
+
"na_onal_id" => "national_id",
|
|
588
|
+
"posi_on_title" => "position_title",
|
|
589
|
+
"contribu_on" => "contribution",
|
|
590
|
+
"con_rmed" => "confirmed"
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
replacements.each do |from, to|
|
|
594
|
+
normalized = normalized.gsub(from, to)
|
|
595
|
+
end
|
|
596
|
+
|
|
597
|
+
# Canonicalize the recurring long disclaimer/attestation label that often
|
|
598
|
+
# arrives with fragmented tokens across different PDFs.
|
|
599
|
+
if normalized.include?("certify_that_my") &&
|
|
600
|
+
normalized.include?("savings_balance") &&
|
|
601
|
+
normalized.include?("sole_property") &&
|
|
602
|
+
normalized.include?("other_party") &&
|
|
603
|
+
normalized.include?("claim_over_it")
|
|
604
|
+
return "certify_that_my_pledged_tier3_other_savings_balance_is_my_sole_property_and_that_no_other_party_has_a_claim_over_it"
|
|
605
|
+
end
|
|
606
|
+
|
|
607
|
+
# Canonicalize a frequent long employer-loan question variant.
|
|
608
|
+
if normalized.include?("does_the_employer") &&
|
|
609
|
+
normalized.include?("loan") &&
|
|
610
|
+
normalized.include?("lien") &&
|
|
611
|
+
normalized.include?("recovered") &&
|
|
612
|
+
normalized.include?("employer_contribution")
|
|
613
|
+
return "does_the_employer_have_a_loan_lien_to_be_recovered_from_employer_contribution"
|
|
614
|
+
end
|
|
615
|
+
|
|
616
|
+
normalized
|
|
617
|
+
end
|
|
618
|
+
|
|
619
|
+
def fix_token_typos(key)
|
|
620
|
+
normalized = key.dup
|
|
621
|
+
|
|
622
|
+
AcroForge::Constants::TYPO_PHRASE_REPLACEMENTS.each do |from, to|
|
|
623
|
+
normalized = normalized.gsub(from, to)
|
|
624
|
+
end
|
|
625
|
+
|
|
626
|
+
# Clean up repeated separators introduced during replacement.
|
|
627
|
+
normalized.squeeze("_").sub(/^_/, "").sub(/_$/, "")
|
|
628
|
+
end
|
|
629
|
+
|
|
630
|
+
def build_section_map(page_text_map)
|
|
631
|
+
map = {}
|
|
632
|
+
page_text_map.each do |page_idx, chunks|
|
|
633
|
+
page_sections = []
|
|
634
|
+
chunks.each do |chunk|
|
|
635
|
+
clean_chunk = chunk[:text].downcase.gsub(/[^a-z]/, "")
|
|
636
|
+
|
|
637
|
+
if @sections.any? { |s| clean_chunk == s.downcase.gsub(/[^a-z]/, "") }
|
|
638
|
+
matched_section = @sections.find { |s| clean_chunk == s.downcase.gsub(/[^a-z]/, "") }
|
|
639
|
+
clean_section = (matched_section.downcase == "adress details") ? "Address Details" : matched_section
|
|
640
|
+
page_sections << {key: sanitize_key(clean_section), y_min: chunk[:y_min]}
|
|
641
|
+
end
|
|
642
|
+
end
|
|
643
|
+
map[page_idx] = page_sections.sort_by { |s| -s[:y_min] }
|
|
644
|
+
end
|
|
645
|
+
map
|
|
646
|
+
end
|
|
647
|
+
|
|
648
|
+
def button_on_states(field)
|
|
649
|
+
states = []
|
|
650
|
+
|
|
651
|
+
field.each_widget do |w|
|
|
652
|
+
next unless w[:AP] && w[:AP][:N]
|
|
653
|
+
|
|
654
|
+
keys = w[:AP][:N].value.keys
|
|
655
|
+
keys.each do |k|
|
|
656
|
+
next if k == :Off || k.to_s.downcase == "off"
|
|
657
|
+
|
|
658
|
+
states << k
|
|
659
|
+
end
|
|
660
|
+
end
|
|
661
|
+
|
|
662
|
+
states.uniq
|
|
663
|
+
end
|
|
664
|
+
|
|
665
|
+
# Returns [resolved_base_key, canonical_label_or_nil].
|
|
666
|
+
# When the options of a radio group / choice field clearly identify
|
|
667
|
+
# the field's semantic role (e.g., options [dr, mr, mrs, miss] mean
|
|
668
|
+
# the field IS a title selector), we override the spatially-derived
|
|
669
|
+
# base_key AND return the canonical human label so callers can
|
|
670
|
+
# replace a misleading raw_label too.
|
|
671
|
+
def normalize_button_base_key(base_key, options_map)
|
|
672
|
+
return [base_key, nil] unless options_map.is_a?(Hash) && options_map.any?
|
|
673
|
+
|
|
674
|
+
option_keys = options_map.keys.map { |k| sanitize_key(k)&.to_s }.compact.uniq
|
|
675
|
+
|
|
676
|
+
title_tokens = %w[dr mr mrs miss title]
|
|
677
|
+
return [:title, "Title"] if (option_keys & title_tokens).size >= 2
|
|
678
|
+
|
|
679
|
+
return [:gender, "Gender"] if option_keys.include?("male") && option_keys.include?("female")
|
|
680
|
+
|
|
681
|
+
marital_tokens = %w[single married divorced widow_widower widowed]
|
|
682
|
+
return [:marital_status, "Marital Status"] if (option_keys & marital_tokens).size >= 2
|
|
683
|
+
|
|
684
|
+
[base_key, nil]
|
|
685
|
+
end
|
|
686
|
+
|
|
687
|
+
def schema_variations(canonical_key)
|
|
688
|
+
entry = @schema[canonical_key]
|
|
689
|
+
return [] unless entry
|
|
690
|
+
entry.is_a?(Hash) ? Array(entry[:variations]) : Array(entry)
|
|
691
|
+
end
|
|
692
|
+
|
|
693
|
+
def canonical_schema_key_for(base_key, raw_label)
|
|
694
|
+
candidates = []
|
|
695
|
+
candidates << base_key.to_s if base_key
|
|
696
|
+
candidates << sanitize_key(raw_label).to_s if raw_label && sanitize_key(raw_label)
|
|
697
|
+
|
|
698
|
+
@schema.each do |canonical, _info|
|
|
699
|
+
variations = schema_variations(canonical)
|
|
700
|
+
canonical_key = sanitize_key(canonical.to_s)&.to_s
|
|
701
|
+
return canonical if candidates.include?(canonical_key)
|
|
702
|
+
|
|
703
|
+
variations.each do |label|
|
|
704
|
+
normalized = sanitize_key(label)&.to_s
|
|
705
|
+
next unless normalized
|
|
706
|
+
|
|
707
|
+
return canonical if candidates.include?(normalized)
|
|
708
|
+
end
|
|
709
|
+
end
|
|
710
|
+
|
|
711
|
+
nil
|
|
712
|
+
end
|
|
713
|
+
|
|
714
|
+
def likely_noisy_key?(key)
|
|
715
|
+
str = key.to_s
|
|
716
|
+
return true if str.empty?
|
|
717
|
+
|
|
718
|
+
str.match?(/(?:^|_)image\d+|(?:^|_)text\d+|(?:^|_)page\d+_field\d+/)
|
|
719
|
+
end
|
|
720
|
+
|
|
721
|
+
def get_active_section(section_map, page_idx, field_y_center)
|
|
722
|
+
return nil unless section_map[page_idx]
|
|
723
|
+
active_section = nil
|
|
724
|
+
section_map[page_idx].each do |sec|
|
|
725
|
+
if sec[:y_min] > field_y_center
|
|
726
|
+
active_section = sec[:key]
|
|
727
|
+
else
|
|
728
|
+
break
|
|
729
|
+
end
|
|
730
|
+
end
|
|
731
|
+
active_section
|
|
732
|
+
end
|
|
733
|
+
|
|
734
|
+
def validate_payload!(payload)
|
|
735
|
+
payload.each do |key, value|
|
|
736
|
+
next if value.nil? || value.to_s.empty?
|
|
737
|
+
|
|
738
|
+
# Strip suffixes like _1 or _btn to find the base canonical key for schema lookup
|
|
739
|
+
key_str = key.to_s
|
|
740
|
+
base_key = key_str.sub(/_btn(?:_\d+)?\z/, "").sub(/_\d+\z/, "").to_sym
|
|
741
|
+
|
|
742
|
+
# Try to resolve override info. @overrides may be keyed by
|
|
743
|
+
# original PDF field names (strings like "page0_field6") so allow lookup
|
|
744
|
+
# by semantic base_key (matching value[:key]) or by string key.
|
|
745
|
+
override_info = @overrides[base_key] || @overrides[base_key.to_s] || @overrides.values.find { |v| v.is_a?(Hash) && v[:key].to_sym == base_key }
|
|
746
|
+
|
|
747
|
+
type_info = @schema[base_key]
|
|
748
|
+
|
|
749
|
+
# If it's a button field, it's a select type by nature
|
|
750
|
+
type = if key_str.include?("_btn")
|
|
751
|
+
:select
|
|
752
|
+
elsif override_info
|
|
753
|
+
override_info[:type]
|
|
754
|
+
elsif type_info
|
|
755
|
+
type_info.is_a?(Hash) ? type_info[:type] : :string
|
|
756
|
+
else
|
|
757
|
+
infer_type(key)
|
|
758
|
+
end
|
|
759
|
+
|
|
760
|
+
schema_options = if type_info.is_a?(Hash)
|
|
761
|
+
type_info[:options] || []
|
|
762
|
+
else
|
|
763
|
+
[]
|
|
764
|
+
end
|
|
765
|
+
pdf_options = @select_field_options[key.to_s]&.keys || []
|
|
766
|
+
|
|
767
|
+
allowed_options = (schema_options + pdf_options).uniq
|
|
768
|
+
|
|
769
|
+
unless AcroForge::Validator.valid?(value, type, allowed_options)
|
|
770
|
+
msg = "Validation failed for field :#{key} (base: :#{base_key}): Expected #{type}, got '#{value}'."
|
|
771
|
+
msg += " (Allowed options: #{allowed_options.join(", ")})" if type == :select
|
|
772
|
+
raise AcroForge::ValidationError, msg
|
|
773
|
+
end
|
|
774
|
+
end
|
|
775
|
+
end
|
|
776
|
+
|
|
777
|
+
def infer_type(key)
|
|
778
|
+
tokens = key.to_s.downcase.split("_")
|
|
779
|
+
if (tokens & %w[date dob expiry]).any?
|
|
780
|
+
:date
|
|
781
|
+
elsif (tokens & %w[amount salary income balance]).any?
|
|
782
|
+
:money
|
|
783
|
+
elsif (tokens & %w[email]).any?
|
|
784
|
+
:email
|
|
785
|
+
elsif (tokens & %w[tenor years age]).any?
|
|
786
|
+
:number
|
|
787
|
+
else
|
|
788
|
+
:string
|
|
789
|
+
end
|
|
790
|
+
end
|
|
791
|
+
|
|
792
|
+
# ------------------------------------------
|
|
793
|
+
# THE UNIVERSAL DYNAMIC HEURISTIC (WEIGHTS FIXED)
|
|
794
|
+
# ------------------------------------------
|
|
795
|
+
def find_nearest_text(text_chunks, field_rect, mode: :standard)
|
|
796
|
+
f_x_min, f_y_min, f_x_max, f_y_max = field_rect
|
|
797
|
+
f_y_center = (f_y_min + f_y_max) / 2.0
|
|
798
|
+
|
|
799
|
+
best_text = nil
|
|
800
|
+
best_score = 99999
|
|
801
|
+
|
|
802
|
+
text_chunks.each do |chunk|
|
|
803
|
+
t_x_min = chunk[:x_min]
|
|
804
|
+
t_x_max = chunk[:x_max]
|
|
805
|
+
t_x_center = (t_x_min + t_x_max) / 2.0
|
|
806
|
+
t_y_min = chunk[:y_min]
|
|
807
|
+
t_y_max = chunk[:y_max]
|
|
808
|
+
t_y_center = (t_y_min + t_y_max) / 2.0
|
|
809
|
+
|
|
810
|
+
dx_left = f_x_min - t_x_max
|
|
811
|
+
dx_right = t_x_min - f_x_max
|
|
812
|
+
dy_top = t_y_min - f_y_max
|
|
813
|
+
dy_center = (t_y_center - f_y_center).abs
|
|
814
|
+
|
|
815
|
+
score = nil
|
|
816
|
+
is_section_header = @sections.any? { |s| chunk[:text].downcase.gsub(/[^a-z]/, "") == s.downcase.gsub(/[^a-z]/, "") }
|
|
817
|
+
has_colon_or_q = chunk[:text].strip.match?(/[:?]\z/)
|
|
818
|
+
|
|
819
|
+
case mode
|
|
820
|
+
when :button_option
|
|
821
|
+
if dy_center < 12
|
|
822
|
+
if dx_right > -10 && dx_right < 60
|
|
823
|
+
score = dx_right.abs
|
|
824
|
+
elsif dx_left > -10 && dx_left < 60
|
|
825
|
+
score = dx_left.abs + 5
|
|
826
|
+
end
|
|
827
|
+
end
|
|
828
|
+
|
|
829
|
+
when :group_label
|
|
830
|
+
if dy_center < 15 && dx_left > -20 && dx_left < 300
|
|
831
|
+
score = dx_left.abs - 1000
|
|
832
|
+
score -= 300 if has_colon_or_q # Colon Tie-breaker Bonus
|
|
833
|
+
elsif dy_top > -5 && dy_top < 30 && (t_x_max > f_x_min - 20)
|
|
834
|
+
score = dy_top.abs + 50
|
|
835
|
+
score -= 300 if has_colon_or_q
|
|
836
|
+
end
|
|
837
|
+
|
|
838
|
+
when :standard
|
|
839
|
+
is_grid_locked = dy_top > -5 && dy_top < 30 && t_x_center >= (f_x_min - 20) && t_x_center <= (f_x_max + 20)
|
|
840
|
+
is_inline = dy_center < 10 && dx_left > -10 && dx_left < 200
|
|
841
|
+
|
|
842
|
+
if is_grid_locked
|
|
843
|
+
score = dy_top.abs - 2000
|
|
844
|
+
score -= 200 if has_colon_or_q
|
|
845
|
+
elsif is_inline
|
|
846
|
+
score = dx_left.abs - 1000
|
|
847
|
+
score -= 200 if has_colon_or_q
|
|
848
|
+
elsif dy_center < 15 && dx_left > -10 && dx_left < 150
|
|
849
|
+
score = dx_left.abs
|
|
850
|
+
score -= 200 if has_colon_or_q
|
|
851
|
+
end
|
|
852
|
+
end
|
|
853
|
+
|
|
854
|
+
if score
|
|
855
|
+
score += 10000 if is_section_header
|
|
856
|
+
|
|
857
|
+
if score < best_score
|
|
858
|
+
best_score = score
|
|
859
|
+
best_text = chunk[:text]
|
|
860
|
+
end
|
|
861
|
+
end
|
|
862
|
+
end
|
|
863
|
+
|
|
864
|
+
best_text&.sub(/:\z/, "")&.strip
|
|
865
|
+
end
|
|
866
|
+
|
|
867
|
+
public :validate_payload!
|
|
868
|
+
end
|
|
869
|
+
end
|