acroforge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,869 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "hexapdf"
4
+ require "json"
5
+ require "date"
6
+ require "uri"
7
+
8
+ require_relative "all_text_processor"
9
+ require_relative "validator"
10
+ require_relative "constants"
11
+ require_relative "labels"
12
+
13
+ module AcroForge
14
+ class Engine
15
+ attr_reader :template_path, :schema, :overrides, :sections, :normalized_path,
16
+ :mapped_fields, :unmapped_fields, :filled_fields, :missing_fields,
17
+ :select_field_options, :new_fields_detected
18
+
19
+ def initialize(template_path, schema: {}, overrides: {}, sections: [], normalized_dir: nil)
20
+ @template_path = template_path
21
+ @schema = schema
22
+ @overrides = overrides
23
+ @sections = sections
24
+
25
+ dir = normalized_dir || File.dirname(template_path)
26
+ base = File.basename(template_path, ".*")
27
+ # Avoid double suffixes like "_normalized_normalized.pdf" when the
28
+ # template already contains the normalized marker.
29
+ normalized_base = base.sub(/_normalized\z/, "")
30
+ Dir.mkdir(dir) unless Dir.exist?(dir)
31
+ @normalized_path = File.join(dir, "#{normalized_base}_normalized.pdf")
32
+
33
+ @mapped_fields = {}
34
+ @unmapped_fields = []
35
+ @filled_fields = {}
36
+ @missing_fields = []
37
+ @select_field_options = {}
38
+ @new_fields_detected = []
39
+ @field_proposals = nil
40
+ end
41
+
42
+ def source_doc
43
+ @source_doc ||= HexaPDF::Document.open(@template_path)
44
+ end
45
+
46
+ def source_form
47
+ @source_form ||= source_doc.acro_form(create: false)
48
+ end
49
+
50
+ def raw_fields
51
+ return [] unless source_form
52
+ extracted = []
53
+ source_form.each_field do |field|
54
+ next unless field.is_a?(HexaPDF::Type::AcroForm::Field)
55
+ type = if field.is_a?(HexaPDF::Type::AcroForm::TextField) then :text
56
+ elsif field.is_a?(HexaPDF::Type::AcroForm::ButtonField) then :button
57
+ elsif field.is_a?(HexaPDF::Type::AcroForm::ChoiceField) then :choice
58
+ else :other
59
+ end
60
+ extracted << {name: field.full_field_name, type: type, alternate_name: field[:TU]}
61
+ end
62
+ extracted
63
+ end
64
+
65
+ def raw_field_names
66
+ raw_fields.map { |f| f[:name] }
67
+ end
68
+
69
+ def any_raw_fields?
70
+ raw_fields.any?
71
+ end
72
+
73
+ def fully_mapped?
74
+ @unmapped_fields.empty?
75
+ end
76
+
77
+ def mapped_count
78
+ @mapped_fields.size
79
+ end
80
+
81
+ def mapped_field_names
82
+ @mapped_fields.values.uniq
83
+ end
84
+
85
+ def field_proposals
86
+ raise "field_proposals available only after compile!" if @field_proposals.nil?
87
+ @field_proposals
88
+ end
89
+
90
+ # Returns a hash mapping synthetic field names ("date", "date#1", "date#2")
91
+ # to the underlying AcroForm field objects, using the same naming scheme
92
+ # compile! emits. Callers (notably Relabeler.apply!) use this to resolve
93
+ # mapping keys back to the right field even when the PDF has multiple
94
+ # fields sharing the same :T name. The first occurrence keeps the bare
95
+ # base name; subsequent occurrences get a #N suffix.
96
+ def self.field_index(form)
97
+ return {} unless form
98
+ counts = Hash.new(0)
99
+ index = {}
100
+ form.each_field do |field|
101
+ next unless field.is_a?(HexaPDF::Type::AcroForm::Field)
102
+ name = field.full_field_name
103
+ next unless name
104
+ synth = (counts[name] == 0) ? name : "#{name}##{counts[name]}"
105
+ counts[name] += 1
106
+ index[synth] = field
107
+ end
108
+ index
109
+ end
110
+
111
+ # ------------------------------------------
112
+ # PHASE 1: THE HIERARCHICAL COMPILER
113
+ # ------------------------------------------
114
+ def compile!
115
+ puts ">> Compiling template: #{@template_path}"
116
+ form = source_doc.acro_form(create: true)
117
+
118
+ @mapped_fields = {}
119
+ @unmapped_fields = []
120
+ @select_field_options = {}
121
+ @new_fields_detected = []
122
+ @field_proposals = []
123
+
124
+ page_text_map = {}
125
+ source_doc.pages.each_with_index do |page, index|
126
+ processor = AllTextProcessor.new
127
+ page.process_contents(processor)
128
+ page_text_map[index] = processor.text_chunks
129
+ end
130
+
131
+ section_map = build_section_map(page_text_map)
132
+
133
+ # Track occurrences of each field name so we can disambiguate
134
+ # duplicates (e.g., three separate fields all named "date") with a
135
+ # synthetic suffix: first -> "date", second -> "date#1", third ->
136
+ # "date#2". The bare base name is preserved when unique so existing
137
+ # mappings stay backwards compatible.
138
+ field_name_counts = Hash.new(0)
139
+
140
+ form.each_field do |field|
141
+ next unless field.is_a?(HexaPDF::Type::AcroForm::Field)
142
+
143
+ widget = field.each_widget.first
144
+ next unless widget && widget[:Rect]
145
+
146
+ page_index = nil
147
+ source_doc.pages.each_with_index do |page, idx|
148
+ if page[:Annots]&.include?(widget)
149
+ page_index = idx
150
+ break
151
+ end
152
+ end
153
+
154
+ next unless page_index
155
+
156
+ base_field_name = field.full_field_name
157
+ occurrence = field_name_counts[base_field_name]
158
+ field_name_counts[base_field_name] += 1
159
+ original_field_name = (occurrence == 0) ? base_field_name : "#{base_field_name}##{occurrence}"
160
+
161
+ is_btn = field.is_a?(HexaPDF::Type::AcroForm::ButtonField) || field.is_a?(HexaPDF::Type::AcroForm::ChoiceField)
162
+ is_radio_group = is_btn && field.each_widget.count > 1
163
+
164
+ options_map = nil
165
+
166
+ if is_radio_group
167
+ # THE FIX: Sort by Highest Y, then Leftmost X to guarantee finding the top-left box of multi-line groups
168
+ first_widget = field.each_widget.min_by { |w| [-w[:Rect][1], w[:Rect][0]] }
169
+
170
+ raw_label = find_nearest_text(page_text_map[page_index], first_widget[:Rect], mode: :group_label)
171
+ raw_label = AcroForge::Labels.humanize(raw_label)
172
+
173
+ if raw_label
174
+ if raw_label.include?(":")
175
+ raw_label = raw_label.split(":").first.strip
176
+ elsif raw_label.downcase.include?("title")
177
+ raw_label = "Title"
178
+ end
179
+ end
180
+
181
+ options_map = {}
182
+ field.each_widget do |w|
183
+ next unless w[:Rect]
184
+
185
+ opt_text = find_nearest_text(page_text_map[page_index], w[:Rect], mode: :button_option)
186
+
187
+ if opt_text&.include?(":")
188
+ opt_text = opt_text.split(":").last.strip
189
+ end
190
+
191
+ export_val = w[:AP]&.[](:N)&.value&.keys&.find { |k| k != :Off && k != :Off.to_s }
192
+
193
+ if export_val
194
+ ev_str = export_val.to_s.downcase
195
+ is_generic = ["yes", "on", "off", "choice", "button", "group"].any? { |g| ev_str.include?(g) } || ev_str.match?(/^[0-9]+$/)
196
+
197
+ final_key = if !is_generic && sanitize_key(export_val)
198
+ sanitize_key(export_val).to_s
199
+ else
200
+ sanitized_opt = opt_text ? sanitize_key(opt_text)&.to_s : nil
201
+ (sanitized_opt.nil? || sanitized_opt.empty?) ? ev_str : sanitized_opt
202
+ end
203
+
204
+ options_map[final_key] = export_val.to_s
205
+ end
206
+ end
207
+
208
+ elsif field.is_a?(HexaPDF::Type::AcroForm::ButtonField)
209
+ # Single-widget buttons are usually checkboxes. Build a predictable
210
+ # hash so payload values can resolve to the exact export state.
211
+ options_map = {}
212
+ on_state = button_on_states(field).first
213
+ if on_state
214
+ on_export = on_state.to_s
215
+ on_keys = ["yes", "true", "on", "1", "checked"]
216
+ sanitized_on = sanitize_key(on_export)&.to_s
217
+ on_keys << sanitized_on if sanitized_on && !sanitized_on.empty?
218
+ on_keys.uniq.each { |k| options_map[k] = on_export }
219
+ end
220
+
221
+ ["no", "false", "off", "0", "unchecked"].each { |k| options_map[k] = "Off" }
222
+
223
+ elsif field.is_a?(HexaPDF::Type::AcroForm::ChoiceField)
224
+ # Choice fields can expose values via /Opt entries.
225
+ options_map = {}
226
+ if field[:Opt].is_a?(Array)
227
+ field[:Opt].each do |opt|
228
+ if opt.is_a?(Array)
229
+ export_val = opt[0].to_s
230
+ display_val = opt[1].to_s
231
+ [export_val, display_val].each do |candidate|
232
+ normalized = sanitize_key(candidate)&.to_s
233
+ options_map[normalized] = export_val if normalized && !normalized.empty?
234
+ end
235
+ else
236
+ export_val = opt.to_s
237
+ normalized = sanitize_key(export_val)&.to_s
238
+ options_map[normalized] = export_val if normalized && !normalized.empty?
239
+ end
240
+ end
241
+ end
242
+ else
243
+ field_rect = widget[:Rect]
244
+ raw_label = find_nearest_text(page_text_map[page_index], field_rect, mode: :standard)
245
+ raw_label = AcroForge::Labels.humanize(raw_label)
246
+ end
247
+
248
+ y_center = if is_radio_group
249
+ first_widget = field.each_widget.min_by { |w| [-w[:Rect][1], w[:Rect][0]] }
250
+ (first_widget[:Rect][1] + first_widget[:Rect][3]) / 2.0
251
+ else
252
+ (widget[:Rect][1] + widget[:Rect][3]) / 2.0
253
+ end
254
+
255
+ active_section = get_active_section(section_map, page_index, y_center)
256
+
257
+ target_key = nil
258
+
259
+ # Apply overrides if applicable. Support @overrides keyed by
260
+ # the original PDF field names (strings like "page0_field6"). When an
261
+ # override exists, map the PDF field to the semantic :key declared in the
262
+ # override (e.g. :full_name) so downstream validation uses semantic keys.
263
+ override_key_used = @overrides.key?(original_field_name.to_s) ? original_field_name.to_s : original_field_name.to_sym
264
+ override_entry = @overrides[original_field_name.to_s] || @overrides[original_field_name.to_sym]
265
+ if override_entry
266
+ semantic_name = override_entry[:key] || override_key_used
267
+ mapped_semantic = semantic_name.to_sym
268
+ target_key = (is_btn && !mapped_semantic.to_s.end_with?("_btn")) ? :"#{mapped_semantic}_btn" : mapped_semantic
269
+
270
+ # Ensure uniqueness when multiple fields map to the same semantic key
271
+ original_target = target_key
272
+ counter = 1
273
+ while @mapped_fields.value?(target_key)
274
+ target_key = :"#{original_target}_#{counter}"
275
+ counter += 1
276
+ end
277
+
278
+ puts " [Override] '#{original_field_name}' -> :#{target_key} (Override)"
279
+ elsif raw_label
280
+ base_key = sanitize_key(raw_label)
281
+ unless base_key
282
+ @unmapped_fields << original_field_name
283
+ @field_proposals << {
284
+ pdf_field_name: original_field_name,
285
+ pdf_field_type: case field
286
+ when HexaPDF::Type::AcroForm::TextField then :text
287
+ when HexaPDF::Type::AcroForm::ButtonField then :button
288
+ when HexaPDF::Type::AcroForm::ChoiceField then :choice
289
+ else :other
290
+ end,
291
+ canonical_key: nil,
292
+ raw_label: raw_label,
293
+ confidence: :none,
294
+ section: active_section,
295
+ page: page_index,
296
+ y: y_center,
297
+ x: (widget[:Rect][0] + widget[:Rect][2]) / 2.0,
298
+ options: options_map
299
+ }
300
+ puts " [Failed] Could not derive a valid key for field: #{original_field_name}"
301
+ next
302
+ end
303
+
304
+ if is_btn
305
+ base_key, override_label = normalize_button_base_key(base_key, options_map)
306
+ # Spatial heuristic's nearby-text guess can be wrong (e.g., a Title
307
+ # radio group sitting close to a "First Name" text input). If the
308
+ # options unambiguously identify the field, trust them and overwrite
309
+ # the misleading raw_label so variations + meta stay self-consistent.
310
+ raw_label = override_label if override_label
311
+ end
312
+
313
+ canonical_schema_key = canonical_schema_key_for(base_key, raw_label)
314
+ if canonical_schema_key
315
+ base_key = canonical_schema_key
316
+ elsif !likely_noisy_key?(base_key)
317
+ @new_fields_detected << base_key.to_s unless @new_fields_detected.include?(base_key.to_s)
318
+ end
319
+
320
+ target_key = active_section ? :"#{active_section}_#{base_key}" : base_key
321
+ target_key = @overrides[raw_label].to_sym if @overrides[raw_label]
322
+ target_key = :"#{target_key}_btn" if is_btn && !target_key.to_s.end_with?("_btn")
323
+
324
+ original_target = target_key
325
+ counter = 1
326
+ while @mapped_fields.value?(target_key)
327
+ target_key = :"#{original_target}_#{counter}"
328
+ counter += 1
329
+ end
330
+ end
331
+
332
+ if target_key
333
+ field[:T] = target_key.to_s
334
+ @mapped_fields[original_field_name] = target_key
335
+ @field_proposals << {
336
+ pdf_field_name: original_field_name,
337
+ pdf_field_type: case field
338
+ when HexaPDF::Type::AcroForm::TextField then :text
339
+ when HexaPDF::Type::AcroForm::ButtonField then :button
340
+ when HexaPDF::Type::AcroForm::ChoiceField then :choice
341
+ else :other
342
+ end,
343
+ canonical_key: target_key,
344
+ raw_label: raw_label,
345
+ confidence: confidence_for(raw_label, target_key),
346
+ section: active_section,
347
+ page: page_index,
348
+ y: y_center,
349
+ x: (widget[:Rect][0] + widget[:Rect][2]) / 2.0,
350
+ options: options_map
351
+ }
352
+
353
+ if is_btn && options_map && options_map.any?
354
+ @select_field_options[target_key.to_s] = options_map
355
+ # Reuse TU to persist the mapping in the normalized template.
356
+ field[:TU] = options_map.to_json
357
+ end
358
+
359
+ prefix_notice = active_section ? "[#{active_section.upcase}] " : ""
360
+ puts " [Auto-Mapped] #{prefix_notice}'#{raw_label || original_field_name}' -> :#{target_key}"
361
+
362
+ if is_btn && options_map && options_map.any?
363
+ puts " └─ Valid Options Hash: #{options_map.keys.inspect}"
364
+ end
365
+ else
366
+ @unmapped_fields << original_field_name
367
+ @field_proposals << {
368
+ pdf_field_name: original_field_name,
369
+ pdf_field_type: case field
370
+ when HexaPDF::Type::AcroForm::TextField then :text
371
+ when HexaPDF::Type::AcroForm::ButtonField then :button
372
+ when HexaPDF::Type::AcroForm::ChoiceField then :choice
373
+ else :other
374
+ end,
375
+ canonical_key: nil,
376
+ raw_label: raw_label,
377
+ confidence: :none,
378
+ section: active_section,
379
+ page: page_index,
380
+ y: y_center,
381
+ x: (widget[:Rect][0] + widget[:Rect][2]) / 2.0,
382
+ options: options_map
383
+ }
384
+ puts " [Failed] Could not find a text label for field: #{original_field_name}"
385
+ end
386
+ end
387
+
388
+ source_doc.write(@normalized_path, optimize: true)
389
+ puts ">> Compilation Complete. #{mapped_count} fields mapped."
390
+ puts ">> Clean template saved to: #{@normalized_path}\n\n"
391
+
392
+ {
393
+ mapped: @mapped_fields,
394
+ unmapped: @unmapped_fields,
395
+ select_options: @select_field_options,
396
+ new_fields_detected: @new_fields_detected
397
+ }
398
+ end
399
+
400
+ # ------------------------------------------
401
+ # PHASE 2: THE CRASH-PROOF INJECTOR
402
+ # ------------------------------------------
403
+ def fill!(payload, output_path, image_overlays = {})
404
+ puts ">> Injecting data into: #{@normalized_path}"
405
+
406
+ unless File.exist?(@normalized_path)
407
+ raise "Normalized template missing. Please run compile! first."
408
+ end
409
+
410
+ validate_payload!(payload)
411
+
412
+ normalized_doc = HexaPDF::Document.open(@normalized_path)
413
+ form = normalized_doc.acro_form
414
+
415
+ @filled_fields = {}
416
+ @missing_fields = []
417
+
418
+ payload.each do |key, value|
419
+ next if value.nil?
420
+ next if image_overlays.key?(key) # Silence the harmless warnings for image overlays
421
+
422
+ doc_field = nil
423
+ form.each_field do |f|
424
+ if f.is_a?(HexaPDF::Type::AcroForm::Field) && f[:T].to_s == key.to_s
425
+ doc_field = f
426
+ break
427
+ end
428
+ end
429
+
430
+ if doc_field
431
+ begin
432
+ if doc_field.is_a?(HexaPDF::Type::AcroForm::ButtonField) ||
433
+ doc_field.is_a?(HexaPDF::Type::AcroForm::ChoiceField)
434
+ resolved_from_map = false
435
+
436
+ if doc_field[:TU]
437
+ begin
438
+ options_map = JSON.parse(doc_field[:TU])
439
+ normalized_user_val = sanitize_key(value)&.to_s
440
+
441
+ if normalized_user_val && options_map.key?(normalized_user_val)
442
+ target_val = options_map[normalized_user_val]
443
+ doc_field.field_value = target_val
444
+ resolved_from_map = true
445
+
446
+ if doc_field.is_a?(HexaPDF::Type::AcroForm::ButtonField)
447
+ doc_field.each_widget do |w|
448
+ next unless w[:AP] && w[:AP][:N]
449
+ w_export = w[:AP][:N].value.keys.find { |k| k != :Off && k.to_s.downcase != "off" }
450
+ w[:AS] = (w_export.to_s == target_val.to_s) ? w_export : :Off
451
+ end
452
+ end
453
+ elsif doc_field.is_a?(HexaPDF::Type::AcroForm::ButtonField)
454
+ puts " [Warning] :#{key} - '#{value}' not found in select options: #{options_map.keys.join(", ")}"
455
+ end
456
+ rescue JSON::ParserError
457
+ resolved_from_map = false
458
+ end
459
+ end
460
+
461
+ if resolved_from_map
462
+ # done
463
+ elsif doc_field.is_a?(HexaPDF::Type::AcroForm::ButtonField)
464
+ normalized_val = value.to_s.downcase.strip
465
+ on_state_sym = button_on_states(doc_field).first || :Yes
466
+
467
+ if ["true", "yes", "on", "1"].include?(normalized_val)
468
+ doc_field.field_value = on_state_sym.to_s
469
+ doc_field.each_widget { |w| w[:AS] = on_state_sym }
470
+ elsif ["false", "no", "off", "0"].include?(normalized_val)
471
+ doc_field.field_value = "Off"
472
+ doc_field.each_widget { |w| w[:AS] = :Off }
473
+ else
474
+ doc_field.field_value = value.to_s
475
+ end
476
+ else
477
+ doc_field.field_value = value.to_s
478
+ end
479
+ else
480
+ if doc_field.key?(:MaxLen)
481
+ doc_field[:Ff] = (doc_field[:Ff] || 0) & ~(1 << 24)
482
+ doc_field.delete(:MaxLen)
483
+ end
484
+ doc_field.field_value = value.to_s
485
+ end
486
+
487
+ @filled_fields[key] = value
488
+ puts " [Filled] :#{key} = #{value}"
489
+ rescue HexaPDF::Error => e
490
+ puts " [Warning] Rejected :#{key} - PDF formatting conflict (#{e.message.split(" (HexaPDF").first})"
491
+ end
492
+ else
493
+ @missing_fields << key
494
+ puts " [Warning] Field :#{key} not found in template."
495
+ end
496
+ end
497
+
498
+ image_overlays.each do |key, config|
499
+ next unless payload[key] && File.exist?(payload[key])
500
+
501
+ page_index = config[:page] || 0
502
+ x, y, w, h = config[:coords]
503
+
504
+ page = normalized_doc.pages[page_index]
505
+ canvas = page.canvas(type: :overlay)
506
+
507
+ canvas.fill_color(255, 255, 255)
508
+ canvas.rectangle(x, y, w, h).fill
509
+ canvas.image(File.open(payload[key]), at: [x, y], width: w, height: h)
510
+
511
+ puts " [Overlay] Stamped :#{key} onto page #{page_index}"
512
+ end
513
+
514
+ normalized_doc.write(output_path, optimize: true)
515
+ puts ">> Success! Saved filled PDF to: #{output_path}\n\n"
516
+
517
+ {filled: @filled_fields, missing: @missing_fields}
518
+ end
519
+
520
+ private
521
+
522
+ def confidence_for(raw_label, target_key)
523
+ return :none if raw_label.nil? || raw_label.strip.empty?
524
+ return :high if target_key && @schema.key?(target_key.to_s.to_sym)
525
+ return :medium if target_key
526
+ :low
527
+ end
528
+
529
+ def sanitize_key(string)
530
+ key = string.to_s.downcase
531
+ .gsub(/['’*]/, "")
532
+ .gsub(/[^a-z0-9]+/, "_").squeeze("_")
533
+ .sub(/_$/, "")
534
+ .sub(/^_/, "")
535
+
536
+ # Merge common split artifacts from broken text extraction.
537
+ loop do
538
+ previous = key
539
+
540
+ # Prefix split: "c_ertify" => "certify", "p_roperty" => "property".
541
+ # Require at least 3 chars in the tail to avoid merging across real word boundaries.
542
+ key = key.gsub(/(^|_)([a-z])_([a-z0-9]{3,})(?=_|$)/, '\\1\\2\\3')
543
+
544
+ # Suffix split with trailing consonant: "an_d" => "and", "i_s" => "is".
545
+ # Keep this narrow so valid tokens like "party_has" aren't corrupted.
546
+ key = key.gsub(/(^|_)([a-z0-9]{1,2})_([bcdfghjklmnpqrstvwxyz])(?=_|$)/, '\\1\\2\\3')
547
+
548
+ # Two-letter head split often seen in "th_at" => "that".
549
+ key = key.gsub(/(^|_)(th|wh)_([a-z0-9]{2,})(?=_|$)/, '\\1\\2\\3')
550
+
551
+ break if key == previous
552
+ end
553
+
554
+ key = fix_token_typos(key)
555
+
556
+ key = canonicalize_known_label_key(key)
557
+
558
+ return nil if key.empty?
559
+
560
+ key.to_sym
561
+ end
562
+
563
+ def canonicalize_known_label_key(key)
564
+ normalized = key.dup
565
+
566
+ # Common fragmented tokens observed across vendor forms.
567
+ replacements = {
568
+ "t_ax" => "tax",
569
+ "identi_cation" => "identification",
570
+ "ide_ntity" => "identity",
571
+ "othe_rbank" => "other_bank",
572
+ "cha_r_ge" => "charge",
573
+ "complet_e" => "complete",
574
+ "a_nd" => "and",
575
+ "ot_her" => "other",
576
+ "sa_vings" => "savings",
577
+ "aloan" => "a_loan",
578
+ "tob_eused" => "to_be_used",
579
+ "documen_tveri_edb_y" => "document_verified_by",
580
+ "contac_tpersons" => "contact_persons",
581
+ "modeof" => "mode_of",
582
+ "mrmrs" => "mr_mrs",
583
+ "mobile_n_o" => "mobile_no",
584
+ "account_n_o" => "account_no",
585
+ "name_of_authorized_ocial" => "name_of_authorized_official",
586
+ "signature_of_authorized_ocial" => "signature_of_authorized_official",
587
+ "na_onal_id" => "national_id",
588
+ "posi_on_title" => "position_title",
589
+ "contribu_on" => "contribution",
590
+ "con_rmed" => "confirmed"
591
+ }
592
+
593
+ replacements.each do |from, to|
594
+ normalized = normalized.gsub(from, to)
595
+ end
596
+
597
+ # Canonicalize the recurring long disclaimer/attestation label that often
598
+ # arrives with fragmented tokens across different PDFs.
599
+ if normalized.include?("certify_that_my") &&
600
+ normalized.include?("savings_balance") &&
601
+ normalized.include?("sole_property") &&
602
+ normalized.include?("other_party") &&
603
+ normalized.include?("claim_over_it")
604
+ return "certify_that_my_pledged_tier3_other_savings_balance_is_my_sole_property_and_that_no_other_party_has_a_claim_over_it"
605
+ end
606
+
607
+ # Canonicalize a frequent long employer-loan question variant.
608
+ if normalized.include?("does_the_employer") &&
609
+ normalized.include?("loan") &&
610
+ normalized.include?("lien") &&
611
+ normalized.include?("recovered") &&
612
+ normalized.include?("employer_contribution")
613
+ return "does_the_employer_have_a_loan_lien_to_be_recovered_from_employer_contribution"
614
+ end
615
+
616
+ normalized
617
+ end
618
+
619
+ def fix_token_typos(key)
620
+ normalized = key.dup
621
+
622
+ AcroForge::Constants::TYPO_PHRASE_REPLACEMENTS.each do |from, to|
623
+ normalized = normalized.gsub(from, to)
624
+ end
625
+
626
+ # Clean up repeated separators introduced during replacement.
627
+ normalized.squeeze("_").sub(/^_/, "").sub(/_$/, "")
628
+ end
629
+
630
+ def build_section_map(page_text_map)
631
+ map = {}
632
+ page_text_map.each do |page_idx, chunks|
633
+ page_sections = []
634
+ chunks.each do |chunk|
635
+ clean_chunk = chunk[:text].downcase.gsub(/[^a-z]/, "")
636
+
637
+ if @sections.any? { |s| clean_chunk == s.downcase.gsub(/[^a-z]/, "") }
638
+ matched_section = @sections.find { |s| clean_chunk == s.downcase.gsub(/[^a-z]/, "") }
639
+ clean_section = (matched_section.downcase == "adress details") ? "Address Details" : matched_section
640
+ page_sections << {key: sanitize_key(clean_section), y_min: chunk[:y_min]}
641
+ end
642
+ end
643
+ map[page_idx] = page_sections.sort_by { |s| -s[:y_min] }
644
+ end
645
+ map
646
+ end
647
+
648
+ def button_on_states(field)
649
+ states = []
650
+
651
+ field.each_widget do |w|
652
+ next unless w[:AP] && w[:AP][:N]
653
+
654
+ keys = w[:AP][:N].value.keys
655
+ keys.each do |k|
656
+ next if k == :Off || k.to_s.downcase == "off"
657
+
658
+ states << k
659
+ end
660
+ end
661
+
662
+ states.uniq
663
+ end
664
+
665
+ # Returns [resolved_base_key, canonical_label_or_nil].
666
+ # When the options of a radio group / choice field clearly identify
667
+ # the field's semantic role (e.g., options [dr, mr, mrs, miss] mean
668
+ # the field IS a title selector), we override the spatially-derived
669
+ # base_key AND return the canonical human label so callers can
670
+ # replace a misleading raw_label too.
671
+ def normalize_button_base_key(base_key, options_map)
672
+ return [base_key, nil] unless options_map.is_a?(Hash) && options_map.any?
673
+
674
+ option_keys = options_map.keys.map { |k| sanitize_key(k)&.to_s }.compact.uniq
675
+
676
+ title_tokens = %w[dr mr mrs miss title]
677
+ return [:title, "Title"] if (option_keys & title_tokens).size >= 2
678
+
679
+ return [:gender, "Gender"] if option_keys.include?("male") && option_keys.include?("female")
680
+
681
+ marital_tokens = %w[single married divorced widow_widower widowed]
682
+ return [:marital_status, "Marital Status"] if (option_keys & marital_tokens).size >= 2
683
+
684
+ [base_key, nil]
685
+ end
686
+
687
+ def schema_variations(canonical_key)
688
+ entry = @schema[canonical_key]
689
+ return [] unless entry
690
+ entry.is_a?(Hash) ? Array(entry[:variations]) : Array(entry)
691
+ end
692
+
693
+ def canonical_schema_key_for(base_key, raw_label)
694
+ candidates = []
695
+ candidates << base_key.to_s if base_key
696
+ candidates << sanitize_key(raw_label).to_s if raw_label && sanitize_key(raw_label)
697
+
698
+ @schema.each do |canonical, _info|
699
+ variations = schema_variations(canonical)
700
+ canonical_key = sanitize_key(canonical.to_s)&.to_s
701
+ return canonical if candidates.include?(canonical_key)
702
+
703
+ variations.each do |label|
704
+ normalized = sanitize_key(label)&.to_s
705
+ next unless normalized
706
+
707
+ return canonical if candidates.include?(normalized)
708
+ end
709
+ end
710
+
711
+ nil
712
+ end
713
+
714
+ def likely_noisy_key?(key)
715
+ str = key.to_s
716
+ return true if str.empty?
717
+
718
+ str.match?(/(?:^|_)image\d+|(?:^|_)text\d+|(?:^|_)page\d+_field\d+/)
719
+ end
720
+
721
+ def get_active_section(section_map, page_idx, field_y_center)
722
+ return nil unless section_map[page_idx]
723
+ active_section = nil
724
+ section_map[page_idx].each do |sec|
725
+ if sec[:y_min] > field_y_center
726
+ active_section = sec[:key]
727
+ else
728
+ break
729
+ end
730
+ end
731
+ active_section
732
+ end
733
+
734
+ def validate_payload!(payload)
735
+ payload.each do |key, value|
736
+ next if value.nil? || value.to_s.empty?
737
+
738
+ # Strip suffixes like _1 or _btn to find the base canonical key for schema lookup
739
+ key_str = key.to_s
740
+ base_key = key_str.sub(/_btn(?:_\d+)?\z/, "").sub(/_\d+\z/, "").to_sym
741
+
742
+ # Try to resolve override info. @overrides may be keyed by
743
+ # original PDF field names (strings like "page0_field6") so allow lookup
744
+ # by semantic base_key (matching value[:key]) or by string key.
745
+ override_info = @overrides[base_key] || @overrides[base_key.to_s] || @overrides.values.find { |v| v.is_a?(Hash) && v[:key].to_sym == base_key }
746
+
747
+ type_info = @schema[base_key]
748
+
749
+ # If it's a button field, it's a select type by nature
750
+ type = if key_str.include?("_btn")
751
+ :select
752
+ elsif override_info
753
+ override_info[:type]
754
+ elsif type_info
755
+ type_info.is_a?(Hash) ? type_info[:type] : :string
756
+ else
757
+ infer_type(key)
758
+ end
759
+
760
+ schema_options = if type_info.is_a?(Hash)
761
+ type_info[:options] || []
762
+ else
763
+ []
764
+ end
765
+ pdf_options = @select_field_options[key.to_s]&.keys || []
766
+
767
+ allowed_options = (schema_options + pdf_options).uniq
768
+
769
+ unless AcroForge::Validator.valid?(value, type, allowed_options)
770
+ msg = "Validation failed for field :#{key} (base: :#{base_key}): Expected #{type}, got '#{value}'."
771
+ msg += " (Allowed options: #{allowed_options.join(", ")})" if type == :select
772
+ raise AcroForge::ValidationError, msg
773
+ end
774
+ end
775
+ end
776
+
777
+ def infer_type(key)
778
+ tokens = key.to_s.downcase.split("_")
779
+ if (tokens & %w[date dob expiry]).any?
780
+ :date
781
+ elsif (tokens & %w[amount salary income balance]).any?
782
+ :money
783
+ elsif (tokens & %w[email]).any?
784
+ :email
785
+ elsif (tokens & %w[tenor years age]).any?
786
+ :number
787
+ else
788
+ :string
789
+ end
790
+ end
791
+
792
+ # ------------------------------------------
793
+ # THE UNIVERSAL DYNAMIC HEURISTIC (WEIGHTS FIXED)
794
+ # ------------------------------------------
795
+ def find_nearest_text(text_chunks, field_rect, mode: :standard)
796
+ f_x_min, f_y_min, f_x_max, f_y_max = field_rect
797
+ f_y_center = (f_y_min + f_y_max) / 2.0
798
+
799
+ best_text = nil
800
+ best_score = 99999
801
+
802
+ text_chunks.each do |chunk|
803
+ t_x_min = chunk[:x_min]
804
+ t_x_max = chunk[:x_max]
805
+ t_x_center = (t_x_min + t_x_max) / 2.0
806
+ t_y_min = chunk[:y_min]
807
+ t_y_max = chunk[:y_max]
808
+ t_y_center = (t_y_min + t_y_max) / 2.0
809
+
810
+ dx_left = f_x_min - t_x_max
811
+ dx_right = t_x_min - f_x_max
812
+ dy_top = t_y_min - f_y_max
813
+ dy_center = (t_y_center - f_y_center).abs
814
+
815
+ score = nil
816
+ is_section_header = @sections.any? { |s| chunk[:text].downcase.gsub(/[^a-z]/, "") == s.downcase.gsub(/[^a-z]/, "") }
817
+ has_colon_or_q = chunk[:text].strip.match?(/[:?]\z/)
818
+
819
+ case mode
820
+ when :button_option
821
+ if dy_center < 12
822
+ if dx_right > -10 && dx_right < 60
823
+ score = dx_right.abs
824
+ elsif dx_left > -10 && dx_left < 60
825
+ score = dx_left.abs + 5
826
+ end
827
+ end
828
+
829
+ when :group_label
830
+ if dy_center < 15 && dx_left > -20 && dx_left < 300
831
+ score = dx_left.abs - 1000
832
+ score -= 300 if has_colon_or_q # Colon Tie-breaker Bonus
833
+ elsif dy_top > -5 && dy_top < 30 && (t_x_max > f_x_min - 20)
834
+ score = dy_top.abs + 50
835
+ score -= 300 if has_colon_or_q
836
+ end
837
+
838
+ when :standard
839
+ is_grid_locked = dy_top > -5 && dy_top < 30 && t_x_center >= (f_x_min - 20) && t_x_center <= (f_x_max + 20)
840
+ is_inline = dy_center < 10 && dx_left > -10 && dx_left < 200
841
+
842
+ if is_grid_locked
843
+ score = dy_top.abs - 2000
844
+ score -= 200 if has_colon_or_q
845
+ elsif is_inline
846
+ score = dx_left.abs - 1000
847
+ score -= 200 if has_colon_or_q
848
+ elsif dy_center < 15 && dx_left > -10 && dx_left < 150
849
+ score = dx_left.abs
850
+ score -= 200 if has_colon_or_q
851
+ end
852
+ end
853
+
854
+ if score
855
+ score += 10000 if is_section_header
856
+
857
+ if score < best_score
858
+ best_score = score
859
+ best_text = chunk[:text]
860
+ end
861
+ end
862
+ end
863
+
864
+ best_text&.sub(/:\z/, "")&.strip
865
+ end
866
+
867
+ public :validate_payload!
868
+ end
869
+ end