acroforge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,351 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+ require "yaml"
5
+ require_relative "../acroforge"
6
+
7
+ module AcroForge
8
+ module CLI
9
+ EXIT_OK = 0
10
+ EXIT_USER_ERROR = 1
11
+ EXIT_VALIDATION_ERROR = 2
12
+ EXIT_INTERNAL_ERROR = 3
13
+
14
+ SUBCOMMANDS = %w[schema relabel compile bootstrap annotate prepare version help].freeze
15
+
16
+ module_function
17
+
18
+ def run(argv)
19
+ argv = argv.dup
20
+ sub = argv.shift
21
+ return print_help(argv) if sub.nil? || sub == "help"
22
+ return print_version if sub == "version"
23
+
24
+ unless SUBCOMMANDS.include?(sub)
25
+ warn "acroforge: unknown subcommand #{sub.inspect}. Try `acroforge help`."
26
+ return EXIT_USER_ERROR
27
+ end
28
+
29
+ send("cmd_#{sub}", argv)
30
+ rescue AcroForge::ValidationError, AcroForge::RelabelError => e
31
+ warn "acroforge: #{e.message}"
32
+ EXIT_VALIDATION_ERROR
33
+ rescue Errno::ENOENT, ArgumentError => e
34
+ warn "acroforge: #{e.message}"
35
+ EXIT_USER_ERROR
36
+ rescue => e
37
+ warn "acroforge: internal error (#{e.class}): #{e.message}"
38
+ EXIT_INTERNAL_ERROR
39
+ end
40
+
41
+ def print_version
42
+ puts AcroForge::VERSION
43
+ EXIT_OK
44
+ end
45
+
46
+ def print_help(_)
47
+ puts <<~HELP
48
+ acroforge: PDF AcroForm engine + relabeler
49
+
50
+ Usage:
51
+ acroforge schema infer <pdf> [--out schema.yml] [--sections a,b,c] [-v]
52
+ acroforge schema merge <mapping.yml> [--schema schema.yml] [--out schema.yml]
53
+ acroforge relabel propose <pdf> [--out mapping.yml] [--schema schema.yml] [--merge|--overwrite] [-v]
54
+ acroforge relabel apply <pdf> <mapping.yml> [--annotate[=PATH]] [-v]
55
+ acroforge compile <pdf> [--schema schema.yml]
56
+ acroforge bootstrap <pdf> [--schema-out s.yml] [--mapping-out m.yml] [-v]
57
+ acroforge annotate <pdf> [--mapping mapping.yml] [--out annotated.pdf]
58
+ acroforge prepare <pdf> [--out prepared.pdf] [--schema schema.yml]
59
+ acroforge version
60
+ acroforge help
61
+
62
+ Pass -v or --verbose to bootstrap, schema infer, relabel propose, and
63
+ relabel apply to see the engine's per-field reasoning on stdout.
64
+ HELP
65
+ EXIT_OK
66
+ end
67
+
68
+ # Run `block` with $stdout redirected to /dev/null, unless `verbose:` is true.
69
+ # Used to suppress the engine's per-field chatter during normal CLI runs.
70
+ def silenced(verbose: false)
71
+ return yield if verbose
72
+ orig = $stdout
73
+ null = File.open(File::NULL, "w")
74
+ $stdout = null
75
+ begin
76
+ yield
77
+ ensure
78
+ $stdout = orig
79
+ null.close
80
+ end
81
+ end
82
+
83
+ def summarize_propose(result)
84
+ total = result[:total]
85
+ mapped = result[:mapped]
86
+ out = result[:out_path]
87
+ if total.zero?
88
+ puts "Wrote #{out}: no AcroForm fields found in the PDF."
89
+ elsif mapped == total
90
+ puts "Wrote #{out}: #{mapped} of #{total} fields proposed."
91
+ else
92
+ puts "Wrote #{out}: #{mapped} of #{total} fields proposed; #{total - mapped} need manual review."
93
+ end
94
+ end
95
+
96
+ def summarize_apply(result, pdf)
97
+ parts = ["#{result[:renamed]} renamed"]
98
+ parts << "#{result[:disambiguated]} disambiguated" if result[:disambiguated] > 0
99
+ parts << "#{result[:skipped_null]} skipped (no key)" if result[:skipped_null] > 0
100
+ parts << "#{result[:stale]} stale" if result[:stale] > 0
101
+ puts "Applied to #{pdf}: #{parts.join(", ")}."
102
+ end
103
+
104
+ def cmd_schema(argv)
105
+ action = argv.shift
106
+ case action
107
+ when "infer"
108
+ out = "schema.yml"
109
+ sections = []
110
+ verbose = false
111
+ OptionParser.new do |opts|
112
+ opts.on("--out PATH") { |v| out = v }
113
+ opts.on("--sections LIST") { |v| sections = v.split(",").map(&:strip) }
114
+ opts.on("-v", "--verbose") { verbose = true }
115
+ end.parse!(argv)
116
+ pdf = argv.shift
117
+ raise ArgumentError, "missing <pdf> argument" if pdf.nil?
118
+ raise Errno::ENOENT, pdf unless File.exist?(pdf)
119
+
120
+ schema = silenced(verbose: verbose) { AcroForge::Schema.infer(pdf, sections: sections) }
121
+ AcroForge::Schema.dump(schema, out)
122
+ count = schema.size
123
+ puts "Wrote #{out}: #{count} canonical key#{"s" unless count == 1} inferred."
124
+ EXIT_OK
125
+ when "merge"
126
+ schema_path = "schema.yml"
127
+ out = nil
128
+ OptionParser.new do |opts|
129
+ opts.on("--schema PATH") { |v| schema_path = v }
130
+ opts.on("--out PATH") { |v| out = v }
131
+ end.parse!(argv)
132
+ mapping_path = argv.shift
133
+ raise ArgumentError, "missing <mapping.yml> argument" if mapping_path.nil?
134
+ raise Errno::ENOENT, mapping_path unless File.exist?(mapping_path)
135
+ out ||= schema_path
136
+
137
+ existing = File.exist?(schema_path) ? AcroForge::Schema.load(schema_path) : {}
138
+ keys_before = existing.keys.to_set
139
+ variations_before = existing.each_with_object({}) do |(k, v), acc|
140
+ acc[k] = (v.is_a?(Hash) ? (v[:variations] || []) : []).to_set
141
+ end
142
+
143
+ mapping = YAML.load_file(mapping_path) || {}
144
+ merged = AcroForge::Schema.merge(existing, mapping.reject { |k, _| k.to_s.start_with?("_") })
145
+ AcroForge::Schema.dump(merged, out)
146
+
147
+ added = (merged.keys.to_set - keys_before).size
148
+ updated = merged.keys.count do |k|
149
+ next false unless keys_before.include?(k)
150
+ (merged[k][:variations] || []).to_set != (variations_before[k] || Set.new)
151
+ end
152
+ summarize_schema_merge(out, added, updated)
153
+ EXIT_OK
154
+ else
155
+ warn "acroforge: unknown schema action #{action.inspect}"
156
+ EXIT_USER_ERROR
157
+ end
158
+ end
159
+
160
+ def summarize_schema_merge(out, added, updated)
161
+ parts = []
162
+ parts << "#{added} new key#{"s" unless added == 1} added" if added > 0
163
+ parts << "#{updated} existing key#{"s" unless updated == 1} updated" if updated > 0
164
+ detail = parts.empty? ? "no changes" : parts.join(", ")
165
+ puts "Merged into #{out}: #{detail}."
166
+ end
167
+
168
+ def cmd_relabel(argv)
169
+ action = argv.shift
170
+ case action
171
+ when "propose"
172
+ out = "mapping.yml"
173
+ schema_path = nil
174
+ mode = :merge
175
+ verbose = false
176
+ OptionParser.new do |opts|
177
+ opts.on("--out PATH") { |v| out = v }
178
+ opts.on("--schema PATH") { |v| schema_path = v }
179
+ opts.on("--merge") { mode = :merge }
180
+ opts.on("--overwrite") { mode = :overwrite }
181
+ opts.on("-v", "--verbose") { verbose = true }
182
+ end.parse!(argv)
183
+ pdf = argv.shift
184
+ raise ArgumentError, "missing <pdf> argument" if pdf.nil?
185
+ raise Errno::ENOENT, pdf unless File.exist?(pdf)
186
+
187
+ schema = schema_path ? AcroForge::Schema.load(schema_path) : {}
188
+ result = silenced(verbose: verbose) do
189
+ AcroForge::Relabeler.propose(pdf, out: out, schema: schema, mode: mode)
190
+ end
191
+ summarize_propose(result)
192
+ EXIT_OK
193
+ when "apply"
194
+ verbose = false
195
+ # `annotate_out` tracks three states:
196
+ # false -> --annotate not passed; no annotation
197
+ # true -> --annotate passed without value; use default path
198
+ # "some/path" -> --annotate=path passed explicitly
199
+ annotate_out = false
200
+ OptionParser.new do |opts|
201
+ opts.on("-v", "--verbose") { verbose = true }
202
+ opts.on("--annotate [PATH]", "Also write an annotated review PDF (default: <source>_annotated.pdf)") do |v|
203
+ annotate_out = v || true
204
+ end
205
+ end.parse!(argv)
206
+ pdf = argv.shift
207
+ mapping = argv.shift
208
+ raise ArgumentError, "missing arguments: expected <pdf> <mapping.yml>" if pdf.nil? || mapping.nil?
209
+ raise Errno::ENOENT, pdf unless File.exist?(pdf)
210
+ raise Errno::ENOENT, mapping unless File.exist?(mapping)
211
+
212
+ # Annotation runs BEFORE the rename so the badges show
213
+ # original_field_name -> proposed_key. After the rename, the mapping's
214
+ # PDF field names no longer match the file, so post-rename annotation
215
+ # would render every entry as "missing in mapping" -- useless.
216
+ annotate_path = nil
217
+ if annotate_out
218
+ annotate_path = (annotate_out == true) ? default_annotated_path(pdf) : annotate_out
219
+ silenced(verbose: verbose) do
220
+ AcroForge::Annotator.annotate(pdf, out: annotate_path, mapping: mapping)
221
+ end
222
+ end
223
+
224
+ result = silenced(verbose: verbose) { AcroForge::Relabeler.apply!(pdf, mapping) }
225
+ summarize_apply(result, pdf)
226
+ puts "Wrote #{annotate_path}: review snapshot of the mapping plan." if annotate_path
227
+ EXIT_OK
228
+ else
229
+ warn "acroforge: unknown relabel action #{action.inspect}"
230
+ EXIT_USER_ERROR
231
+ end
232
+ end
233
+
234
+ def cmd_compile(argv)
235
+ schema_path = nil
236
+ OptionParser.new do |opts|
237
+ opts.on("--schema PATH") { |v| schema_path = v }
238
+ end.parse!(argv)
239
+ pdf = argv.shift
240
+ raise ArgumentError, "missing <pdf> argument" if pdf.nil?
241
+ raise Errno::ENOENT, pdf unless File.exist?(pdf)
242
+
243
+ schema = schema_path ? AcroForge::Schema.load(schema_path) : {}
244
+ require "tmpdir"
245
+ Dir.mktmpdir do |tmp|
246
+ engine = AcroForge::Engine.new(pdf, schema: schema, normalized_dir: tmp)
247
+ result = engine.compile!
248
+ puts "Mapped: #{result[:mapped].size}, Unmapped: #{result[:unmapped].size}"
249
+ end
250
+ EXIT_OK
251
+ end
252
+
253
+ def cmd_prepare(argv)
254
+ out = nil
255
+ schema_path = nil
256
+ OptionParser.new do |opts|
257
+ opts.on("--out PATH") { |v| out = v }
258
+ opts.on("--schema PATH") { |v| schema_path = v }
259
+ end.parse!(argv)
260
+ pdf = argv.shift
261
+ raise ArgumentError, "missing <pdf> argument" if pdf.nil?
262
+ raise Errno::ENOENT, pdf unless File.exist?(pdf)
263
+ raise Errno::ENOENT, schema_path if schema_path && !File.exist?(schema_path)
264
+
265
+ schema = schema_path ? AcroForge::Schema.load(schema_path) : {}
266
+ result = silenced(verbose: false) do
267
+ AcroForge::Preparer.prepare!(pdf, out: out, schema: schema)
268
+ end
269
+ summarize_prepare(result, pdf, out)
270
+ EXIT_OK
271
+ end
272
+
273
+ def summarize_prepare(result, in_path, explicit_out)
274
+ target = result[:out_path]
275
+ where = (target == in_path) ? "in place" : "to #{target}"
276
+ if result[:duplicate_groups].zero?
277
+ puts "Nothing to do: #{in_path} has no duplicate field names."
278
+ else
279
+ parts = ["#{result[:renamed]} duplicates renamed"]
280
+ parts << "#{result[:skipped]} skipped (no heuristic proposal)" if result[:skipped] > 0
281
+ puts "Prepared #{where}: #{result[:duplicate_groups]} duplicate groups, #{parts.join(", ")}."
282
+ end
283
+ end
284
+
285
+ def cmd_annotate(argv)
286
+ out = nil
287
+ mapping_path = nil
288
+ OptionParser.new do |opts|
289
+ opts.on("--out PATH") { |v| out = v }
290
+ opts.on("--mapping PATH") { |v| mapping_path = v }
291
+ end.parse!(argv)
292
+ pdf = argv.shift
293
+ raise ArgumentError, "missing <pdf> argument" if pdf.nil?
294
+ raise Errno::ENOENT, pdf unless File.exist?(pdf)
295
+ raise Errno::ENOENT, mapping_path if mapping_path && !File.exist?(mapping_path)
296
+
297
+ out ||= default_annotated_path(pdf)
298
+ result = AcroForge::Annotator.annotate(pdf, out: out, mapping: mapping_path)
299
+ summarize_annotate(result, mapping_path)
300
+ EXIT_OK
301
+ end
302
+
303
+ def default_annotated_path(pdf)
304
+ base = File.basename(pdf, ".*")
305
+ File.join(File.dirname(pdf), "#{base}_annotated.pdf")
306
+ end
307
+
308
+ def summarize_annotate(result, mapping_path)
309
+ if mapping_path
310
+ parts = ["#{result[:mapped]} mapped"]
311
+ parts << "#{result[:unmapped]} no key" if result[:unmapped] > 0
312
+ parts << "#{result[:missing]} not in mapping" if result[:missing] > 0
313
+ puts "Wrote #{result[:out_path]}: #{result[:annotated]} fields annotated (#{parts.join(", ")})."
314
+ else
315
+ puts "Wrote #{result[:out_path]}: #{result[:annotated]} fields annotated."
316
+ end
317
+ end
318
+
319
+ def cmd_bootstrap(argv)
320
+ schema_out = "schema.yml"
321
+ mapping_out = "mapping.yml"
322
+ verbose = false
323
+ OptionParser.new do |opts|
324
+ opts.on("--schema-out PATH") { |v| schema_out = v }
325
+ opts.on("--mapping-out PATH") { |v| mapping_out = v }
326
+ opts.on("-v", "--verbose") { verbose = true }
327
+ end.parse!(argv)
328
+ pdf = argv.shift
329
+ raise ArgumentError, "missing <pdf> argument" if pdf.nil?
330
+ raise Errno::ENOENT, pdf unless File.exist?(pdf)
331
+
332
+ # Run the engine ONCE. Schema.infer and Relabeler.propose both accept
333
+ # an `engine:` kwarg so they reuse the same compile pass instead of
334
+ # each running their own (which would print the verbose chatter twice).
335
+ require "tmpdir"
336
+ Dir.mktmpdir do |tmp|
337
+ engine = AcroForge::Engine.new(pdf, normalized_dir: tmp)
338
+ silenced(verbose: verbose) { engine.compile! }
339
+
340
+ schema = AcroForge::Schema.infer(pdf, engine: engine)
341
+ AcroForge::Schema.dump(schema, schema_out)
342
+ count = schema.size
343
+ puts "Wrote #{schema_out}: #{count} canonical key#{"s" unless count == 1} inferred."
344
+
345
+ result = AcroForge::Relabeler.propose(pdf, out: mapping_out, schema: schema, mode: :overwrite, engine: engine)
346
+ summarize_propose(result)
347
+ end
348
+ EXIT_OK
349
+ end
350
+ end
351
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module AcroForge
4
+ module Constants
5
+ TYPO_PHRASE_REPLACEMENTS = {
6
+ "identi_fi_cation" => "identification",
7
+ "identi_cation" => "identification",
8
+ "ide_ntity" => "identity",
9
+ "contribu_on" => "contribution",
10
+ "con_rmed" => "confirmed",
11
+ "na_onal" => "national",
12
+ "ocial" => "official",
13
+ "modeof" => "mode_of",
14
+ "modeofr" => "mode_of_r",
15
+ "nameof" => "name_of"
16
+ }.freeze
17
+
18
+ # PDF text extraction returns Unicode quirks: ligatures (fi instead of f+i),
19
+ # fullwidth letters, curly quotes, etc. AllTextProcessor normalizes via
20
+ # Unicode NFKC first, which handles most of the "compatibility" subset
21
+ # automatically (ligatures, fullwidth, superscript digits, ...). NFKC
22
+ # does NOT touch these characters — they're separate codepoints, not
23
+ # compatibility decompositions — so we substitute them manually.
24
+ UNICODE_REPLACEMENTS = {
25
+ "\u{2018}" => "'", # left single quote
26
+ "\u{2019}" => "'", # right single quote
27
+ "\u{201A}" => "'", # single low-9 quote
28
+ "\u{201C}" => '"', # left double quote
29
+ "\u{201D}" => '"', # right double quote
30
+ "\u{201E}" => '"', # double low-9 quote
31
+ "\u{2013}" => "-", # en dash
32
+ "\u{2014}" => "-", # em dash
33
+ "\u{2010}" => "-", # hyphen
34
+ "\u{2011}" => "-", # non-breaking hyphen
35
+ "\u{2212}" => "-", # minus sign
36
+ "\u{00AD}" => "", # soft hyphen (often invisible artifact)
37
+ "\u{200B}" => "", # zero-width space
38
+ "\u{200C}" => "", # zero-width non-joiner
39
+ "\u{200D}" => "", # zero-width joiner
40
+ "\u{FEFF}" => "", # zero-width no-break space (BOM)
41
+ "\u{2026}" => "...", # ellipsis
42
+ "\u{2022}" => "*", # bullet
43
+ "\u{00B7}" => "*" # middle dot used as bullet
44
+ }.freeze
45
+ end
46
+ end