acroforge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +11 -0
- data/LICENSE.txt +21 -0
- data/README.md +217 -0
- data/Rakefile +10 -0
- data/acroforge.gemspec +37 -0
- data/exe/acroforge +5 -0
- data/lib/acroforge/all_text_processor.rb +126 -0
- data/lib/acroforge/annotator.rb +137 -0
- data/lib/acroforge/cli.rb +351 -0
- data/lib/acroforge/constants.rb +46 -0
- data/lib/acroforge/engine.rb +869 -0
- data/lib/acroforge/labels.rb +112 -0
- data/lib/acroforge/preparer.rb +103 -0
- data/lib/acroforge/relabeler.rb +179 -0
- data/lib/acroforge/schema.rb +208 -0
- data/lib/acroforge/validator.rb +37 -0
- data/lib/acroforge/version.rb +5 -0
- data/lib/acroforge.rb +18 -0
- data/sig/acroforge.rbs +4 -0
- metadata +81 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "optparse"
|
|
4
|
+
require "yaml"
|
|
5
|
+
require_relative "../acroforge"
|
|
6
|
+
|
|
7
|
+
module AcroForge
|
|
8
|
+
module CLI
|
|
9
|
+
EXIT_OK = 0
|
|
10
|
+
EXIT_USER_ERROR = 1
|
|
11
|
+
EXIT_VALIDATION_ERROR = 2
|
|
12
|
+
EXIT_INTERNAL_ERROR = 3
|
|
13
|
+
|
|
14
|
+
SUBCOMMANDS = %w[schema relabel compile bootstrap annotate prepare version help].freeze
|
|
15
|
+
|
|
16
|
+
module_function
|
|
17
|
+
|
|
18
|
+
def run(argv)
|
|
19
|
+
argv = argv.dup
|
|
20
|
+
sub = argv.shift
|
|
21
|
+
return print_help(argv) if sub.nil? || sub == "help"
|
|
22
|
+
return print_version if sub == "version"
|
|
23
|
+
|
|
24
|
+
unless SUBCOMMANDS.include?(sub)
|
|
25
|
+
warn "acroforge: unknown subcommand #{sub.inspect}. Try `acroforge help`."
|
|
26
|
+
return EXIT_USER_ERROR
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
send("cmd_#{sub}", argv)
|
|
30
|
+
rescue AcroForge::ValidationError, AcroForge::RelabelError => e
|
|
31
|
+
warn "acroforge: #{e.message}"
|
|
32
|
+
EXIT_VALIDATION_ERROR
|
|
33
|
+
rescue Errno::ENOENT, ArgumentError => e
|
|
34
|
+
warn "acroforge: #{e.message}"
|
|
35
|
+
EXIT_USER_ERROR
|
|
36
|
+
rescue => e
|
|
37
|
+
warn "acroforge: internal error (#{e.class}): #{e.message}"
|
|
38
|
+
EXIT_INTERNAL_ERROR
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def print_version
|
|
42
|
+
puts AcroForge::VERSION
|
|
43
|
+
EXIT_OK
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def print_help(_)
|
|
47
|
+
puts <<~HELP
|
|
48
|
+
acroforge: PDF AcroForm engine + relabeler
|
|
49
|
+
|
|
50
|
+
Usage:
|
|
51
|
+
acroforge schema infer <pdf> [--out schema.yml] [--sections a,b,c] [-v]
|
|
52
|
+
acroforge schema merge <mapping.yml> [--schema schema.yml] [--out schema.yml]
|
|
53
|
+
acroforge relabel propose <pdf> [--out mapping.yml] [--schema schema.yml] [--merge|--overwrite] [-v]
|
|
54
|
+
acroforge relabel apply <pdf> <mapping.yml> [--annotate[=PATH]] [-v]
|
|
55
|
+
acroforge compile <pdf> [--schema schema.yml]
|
|
56
|
+
acroforge bootstrap <pdf> [--schema-out s.yml] [--mapping-out m.yml] [-v]
|
|
57
|
+
acroforge annotate <pdf> [--mapping mapping.yml] [--out annotated.pdf]
|
|
58
|
+
acroforge prepare <pdf> [--out prepared.pdf] [--schema schema.yml]
|
|
59
|
+
acroforge version
|
|
60
|
+
acroforge help
|
|
61
|
+
|
|
62
|
+
Pass -v or --verbose to bootstrap, schema infer, relabel propose, and
|
|
63
|
+
relabel apply to see the engine's per-field reasoning on stdout.
|
|
64
|
+
HELP
|
|
65
|
+
EXIT_OK
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Run `block` with $stdout redirected to /dev/null, unless `verbose:` is true.
|
|
69
|
+
# Used to suppress the engine's per-field chatter during normal CLI runs.
|
|
70
|
+
def silenced(verbose: false)
|
|
71
|
+
return yield if verbose
|
|
72
|
+
orig = $stdout
|
|
73
|
+
null = File.open(File::NULL, "w")
|
|
74
|
+
$stdout = null
|
|
75
|
+
begin
|
|
76
|
+
yield
|
|
77
|
+
ensure
|
|
78
|
+
$stdout = orig
|
|
79
|
+
null.close
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def summarize_propose(result)
|
|
84
|
+
total = result[:total]
|
|
85
|
+
mapped = result[:mapped]
|
|
86
|
+
out = result[:out_path]
|
|
87
|
+
if total.zero?
|
|
88
|
+
puts "Wrote #{out}: no AcroForm fields found in the PDF."
|
|
89
|
+
elsif mapped == total
|
|
90
|
+
puts "Wrote #{out}: #{mapped} of #{total} fields proposed."
|
|
91
|
+
else
|
|
92
|
+
puts "Wrote #{out}: #{mapped} of #{total} fields proposed; #{total - mapped} need manual review."
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def summarize_apply(result, pdf)
|
|
97
|
+
parts = ["#{result[:renamed]} renamed"]
|
|
98
|
+
parts << "#{result[:disambiguated]} disambiguated" if result[:disambiguated] > 0
|
|
99
|
+
parts << "#{result[:skipped_null]} skipped (no key)" if result[:skipped_null] > 0
|
|
100
|
+
parts << "#{result[:stale]} stale" if result[:stale] > 0
|
|
101
|
+
puts "Applied to #{pdf}: #{parts.join(", ")}."
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def cmd_schema(argv)
|
|
105
|
+
action = argv.shift
|
|
106
|
+
case action
|
|
107
|
+
when "infer"
|
|
108
|
+
out = "schema.yml"
|
|
109
|
+
sections = []
|
|
110
|
+
verbose = false
|
|
111
|
+
OptionParser.new do |opts|
|
|
112
|
+
opts.on("--out PATH") { |v| out = v }
|
|
113
|
+
opts.on("--sections LIST") { |v| sections = v.split(",").map(&:strip) }
|
|
114
|
+
opts.on("-v", "--verbose") { verbose = true }
|
|
115
|
+
end.parse!(argv)
|
|
116
|
+
pdf = argv.shift
|
|
117
|
+
raise ArgumentError, "missing <pdf> argument" if pdf.nil?
|
|
118
|
+
raise Errno::ENOENT, pdf unless File.exist?(pdf)
|
|
119
|
+
|
|
120
|
+
schema = silenced(verbose: verbose) { AcroForge::Schema.infer(pdf, sections: sections) }
|
|
121
|
+
AcroForge::Schema.dump(schema, out)
|
|
122
|
+
count = schema.size
|
|
123
|
+
puts "Wrote #{out}: #{count} canonical key#{"s" unless count == 1} inferred."
|
|
124
|
+
EXIT_OK
|
|
125
|
+
when "merge"
|
|
126
|
+
schema_path = "schema.yml"
|
|
127
|
+
out = nil
|
|
128
|
+
OptionParser.new do |opts|
|
|
129
|
+
opts.on("--schema PATH") { |v| schema_path = v }
|
|
130
|
+
opts.on("--out PATH") { |v| out = v }
|
|
131
|
+
end.parse!(argv)
|
|
132
|
+
mapping_path = argv.shift
|
|
133
|
+
raise ArgumentError, "missing <mapping.yml> argument" if mapping_path.nil?
|
|
134
|
+
raise Errno::ENOENT, mapping_path unless File.exist?(mapping_path)
|
|
135
|
+
out ||= schema_path
|
|
136
|
+
|
|
137
|
+
existing = File.exist?(schema_path) ? AcroForge::Schema.load(schema_path) : {}
|
|
138
|
+
keys_before = existing.keys.to_set
|
|
139
|
+
variations_before = existing.each_with_object({}) do |(k, v), acc|
|
|
140
|
+
acc[k] = (v.is_a?(Hash) ? (v[:variations] || []) : []).to_set
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
mapping = YAML.load_file(mapping_path) || {}
|
|
144
|
+
merged = AcroForge::Schema.merge(existing, mapping.reject { |k, _| k.to_s.start_with?("_") })
|
|
145
|
+
AcroForge::Schema.dump(merged, out)
|
|
146
|
+
|
|
147
|
+
added = (merged.keys.to_set - keys_before).size
|
|
148
|
+
updated = merged.keys.count do |k|
|
|
149
|
+
next false unless keys_before.include?(k)
|
|
150
|
+
(merged[k][:variations] || []).to_set != (variations_before[k] || Set.new)
|
|
151
|
+
end
|
|
152
|
+
summarize_schema_merge(out, added, updated)
|
|
153
|
+
EXIT_OK
|
|
154
|
+
else
|
|
155
|
+
warn "acroforge: unknown schema action #{action.inspect}"
|
|
156
|
+
EXIT_USER_ERROR
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def summarize_schema_merge(out, added, updated)
|
|
161
|
+
parts = []
|
|
162
|
+
parts << "#{added} new key#{"s" unless added == 1} added" if added > 0
|
|
163
|
+
parts << "#{updated} existing key#{"s" unless updated == 1} updated" if updated > 0
|
|
164
|
+
detail = parts.empty? ? "no changes" : parts.join(", ")
|
|
165
|
+
puts "Merged into #{out}: #{detail}."
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def cmd_relabel(argv)
|
|
169
|
+
action = argv.shift
|
|
170
|
+
case action
|
|
171
|
+
when "propose"
|
|
172
|
+
out = "mapping.yml"
|
|
173
|
+
schema_path = nil
|
|
174
|
+
mode = :merge
|
|
175
|
+
verbose = false
|
|
176
|
+
OptionParser.new do |opts|
|
|
177
|
+
opts.on("--out PATH") { |v| out = v }
|
|
178
|
+
opts.on("--schema PATH") { |v| schema_path = v }
|
|
179
|
+
opts.on("--merge") { mode = :merge }
|
|
180
|
+
opts.on("--overwrite") { mode = :overwrite }
|
|
181
|
+
opts.on("-v", "--verbose") { verbose = true }
|
|
182
|
+
end.parse!(argv)
|
|
183
|
+
pdf = argv.shift
|
|
184
|
+
raise ArgumentError, "missing <pdf> argument" if pdf.nil?
|
|
185
|
+
raise Errno::ENOENT, pdf unless File.exist?(pdf)
|
|
186
|
+
|
|
187
|
+
schema = schema_path ? AcroForge::Schema.load(schema_path) : {}
|
|
188
|
+
result = silenced(verbose: verbose) do
|
|
189
|
+
AcroForge::Relabeler.propose(pdf, out: out, schema: schema, mode: mode)
|
|
190
|
+
end
|
|
191
|
+
summarize_propose(result)
|
|
192
|
+
EXIT_OK
|
|
193
|
+
when "apply"
|
|
194
|
+
verbose = false
|
|
195
|
+
# `annotate_out` tracks three states:
|
|
196
|
+
# false -> --annotate not passed; no annotation
|
|
197
|
+
# true -> --annotate passed without value; use default path
|
|
198
|
+
# "some/path" -> --annotate=path passed explicitly
|
|
199
|
+
annotate_out = false
|
|
200
|
+
OptionParser.new do |opts|
|
|
201
|
+
opts.on("-v", "--verbose") { verbose = true }
|
|
202
|
+
opts.on("--annotate [PATH]", "Also write an annotated review PDF (default: <source>_annotated.pdf)") do |v|
|
|
203
|
+
annotate_out = v || true
|
|
204
|
+
end
|
|
205
|
+
end.parse!(argv)
|
|
206
|
+
pdf = argv.shift
|
|
207
|
+
mapping = argv.shift
|
|
208
|
+
raise ArgumentError, "missing arguments: expected <pdf> <mapping.yml>" if pdf.nil? || mapping.nil?
|
|
209
|
+
raise Errno::ENOENT, pdf unless File.exist?(pdf)
|
|
210
|
+
raise Errno::ENOENT, mapping unless File.exist?(mapping)
|
|
211
|
+
|
|
212
|
+
# Annotation runs BEFORE the rename so the badges show
|
|
213
|
+
# original_field_name -> proposed_key. After the rename, the mapping's
|
|
214
|
+
# PDF field names no longer match the file, so post-rename annotation
|
|
215
|
+
# would render every entry as "missing in mapping" -- useless.
|
|
216
|
+
annotate_path = nil
|
|
217
|
+
if annotate_out
|
|
218
|
+
annotate_path = (annotate_out == true) ? default_annotated_path(pdf) : annotate_out
|
|
219
|
+
silenced(verbose: verbose) do
|
|
220
|
+
AcroForge::Annotator.annotate(pdf, out: annotate_path, mapping: mapping)
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
result = silenced(verbose: verbose) { AcroForge::Relabeler.apply!(pdf, mapping) }
|
|
225
|
+
summarize_apply(result, pdf)
|
|
226
|
+
puts "Wrote #{annotate_path}: review snapshot of the mapping plan." if annotate_path
|
|
227
|
+
EXIT_OK
|
|
228
|
+
else
|
|
229
|
+
warn "acroforge: unknown relabel action #{action.inspect}"
|
|
230
|
+
EXIT_USER_ERROR
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def cmd_compile(argv)
|
|
235
|
+
schema_path = nil
|
|
236
|
+
OptionParser.new do |opts|
|
|
237
|
+
opts.on("--schema PATH") { |v| schema_path = v }
|
|
238
|
+
end.parse!(argv)
|
|
239
|
+
pdf = argv.shift
|
|
240
|
+
raise ArgumentError, "missing <pdf> argument" if pdf.nil?
|
|
241
|
+
raise Errno::ENOENT, pdf unless File.exist?(pdf)
|
|
242
|
+
|
|
243
|
+
schema = schema_path ? AcroForge::Schema.load(schema_path) : {}
|
|
244
|
+
require "tmpdir"
|
|
245
|
+
Dir.mktmpdir do |tmp|
|
|
246
|
+
engine = AcroForge::Engine.new(pdf, schema: schema, normalized_dir: tmp)
|
|
247
|
+
result = engine.compile!
|
|
248
|
+
puts "Mapped: #{result[:mapped].size}, Unmapped: #{result[:unmapped].size}"
|
|
249
|
+
end
|
|
250
|
+
EXIT_OK
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def cmd_prepare(argv)
|
|
254
|
+
out = nil
|
|
255
|
+
schema_path = nil
|
|
256
|
+
OptionParser.new do |opts|
|
|
257
|
+
opts.on("--out PATH") { |v| out = v }
|
|
258
|
+
opts.on("--schema PATH") { |v| schema_path = v }
|
|
259
|
+
end.parse!(argv)
|
|
260
|
+
pdf = argv.shift
|
|
261
|
+
raise ArgumentError, "missing <pdf> argument" if pdf.nil?
|
|
262
|
+
raise Errno::ENOENT, pdf unless File.exist?(pdf)
|
|
263
|
+
raise Errno::ENOENT, schema_path if schema_path && !File.exist?(schema_path)
|
|
264
|
+
|
|
265
|
+
schema = schema_path ? AcroForge::Schema.load(schema_path) : {}
|
|
266
|
+
result = silenced(verbose: false) do
|
|
267
|
+
AcroForge::Preparer.prepare!(pdf, out: out, schema: schema)
|
|
268
|
+
end
|
|
269
|
+
summarize_prepare(result, pdf, out)
|
|
270
|
+
EXIT_OK
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def summarize_prepare(result, in_path, explicit_out)
|
|
274
|
+
target = result[:out_path]
|
|
275
|
+
where = (target == in_path) ? "in place" : "to #{target}"
|
|
276
|
+
if result[:duplicate_groups].zero?
|
|
277
|
+
puts "Nothing to do: #{in_path} has no duplicate field names."
|
|
278
|
+
else
|
|
279
|
+
parts = ["#{result[:renamed]} duplicates renamed"]
|
|
280
|
+
parts << "#{result[:skipped]} skipped (no heuristic proposal)" if result[:skipped] > 0
|
|
281
|
+
puts "Prepared #{where}: #{result[:duplicate_groups]} duplicate groups, #{parts.join(", ")}."
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def cmd_annotate(argv)
|
|
286
|
+
out = nil
|
|
287
|
+
mapping_path = nil
|
|
288
|
+
OptionParser.new do |opts|
|
|
289
|
+
opts.on("--out PATH") { |v| out = v }
|
|
290
|
+
opts.on("--mapping PATH") { |v| mapping_path = v }
|
|
291
|
+
end.parse!(argv)
|
|
292
|
+
pdf = argv.shift
|
|
293
|
+
raise ArgumentError, "missing <pdf> argument" if pdf.nil?
|
|
294
|
+
raise Errno::ENOENT, pdf unless File.exist?(pdf)
|
|
295
|
+
raise Errno::ENOENT, mapping_path if mapping_path && !File.exist?(mapping_path)
|
|
296
|
+
|
|
297
|
+
out ||= default_annotated_path(pdf)
|
|
298
|
+
result = AcroForge::Annotator.annotate(pdf, out: out, mapping: mapping_path)
|
|
299
|
+
summarize_annotate(result, mapping_path)
|
|
300
|
+
EXIT_OK
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
def default_annotated_path(pdf)
|
|
304
|
+
base = File.basename(pdf, ".*")
|
|
305
|
+
File.join(File.dirname(pdf), "#{base}_annotated.pdf")
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def summarize_annotate(result, mapping_path)
|
|
309
|
+
if mapping_path
|
|
310
|
+
parts = ["#{result[:mapped]} mapped"]
|
|
311
|
+
parts << "#{result[:unmapped]} no key" if result[:unmapped] > 0
|
|
312
|
+
parts << "#{result[:missing]} not in mapping" if result[:missing] > 0
|
|
313
|
+
puts "Wrote #{result[:out_path]}: #{result[:annotated]} fields annotated (#{parts.join(", ")})."
|
|
314
|
+
else
|
|
315
|
+
puts "Wrote #{result[:out_path]}: #{result[:annotated]} fields annotated."
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
def cmd_bootstrap(argv)
|
|
320
|
+
schema_out = "schema.yml"
|
|
321
|
+
mapping_out = "mapping.yml"
|
|
322
|
+
verbose = false
|
|
323
|
+
OptionParser.new do |opts|
|
|
324
|
+
opts.on("--schema-out PATH") { |v| schema_out = v }
|
|
325
|
+
opts.on("--mapping-out PATH") { |v| mapping_out = v }
|
|
326
|
+
opts.on("-v", "--verbose") { verbose = true }
|
|
327
|
+
end.parse!(argv)
|
|
328
|
+
pdf = argv.shift
|
|
329
|
+
raise ArgumentError, "missing <pdf> argument" if pdf.nil?
|
|
330
|
+
raise Errno::ENOENT, pdf unless File.exist?(pdf)
|
|
331
|
+
|
|
332
|
+
# Run the engine ONCE. Schema.infer and Relabeler.propose both accept
|
|
333
|
+
# an `engine:` kwarg so they reuse the same compile pass instead of
|
|
334
|
+
# each running their own (which would print the verbose chatter twice).
|
|
335
|
+
require "tmpdir"
|
|
336
|
+
Dir.mktmpdir do |tmp|
|
|
337
|
+
engine = AcroForge::Engine.new(pdf, normalized_dir: tmp)
|
|
338
|
+
silenced(verbose: verbose) { engine.compile! }
|
|
339
|
+
|
|
340
|
+
schema = AcroForge::Schema.infer(pdf, engine: engine)
|
|
341
|
+
AcroForge::Schema.dump(schema, schema_out)
|
|
342
|
+
count = schema.size
|
|
343
|
+
puts "Wrote #{schema_out}: #{count} canonical key#{"s" unless count == 1} inferred."
|
|
344
|
+
|
|
345
|
+
result = AcroForge::Relabeler.propose(pdf, out: mapping_out, schema: schema, mode: :overwrite, engine: engine)
|
|
346
|
+
summarize_propose(result)
|
|
347
|
+
end
|
|
348
|
+
EXIT_OK
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module AcroForge
|
|
4
|
+
module Constants
|
|
5
|
+
TYPO_PHRASE_REPLACEMENTS = {
|
|
6
|
+
"identi_fi_cation" => "identification",
|
|
7
|
+
"identi_cation" => "identification",
|
|
8
|
+
"ide_ntity" => "identity",
|
|
9
|
+
"contribu_on" => "contribution",
|
|
10
|
+
"con_rmed" => "confirmed",
|
|
11
|
+
"na_onal" => "national",
|
|
12
|
+
"ocial" => "official",
|
|
13
|
+
"modeof" => "mode_of",
|
|
14
|
+
"modeofr" => "mode_of_r",
|
|
15
|
+
"nameof" => "name_of"
|
|
16
|
+
}.freeze
|
|
17
|
+
|
|
18
|
+
# PDF text extraction returns Unicode quirks: ligatures (fi instead of f+i),
|
|
19
|
+
# fullwidth letters, curly quotes, etc. AllTextProcessor normalizes via
|
|
20
|
+
# Unicode NFKC first, which handles most of the "compatibility" subset
|
|
21
|
+
# automatically (ligatures, fullwidth, superscript digits, ...). NFKC
|
|
22
|
+
# does NOT touch these characters — they're separate codepoints, not
|
|
23
|
+
# compatibility decompositions — so we substitute them manually.
|
|
24
|
+
UNICODE_REPLACEMENTS = {
|
|
25
|
+
"\u{2018}" => "'", # left single quote
|
|
26
|
+
"\u{2019}" => "'", # right single quote
|
|
27
|
+
"\u{201A}" => "'", # single low-9 quote
|
|
28
|
+
"\u{201C}" => '"', # left double quote
|
|
29
|
+
"\u{201D}" => '"', # right double quote
|
|
30
|
+
"\u{201E}" => '"', # double low-9 quote
|
|
31
|
+
"\u{2013}" => "-", # en dash
|
|
32
|
+
"\u{2014}" => "-", # em dash
|
|
33
|
+
"\u{2010}" => "-", # hyphen
|
|
34
|
+
"\u{2011}" => "-", # non-breaking hyphen
|
|
35
|
+
"\u{2212}" => "-", # minus sign
|
|
36
|
+
"\u{00AD}" => "", # soft hyphen (often invisible artifact)
|
|
37
|
+
"\u{200B}" => "", # zero-width space
|
|
38
|
+
"\u{200C}" => "", # zero-width non-joiner
|
|
39
|
+
"\u{200D}" => "", # zero-width joiner
|
|
40
|
+
"\u{FEFF}" => "", # zero-width no-break space (BOM)
|
|
41
|
+
"\u{2026}" => "...", # ellipsis
|
|
42
|
+
"\u{2022}" => "*", # bullet
|
|
43
|
+
"\u{00B7}" => "*" # middle dot used as bullet
|
|
44
|
+
}.freeze
|
|
45
|
+
end
|
|
46
|
+
end
|