deghost 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,336 @@
1
+ /**
2
+ * Unicode character categories that deghost can target.
3
+ *
4
+ * Each maps to a Unicode general category or a curated set of codepoints:
5
+ * - `format` — \p{Cf}: zero-width joiners, directional marks, soft hyphens
6
+ * - `control` — \p{Cc}: C0/C1 control characters (NULL through DELETE, 0x80–0x9F)
7
+ * - `spaces` — \p{Zs}: space separators (NBSP, en/em space, thin space, ideographic space)
8
+ * - `tag` — U+E0001–U+E007F: Unicode tag characters (deprecated, used in flag sequences)
9
+ * - `bom` — U+FEFF: byte order mark / zero-width no-break space
10
+ * - `fillers` — Script-specific fillers (Hangul, Khmer, Mongolian, Ogham)
11
+ * - `math` — Invisible math operators (U+2061–U+2064)
12
+ */
13
+ type Category = 'format' | 'control' | 'spaces' | 'tag' | 'bom' | 'fillers' | 'math';
14
+ /** Action to take on matched characters. */
15
+ type Action = 'strip' | 'normalize' | 'replace';
16
+ /** A single detected invisible character with metadata. */
17
+ interface Detection {
18
+ /** The invisible character itself. */
19
+ char: string;
20
+ /** Unicode codepoint as hex string (e.g., "U+200B"). */
21
+ codepoint: string;
22
+ /** Unicode character name (e.g., "ZERO WIDTH SPACE"). */
23
+ name: string;
24
+ /** Which deghost category this character belongs to. */
25
+ category: Category;
26
+ /** Zero-based offset in the input string. */
27
+ offset: number;
28
+ }
29
+ /** Configuration for a single rule in a cleaner pipeline. */
30
+ type Rule = {
31
+ category: Category;
32
+ action: 'strip';
33
+ } | {
34
+ category: Category;
35
+ action: 'normalize';
36
+ replacement?: string;
37
+ } | {
38
+ category: Category;
39
+ action: 'replace';
40
+ mapper: (detection: Detection) => string;
41
+ };
42
+ /** A reusable cleaning function built from a set of rules. */
43
+ type CleanerFn = (input: string) => string;
44
+ /** Options for highlight(). */
45
+ interface HighlightOptions {
46
+ /** Only highlight characters in these categories. Defaults to all. */
47
+ categories?: Category[];
48
+ /** Custom formatter for each detection. Defaults to `[U+XXXX]`. */
49
+ formatter?: (detection: Detection) => string;
50
+ }
51
+ /** Options for the deghost() entry point. */
52
+ interface DeghostOptions {
53
+ /** Trim leading/trailing whitespace from the result. Default: true */
54
+ trim?: boolean;
55
+ }
56
+
57
+ /**
58
+ * Fluent chain for composing text cleaning operations.
59
+ *
60
+ * Each method returns a new DeghostChain (immutable), so you can
61
+ * branch chains without side effects.
62
+ *
63
+ * @example
64
+ * ```ts
65
+ * new DeghostChain('text\u00A0with\u200Bghosts')
66
+ * .strip('format')
67
+ * .normalize('spaces')
68
+ * .trim()
69
+ * .toString()
70
+ * // 'text with ghosts'
71
+ * ```
72
+ */
73
+ declare class DeghostChain {
74
+ #private;
75
+ constructor(value: string);
76
+ /** Remove all characters in the given category. */
77
+ strip(category: Category): DeghostChain;
78
+ /** Replace all characters in the given category with a substitute. */
79
+ normalize(category: Category, replacement?: string): DeghostChain;
80
+ /** Replace matched ghosts in a category using a mapper function. */
81
+ replace(category: Category, mapper: (detection: Detection) => string): DeghostChain;
82
+ /** Replace ghosts with visible markers like `[U+200B]`. */
83
+ highlight(category?: Category, formatter?: (detection: Detection) => string): DeghostChain;
84
+ /** Return detections for the current chain value. */
85
+ detect(categories?: Category[]): Detection[];
86
+ /** Check if the current chain value contains invisible characters. */
87
+ hasGhosts(categories?: Category[]): boolean;
88
+ /** Count invisible characters by category in the current chain value. */
89
+ count(categories?: Category[]): Partial<Record<Category, number>>;
90
+ /** Returns true if the current chain value has no invisible characters. */
91
+ isClean(categories?: Category[]): boolean;
92
+ /** Return a human-readable report of ghosts in the current chain value. */
93
+ summary(categories?: Category[]): string;
94
+ /** Collapse runs of whitespace into a single space. */
95
+ collapse(): DeghostChain;
96
+ /** Trim leading and trailing whitespace. */
97
+ trim(): DeghostChain;
98
+ /** Apply the default cleaning preset: strip format + control, normalize spaces, trim. */
99
+ clean(): DeghostChain;
100
+ /** Extract the cleaned string. */
101
+ toString(): string;
102
+ /** Extract the cleaned string (alias for toString). */
103
+ valueOf(): string;
104
+ /** Support JSON.stringify. */
105
+ toJSON(): string;
106
+ }
107
+
108
+ /**
109
+ * Builder for creating reusable cleaning functions.
110
+ *
111
+ * Compile a cleaning pipeline once, apply it to many strings.
112
+ *
113
+ * @example
114
+ * ```ts
115
+ * const clean = cleaner()
116
+ * .strip('format')
117
+ * .strip('control')
118
+ * .normalize('spaces')
119
+ * .trim()
120
+ * .build()
121
+ *
122
+ * clean('dirty\u00A0string') // 'dirty string'
123
+ * clean('another\u200Bone') // 'anotherone'
124
+ * ```
125
+ */
126
+ declare class CleanerBuilder {
127
+ #private;
128
+ /** Add a strip rule — remove all characters in this category. */
129
+ strip(category: Category): this;
130
+ /** Add a normalize rule — replace characters in this category. */
131
+ normalize(category: Category, replacement?: string): this;
132
+ /** Add a replace rule — transform characters using detection metadata. */
133
+ replace(category: Category, mapper: (detection: Detection) => string): this;
134
+ /** Add a highlight step — annotate characters with visible markers. */
135
+ highlight(category: Category, formatter?: (detection: Detection) => string): this;
136
+ /** Enable whitespace trimming as a final step. */
137
+ trim(): this;
138
+ /** Enable collapsing runs of whitespace as a final step. */
139
+ collapse(): this;
140
+ /** Compile the pipeline into a reusable function. */
141
+ build(): CleanerFn;
142
+ }
143
+ /** Create a new cleaner builder. */
144
+ declare function cleaner(): CleanerBuilder;
145
+
146
+ /**
147
+ * Scan a string for invisible Unicode characters and return metadata for each.
148
+ *
149
+ * Optionally pass a list of categories to restrict detection.
150
+ *
151
+ * @example
152
+ * ```ts
153
+ * detect('hello\u200Bworld')
154
+ * // [{ char: '\u200B', codepoint: 'U+200B', name: 'ZERO WIDTH SPACE', category: 'format', offset: 5 }]
155
+ *
156
+ * detect('a\u00a0b\u200Bc', ['format'])
157
+ * // [{ char: '\u200B', ..., category: 'format' }]
158
+ * ```
159
+ */
160
+ /**
161
+ * Lazily scan a string for invisible Unicode characters, yielding metadata for each.
162
+ *
163
+ * Like `detect()` but returns a generator — useful for large strings where you
164
+ * may not need all results, or want to break early.
165
+ */
166
+ declare function scan(input: string, categories?: Category[]): Generator<Detection>;
167
+ declare function detect(input: string, categories?: Category[]): Detection[];
168
+ /**
169
+ * Return the first invisible character detected, or `undefined` if clean.
170
+ *
171
+ * Uses `scan()` internally — stops at the first match.
172
+ */
173
+ declare function first(input: string, categories?: Category[]): Detection | undefined;
174
+ /**
175
+ * Returns true if the string contains any invisible Unicode characters.
176
+ *
177
+ * Faster than `detect()` when you don't need metadata.
178
+ */
179
+ declare function hasGhosts(input: string, categories?: Category[]): boolean;
180
+ /**
181
+ * Returns true if the string contains no invisible Unicode characters.
182
+ *
183
+ * Inverse of `hasGhosts()`.
184
+ */
185
+ declare function isClean(input: string, categories?: Category[]): boolean;
186
+ /**
187
+ * Count invisible characters by category.
188
+ *
189
+ * @example
190
+ * ```ts
191
+ * count('a\u00A0b\u200Bc')
192
+ * // { spaces: 1, format: 1 }
193
+ * ```
194
+ */
195
+ declare function count(input: string, categories?: Category[]): Partial<Record<Category, number>>;
196
+ /** Result of identifying a single character. */
197
+ interface CharInfo {
198
+ codepoint: string;
199
+ name: string;
200
+ category: Category;
201
+ }
202
+ /**
203
+ * Identify a single character or codepoint — returns its category and name,
204
+ * or `undefined` if it's not an invisible character deghost tracks.
205
+ *
206
+ * @example
207
+ * ```ts
208
+ * identify('\u200B')
209
+ * // { codepoint: 'U+200B', name: 'ZERO WIDTH SPACE', category: 'format' }
210
+ *
211
+ * identify(0x00A0)
212
+ * // { codepoint: 'U+00A0', name: 'NO-BREAK SPACE', category: 'spaces' }
213
+ * ```
214
+ */
215
+ declare function identify(input: string | number): CharInfo | undefined;
216
+
217
+ /**
218
+ * Replace invisible characters with visible markers for debugging.
219
+ *
220
+ * By default each ghost is replaced with its codepoint in brackets,
221
+ * e.g. `[U+200B]`. Pass a custom formatter or an options object to
222
+ * filter by category.
223
+ *
224
+ * @example
225
+ * ```ts
226
+ * highlight('hello\u200Bworld')
227
+ * // 'hello[U+200B]world'
228
+ *
229
+ * highlight('a\u200Bb', (d) => `{${d.name}}`)
230
+ * // 'a{ZERO WIDTH SPACE}b'
231
+ *
232
+ * highlight('a\u00a0b\u200Bc', { categories: ['format'] })
233
+ * // 'a\u00a0b[U+200B]c'
234
+ * ```
235
+ */
236
+ declare function highlight(input: string): string;
237
+ declare function highlight(input: string, formatter: (detection: Detection) => string): string;
238
+ declare function highlight(input: string, options: HighlightOptions): string;
239
+
240
+ /**
241
+ * Return a human-readable report of all invisible characters in a string.
242
+ *
243
+ * @example
244
+ * ```ts
245
+ * summary('hello\u200Bworld')
246
+ * // "1 invisible character found.\n\nBy category:\n format: 1\n\nDetails:\n U+200B ZERO WIDTH SPACE (format, offset 5)"
247
+ * ```
248
+ */
249
+ declare function summary(input: string, categories?: Category[]): string;
250
+
251
+ /**
252
+ * Pre-built cleaning functions for common use cases.
253
+ *
254
+ * @example
255
+ * ```ts
256
+ * import { presets } from 'deghost'
257
+ *
258
+ * presets.clean('text\u00A0with\u200Bghosts')
259
+ * // 'text with ghosts'
260
+ * ```
261
+ */
262
+ declare const presets: {
263
+ /**
264
+ * Default clean: strip format + control + BOM, normalize spaces, collapse, trim.
265
+ *
266
+ * The right choice for most text processing — catches invisible chars from
267
+ * binary formats (Garmin FIT, PDFs), APIs, and copy-paste while preserving
268
+ * word boundaries.
269
+ */
270
+ clean: CleanerFn;
271
+ /**
272
+ * Aggressive: strip everything invisible, including fillers, math operators, and tags.
273
+ *
274
+ * Use when you want maximally clean output and don't need to preserve any
275
+ * invisible Unicode semantics (ligature joiners, bidi marks, etc.).
276
+ */
277
+ aggressive: CleanerFn;
278
+ /**
279
+ * Spaces only: normalize Unicode whitespace to ASCII space.
280
+ *
281
+ * Leaves format/control characters alone. Useful when you only care about
282
+ * NBSP and exotic spaces (common in data from Garmin, Strava, etc.).
283
+ */
284
+ spaces: CleanerFn;
285
+ };
286
+
287
+ /**
288
+ * Regex patterns for each invisible character category.
289
+ *
290
+ * Uses ES2018 Unicode property escapes where possible (\p{Cf}, \p{Cc}, \p{Zs}).
291
+ * Falls back to explicit codepoint ranges for categories not covered by a
292
+ * single Unicode general category.
293
+ */
294
+ declare const patterns: Record<Category, RegExp>;
295
+ /** All category names as a readonly array. */
296
+ declare const categories: readonly Category[];
297
+ /** Human-readable descriptions for each category. */
298
+ declare const descriptions: Record<Category, string>;
299
+ /**
300
+ * Unicode character names for well-known invisible codepoints.
301
+ *
302
+ * This is not exhaustive — \p{Cf} alone covers 170+ characters.
303
+ * We map the ~40 most commonly encountered ones for the detect() API.
304
+ */
305
+ declare const charNames: Record<number, string>;
306
+ /** Resolve the deghost category for a given codepoint. */
307
+ declare function categorize(codepoint: number): Category | undefined;
308
+
309
+ /**
310
+ * Clean a string of invisible Unicode characters.
311
+ *
312
+ * With no chaining, applies the default preset (strip format + control + BOM,
313
+ * normalize spaces, collapse, trim). Chain methods for fine-grained control.
314
+ *
315
+ * @example
316
+ * ```ts
317
+ * // Quick clean — sensible defaults
318
+ * deghost('Plant\u00a064\u00a0-\u00a0Woodbridge')
319
+ * // → 'Plant 64 - Woodbridge'
320
+ *
321
+ * // Tagged template literal
322
+ * deghost`text\u200B\u00a0here`
323
+ * // → 'text here'
324
+ *
325
+ * // Chainable — pick what to handle
326
+ * deghost('text\u200B\u00a0here')
327
+ * .strip('format')
328
+ * .normalize('spaces')
329
+ * .trim()
330
+ * .toString()
331
+ * ```
332
+ */
333
+ declare function deghost(strings: TemplateStringsArray, ...values: unknown[]): DeghostChain & string;
334
+ declare function deghost(input: string, options?: DeghostOptions): DeghostChain & string;
335
+
336
+ export { type Action, type Category, type CharInfo, CleanerBuilder, type CleanerFn, DeghostChain, type DeghostOptions, type Detection, type HighlightOptions, type Rule, categories, categorize, charNames, cleaner, count, deghost, descriptions, detect, first, hasGhosts, highlight, identify, isClean, patterns, presets, scan, summary };