od-temp 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1531 @@
1
+ import { createRequire } from "node:module";
2
+
3
+ //#region rolldown:runtime
4
+ var __defProp = Object.defineProperty;
5
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
6
+ var __getOwnPropNames = Object.getOwnPropertyNames;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __esmMin = (fn, res) => () => (fn && (res = fn(fn = 0)), res);
9
+ var __exportAll = (all, symbols) => {
10
+ let target = {};
11
+ for (var name in all) {
12
+ __defProp(target, name, {
13
+ get: all[name],
14
+ enumerable: true
15
+ });
16
+ }
17
+ if (symbols) {
18
+ __defProp(target, Symbol.toStringTag, { value: "Module" });
19
+ }
20
+ return target;
21
+ };
22
+ var __copyProps = (to, from, except, desc) => {
23
+ if (from && typeof from === "object" || typeof from === "function") {
24
+ for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
25
+ key = keys[i];
26
+ if (!__hasOwnProp.call(to, key) && key !== except) {
27
+ __defProp(to, key, {
28
+ get: ((k) => from[k]).bind(null, key),
29
+ enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
30
+ });
31
+ }
32
+ }
33
+ }
34
+ return to;
35
+ };
36
+ var __toCommonJS = (mod) => __hasOwnProp.call(mod, "module.exports") ? mod["module.exports"] : __copyProps(__defProp({}, "__esModule", { value: true }), mod);
37
+ var __require = /* @__PURE__ */ createRequire(import.meta.url);
38
+
39
+ //#endregion
40
+ //#region src/document/OCRProcessor.ts
41
+ /**
42
+ * OCR processor with optional Tesseract.js support
43
+ * Requires peer dependency: tesseract.js
44
+ */
45
+ var OCRProcessor = class {
46
+ constructor() {
47
+ try {
48
+ this.tesseract = __require("tesseract.js");
49
+ } catch {}
50
+ }
51
+ /**
52
+ * Extract text from image buffer using OCR
53
+ */
54
+ async recognizeText(buffer, options) {
55
+ if (!this.tesseract) throw new Error("[OCRProcessor] OCR support requires tesseract.js. Install with: npm install tesseract.js");
56
+ const startTime = performance.now();
57
+ try {
58
+ const language = Array.isArray(options?.language) ? options.language.join("+") : options?.language || "eng";
59
+ const worker = await this.tesseract.createWorker(language, options?.oem || 3);
60
+ if (options?.psm !== void 0) await worker.setParameters({ tessedit_pageseg_mode: options.psm });
61
+ const result = await worker.recognize(buffer);
62
+ await worker.terminate();
63
+ const endTime = performance.now();
64
+ const processingTime = Math.round((endTime - startTime) * 100) / 100;
65
+ return {
66
+ text: result.data.text || "",
67
+ confidence: result.data.confidence || 0,
68
+ processingTime
69
+ };
70
+ } catch (error) {
71
+ throw new Error(`[OCRProcessor] OCR recognition failed: ${error.message}`);
72
+ }
73
+ }
74
+ /**
75
+ * Check if OCR is available (tesseract.js installed)
76
+ */
77
+ isAvailable() {
78
+ return !!this.tesseract;
79
+ }
80
+ /**
81
+ * Create a scheduler for batch OCR processing
82
+ * More efficient for processing multiple images
83
+ */
84
+ async createScheduler(workerCount = 4) {
85
+ if (!this.tesseract) throw new Error("[OCRProcessor] OCR support requires tesseract.js. Install with: npm install tesseract.js");
86
+ if (this.scheduler) await this.scheduler.terminate();
87
+ this.scheduler = this.tesseract.createScheduler();
88
+ const workers = [];
89
+ for (let i = 0; i < workerCount; i++) {
90
+ const worker = await this.tesseract.createWorker("eng");
91
+ this.scheduler.addWorker(worker);
92
+ workers.push(worker);
93
+ }
94
+ return this.scheduler;
95
+ }
96
+ /**
97
+ * Batch process multiple images
98
+ */
99
+ async recognizeBatch(buffers, _options) {
100
+ if (!this.tesseract) throw new Error("[OCRProcessor] OCR support requires tesseract.js. Install with: npm install tesseract.js");
101
+ const scheduler = await this.createScheduler();
102
+ try {
103
+ const results = await Promise.all(buffers.map(async (buffer) => {
104
+ const startTime = performance.now();
105
+ const result = await scheduler.addJob("recognize", buffer);
106
+ const endTime = performance.now();
107
+ return {
108
+ text: result.data.text || "",
109
+ confidence: result.data.confidence || 0,
110
+ processingTime: Math.round((endTime - startTime) * 100) / 100
111
+ };
112
+ }));
113
+ await scheduler.terminate();
114
+ this.scheduler = void 0;
115
+ return results;
116
+ } catch (error) {
117
+ if (scheduler) {
118
+ await scheduler.terminate();
119
+ this.scheduler = void 0;
120
+ }
121
+ throw new Error(`[OCRProcessor] Batch OCR failed: ${error.message}`);
122
+ }
123
+ }
124
+ /**
125
+ * Terminate any running scheduler
126
+ */
127
+ async cleanup() {
128
+ if (this.scheduler) {
129
+ await this.scheduler.terminate();
130
+ this.scheduler = void 0;
131
+ }
132
+ }
133
+ };
134
+ /**
135
+ * Create an OCR processor instance
136
+ */
137
+ function createOCRProcessor() {
138
+ return new OCRProcessor();
139
+ }
140
+
141
+ //#endregion
142
+ //#region src/document/JsonProcessor.ts
143
+ /**
144
+ * Processor for JSON documents
145
+ */
146
+ var JsonProcessor = class {
147
+ constructor() {
148
+ this.defaultOptions = {
149
+ maxDepth: 100,
150
+ scanKeys: false,
151
+ alwaysRedact: [],
152
+ skipPaths: [],
153
+ piiIndicatorKeys: [
154
+ "email",
155
+ "e-mail",
156
+ "mail",
157
+ "phone",
158
+ "tel",
159
+ "telephone",
160
+ "mobile",
161
+ "ssn",
162
+ "social_security",
163
+ "address",
164
+ "street",
165
+ "city",
166
+ "zip",
167
+ "postal",
168
+ "name",
169
+ "firstname",
170
+ "lastname",
171
+ "fullname",
172
+ "password",
173
+ "pwd",
174
+ "secret",
175
+ "token",
176
+ "key",
177
+ "card",
178
+ "credit_card",
179
+ "creditcard",
180
+ "account",
181
+ "iban",
182
+ "swift",
183
+ "passport",
184
+ "license",
185
+ "licence"
186
+ ],
187
+ preserveStructure: true
188
+ };
189
+ }
190
+ /**
191
+ * Parse JSON from buffer or string
192
+ */
193
+ parse(input) {
194
+ try {
195
+ const text = typeof input === "string" ? input : input.toString("utf-8");
196
+ return JSON.parse(text);
197
+ } catch (error) {
198
+ throw new Error(`[JsonProcessor] Invalid JSON: ${error.message}`);
199
+ }
200
+ }
201
+ /**
202
+ * Detect PII in JSON data
203
+ */
204
+ async detect(data, detector, options) {
205
+ const opts = {
206
+ ...this.defaultOptions,
207
+ ...options
208
+ };
209
+ const pathsDetected = [];
210
+ const matchesByPath = {};
211
+ const allDetections = [];
212
+ const promises = [];
213
+ this.traverse(data, "", opts, (path, value, key) => {
214
+ promises.push((async () => {
215
+ if (this.shouldSkip(path, opts.skipPaths)) return;
216
+ if (this.shouldAlwaysRedact(path, opts.alwaysRedact)) {
217
+ const detection = {
218
+ type: "SENSITIVE_FIELD",
219
+ value: String(value),
220
+ placeholder: `[SENSITIVE_FIELD]`,
221
+ position: [0, String(value).length],
222
+ severity: "high",
223
+ confidence: 1
224
+ };
225
+ matchesByPath[path] = [detection];
226
+ pathsDetected.push(path);
227
+ allDetections.push(detection);
228
+ return;
229
+ }
230
+ if (opts.scanKeys && key) {
231
+ const keyResult = await detector.detect(key);
232
+ if (keyResult.detections.length > 0) {
233
+ const keyPath = `${path}.__key__`;
234
+ matchesByPath[keyPath] = keyResult.detections;
235
+ pathsDetected.push(keyPath);
236
+ allDetections.push(...keyResult.detections);
237
+ }
238
+ }
239
+ const valueStr = String(value);
240
+ const result = await detector.detect(valueStr);
241
+ if (result.detections.length > 0) {
242
+ const boostedDetections = this.boostConfidenceFromKey(result.detections, key, opts.piiIndicatorKeys);
243
+ matchesByPath[path] = boostedDetections;
244
+ pathsDetected.push(path);
245
+ allDetections.push(...boostedDetections);
246
+ }
247
+ })());
248
+ });
249
+ await Promise.all(promises);
250
+ const original = JSON.stringify(data);
251
+ const redacted = this.redact(data, {
252
+ original,
253
+ redacted: original,
254
+ detections: allDetections,
255
+ redactionMap: {},
256
+ stats: { piiCount: allDetections.length },
257
+ pathsDetected,
258
+ matchesByPath
259
+ }, opts);
260
+ const redactionMap = {};
261
+ allDetections.forEach((det) => {
262
+ redactionMap[det.placeholder] = det.value;
263
+ });
264
+ return {
265
+ original,
266
+ redacted: typeof redacted === "string" ? redacted : JSON.stringify(redacted),
267
+ detections: allDetections,
268
+ redactionMap,
269
+ stats: { piiCount: allDetections.length },
270
+ pathsDetected,
271
+ matchesByPath
272
+ };
273
+ }
274
+ /**
275
+ * Redact PII in JSON data
276
+ */
277
+ redact(data, detectionResult, options) {
278
+ if (!{
279
+ ...this.defaultOptions,
280
+ ...options
281
+ }.preserveStructure) return this.parse(this.redactText(JSON.stringify(data, null, 2), detectionResult));
282
+ return this.redactPreservingStructure(data, detectionResult.pathsDetected);
283
+ }
284
+ /**
285
+ * Redact specific paths in JSON while preserving structure
286
+ */
287
+ redactPreservingStructure(data, pathsToRedact) {
288
+ const pathSet = new Set(pathsToRedact);
289
+ const redactValue = (value, currentPath) => {
290
+ if (pathSet.has(currentPath)) {
291
+ if (typeof value === "string") return "[REDACTED]";
292
+ else if (typeof value === "number") return 0;
293
+ else if (typeof value === "boolean") return false;
294
+ else if (value === null) return null;
295
+ else if (Array.isArray(value)) return [];
296
+ else if (typeof value === "object") return {};
297
+ return "[REDACTED]";
298
+ }
299
+ if (Array.isArray(value)) return value.map((item, index) => redactValue(item, `${currentPath}[${index}]`));
300
+ if (value !== null && typeof value === "object") {
301
+ const result = {};
302
+ for (const [key, val] of Object.entries(value)) result[key] = redactValue(val, currentPath ? `${currentPath}.${key}` : key);
303
+ return result;
304
+ }
305
+ return value;
306
+ };
307
+ return redactValue(data, "");
308
+ }
309
+ /**
310
+ * Simple text-based redaction (fallback)
311
+ */
312
+ redactText(text, detectionResult) {
313
+ let redacted = text;
314
+ const sortedDetections = [...detectionResult.detections].sort((a, b) => b.position[0] - a.position[0]);
315
+ for (const detection of sortedDetections) {
316
+ const [start, end] = detection.position;
317
+ redacted = redacted.slice(0, start) + detection.placeholder + redacted.slice(end);
318
+ }
319
+ return redacted;
320
+ }
321
+ /**
322
+ * Traverse JSON structure and call callback for each value
323
+ */
324
+ traverse(obj, path, options, callback, depth = 0) {
325
+ if (depth > options.maxDepth) throw new Error(`[JsonProcessor] Maximum depth (${options.maxDepth}) exceeded`);
326
+ if (obj === null || obj === void 0) return;
327
+ if (Array.isArray(obj)) {
328
+ obj.forEach((item, index) => {
329
+ const itemPath = path ? `${path}[${index}]` : `[${index}]`;
330
+ if (this.isPrimitive(item)) callback(itemPath, item);
331
+ this.traverse(item, itemPath, options, callback, depth + 1);
332
+ });
333
+ return;
334
+ }
335
+ if (typeof obj === "object") {
336
+ for (const [key, value] of Object.entries(obj)) {
337
+ const valuePath = path ? `${path}.${key}` : key;
338
+ if (this.isPrimitive(value)) callback(valuePath, value, key);
339
+ this.traverse(value, valuePath, options, callback, depth + 1);
340
+ }
341
+ return;
342
+ }
343
+ if (this.isPrimitive(obj)) callback(path, obj);
344
+ }
345
+ /**
346
+ * Check if value is primitive (string, number, boolean)
347
+ */
348
+ isPrimitive(value) {
349
+ return typeof value === "string" || typeof value === "number" || typeof value === "boolean";
350
+ }
351
+ /**
352
+ * Check if path should be skipped
353
+ */
354
+ shouldSkip(path, skipPaths) {
355
+ return skipPaths.some((skipPath) => {
356
+ if (path === skipPath) return true;
357
+ return new RegExp("^" + skipPath.replace(/\*/g, "[^.]+") + "$").test(path);
358
+ });
359
+ }
360
+ /**
361
+ * Check if path should always be redacted
362
+ */
363
+ shouldAlwaysRedact(path, alwaysRedact) {
364
+ return alwaysRedact.some((redactPath) => {
365
+ if (path === redactPath) return true;
366
+ return new RegExp("^" + redactPath.replace(/\*/g, "[^.]+") + "$").test(path);
367
+ });
368
+ }
369
+ /**
370
+ * Boost confidence if key name indicates PII
371
+ */
372
+ boostConfidenceFromKey(detections, key, piiIndicatorKeys) {
373
+ if (!key) return detections;
374
+ const keyLower = key.toLowerCase();
375
+ if (!piiIndicatorKeys.some((indicator) => keyLower.includes(indicator.toLowerCase()))) return detections;
376
+ return detections.map((detection) => ({
377
+ ...detection,
378
+ confidence: Math.min(1, (detection.confidence || .5) * 1.2)
379
+ }));
380
+ }
381
+ /**
382
+ * Extract all text values from JSON for simple text-based detection
383
+ */
384
+ extractText(data, options) {
385
+ const opts = {
386
+ ...this.defaultOptions,
387
+ ...options
388
+ };
389
+ const textParts = [];
390
+ this.traverse(data, "", opts, (_path, value, key) => {
391
+ if (opts.scanKeys && key) textParts.push(key);
392
+ if (typeof value === "string") textParts.push(value);
393
+ });
394
+ return textParts.join(" ");
395
+ }
396
+ /**
397
+ * Validate JSON buffer/string
398
+ */
399
+ isValid(input) {
400
+ try {
401
+ this.parse(input);
402
+ return true;
403
+ } catch {
404
+ return false;
405
+ }
406
+ }
407
+ /**
408
+ * Get JSON Lines (JSONL) support - split by newlines and parse each line
409
+ */
410
+ parseJsonLines(input) {
411
+ return (typeof input === "string" ? input : input.toString("utf-8")).split("\n").filter((line) => line.trim().length > 0).map((line, index) => {
412
+ try {
413
+ return JSON.parse(line);
414
+ } catch (error) {
415
+ throw new Error(`[JsonProcessor] Invalid JSON at line ${index + 1}: ${error.message}`);
416
+ }
417
+ });
418
+ }
419
+ /**
420
+ * Detect PII in JSON Lines format
421
+ */
422
+ async detectJsonLines(input, detector, options) {
423
+ const documents = this.parseJsonLines(input);
424
+ return Promise.all(documents.map((doc) => this.detect(doc, detector, options)));
425
+ }
426
+ };
427
+ /**
428
+ * Create a JSON processor instance
429
+ */
430
+ function createJsonProcessor() {
431
+ return new JsonProcessor();
432
+ }
433
+
434
+ //#endregion
435
+ //#region src/document/CsvProcessor.ts
436
+ /**
437
+ * CSV processor for tabular data
438
+ */
439
+ var CsvProcessor = class {
440
+ constructor() {
441
+ this.defaultOptions = {
442
+ quote: "\"",
443
+ escape: "\"",
444
+ skipEmptyLines: true,
445
+ piiIndicatorNames: [
446
+ "email",
447
+ "e-mail",
448
+ "mail",
449
+ "email_address",
450
+ "phone",
451
+ "tel",
452
+ "telephone",
453
+ "mobile",
454
+ "phone_number",
455
+ "ssn",
456
+ "social_security",
457
+ "social_security_number",
458
+ "address",
459
+ "street",
460
+ "street_address",
461
+ "city",
462
+ "zip",
463
+ "zipcode",
464
+ "postal",
465
+ "postcode",
466
+ "name",
467
+ "firstname",
468
+ "first_name",
469
+ "lastname",
470
+ "last_name",
471
+ "fullname",
472
+ "full_name",
473
+ "password",
474
+ "pwd",
475
+ "secret",
476
+ "token",
477
+ "api_key",
478
+ "card",
479
+ "credit_card",
480
+ "creditcard",
481
+ "card_number",
482
+ "account",
483
+ "account_number",
484
+ "iban",
485
+ "swift",
486
+ "passport",
487
+ "passport_number",
488
+ "license",
489
+ "licence",
490
+ "driver_license",
491
+ "dob",
492
+ "date_of_birth",
493
+ "birth_date",
494
+ "birthdate"
495
+ ],
496
+ treatFirstRowAsHeader: true
497
+ };
498
+ }
499
+ /**
500
+ * Parse CSV from buffer or string
501
+ */
502
+ parse(input, options) {
503
+ const opts = {
504
+ ...this.defaultOptions,
505
+ ...options
506
+ };
507
+ const text = typeof input === "string" ? input : input.toString("utf-8");
508
+ const delimiter = opts.delimiter || this.detectDelimiter(text);
509
+ const lines = text.split(/\r?\n/);
510
+ const rows = [];
511
+ let rowIndex = 0;
512
+ for (let i = 0; i < lines.length; i++) {
513
+ const line = lines[i];
514
+ if (opts.skipEmptyLines && line.trim().length === 0) continue;
515
+ if (opts.maxRows !== void 0 && rowIndex >= opts.maxRows) break;
516
+ const values = this.parseRow(line, delimiter, opts.quote, opts.escape);
517
+ rows.push({
518
+ index: rowIndex,
519
+ values
520
+ });
521
+ rowIndex++;
522
+ }
523
+ return rows;
524
+ }
525
+ /**
526
+ * Detect PII in CSV data
527
+ */
528
+ async detect(input, detector, options) {
529
+ const opts = {
530
+ ...this.defaultOptions,
531
+ ...options
532
+ };
533
+ const rows = this.parse(input, options);
534
+ if (rows.length === 0) {
535
+ const original = typeof input === "string" ? input : input.toString("utf-8");
536
+ return {
537
+ original,
538
+ redacted: original,
539
+ detections: [],
540
+ redactionMap: {},
541
+ stats: { piiCount: 0 },
542
+ rowCount: 0,
543
+ columnCount: 0,
544
+ columnStats: {},
545
+ matchesByCell: []
546
+ };
547
+ }
548
+ const hasHeader = opts.hasHeader !== void 0 ? opts.hasHeader : this.detectHeader(rows);
549
+ const headers = hasHeader && rows.length > 0 ? rows[0].values : void 0;
550
+ const dataRows = hasHeader ? rows.slice(1) : rows;
551
+ const columnCount = rows[0].values.length;
552
+ const columnNameToIndex = /* @__PURE__ */ new Map();
553
+ if (headers) headers.forEach((header, index) => {
554
+ columnNameToIndex.set(header.toLowerCase().trim(), index);
555
+ });
556
+ const alwaysRedactCols = new Set(opts.alwaysRedactColumns || []);
557
+ if (opts.alwaysRedactColumnNames && headers) opts.alwaysRedactColumnNames.forEach((name) => {
558
+ const index = columnNameToIndex.get(name.toLowerCase().trim());
559
+ if (index !== void 0) alwaysRedactCols.add(index);
560
+ });
561
+ const skipCols = new Set(opts.skipColumns || []);
562
+ const columnStats = {};
563
+ const matchesByCell = [];
564
+ const allDetections = [];
565
+ for (let col = 0; col < columnCount; col++) columnStats[col] = {
566
+ columnIndex: col,
567
+ columnName: headers?.[col],
568
+ piiCount: 0,
569
+ piiPercentage: 0,
570
+ piiTypes: []
571
+ };
572
+ for (const row of dataRows) for (let col = 0; col < row.values.length; col++) {
573
+ if (skipCols.has(col)) continue;
574
+ const cellValue = row.values[col];
575
+ if (alwaysRedactCols.has(col)) {
576
+ const detection = {
577
+ type: "SENSITIVE_COLUMN",
578
+ value: cellValue,
579
+ placeholder: `[SENSITIVE_COLUMN_${col}]`,
580
+ position: [0, cellValue.length],
581
+ severity: "high",
582
+ confidence: 1
583
+ };
584
+ matchesByCell.push({
585
+ row: row.index,
586
+ column: col,
587
+ columnName: headers?.[col],
588
+ value: cellValue,
589
+ matches: [detection]
590
+ });
591
+ allDetections.push(detection);
592
+ columnStats[col].piiCount++;
593
+ continue;
594
+ }
595
+ const result = await detector.detect(cellValue);
596
+ if (result.detections.length > 0) {
597
+ const boostedDetections = this.boostConfidenceFromColumnName(result.detections, headers?.[col], opts.piiIndicatorNames || []);
598
+ matchesByCell.push({
599
+ row: row.index,
600
+ column: col,
601
+ columnName: headers?.[col],
602
+ value: cellValue,
603
+ matches: boostedDetections
604
+ });
605
+ allDetections.push(...boostedDetections);
606
+ columnStats[col].piiCount += boostedDetections.length;
607
+ const columnTypes = new Set(columnStats[col].piiTypes);
608
+ boostedDetections.forEach((d) => columnTypes.add(d.type));
609
+ columnStats[col].piiTypes = Array.from(columnTypes);
610
+ }
611
+ }
612
+ for (let col = 0; col < columnCount; col++) {
613
+ const rowsWithPii = matchesByCell.filter((m) => m.column === col).length;
614
+ columnStats[col].piiPercentage = dataRows.length > 0 ? rowsWithPii / dataRows.length * 100 : 0;
615
+ }
616
+ const original = typeof input === "string" ? input : input.toString("utf-8");
617
+ const redacted = this.redact(original, {
618
+ original,
619
+ redacted: original,
620
+ detections: allDetections,
621
+ redactionMap: {},
622
+ stats: { piiCount: allDetections.length },
623
+ rowCount: dataRows.length,
624
+ columnCount,
625
+ headers,
626
+ columnStats,
627
+ matchesByCell
628
+ }, opts);
629
+ const redactionMap = {};
630
+ allDetections.forEach((det) => {
631
+ redactionMap[det.placeholder] = det.value;
632
+ });
633
+ return {
634
+ original,
635
+ redacted,
636
+ detections: allDetections,
637
+ redactionMap,
638
+ stats: { piiCount: allDetections.length },
639
+ rowCount: dataRows.length,
640
+ columnCount,
641
+ headers: headers?.filter((h) => h !== void 0),
642
+ columnStats,
643
+ matchesByCell
644
+ };
645
+ }
646
+ /**
647
+ * Redact PII in CSV data
648
+ */
649
+ redact(input, detectionResult, options) {
650
+ const opts = {
651
+ ...this.defaultOptions,
652
+ ...options
653
+ };
654
+ const rows = this.parse(input, options);
655
+ if (rows.length === 0) return "";
656
+ const delimiter = opts.delimiter || this.detectDelimiter(typeof input === "string" ? input : input.toString("utf-8"));
657
+ const hasHeader = detectionResult.headers !== void 0;
658
+ const redactionMap = /* @__PURE__ */ new Map();
659
+ for (const cellMatch of detectionResult.matchesByCell) {
660
+ if (!redactionMap.has(cellMatch.row)) redactionMap.set(cellMatch.row, /* @__PURE__ */ new Map());
661
+ redactionMap.get(cellMatch.row).set(cellMatch.column, "[REDACTED]");
662
+ }
663
+ const outputRows = [];
664
+ for (let i = 0; i < rows.length; i++) {
665
+ const row = rows[i];
666
+ if (hasHeader && i === 0) outputRows.push(this.formatRow(row.values, delimiter, opts.quote));
667
+ else {
668
+ const rowIndex = hasHeader ? i - 1 : i;
669
+ const redactedValues = row.values.map((value, colIndex) => {
670
+ return redactionMap.get(rowIndex)?.get(colIndex) || value;
671
+ });
672
+ outputRows.push(this.formatRow(redactedValues, delimiter, opts.quote));
673
+ }
674
+ }
675
+ return outputRows.join("\n");
676
+ }
677
+ /**
678
+ * Parse a single CSV row
679
+ */
680
+ parseRow(line, delimiter, quote, _escape) {
681
+ const values = [];
682
+ let current = "";
683
+ let inQuotes = false;
684
+ let i = 0;
685
+ while (i < line.length) {
686
+ const char = line[i];
687
+ const nextChar = line[i + 1];
688
+ if (char === quote) if (inQuotes && nextChar === quote) {
689
+ current += quote;
690
+ i += 2;
691
+ } else {
692
+ inQuotes = !inQuotes;
693
+ i++;
694
+ }
695
+ else if (char === delimiter && !inQuotes) {
696
+ values.push(current);
697
+ current = "";
698
+ i++;
699
+ } else {
700
+ current += char;
701
+ i++;
702
+ }
703
+ }
704
+ values.push(current);
705
+ return values;
706
+ }
707
+ /**
708
+ * Format a row as CSV
709
+ */
710
+ formatRow(values, delimiter, quote) {
711
+ return values.map((value) => {
712
+ if (value.includes(delimiter) || value.includes(quote) || value.includes("\n")) return `${quote}${value.replace(new RegExp(quote, "g"), quote + quote)}${quote}`;
713
+ return value;
714
+ }).join(delimiter);
715
+ }
716
+ /**
717
+ * Auto-detect CSV delimiter
718
+ */
719
+ detectDelimiter(text) {
720
+ const delimiters = [
721
+ ",",
722
+ " ",
723
+ ";",
724
+ "|"
725
+ ];
726
+ const lines = text.split(/\r?\n/).slice(0, 5);
727
+ let bestDelimiter = ",";
728
+ let bestScore = 0;
729
+ for (const delimiter of delimiters) {
730
+ const counts = lines.map((line) => {
731
+ let count = 0;
732
+ let inQuotes = false;
733
+ for (const char of line) {
734
+ if (char === "\"") inQuotes = !inQuotes;
735
+ if (char === delimiter && !inQuotes) count++;
736
+ }
737
+ return count;
738
+ });
739
+ if (counts.length > 0 && counts[0] > 0) {
740
+ const avg = counts.reduce((a, b) => a + b, 0) / counts.length;
741
+ const score = avg / (counts.reduce((sum, c) => sum + Math.pow(c - avg, 2), 0) / counts.length + 1);
742
+ if (score > bestScore) {
743
+ bestScore = score;
744
+ bestDelimiter = delimiter;
745
+ }
746
+ }
747
+ }
748
+ return bestDelimiter;
749
+ }
750
+ /**
751
+ * Detect if first row is likely a header
752
+ */
753
+ detectHeader(rows) {
754
+ if (rows.length < 2) return false;
755
+ const firstRow = rows[0].values;
756
+ const secondRow = rows[1].values;
757
+ if (firstRow.reduce((sum, v) => sum + v.length, 0) / firstRow.length > secondRow.reduce((sum, v) => sum + v.length, 0) / secondRow.length * 1.5) return false;
758
+ const firstRowNumeric = firstRow.filter((v) => !isNaN(Number(v)) && v.trim() !== "").length;
759
+ return firstRow.length - firstRowNumeric >= firstRowNumeric;
760
+ }
761
+ /**
762
+ * Boost confidence if column name indicates PII
763
+ */
764
+ boostConfidenceFromColumnName(detections, columnName, piiIndicatorNames) {
765
+ if (!columnName) return detections;
766
+ const nameLower = columnName.toLowerCase().trim();
767
+ if (!piiIndicatorNames.some((indicator) => nameLower.includes(indicator.toLowerCase()))) return detections;
768
+ return detections.map((detection) => ({
769
+ ...detection,
770
+ confidence: Math.min(1, (detection.confidence || .5) * 1.2)
771
+ }));
772
+ }
773
+ /**
774
+ * Extract all cell values as text
775
+ */
776
+ extractText(input, options) {
777
+ const rows = this.parse(input, options);
778
+ const textParts = [];
779
+ for (const row of rows) for (const value of row.values) if (value.trim().length > 0) textParts.push(value);
780
+ return textParts.join(" ");
781
+ }
782
+ /**
783
+ * Get column statistics without full PII detection
784
+ */
785
+ getColumnInfo(input, options) {
786
+ const rows = this.parse(input, options);
787
+ if (rows.length === 0) return {
788
+ columnCount: 0,
789
+ rowCount: 0,
790
+ sampleRows: []
791
+ };
792
+ const opts = {
793
+ ...this.defaultOptions,
794
+ ...options
795
+ };
796
+ const hasHeader = opts.hasHeader !== void 0 ? opts.hasHeader : this.detectHeader(rows);
797
+ const headers = hasHeader && rows.length > 0 ? rows[0].values : void 0;
798
+ const dataRows = hasHeader ? rows.slice(1) : rows;
799
+ const sampleRows = dataRows.slice(0, 5).map((r) => r.values);
800
+ return {
801
+ columnCount: rows[0].values.length,
802
+ rowCount: dataRows.length,
803
+ headers,
804
+ sampleRows
805
+ };
806
+ }
807
+ };
808
+ /**
809
+ * Create a CSV processor instance
810
+ */
811
+ function createCsvProcessor() {
812
+ return new CsvProcessor();
813
+ }
814
+
815
+ //#endregion
816
+ //#region src/document/XlsxProcessor.ts
817
+ /**
818
+ * XLSX processor for spreadsheet data
819
+ */
820
+ var XlsxProcessor = class {
821
+ constructor() {
822
+ this.defaultOptions = {
823
+ piiIndicatorNames: [
824
+ "email",
825
+ "e-mail",
826
+ "mail",
827
+ "email_address",
828
+ "phone",
829
+ "tel",
830
+ "telephone",
831
+ "mobile",
832
+ "phone_number",
833
+ "ssn",
834
+ "social_security",
835
+ "social_security_number",
836
+ "address",
837
+ "street",
838
+ "street_address",
839
+ "city",
840
+ "zip",
841
+ "zipcode",
842
+ "postal",
843
+ "postcode",
844
+ "name",
845
+ "firstname",
846
+ "first_name",
847
+ "lastname",
848
+ "last_name",
849
+ "fullname",
850
+ "full_name",
851
+ "password",
852
+ "pwd",
853
+ "secret",
854
+ "token",
855
+ "api_key",
856
+ "card",
857
+ "credit_card",
858
+ "creditcard",
859
+ "card_number",
860
+ "account",
861
+ "account_number",
862
+ "iban",
863
+ "swift",
864
+ "passport",
865
+ "passport_number",
866
+ "license",
867
+ "licence",
868
+ "driver_license",
869
+ "dob",
870
+ "date_of_birth",
871
+ "birth_date",
872
+ "birthdate"
873
+ ],
874
+ preserveFormatting: true,
875
+ preserveFormulas: true
876
+ };
877
+ try {
878
+ this.xlsx = __require("xlsx");
879
+ } catch {}
880
+ }
881
+ /**
882
+ * Check if XLSX support is available
883
+ */
884
+ isAvailable() {
885
+ return !!this.xlsx;
886
+ }
887
+ /**
888
+ * Parse XLSX from buffer
889
+ */
890
+ parse(buffer) {
891
+ if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
892
+ try {
893
+ return this.xlsx.read(buffer, {
894
+ type: "buffer",
895
+ cellFormula: true,
896
+ cellStyles: true
897
+ });
898
+ } catch (error) {
899
+ throw new Error(`[XlsxProcessor] Failed to parse XLSX: ${error.message}`);
900
+ }
901
+ }
902
+ /**
903
+ * Detect PII in XLSX data
904
+ */
905
+ async detect(buffer, detector, options) {
906
+ if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
907
+ const opts = {
908
+ ...this.defaultOptions,
909
+ ...options
910
+ };
911
+ const workbook = this.parse(buffer);
912
+ const sheetNames = this.getSheetNamesToProcess(workbook, opts);
913
+ const sheetResults = [];
914
+ const allDetections = [];
915
+ const allTypes = /* @__PURE__ */ new Set();
916
+ for (let sheetIndex = 0; sheetIndex < sheetNames.length; sheetIndex++) {
917
+ const sheetName = sheetNames[sheetIndex];
918
+ const sheet = workbook.Sheets[sheetName];
919
+ const sheetResult = await this.detectSheet(sheet, sheetName, sheetIndex, detector, opts);
920
+ sheetResults.push(sheetResult);
921
+ allDetections.push(...sheetResult.matchesByCell.flatMap((c) => c.matches));
922
+ sheetResult.matchesByCell.forEach((cell) => {
923
+ cell.matches.forEach((det) => allTypes.add(det.type));
924
+ });
925
+ }
926
+ const original = this.extractText(buffer, options);
927
+ const redactedBuffer = this.redact(buffer, {
928
+ original,
929
+ redacted: original,
930
+ detections: allDetections,
931
+ redactionMap: {},
932
+ stats: { piiCount: allDetections.length },
933
+ sheetResults,
934
+ sheetCount: sheetResults.length
935
+ }, options);
936
+ const redacted = this.extractText(redactedBuffer, options);
937
+ const redactionMap = {};
938
+ allDetections.forEach((det) => {
939
+ redactionMap[det.placeholder] = det.value;
940
+ });
941
+ return {
942
+ original,
943
+ redacted,
944
+ detections: allDetections,
945
+ redactionMap,
946
+ stats: { piiCount: allDetections.length },
947
+ sheetResults,
948
+ sheetCount: sheetResults.length
949
+ };
950
+ }
951
+ /**
952
+ * Detect PII in a single sheet
953
+ */
954
+ async detectSheet(sheet, sheetName, sheetIndex, detector, options) {
955
+ const range = this.xlsx.utils.decode_range(sheet["!ref"] || "A1");
956
+ const startRow = range.s.r;
957
+ const endRow = options.maxRows !== void 0 ? Math.min(range.e.r, startRow + options.maxRows - 1) : range.e.r;
958
+ const startCol = range.s.c;
959
+ const endCol = range.e.c;
960
+ const columnCount = endCol - startCol + 1;
961
+ const hasHeader = options.hasHeader !== void 0 ? options.hasHeader : this.detectHeader(sheet, range);
962
+ const headers = hasHeader ? this.getRowValues(sheet, startRow, startCol, endCol) : void 0;
963
+ const dataStartRow = hasHeader ? startRow + 1 : startRow;
964
+ const columnNameToIndex = /* @__PURE__ */ new Map();
965
+ if (headers) headers.forEach((header, index) => {
966
+ if (header) columnNameToIndex.set(header.toLowerCase().trim(), index);
967
+ });
968
+ const alwaysRedactCols = new Set(options.alwaysRedactColumns || []);
969
+ if (options.alwaysRedactColumnNames && headers) options.alwaysRedactColumnNames.forEach((name) => {
970
+ const index = columnNameToIndex.get(name.toLowerCase().trim());
971
+ if (index !== void 0) alwaysRedactCols.add(index);
972
+ });
973
+ const skipCols = new Set(options.skipColumns || []);
974
+ const columnStats = {};
975
+ for (let col = 0; col <= endCol - startCol; col++) columnStats[col] = {
976
+ columnIndex: col,
977
+ columnLetter: this.columnToLetter(col),
978
+ columnName: headers?.[col],
979
+ piiCount: 0,
980
+ piiPercentage: 0,
981
+ piiTypes: []
982
+ };
983
+ const matchesByCell = [];
984
+ for (let row = dataStartRow; row <= endRow; row++) for (let col = startCol; col <= endCol; col++) {
985
+ const colIndex = col - startCol;
986
+ if (skipCols.has(colIndex)) continue;
987
+ const cellRef = this.xlsx.utils.encode_cell({
988
+ r: row,
989
+ c: col
990
+ });
991
+ const cell = sheet[cellRef];
992
+ if (!cell) continue;
993
+ const cellValue = this.getCellValue(cell);
994
+ if (!cellValue) continue;
995
+ const cellFormula = cell.f;
996
+ if (alwaysRedactCols.has(colIndex)) {
997
+ const detection = {
998
+ type: "SENSITIVE_COLUMN",
999
+ value: cellValue,
1000
+ placeholder: `[SENSITIVE_COLUMN_${colIndex}]`,
1001
+ position: [0, cellValue.length],
1002
+ severity: "high",
1003
+ confidence: 1
1004
+ };
1005
+ matchesByCell.push({
1006
+ cell: cellRef,
1007
+ row: row + 1,
1008
+ column: colIndex,
1009
+ columnLetter: this.columnToLetter(colIndex),
1010
+ columnName: headers?.[colIndex],
1011
+ value: cellValue,
1012
+ formula: cellFormula,
1013
+ matches: [detection]
1014
+ });
1015
+ columnStats[colIndex].piiCount++;
1016
+ continue;
1017
+ }
1018
+ const result = await detector.detect(cellValue);
1019
+ if (result.detections.length > 0) {
1020
+ const boostedDetections = this.boostConfidenceFromColumnName(result.detections, headers?.[colIndex], options.piiIndicatorNames || []);
1021
+ matchesByCell.push({
1022
+ cell: cellRef,
1023
+ row: row + 1,
1024
+ column: colIndex,
1025
+ columnLetter: this.columnToLetter(colIndex),
1026
+ columnName: headers?.[colIndex],
1027
+ value: cellValue,
1028
+ formula: cellFormula,
1029
+ matches: boostedDetections
1030
+ });
1031
+ columnStats[colIndex].piiCount += boostedDetections.length;
1032
+ const columnTypes = new Set(columnStats[colIndex].piiTypes);
1033
+ boostedDetections.forEach((d) => columnTypes.add(d.type));
1034
+ columnStats[colIndex].piiTypes = Array.from(columnTypes);
1035
+ }
1036
+ }
1037
+ const dataRowCount = endRow - dataStartRow + 1;
1038
+ for (let col = 0; col <= endCol - startCol; col++) {
1039
+ const rowsWithPii = matchesByCell.filter((m) => m.column === col).length;
1040
+ columnStats[col].piiPercentage = dataRowCount > 0 ? rowsWithPii / dataRowCount * 100 : 0;
1041
+ }
1042
+ return {
1043
+ sheetName,
1044
+ sheetIndex,
1045
+ rowCount: dataRowCount,
1046
+ columnCount,
1047
+ headers: headers?.filter((h) => h !== void 0),
1048
+ columnStats,
1049
+ matchesByCell
1050
+ };
1051
+ }
1052
+ /**
1053
+ * Redact PII in XLSX data
1054
+ */
1055
+ redact(buffer, detectionResult, options) {
1056
+ if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
1057
+ const opts = {
1058
+ ...this.defaultOptions,
1059
+ ...options
1060
+ };
1061
+ const workbook = this.parse(buffer);
1062
+ for (const sheetResult of detectionResult.sheetResults) {
1063
+ const sheet = workbook.Sheets[sheetResult.sheetName];
1064
+ for (const cellMatch of sheetResult.matchesByCell) {
1065
+ const cell = sheet[cellMatch.cell];
1066
+ if (!cell) continue;
1067
+ cell.v = "[REDACTED]";
1068
+ cell.w = "[REDACTED]";
1069
+ if (!opts.preserveFormulas) delete cell.f;
1070
+ cell.t = "s";
1071
+ }
1072
+ }
1073
+ return this.xlsx.write(workbook, {
1074
+ type: "buffer",
1075
+ bookType: "xlsx"
1076
+ });
1077
+ }
1078
+ /**
1079
+ * Get cell value as string
1080
+ */
1081
+ getCellValue(cell) {
1082
+ if (!cell) return "";
1083
+ if (cell.w !== void 0) return String(cell.w);
1084
+ if (cell.v !== void 0) return String(cell.v);
1085
+ return "";
1086
+ }
1087
+ /**
1088
+ * Get row values
1089
+ */
1090
+ getRowValues(sheet, row, startCol, endCol) {
1091
+ const values = [];
1092
+ for (let col = startCol; col <= endCol; col++) {
1093
+ const cell = sheet[this.xlsx.utils.encode_cell({
1094
+ r: row,
1095
+ c: col
1096
+ })];
1097
+ values.push(cell ? this.getCellValue(cell) : void 0);
1098
+ }
1099
+ return values;
1100
+ }
1101
+ /**
1102
+ * Detect if first row is likely a header
1103
+ */
1104
+ detectHeader(sheet, range) {
1105
+ const firstRow = this.getRowValues(sheet, range.s.r, range.s.c, range.e.c);
1106
+ const secondRow = range.s.r + 1 <= range.e.r ? this.getRowValues(sheet, range.s.r + 1, range.s.c, range.e.c) : null;
1107
+ if (!secondRow) return false;
1108
+ const firstRowValues = firstRow.filter((v) => v !== void 0);
1109
+ const secondRowValues = secondRow.filter((v) => v !== void 0);
1110
+ if (firstRowValues.length === 0 || secondRowValues.length === 0) return false;
1111
+ if (firstRowValues.reduce((sum, v) => sum + v.length, 0) / firstRowValues.length > secondRowValues.reduce((sum, v) => sum + v.length, 0) / secondRowValues.length * 1.5) return false;
1112
+ const firstRowNumeric = firstRowValues.filter((v) => !isNaN(Number(v)) && v.trim() !== "").length;
1113
+ return firstRowValues.length - firstRowNumeric >= firstRowNumeric;
1114
+ }
1115
+ /**
1116
+ * Convert column index to letter (0 = A, 25 = Z, 26 = AA)
1117
+ */
1118
+ columnToLetter(col) {
1119
+ let letter = "";
1120
+ while (col >= 0) {
1121
+ letter = String.fromCharCode(col % 26 + 65) + letter;
1122
+ col = Math.floor(col / 26) - 1;
1123
+ }
1124
+ return letter;
1125
+ }
1126
+ /**
1127
+ * Get sheet names to process based on options
1128
+ */
1129
+ getSheetNamesToProcess(workbook, options) {
1130
+ const allSheetNames = workbook.SheetNames;
1131
+ if (options.sheets && options.sheets.length > 0) return options.sheets.filter((name) => allSheetNames.includes(name));
1132
+ if (options.sheetIndices && options.sheetIndices.length > 0) return options.sheetIndices.filter((index) => index >= 0 && index < allSheetNames.length).map((index) => allSheetNames[index]);
1133
+ return allSheetNames;
1134
+ }
1135
+ /**
1136
+ * Boost confidence if column name indicates PII
1137
+ */
1138
+ boostConfidenceFromColumnName(detections, columnName, piiIndicatorNames) {
1139
+ if (!columnName) return detections;
1140
+ const nameLower = columnName.toLowerCase().trim();
1141
+ if (!piiIndicatorNames.some((indicator) => nameLower.includes(indicator.toLowerCase()))) return detections;
1142
+ return detections.map((detection) => ({
1143
+ ...detection,
1144
+ confidence: Math.min(1, (detection.confidence || .5) * 1.2)
1145
+ }));
1146
+ }
1147
+ /**
1148
+ * Extract all cell values as text
1149
+ */
1150
+ extractText(buffer, options) {
1151
+ if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
1152
+ const workbook = this.parse(buffer);
1153
+ const opts = {
1154
+ ...this.defaultOptions,
1155
+ ...options
1156
+ };
1157
+ const sheetNames = this.getSheetNamesToProcess(workbook, opts);
1158
+ const textParts = [];
1159
+ for (const sheetName of sheetNames) {
1160
+ const sheet = workbook.Sheets[sheetName];
1161
+ const range = this.xlsx.utils.decode_range(sheet["!ref"] || "A1");
1162
+ for (let row = range.s.r; row <= range.e.r; row++) for (let col = range.s.c; col <= range.e.c; col++) {
1163
+ const cell = sheet[this.xlsx.utils.encode_cell({
1164
+ r: row,
1165
+ c: col
1166
+ })];
1167
+ if (cell) {
1168
+ const value = this.getCellValue(cell);
1169
+ if (value.trim().length > 0) textParts.push(value);
1170
+ }
1171
+ }
1172
+ }
1173
+ return textParts.join(" ");
1174
+ }
1175
+ /**
1176
+ * Get workbook metadata
1177
+ */
1178
+ getMetadata(buffer) {
1179
+ if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
1180
+ const workbook = this.parse(buffer);
1181
+ return {
1182
+ sheetNames: workbook.SheetNames,
1183
+ sheetCount: workbook.SheetNames.length
1184
+ };
1185
+ }
1186
+ };
1187
+ /**
1188
+ * Create an XLSX processor instance
1189
+ */
1190
+ function createXlsxProcessor() {
1191
+ return new XlsxProcessor();
1192
+ }
1193
+
1194
+ //#endregion
1195
+ //#region src/document/DocumentProcessor.ts
1196
+ /**
1197
+ * Document processor with optional PDF, DOCX, OCR, JSON, CSV, and XLSX support
1198
+ * Requires peer dependencies:
1199
+ * - pdf-parse (for PDF)
1200
+ * - mammoth (for DOCX)
1201
+ * - tesseract.js (for OCR/images)
1202
+ * - xlsx (for Excel/XLSX)
1203
+ */
1204
+ var DocumentProcessor = class {
1205
+ constructor() {
1206
+ try {
1207
+ this.pdfParse = __require("pdf-parse");
1208
+ } catch {}
1209
+ try {
1210
+ this.mammoth = __require("mammoth");
1211
+ } catch {}
1212
+ this.ocrProcessor = new OCRProcessor();
1213
+ this.jsonProcessor = new JsonProcessor();
1214
+ this.csvProcessor = new CsvProcessor();
1215
+ this.xlsxProcessor = new XlsxProcessor();
1216
+ }
1217
+ /**
1218
+ * Extract text from document buffer
1219
+ */
1220
+ async extractText(buffer, options) {
1221
+ const format = options?.format || this.detectFormat(buffer);
1222
+ if (!format) throw new Error("[DocumentProcessor] Unable to detect document format. Supported: PDF, DOCX, TXT, images (with OCR)");
1223
+ const maxSize = options?.maxSize || 50 * 1024 * 1024;
1224
+ if (buffer.length > maxSize) throw new Error(`[DocumentProcessor] Document size (${buffer.length} bytes) exceeds maximum (${maxSize} bytes)`);
1225
+ switch (format) {
1226
+ case "pdf": return this.extractPdfText(buffer, options);
1227
+ case "docx": return this.extractDocxText(buffer, options);
1228
+ case "txt": return buffer.toString("utf-8");
1229
+ case "image": return this.extractImageText(buffer, options);
1230
+ case "json": return this.extractJsonText(buffer, options);
1231
+ case "csv": return this.extractCsvText(buffer, options);
1232
+ case "xlsx": return this.extractXlsxText(buffer, options);
1233
+ default: throw new Error(`[DocumentProcessor] Unsupported format: ${format}`);
1234
+ }
1235
+ }
1236
+ /**
1237
+ * Get document metadata
1238
+ */
1239
+ async getMetadata(buffer, options) {
1240
+ const format = options?.format || this.detectFormat(buffer);
1241
+ if (!format) throw new Error("[DocumentProcessor] Unable to detect document format");
1242
+ switch (format) {
1243
+ case "pdf": return this.getPdfMetadata(buffer, options);
1244
+ case "docx": return this.getDocxMetadata(buffer, options);
1245
+ case "txt": return {
1246
+ format: "txt",
1247
+ pages: void 0
1248
+ };
1249
+ case "image": return this.getImageMetadata(buffer, options);
1250
+ case "json": return this.getJsonMetadata(buffer, options);
1251
+ case "csv": return this.getCsvMetadata(buffer, options);
1252
+ case "xlsx": return this.getXlsxMetadata(buffer, options);
1253
+ default: throw new Error(`[DocumentProcessor] Unsupported format: ${format}`);
1254
+ }
1255
+ }
1256
+ /**
1257
+ * Detect document format from buffer
1258
+ */
1259
+ detectFormat(buffer) {
1260
+ if (buffer.length < 4) return null;
1261
+ if (buffer.toString("utf-8", 0, 4) === "%PDF") return "pdf";
1262
+ if (buffer.length >= 8 && buffer[0] === 137 && buffer[1] === 80 && buffer[2] === 78 && buffer[3] === 71) return "image";
1263
+ if (buffer[0] === 255 && buffer[1] === 216 && buffer[2] === 255) return "image";
1264
+ if (buffer[0] === 73 && buffer[1] === 73 && buffer[2] === 42 && buffer[3] === 0 || buffer[0] === 77 && buffer[1] === 77 && buffer[2] === 0 && buffer[3] === 42) return "image";
1265
+ if (buffer[0] === 66 && buffer[1] === 77) return "image";
1266
+ if (buffer.length >= 12 && buffer[0] === 82 && buffer[1] === 73 && buffer[2] === 70 && buffer[3] === 70 && buffer[8] === 87 && buffer[9] === 69 && buffer[10] === 66 && buffer[11] === 80) return "image";
1267
+ if (buffer[0] === 80 && buffer[1] === 75) {
1268
+ const zipHeader = buffer.toString("utf-8", 0, Math.min(500, buffer.length));
1269
+ if (zipHeader.includes("word/") || zipHeader.includes("[Content_Types].xml")) return "docx";
1270
+ if (zipHeader.includes("xl/")) return "xlsx";
1271
+ }
1272
+ const text = buffer.toString("utf-8");
1273
+ const trimmed = text.trim();
1274
+ if (trimmed.startsWith("{") && trimmed.endsWith("}") || trimmed.startsWith("[") && trimmed.endsWith("]")) {
1275
+ if (this.jsonProcessor.isValid(buffer)) return "json";
1276
+ }
1277
+ const lines = text.split(/\r?\n/).slice(0, 5);
1278
+ if (lines.length >= 2) for (const delimiter of [
1279
+ ",",
1280
+ " ",
1281
+ ";",
1282
+ "|"
1283
+ ]) {
1284
+ const counts = lines.map((line) => (line.match(new RegExp(delimiter, "g")) || []).length);
1285
+ if (counts[0] > 0 && counts.every((c) => c === counts[0])) return "csv";
1286
+ }
1287
+ const sample = buffer.slice(0, Math.min(1e3, buffer.length));
1288
+ if (sample.filter((byte) => byte < 32 && byte !== 9 && byte !== 10 && byte !== 13).length < sample.length * .1) return "txt";
1289
+ return null;
1290
+ }
1291
+ /**
1292
+ * Check if format is supported
1293
+ */
1294
+ isFormatSupported(format) {
1295
+ switch (format) {
1296
+ case "pdf": return !!this.pdfParse;
1297
+ case "docx": return !!this.mammoth;
1298
+ case "txt": return true;
1299
+ case "image": return this.ocrProcessor.isAvailable();
1300
+ case "json": return true;
1301
+ case "csv": return true;
1302
+ case "xlsx": return this.xlsxProcessor.isAvailable();
1303
+ default: return false;
1304
+ }
1305
+ }
1306
+ /**
1307
+ * Extract text from PDF
1308
+ */
1309
+ async extractPdfText(buffer, options) {
1310
+ if (!this.pdfParse) throw new Error("[DocumentProcessor] PDF support requires pdf-parse. Install with: npm install pdf-parse");
1311
+ try {
1312
+ const data = await this.pdfParse(buffer, {
1313
+ password: options?.password,
1314
+ max: options?.pages ? Math.max(...options.pages) : void 0
1315
+ });
1316
+ if (options?.pages) return data.text;
1317
+ return data.text || "";
1318
+ } catch (error) {
1319
+ throw new Error(`[DocumentProcessor] PDF extraction failed: ${error.message}`);
1320
+ }
1321
+ }
1322
+ /**
1323
+ * Extract text from DOCX
1324
+ */
1325
+ async extractDocxText(buffer, _options) {
1326
+ if (!this.mammoth) throw new Error("[DocumentProcessor] DOCX support requires mammoth. Install with: npm install mammoth");
1327
+ try {
1328
+ return (await this.mammoth.extractRawText({ buffer })).value || "";
1329
+ } catch (error) {
1330
+ throw new Error(`[DocumentProcessor] DOCX extraction failed: ${error.message}`);
1331
+ }
1332
+ }
1333
+ /**
1334
+ * Get PDF metadata
1335
+ */
1336
+ async getPdfMetadata(buffer, _options) {
1337
+ if (!this.pdfParse) throw new Error("[DocumentProcessor] PDF support requires pdf-parse. Install with: npm install pdf-parse");
1338
+ try {
1339
+ const data = await this.pdfParse(buffer, { password: _options?.password });
1340
+ return {
1341
+ format: "pdf",
1342
+ pages: data.numpages,
1343
+ title: data.info?.Title,
1344
+ author: data.info?.Author,
1345
+ creationDate: data.info?.CreationDate ? new Date(data.info.CreationDate) : void 0,
1346
+ modifiedDate: data.info?.ModDate ? new Date(data.info.ModDate) : void 0,
1347
+ custom: data.info
1348
+ };
1349
+ } catch (error) {
1350
+ throw new Error(`[DocumentProcessor] PDF metadata extraction failed: ${error.message}`);
1351
+ }
1352
+ }
1353
+ /**
1354
+ * Get DOCX metadata
1355
+ */
1356
+ async getDocxMetadata(_buffer, _options) {
1357
+ return {
1358
+ format: "docx",
1359
+ pages: void 0
1360
+ };
1361
+ }
1362
+ /**
1363
+ * Extract text from image using OCR
1364
+ */
1365
+ async extractImageText(buffer, options) {
1366
+ if (!this.ocrProcessor.isAvailable()) throw new Error("[DocumentProcessor] Image/OCR support requires tesseract.js. Install with: npm install tesseract.js");
1367
+ try {
1368
+ return (await this.ocrProcessor.recognizeText(buffer, options?.ocrOptions)).text;
1369
+ } catch (error) {
1370
+ throw new Error(`[DocumentProcessor] Image text extraction failed: ${error.message}`);
1371
+ }
1372
+ }
1373
+ /**
1374
+ * Get image metadata
1375
+ */
1376
+ async getImageMetadata(buffer, options) {
1377
+ if (!this.ocrProcessor.isAvailable()) return {
1378
+ format: "image",
1379
+ pages: void 0,
1380
+ usedOCR: false
1381
+ };
1382
+ try {
1383
+ return {
1384
+ format: "image",
1385
+ pages: void 0,
1386
+ usedOCR: true,
1387
+ ocrConfidence: (await this.ocrProcessor.recognizeText(buffer, options?.ocrOptions)).confidence
1388
+ };
1389
+ } catch {
1390
+ return {
1391
+ format: "image",
1392
+ pages: void 0,
1393
+ usedOCR: false
1394
+ };
1395
+ }
1396
+ }
1397
+ /**
1398
+ * Extract text from JSON
1399
+ */
1400
+ async extractJsonText(buffer, _options) {
1401
+ try {
1402
+ return this.jsonProcessor.extractText(buffer);
1403
+ } catch (error) {
1404
+ throw new Error(`[DocumentProcessor] JSON extraction failed: ${error.message}`);
1405
+ }
1406
+ }
1407
+ /**
1408
+ * Extract text from CSV
1409
+ */
1410
+ async extractCsvText(buffer, _options) {
1411
+ try {
1412
+ return this.csvProcessor.extractText(buffer);
1413
+ } catch (error) {
1414
+ throw new Error(`[DocumentProcessor] CSV extraction failed: ${error.message}`);
1415
+ }
1416
+ }
1417
+ /**
1418
+ * Extract text from XLSX
1419
+ */
1420
+ async extractXlsxText(buffer, _options) {
1421
+ if (!this.xlsxProcessor.isAvailable()) throw new Error("[DocumentProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
1422
+ try {
1423
+ return this.xlsxProcessor.extractText(buffer);
1424
+ } catch (error) {
1425
+ throw new Error(`[DocumentProcessor] XLSX extraction failed: ${error.message}`);
1426
+ }
1427
+ }
1428
+ /**
1429
+ * Get JSON metadata
1430
+ */
1431
+ async getJsonMetadata(buffer, _options) {
1432
+ try {
1433
+ const data = this.jsonProcessor.parse(buffer);
1434
+ const isArray = Array.isArray(data);
1435
+ return {
1436
+ format: "json",
1437
+ pages: void 0,
1438
+ custom: {
1439
+ isArray,
1440
+ itemCount: isArray ? data.length : Object.keys(data).length
1441
+ }
1442
+ };
1443
+ } catch {
1444
+ return {
1445
+ format: "json",
1446
+ pages: void 0
1447
+ };
1448
+ }
1449
+ }
1450
+ /**
1451
+ * Get CSV metadata
1452
+ */
1453
+ async getCsvMetadata(buffer, _options) {
1454
+ try {
1455
+ const info = this.csvProcessor.getColumnInfo(buffer);
1456
+ return {
1457
+ format: "csv",
1458
+ pages: void 0,
1459
+ custom: {
1460
+ rowCount: info.rowCount,
1461
+ columnCount: info.columnCount,
1462
+ headers: info.headers
1463
+ }
1464
+ };
1465
+ } catch {
1466
+ return {
1467
+ format: "csv",
1468
+ pages: void 0
1469
+ };
1470
+ }
1471
+ }
1472
+ /**
1473
+ * Get XLSX metadata
1474
+ */
1475
+ async getXlsxMetadata(buffer, _options) {
1476
+ if (!this.xlsxProcessor.isAvailable()) return {
1477
+ format: "xlsx",
1478
+ pages: void 0
1479
+ };
1480
+ try {
1481
+ const metadata = this.xlsxProcessor.getMetadata(buffer);
1482
+ return {
1483
+ format: "xlsx",
1484
+ pages: void 0,
1485
+ custom: {
1486
+ sheetNames: metadata.sheetNames,
1487
+ sheetCount: metadata.sheetCount
1488
+ }
1489
+ };
1490
+ } catch {
1491
+ return {
1492
+ format: "xlsx",
1493
+ pages: void 0
1494
+ };
1495
+ }
1496
+ }
1497
+ /**
1498
+ * Get OCR processor instance
1499
+ */
1500
+ getOCRProcessor() {
1501
+ return this.ocrProcessor;
1502
+ }
1503
+ /**
1504
+ * Get JSON processor instance
1505
+ */
1506
+ getJsonProcessor() {
1507
+ return this.jsonProcessor;
1508
+ }
1509
+ /**
1510
+ * Get CSV processor instance
1511
+ */
1512
+ getCsvProcessor() {
1513
+ return this.csvProcessor;
1514
+ }
1515
+ /**
1516
+ * Get XLSX processor instance
1517
+ */
1518
+ getXlsxProcessor() {
1519
+ return this.xlsxProcessor;
1520
+ }
1521
+ };
1522
+ /**
1523
+ * Create a document processor instance
1524
+ */
1525
+ function createDocumentProcessor() {
1526
+ return new DocumentProcessor();
1527
+ }
1528
+
1529
+ //#endregion
1530
+ export { CsvProcessor as a, createJsonProcessor as c, __esmMin as d, __exportAll as f, createXlsxProcessor as i, OCRProcessor as l, __toCommonJS as m, createDocumentProcessor as n, createCsvProcessor as o, __require as p, XlsxProcessor as r, JsonProcessor as s, DocumentProcessor as t, createOCRProcessor as u };
1531
+ //# sourceMappingURL=document-ChTIy4sC.mjs.map