openredaction 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1701 @@
1
+ import {
2
+ __require
3
+ } from "./chunk-WMJKH4XE.mjs";
4
+
5
+ // src/document/OCRProcessor.ts
6
+ var OCRProcessor = class {
7
+ constructor() {
8
+ try {
9
+ this.tesseract = __require("tesseract.js");
10
+ } catch {
11
+ }
12
+ }
13
+ /**
14
+ * Extract text from image buffer using OCR
15
+ */
16
+ async recognizeText(buffer, options) {
17
+ if (!this.tesseract) {
18
+ throw new Error(
19
+ "[OCRProcessor] OCR support requires tesseract.js. Install with: npm install tesseract.js"
20
+ );
21
+ }
22
+ const startTime = performance.now();
23
+ try {
24
+ const language = Array.isArray(options?.language) ? options.language.join("+") : options?.language || "eng";
25
+ const worker = await this.tesseract.createWorker(language, options?.oem || 3);
26
+ if (options?.psm !== void 0) {
27
+ await worker.setParameters({
28
+ tessedit_pageseg_mode: options.psm
29
+ });
30
+ }
31
+ const result = await worker.recognize(buffer);
32
+ await worker.terminate();
33
+ const endTime = performance.now();
34
+ const processingTime = Math.round((endTime - startTime) * 100) / 100;
35
+ return {
36
+ text: result.data.text || "",
37
+ confidence: result.data.confidence || 0,
38
+ processingTime
39
+ };
40
+ } catch (error) {
41
+ throw new Error(`[OCRProcessor] OCR recognition failed: ${error.message}`);
42
+ }
43
+ }
44
+ /**
45
+ * Check if OCR is available (tesseract.js installed)
46
+ */
47
+ isAvailable() {
48
+ return !!this.tesseract;
49
+ }
50
+ /**
51
+ * Create a scheduler for batch OCR processing
52
+ * More efficient for processing multiple images
53
+ */
54
+ async createScheduler(workerCount = 4) {
55
+ if (!this.tesseract) {
56
+ throw new Error(
57
+ "[OCRProcessor] OCR support requires tesseract.js. Install with: npm install tesseract.js"
58
+ );
59
+ }
60
+ if (this.scheduler) {
61
+ await this.scheduler.terminate();
62
+ }
63
+ this.scheduler = this.tesseract.createScheduler();
64
+ const workers = [];
65
+ for (let i = 0; i < workerCount; i++) {
66
+ const worker = await this.tesseract.createWorker("eng");
67
+ this.scheduler.addWorker(worker);
68
+ workers.push(worker);
69
+ }
70
+ return this.scheduler;
71
+ }
72
+ /**
73
+ * Batch process multiple images
74
+ */
75
+ async recognizeBatch(buffers, _options) {
76
+ if (!this.tesseract) {
77
+ throw new Error(
78
+ "[OCRProcessor] OCR support requires tesseract.js. Install with: npm install tesseract.js"
79
+ );
80
+ }
81
+ const scheduler = await this.createScheduler();
82
+ try {
83
+ const results = await Promise.all(
84
+ buffers.map(async (buffer) => {
85
+ const startTime = performance.now();
86
+ const result = await scheduler.addJob("recognize", buffer);
87
+ const endTime = performance.now();
88
+ return {
89
+ text: result.data.text || "",
90
+ confidence: result.data.confidence || 0,
91
+ processingTime: Math.round((endTime - startTime) * 100) / 100
92
+ };
93
+ })
94
+ );
95
+ await scheduler.terminate();
96
+ this.scheduler = void 0;
97
+ return results;
98
+ } catch (error) {
99
+ if (scheduler) {
100
+ await scheduler.terminate();
101
+ this.scheduler = void 0;
102
+ }
103
+ throw new Error(`[OCRProcessor] Batch OCR failed: ${error.message}`);
104
+ }
105
+ }
106
+ /**
107
+ * Terminate any running scheduler
108
+ */
109
+ async cleanup() {
110
+ if (this.scheduler) {
111
+ await this.scheduler.terminate();
112
+ this.scheduler = void 0;
113
+ }
114
+ }
115
+ };
116
+ function createOCRProcessor() {
117
+ return new OCRProcessor();
118
+ }
119
+
120
+ // src/document/JsonProcessor.ts
121
+ var JsonProcessor = class {
122
+ constructor() {
123
+ this.defaultOptions = {
124
+ maxDepth: 100,
125
+ scanKeys: false,
126
+ alwaysRedact: [],
127
+ skipPaths: [],
128
+ piiIndicatorKeys: [
129
+ "email",
130
+ "e-mail",
131
+ "mail",
132
+ "phone",
133
+ "tel",
134
+ "telephone",
135
+ "mobile",
136
+ "ssn",
137
+ "social_security",
138
+ "address",
139
+ "street",
140
+ "city",
141
+ "zip",
142
+ "postal",
143
+ "name",
144
+ "firstname",
145
+ "lastname",
146
+ "fullname",
147
+ "password",
148
+ "pwd",
149
+ "secret",
150
+ "token",
151
+ "key",
152
+ "card",
153
+ "credit_card",
154
+ "creditcard",
155
+ "account",
156
+ "iban",
157
+ "swift",
158
+ "passport",
159
+ "license",
160
+ "licence"
161
+ ],
162
+ preserveStructure: true
163
+ };
164
+ }
165
+ /**
166
+ * Parse JSON from buffer or string
167
+ */
168
+ parse(input) {
169
+ try {
170
+ const text = typeof input === "string" ? input : input.toString("utf-8");
171
+ return JSON.parse(text);
172
+ } catch (error) {
173
+ throw new Error(`[JsonProcessor] Invalid JSON: ${error.message}`);
174
+ }
175
+ }
176
+ /**
177
+ * Detect PII in JSON data
178
+ */
179
+ detect(data, detector, options) {
180
+ const opts = { ...this.defaultOptions, ...options };
181
+ const pathsDetected = [];
182
+ const matchesByPath = {};
183
+ const allDetections = [];
184
+ this.traverse(data, "", opts, (path, value, key) => {
185
+ if (this.shouldSkip(path, opts.skipPaths)) {
186
+ return;
187
+ }
188
+ if (this.shouldAlwaysRedact(path, opts.alwaysRedact)) {
189
+ const detection = {
190
+ type: "SENSITIVE_FIELD",
191
+ value: String(value),
192
+ placeholder: `[SENSITIVE_FIELD]`,
193
+ position: [0, String(value).length],
194
+ severity: "high",
195
+ confidence: 1
196
+ };
197
+ matchesByPath[path] = [detection];
198
+ pathsDetected.push(path);
199
+ allDetections.push(detection);
200
+ return;
201
+ }
202
+ if (opts.scanKeys && key) {
203
+ const keyResult = detector.detect(key);
204
+ if (keyResult.detections.length > 0) {
205
+ const keyPath = `${path}.__key__`;
206
+ matchesByPath[keyPath] = keyResult.detections;
207
+ pathsDetected.push(keyPath);
208
+ allDetections.push(...keyResult.detections);
209
+ }
210
+ }
211
+ const valueStr = String(value);
212
+ const result = detector.detect(valueStr);
213
+ if (result.detections.length > 0) {
214
+ const boostedDetections = this.boostConfidenceFromKey(
215
+ result.detections,
216
+ key,
217
+ opts.piiIndicatorKeys
218
+ );
219
+ matchesByPath[path] = boostedDetections;
220
+ pathsDetected.push(path);
221
+ allDetections.push(...boostedDetections);
222
+ }
223
+ });
224
+ const original = JSON.stringify(data);
225
+ const redacted = this.redact(data, {
226
+ original,
227
+ redacted: original,
228
+ detections: allDetections,
229
+ redactionMap: {},
230
+ stats: { piiCount: allDetections.length },
231
+ pathsDetected,
232
+ matchesByPath
233
+ }, opts);
234
+ const redactionMap = {};
235
+ allDetections.forEach((det) => {
236
+ redactionMap[det.placeholder] = det.value;
237
+ });
238
+ return {
239
+ original,
240
+ redacted: typeof redacted === "string" ? redacted : JSON.stringify(redacted),
241
+ detections: allDetections,
242
+ redactionMap,
243
+ stats: {
244
+ piiCount: allDetections.length
245
+ },
246
+ pathsDetected,
247
+ matchesByPath
248
+ };
249
+ }
250
+ /**
251
+ * Redact PII in JSON data
252
+ */
253
+ redact(data, detectionResult, options) {
254
+ const opts = { ...this.defaultOptions, ...options };
255
+ if (!opts.preserveStructure) {
256
+ return this.parse(this.redactText(JSON.stringify(data, null, 2), detectionResult));
257
+ }
258
+ return this.redactPreservingStructure(data, detectionResult.pathsDetected);
259
+ }
260
+ /**
261
+ * Redact specific paths in JSON while preserving structure
262
+ */
263
+ redactPreservingStructure(data, pathsToRedact) {
264
+ const pathSet = new Set(pathsToRedact);
265
+ const redactValue = (value, currentPath) => {
266
+ if (pathSet.has(currentPath)) {
267
+ if (typeof value === "string") {
268
+ return "[REDACTED]";
269
+ } else if (typeof value === "number") {
270
+ return 0;
271
+ } else if (typeof value === "boolean") {
272
+ return false;
273
+ } else if (value === null) {
274
+ return null;
275
+ } else if (Array.isArray(value)) {
276
+ return [];
277
+ } else if (typeof value === "object") {
278
+ return {};
279
+ }
280
+ return "[REDACTED]";
281
+ }
282
+ if (Array.isArray(value)) {
283
+ return value.map(
284
+ (item, index) => redactValue(item, `${currentPath}[${index}]`)
285
+ );
286
+ }
287
+ if (value !== null && typeof value === "object") {
288
+ const result = {};
289
+ for (const [key, val] of Object.entries(value)) {
290
+ const newPath = currentPath ? `${currentPath}.${key}` : key;
291
+ result[key] = redactValue(val, newPath);
292
+ }
293
+ return result;
294
+ }
295
+ return value;
296
+ };
297
+ return redactValue(data, "");
298
+ }
299
+ /**
300
+ * Simple text-based redaction (fallback)
301
+ */
302
+ redactText(text, detectionResult) {
303
+ let redacted = text;
304
+ const sortedDetections = [...detectionResult.detections].sort((a, b) => b.position[0] - a.position[0]);
305
+ for (const detection of sortedDetections) {
306
+ const [start, end] = detection.position;
307
+ redacted = redacted.slice(0, start) + detection.placeholder + redacted.slice(end);
308
+ }
309
+ return redacted;
310
+ }
311
+ /**
312
+ * Traverse JSON structure and call callback for each value
313
+ */
314
+ traverse(obj, path, options, callback, depth = 0) {
315
+ if (depth > options.maxDepth) {
316
+ throw new Error(`[JsonProcessor] Maximum depth (${options.maxDepth}) exceeded`);
317
+ }
318
+ if (obj === null || obj === void 0) {
319
+ return;
320
+ }
321
+ if (Array.isArray(obj)) {
322
+ obj.forEach((item, index) => {
323
+ const itemPath = path ? `${path}[${index}]` : `[${index}]`;
324
+ if (this.isPrimitive(item)) {
325
+ callback(itemPath, item);
326
+ }
327
+ this.traverse(item, itemPath, options, callback, depth + 1);
328
+ });
329
+ return;
330
+ }
331
+ if (typeof obj === "object") {
332
+ for (const [key, value] of Object.entries(obj)) {
333
+ const valuePath = path ? `${path}.${key}` : key;
334
+ if (this.isPrimitive(value)) {
335
+ callback(valuePath, value, key);
336
+ }
337
+ this.traverse(value, valuePath, options, callback, depth + 1);
338
+ }
339
+ return;
340
+ }
341
+ if (this.isPrimitive(obj)) {
342
+ callback(path, obj);
343
+ }
344
+ }
345
+ /**
346
+ * Check if value is primitive (string, number, boolean)
347
+ */
348
+ isPrimitive(value) {
349
+ return typeof value === "string" || typeof value === "number" || typeof value === "boolean";
350
+ }
351
+ /**
352
+ * Check if path should be skipped
353
+ */
354
+ shouldSkip(path, skipPaths) {
355
+ return skipPaths.some((skipPath) => {
356
+ if (path === skipPath) return true;
357
+ const skipRegex = new RegExp("^" + skipPath.replace(/\*/g, "[^.]+") + "$");
358
+ return skipRegex.test(path);
359
+ });
360
+ }
361
+ /**
362
+ * Check if path should always be redacted
363
+ */
364
+ shouldAlwaysRedact(path, alwaysRedact) {
365
+ return alwaysRedact.some((redactPath) => {
366
+ if (path === redactPath) return true;
367
+ const redactRegex = new RegExp("^" + redactPath.replace(/\*/g, "[^.]+") + "$");
368
+ return redactRegex.test(path);
369
+ });
370
+ }
371
+ /**
372
+ * Boost confidence if key name indicates PII
373
+ */
374
+ boostConfidenceFromKey(detections, key, piiIndicatorKeys) {
375
+ if (!key) return detections;
376
+ const keyLower = key.toLowerCase();
377
+ const isPiiKey = piiIndicatorKeys.some(
378
+ (indicator) => keyLower.includes(indicator.toLowerCase())
379
+ );
380
+ if (!isPiiKey) return detections;
381
+ return detections.map((detection) => ({
382
+ ...detection,
383
+ confidence: Math.min(1, (detection.confidence || 0.5) * 1.2)
384
+ }));
385
+ }
386
+ /**
387
+ * Extract all text values from JSON for simple text-based detection
388
+ */
389
+ extractText(data, options) {
390
+ const opts = { ...this.defaultOptions, ...options };
391
+ const textParts = [];
392
+ this.traverse(data, "", opts, (_path, value, key) => {
393
+ if (opts.scanKeys && key) {
394
+ textParts.push(key);
395
+ }
396
+ if (typeof value === "string") {
397
+ textParts.push(value);
398
+ }
399
+ });
400
+ return textParts.join(" ");
401
+ }
402
+ /**
403
+ * Validate JSON buffer/string
404
+ */
405
+ isValid(input) {
406
+ try {
407
+ this.parse(input);
408
+ return true;
409
+ } catch {
410
+ return false;
411
+ }
412
+ }
413
+ /**
414
+ * Get JSON Lines (JSONL) support - split by newlines and parse each line
415
+ */
416
+ parseJsonLines(input) {
417
+ const text = typeof input === "string" ? input : input.toString("utf-8");
418
+ const lines = text.split("\n").filter((line) => line.trim().length > 0);
419
+ return lines.map((line, index) => {
420
+ try {
421
+ return JSON.parse(line);
422
+ } catch (error) {
423
+ throw new Error(`[JsonProcessor] Invalid JSON at line ${index + 1}: ${error.message}`);
424
+ }
425
+ });
426
+ }
427
+ /**
428
+ * Detect PII in JSON Lines format
429
+ */
430
+ detectJsonLines(input, detector, options) {
431
+ const documents = this.parseJsonLines(input);
432
+ return documents.map((doc) => this.detect(doc, detector, options));
433
+ }
434
+ };
435
+ function createJsonProcessor() {
436
+ return new JsonProcessor();
437
+ }
438
+
439
+ // src/document/CsvProcessor.ts
440
+ var CsvProcessor = class {
441
+ constructor() {
442
+ this.defaultOptions = {
443
+ quote: '"',
444
+ escape: '"',
445
+ skipEmptyLines: true,
446
+ piiIndicatorNames: [
447
+ "email",
448
+ "e-mail",
449
+ "mail",
450
+ "email_address",
451
+ "phone",
452
+ "tel",
453
+ "telephone",
454
+ "mobile",
455
+ "phone_number",
456
+ "ssn",
457
+ "social_security",
458
+ "social_security_number",
459
+ "address",
460
+ "street",
461
+ "street_address",
462
+ "city",
463
+ "zip",
464
+ "zipcode",
465
+ "postal",
466
+ "postcode",
467
+ "name",
468
+ "firstname",
469
+ "first_name",
470
+ "lastname",
471
+ "last_name",
472
+ "fullname",
473
+ "full_name",
474
+ "password",
475
+ "pwd",
476
+ "secret",
477
+ "token",
478
+ "api_key",
479
+ "card",
480
+ "credit_card",
481
+ "creditcard",
482
+ "card_number",
483
+ "account",
484
+ "account_number",
485
+ "iban",
486
+ "swift",
487
+ "passport",
488
+ "passport_number",
489
+ "license",
490
+ "licence",
491
+ "driver_license",
492
+ "dob",
493
+ "date_of_birth",
494
+ "birth_date",
495
+ "birthdate"
496
+ ],
497
+ treatFirstRowAsHeader: true
498
+ };
499
+ }
500
+ /**
501
+ * Parse CSV from buffer or string
502
+ */
503
+ parse(input, options) {
504
+ const opts = { ...this.defaultOptions, ...options };
505
+ const text = typeof input === "string" ? input : input.toString("utf-8");
506
+ const delimiter = opts.delimiter || this.detectDelimiter(text);
507
+ const lines = text.split(/\r?\n/);
508
+ const rows = [];
509
+ let rowIndex = 0;
510
+ for (let i = 0; i < lines.length; i++) {
511
+ const line = lines[i];
512
+ if (opts.skipEmptyLines && line.trim().length === 0) {
513
+ continue;
514
+ }
515
+ if (opts.maxRows !== void 0 && rowIndex >= opts.maxRows) {
516
+ break;
517
+ }
518
+ const values = this.parseRow(line, delimiter, opts.quote, opts.escape);
519
+ rows.push({
520
+ index: rowIndex,
521
+ values
522
+ });
523
+ rowIndex++;
524
+ }
525
+ return rows;
526
+ }
527
+ /**
528
+ * Detect PII in CSV data
529
+ */
530
+ detect(input, detector, options) {
531
+ const opts = { ...this.defaultOptions, ...options };
532
+ const rows = this.parse(input, options);
533
+ if (rows.length === 0) {
534
+ const original2 = typeof input === "string" ? input : input.toString("utf-8");
535
+ return {
536
+ original: original2,
537
+ redacted: original2,
538
+ detections: [],
539
+ redactionMap: {},
540
+ stats: {
541
+ piiCount: 0
542
+ },
543
+ rowCount: 0,
544
+ columnCount: 0,
545
+ columnStats: {},
546
+ matchesByCell: []
547
+ };
548
+ }
549
+ const hasHeader = opts.hasHeader !== void 0 ? opts.hasHeader : this.detectHeader(rows);
550
+ const headers = hasHeader && rows.length > 0 ? rows[0].values : void 0;
551
+ const dataRows = hasHeader ? rows.slice(1) : rows;
552
+ const columnCount = rows[0].values.length;
553
+ const columnNameToIndex = /* @__PURE__ */ new Map();
554
+ if (headers) {
555
+ headers.forEach((header, index) => {
556
+ columnNameToIndex.set(header.toLowerCase().trim(), index);
557
+ });
558
+ }
559
+ const alwaysRedactCols = new Set(opts.alwaysRedactColumns || []);
560
+ if (opts.alwaysRedactColumnNames && headers) {
561
+ opts.alwaysRedactColumnNames.forEach((name) => {
562
+ const index = columnNameToIndex.get(name.toLowerCase().trim());
563
+ if (index !== void 0) {
564
+ alwaysRedactCols.add(index);
565
+ }
566
+ });
567
+ }
568
+ const skipCols = new Set(opts.skipColumns || []);
569
+ const columnStats = {};
570
+ const matchesByCell = [];
571
+ const allDetections = [];
572
+ for (let col = 0; col < columnCount; col++) {
573
+ columnStats[col] = {
574
+ columnIndex: col,
575
+ columnName: headers?.[col],
576
+ piiCount: 0,
577
+ piiPercentage: 0,
578
+ piiTypes: []
579
+ };
580
+ }
581
+ for (const row of dataRows) {
582
+ for (let col = 0; col < row.values.length; col++) {
583
+ if (skipCols.has(col)) {
584
+ continue;
585
+ }
586
+ const cellValue = row.values[col];
587
+ if (alwaysRedactCols.has(col)) {
588
+ const detection = {
589
+ type: "SENSITIVE_COLUMN",
590
+ value: cellValue,
591
+ placeholder: `[SENSITIVE_COLUMN_${col}]`,
592
+ position: [0, cellValue.length],
593
+ severity: "high",
594
+ confidence: 1
595
+ };
596
+ matchesByCell.push({
597
+ row: row.index,
598
+ column: col,
599
+ columnName: headers?.[col],
600
+ value: cellValue,
601
+ matches: [detection]
602
+ });
603
+ allDetections.push(detection);
604
+ columnStats[col].piiCount++;
605
+ continue;
606
+ }
607
+ const result = detector.detect(cellValue);
608
+ if (result.detections.length > 0) {
609
+ const boostedDetections = this.boostConfidenceFromColumnName(
610
+ result.detections,
611
+ headers?.[col],
612
+ opts.piiIndicatorNames || []
613
+ );
614
+ matchesByCell.push({
615
+ row: row.index,
616
+ column: col,
617
+ columnName: headers?.[col],
618
+ value: cellValue,
619
+ matches: boostedDetections
620
+ });
621
+ allDetections.push(...boostedDetections);
622
+ columnStats[col].piiCount += boostedDetections.length;
623
+ const columnTypes = new Set(columnStats[col].piiTypes);
624
+ boostedDetections.forEach((d) => columnTypes.add(d.type));
625
+ columnStats[col].piiTypes = Array.from(columnTypes);
626
+ }
627
+ }
628
+ }
629
+ for (let col = 0; col < columnCount; col++) {
630
+ const rowsWithPii = matchesByCell.filter((m) => m.column === col).length;
631
+ columnStats[col].piiPercentage = dataRows.length > 0 ? rowsWithPii / dataRows.length * 100 : 0;
632
+ }
633
+ const original = typeof input === "string" ? input : input.toString("utf-8");
634
+ const redacted = this.redact(original, {
635
+ original,
636
+ redacted: original,
637
+ detections: allDetections,
638
+ redactionMap: {},
639
+ stats: { piiCount: allDetections.length },
640
+ rowCount: dataRows.length,
641
+ columnCount,
642
+ headers,
643
+ columnStats,
644
+ matchesByCell
645
+ }, opts);
646
+ const redactionMap = {};
647
+ allDetections.forEach((det) => {
648
+ redactionMap[det.placeholder] = det.value;
649
+ });
650
+ return {
651
+ original,
652
+ redacted,
653
+ detections: allDetections,
654
+ redactionMap,
655
+ stats: {
656
+ piiCount: allDetections.length
657
+ },
658
+ rowCount: dataRows.length,
659
+ columnCount,
660
+ headers: headers?.filter((h) => h !== void 0),
661
+ columnStats,
662
+ matchesByCell
663
+ };
664
+ }
665
+ /**
666
+ * Redact PII in CSV data
667
+ */
668
+ redact(input, detectionResult, options) {
669
+ const opts = { ...this.defaultOptions, ...options };
670
+ const rows = this.parse(input, options);
671
+ if (rows.length === 0) {
672
+ return "";
673
+ }
674
+ const delimiter = opts.delimiter || this.detectDelimiter(
675
+ typeof input === "string" ? input : input.toString("utf-8")
676
+ );
677
+ const hasHeader = detectionResult.headers !== void 0;
678
+ const redactionMap = /* @__PURE__ */ new Map();
679
+ for (const cellMatch of detectionResult.matchesByCell) {
680
+ if (!redactionMap.has(cellMatch.row)) {
681
+ redactionMap.set(cellMatch.row, /* @__PURE__ */ new Map());
682
+ }
683
+ redactionMap.get(cellMatch.row).set(
684
+ cellMatch.column,
685
+ "[REDACTED]"
686
+ );
687
+ }
688
+ const outputRows = [];
689
+ for (let i = 0; i < rows.length; i++) {
690
+ const row = rows[i];
691
+ const isHeaderRow = hasHeader && i === 0;
692
+ if (isHeaderRow) {
693
+ outputRows.push(this.formatRow(row.values, delimiter, opts.quote));
694
+ } else {
695
+ const rowIndex = hasHeader ? i - 1 : i;
696
+ const redactedValues = row.values.map((value, colIndex) => {
697
+ return redactionMap.get(rowIndex)?.get(colIndex) || value;
698
+ });
699
+ outputRows.push(this.formatRow(redactedValues, delimiter, opts.quote));
700
+ }
701
+ }
702
+ return outputRows.join("\n");
703
+ }
704
+ /**
705
+ * Parse a single CSV row
706
+ */
707
+ parseRow(line, delimiter, quote, _escape) {
708
+ const values = [];
709
+ let current = "";
710
+ let inQuotes = false;
711
+ let i = 0;
712
+ while (i < line.length) {
713
+ const char = line[i];
714
+ const nextChar = line[i + 1];
715
+ if (char === quote) {
716
+ if (inQuotes && nextChar === quote) {
717
+ current += quote;
718
+ i += 2;
719
+ } else {
720
+ inQuotes = !inQuotes;
721
+ i++;
722
+ }
723
+ } else if (char === delimiter && !inQuotes) {
724
+ values.push(current);
725
+ current = "";
726
+ i++;
727
+ } else {
728
+ current += char;
729
+ i++;
730
+ }
731
+ }
732
+ values.push(current);
733
+ return values;
734
+ }
735
+ /**
736
+ * Format a row as CSV
737
+ */
738
+ formatRow(values, delimiter, quote) {
739
+ return values.map((value) => {
740
+ if (value.includes(delimiter) || value.includes(quote) || value.includes("\n")) {
741
+ const escaped = value.replace(new RegExp(quote, "g"), quote + quote);
742
+ return `${quote}${escaped}${quote}`;
743
+ }
744
+ return value;
745
+ }).join(delimiter);
746
+ }
747
+ /**
748
+ * Auto-detect CSV delimiter
749
+ */
750
+ detectDelimiter(text) {
751
+ const delimiters = [",", " ", ";", "|"];
752
+ const lines = text.split(/\r?\n/).slice(0, 5);
753
+ let bestDelimiter = ",";
754
+ let bestScore = 0;
755
+ for (const delimiter of delimiters) {
756
+ const counts = lines.map((line) => {
757
+ let count = 0;
758
+ let inQuotes = false;
759
+ for (const char of line) {
760
+ if (char === '"') inQuotes = !inQuotes;
761
+ if (char === delimiter && !inQuotes) count++;
762
+ }
763
+ return count;
764
+ });
765
+ if (counts.length > 0 && counts[0] > 0) {
766
+ const avg = counts.reduce((a, b) => a + b, 0) / counts.length;
767
+ const variance = counts.reduce((sum, c) => sum + Math.pow(c - avg, 2), 0) / counts.length;
768
+ const score = avg / (variance + 1);
769
+ if (score > bestScore) {
770
+ bestScore = score;
771
+ bestDelimiter = delimiter;
772
+ }
773
+ }
774
+ }
775
+ return bestDelimiter;
776
+ }
777
+ /**
778
+ * Detect if first row is likely a header
779
+ */
780
+ detectHeader(rows) {
781
+ if (rows.length < 2) {
782
+ return false;
783
+ }
784
+ const firstRow = rows[0].values;
785
+ const secondRow = rows[1].values;
786
+ const firstRowAvgLen = firstRow.reduce((sum, v) => sum + v.length, 0) / firstRow.length;
787
+ const secondRowAvgLen = secondRow.reduce((sum, v) => sum + v.length, 0) / secondRow.length;
788
+ if (firstRowAvgLen > secondRowAvgLen * 1.5) {
789
+ return false;
790
+ }
791
+ const firstRowNumeric = firstRow.filter((v) => !isNaN(Number(v)) && v.trim() !== "").length;
792
+ const firstRowNonNumeric = firstRow.length - firstRowNumeric;
793
+ return firstRowNonNumeric >= firstRowNumeric;
794
+ }
795
+ /**
796
+ * Boost confidence if column name indicates PII
797
+ */
798
+ boostConfidenceFromColumnName(detections, columnName, piiIndicatorNames) {
799
+ if (!columnName) return detections;
800
+ const nameLower = columnName.toLowerCase().trim();
801
+ const isPiiColumn = piiIndicatorNames.some(
802
+ (indicator) => nameLower.includes(indicator.toLowerCase())
803
+ );
804
+ if (!isPiiColumn) return detections;
805
+ return detections.map((detection) => ({
806
+ ...detection,
807
+ confidence: Math.min(1, (detection.confidence || 0.5) * 1.2)
808
+ }));
809
+ }
810
+ /**
811
+ * Extract all cell values as text
812
+ */
813
+ extractText(input, options) {
814
+ const rows = this.parse(input, options);
815
+ const textParts = [];
816
+ for (const row of rows) {
817
+ for (const value of row.values) {
818
+ if (value.trim().length > 0) {
819
+ textParts.push(value);
820
+ }
821
+ }
822
+ }
823
+ return textParts.join(" ");
824
+ }
825
+ /**
826
+ * Get column statistics without full PII detection
827
+ */
828
+ getColumnInfo(input, options) {
829
+ const rows = this.parse(input, options);
830
+ if (rows.length === 0) {
831
+ return {
832
+ columnCount: 0,
833
+ rowCount: 0,
834
+ sampleRows: []
835
+ };
836
+ }
837
+ const opts = { ...this.defaultOptions, ...options };
838
+ const hasHeader = opts.hasHeader !== void 0 ? opts.hasHeader : this.detectHeader(rows);
839
+ const headers = hasHeader && rows.length > 0 ? rows[0].values : void 0;
840
+ const dataRows = hasHeader ? rows.slice(1) : rows;
841
+ const sampleRows = dataRows.slice(0, 5).map((r) => r.values);
842
+ return {
843
+ columnCount: rows[0].values.length,
844
+ rowCount: dataRows.length,
845
+ headers,
846
+ sampleRows
847
+ };
848
+ }
849
+ };
850
+ function createCsvProcessor() {
851
+ return new CsvProcessor();
852
+ }
853
+
854
+ // src/document/XlsxProcessor.ts
855
+ var XlsxProcessor = class {
856
+ constructor() {
857
+ this.defaultOptions = {
858
+ piiIndicatorNames: [
859
+ "email",
860
+ "e-mail",
861
+ "mail",
862
+ "email_address",
863
+ "phone",
864
+ "tel",
865
+ "telephone",
866
+ "mobile",
867
+ "phone_number",
868
+ "ssn",
869
+ "social_security",
870
+ "social_security_number",
871
+ "address",
872
+ "street",
873
+ "street_address",
874
+ "city",
875
+ "zip",
876
+ "zipcode",
877
+ "postal",
878
+ "postcode",
879
+ "name",
880
+ "firstname",
881
+ "first_name",
882
+ "lastname",
883
+ "last_name",
884
+ "fullname",
885
+ "full_name",
886
+ "password",
887
+ "pwd",
888
+ "secret",
889
+ "token",
890
+ "api_key",
891
+ "card",
892
+ "credit_card",
893
+ "creditcard",
894
+ "card_number",
895
+ "account",
896
+ "account_number",
897
+ "iban",
898
+ "swift",
899
+ "passport",
900
+ "passport_number",
901
+ "license",
902
+ "licence",
903
+ "driver_license",
904
+ "dob",
905
+ "date_of_birth",
906
+ "birth_date",
907
+ "birthdate"
908
+ ],
909
+ preserveFormatting: true,
910
+ preserveFormulas: true
911
+ };
912
+ try {
913
+ this.xlsx = __require("xlsx");
914
+ } catch {
915
+ }
916
+ }
917
+ /**
918
+ * Check if XLSX support is available
919
+ */
920
+ isAvailable() {
921
+ return !!this.xlsx;
922
+ }
923
+ /**
924
+ * Parse XLSX from buffer
925
+ */
926
+ parse(buffer) {
927
+ if (!this.xlsx) {
928
+ throw new Error(
929
+ "[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx"
930
+ );
931
+ }
932
+ try {
933
+ return this.xlsx.read(buffer, { type: "buffer", cellFormula: true, cellStyles: true });
934
+ } catch (error) {
935
+ throw new Error(`[XlsxProcessor] Failed to parse XLSX: ${error.message}`);
936
+ }
937
+ }
938
+ /**
939
+ * Detect PII in XLSX data
940
+ */
941
+ detect(buffer, detector, options) {
942
+ if (!this.xlsx) {
943
+ throw new Error(
944
+ "[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx"
945
+ );
946
+ }
947
+ const opts = { ...this.defaultOptions, ...options };
948
+ const workbook = this.parse(buffer);
949
+ const sheetNames = this.getSheetNamesToProcess(workbook, opts);
950
+ const sheetResults = [];
951
+ const allDetections = [];
952
+ const allTypes = /* @__PURE__ */ new Set();
953
+ for (let sheetIndex = 0; sheetIndex < sheetNames.length; sheetIndex++) {
954
+ const sheetName = sheetNames[sheetIndex];
955
+ const sheet = workbook.Sheets[sheetName];
956
+ const sheetResult = this.detectSheet(
957
+ sheet,
958
+ sheetName,
959
+ sheetIndex,
960
+ detector,
961
+ opts
962
+ );
963
+ sheetResults.push(sheetResult);
964
+ allDetections.push(...sheetResult.matchesByCell.flatMap((c) => c.matches));
965
+ sheetResult.matchesByCell.forEach((cell) => {
966
+ cell.matches.forEach((det) => allTypes.add(det.type));
967
+ });
968
+ }
969
+ const original = this.extractText(buffer, options);
970
+ const redactedBuffer = this.redact(buffer, {
971
+ original,
972
+ redacted: original,
973
+ detections: allDetections,
974
+ redactionMap: {},
975
+ stats: { piiCount: allDetections.length },
976
+ sheetResults,
977
+ sheetCount: sheetResults.length
978
+ }, options);
979
+ const redacted = this.extractText(redactedBuffer, options);
980
+ const redactionMap = {};
981
+ allDetections.forEach((det) => {
982
+ redactionMap[det.placeholder] = det.value;
983
+ });
984
+ return {
985
+ original,
986
+ redacted,
987
+ detections: allDetections,
988
+ redactionMap,
989
+ stats: {
990
+ piiCount: allDetections.length
991
+ },
992
+ sheetResults,
993
+ sheetCount: sheetResults.length
994
+ };
995
+ }
996
+ /**
997
+ * Detect PII in a single sheet
998
+ */
999
+ detectSheet(sheet, sheetName, sheetIndex, detector, options) {
1000
+ const range = this.xlsx.utils.decode_range(sheet["!ref"] || "A1");
1001
+ const startRow = range.s.r;
1002
+ const endRow = options.maxRows !== void 0 ? Math.min(range.e.r, startRow + options.maxRows - 1) : range.e.r;
1003
+ const startCol = range.s.c;
1004
+ const endCol = range.e.c;
1005
+ const columnCount = endCol - startCol + 1;
1006
+ const hasHeader = options.hasHeader !== void 0 ? options.hasHeader : this.detectHeader(sheet, range);
1007
+ const headers = hasHeader ? this.getRowValues(sheet, startRow, startCol, endCol) : void 0;
1008
+ const dataStartRow = hasHeader ? startRow + 1 : startRow;
1009
+ const columnNameToIndex = /* @__PURE__ */ new Map();
1010
+ if (headers) {
1011
+ headers.forEach((header, index) => {
1012
+ if (header) {
1013
+ columnNameToIndex.set(header.toLowerCase().trim(), index);
1014
+ }
1015
+ });
1016
+ }
1017
+ const alwaysRedactCols = new Set(options.alwaysRedactColumns || []);
1018
+ if (options.alwaysRedactColumnNames && headers) {
1019
+ options.alwaysRedactColumnNames.forEach((name) => {
1020
+ const index = columnNameToIndex.get(name.toLowerCase().trim());
1021
+ if (index !== void 0) {
1022
+ alwaysRedactCols.add(index);
1023
+ }
1024
+ });
1025
+ }
1026
+ const skipCols = new Set(options.skipColumns || []);
1027
+ const columnStats = {};
1028
+ for (let col = 0; col <= endCol - startCol; col++) {
1029
+ columnStats[col] = {
1030
+ columnIndex: col,
1031
+ columnLetter: this.columnToLetter(col),
1032
+ columnName: headers?.[col],
1033
+ piiCount: 0,
1034
+ piiPercentage: 0,
1035
+ piiTypes: []
1036
+ };
1037
+ }
1038
+ const matchesByCell = [];
1039
+ for (let row = dataStartRow; row <= endRow; row++) {
1040
+ for (let col = startCol; col <= endCol; col++) {
1041
+ const colIndex = col - startCol;
1042
+ if (skipCols.has(colIndex)) {
1043
+ continue;
1044
+ }
1045
+ const cellRef = this.xlsx.utils.encode_cell({ r: row, c: col });
1046
+ const cell = sheet[cellRef];
1047
+ if (!cell) continue;
1048
+ const cellValue = this.getCellValue(cell);
1049
+ if (!cellValue) continue;
1050
+ const cellFormula = cell.f;
1051
+ if (alwaysRedactCols.has(colIndex)) {
1052
+ const detection = {
1053
+ type: "SENSITIVE_COLUMN",
1054
+ value: cellValue,
1055
+ placeholder: `[SENSITIVE_COLUMN_${colIndex}]`,
1056
+ position: [0, cellValue.length],
1057
+ severity: "high",
1058
+ confidence: 1
1059
+ };
1060
+ matchesByCell.push({
1061
+ cell: cellRef,
1062
+ row: row + 1,
1063
+ // 1-indexed for Excel
1064
+ column: colIndex,
1065
+ columnLetter: this.columnToLetter(colIndex),
1066
+ columnName: headers?.[colIndex],
1067
+ value: cellValue,
1068
+ formula: cellFormula,
1069
+ matches: [detection]
1070
+ });
1071
+ columnStats[colIndex].piiCount++;
1072
+ continue;
1073
+ }
1074
+ const result = detector.detect(cellValue);
1075
+ if (result.detections.length > 0) {
1076
+ const boostedDetections = this.boostConfidenceFromColumnName(
1077
+ result.detections,
1078
+ headers?.[colIndex],
1079
+ options.piiIndicatorNames || []
1080
+ );
1081
+ matchesByCell.push({
1082
+ cell: cellRef,
1083
+ row: row + 1,
1084
+ // 1-indexed for Excel
1085
+ column: colIndex,
1086
+ columnLetter: this.columnToLetter(colIndex),
1087
+ columnName: headers?.[colIndex],
1088
+ value: cellValue,
1089
+ formula: cellFormula,
1090
+ matches: boostedDetections
1091
+ });
1092
+ columnStats[colIndex].piiCount += boostedDetections.length;
1093
+ const columnTypes = new Set(columnStats[colIndex].piiTypes);
1094
+ boostedDetections.forEach((d) => columnTypes.add(d.type));
1095
+ columnStats[colIndex].piiTypes = Array.from(columnTypes);
1096
+ }
1097
+ }
1098
+ }
1099
+ const dataRowCount = endRow - dataStartRow + 1;
1100
+ for (let col = 0; col <= endCol - startCol; col++) {
1101
+ const rowsWithPii = matchesByCell.filter((m) => m.column === col).length;
1102
+ columnStats[col].piiPercentage = dataRowCount > 0 ? rowsWithPii / dataRowCount * 100 : 0;
1103
+ }
1104
+ return {
1105
+ sheetName,
1106
+ sheetIndex,
1107
+ rowCount: dataRowCount,
1108
+ columnCount,
1109
+ headers: headers?.filter((h) => h !== void 0),
1110
+ columnStats,
1111
+ matchesByCell
1112
+ };
1113
+ }
1114
+ /**
1115
+ * Redact PII in XLSX data
1116
+ */
1117
+ redact(buffer, detectionResult, options) {
1118
+ if (!this.xlsx) {
1119
+ throw new Error(
1120
+ "[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx"
1121
+ );
1122
+ }
1123
+ const opts = { ...this.defaultOptions, ...options };
1124
+ const workbook = this.parse(buffer);
1125
+ for (const sheetResult of detectionResult.sheetResults) {
1126
+ const sheet = workbook.Sheets[sheetResult.sheetName];
1127
+ for (const cellMatch of sheetResult.matchesByCell) {
1128
+ const cellRef = cellMatch.cell;
1129
+ const cell = sheet[cellRef];
1130
+ if (!cell) continue;
1131
+ cell.v = "[REDACTED]";
1132
+ cell.w = "[REDACTED]";
1133
+ if (!opts.preserveFormulas) {
1134
+ delete cell.f;
1135
+ }
1136
+ cell.t = "s";
1137
+ }
1138
+ }
1139
+ return this.xlsx.write(workbook, { type: "buffer", bookType: "xlsx" });
1140
+ }
1141
+ /**
1142
+ * Get cell value as string
1143
+ */
1144
+ getCellValue(cell) {
1145
+ if (!cell) return "";
1146
+ if (cell.w !== void 0) {
1147
+ return String(cell.w);
1148
+ }
1149
+ if (cell.v !== void 0) {
1150
+ return String(cell.v);
1151
+ }
1152
+ return "";
1153
+ }
1154
+ /**
1155
+ * Get row values
1156
+ */
1157
+ getRowValues(sheet, row, startCol, endCol) {
1158
+ const values = [];
1159
+ for (let col = startCol; col <= endCol; col++) {
1160
+ const cellRef = this.xlsx.utils.encode_cell({ r: row, c: col });
1161
+ const cell = sheet[cellRef];
1162
+ values.push(cell ? this.getCellValue(cell) : void 0);
1163
+ }
1164
+ return values;
1165
+ }
1166
+ /**
1167
+ * Detect if first row is likely a header
1168
+ */
1169
+ detectHeader(sheet, range) {
1170
+ const firstRow = this.getRowValues(sheet, range.s.r, range.s.c, range.e.c);
1171
+ const secondRow = range.s.r + 1 <= range.e.r ? this.getRowValues(sheet, range.s.r + 1, range.s.c, range.e.c) : null;
1172
+ if (!secondRow) return false;
1173
+ const firstRowValues = firstRow.filter((v) => v !== void 0);
1174
+ const secondRowValues = secondRow.filter((v) => v !== void 0);
1175
+ if (firstRowValues.length === 0 || secondRowValues.length === 0) {
1176
+ return false;
1177
+ }
1178
+ const firstRowAvgLen = firstRowValues.reduce((sum, v) => sum + v.length, 0) / firstRowValues.length;
1179
+ const secondRowAvgLen = secondRowValues.reduce((sum, v) => sum + v.length, 0) / secondRowValues.length;
1180
+ if (firstRowAvgLen > secondRowAvgLen * 1.5) {
1181
+ return false;
1182
+ }
1183
+ const firstRowNumeric = firstRowValues.filter((v) => !isNaN(Number(v)) && v.trim() !== "").length;
1184
+ const firstRowNonNumeric = firstRowValues.length - firstRowNumeric;
1185
+ return firstRowNonNumeric >= firstRowNumeric;
1186
+ }
1187
+ /**
1188
+ * Convert column index to letter (0 = A, 25 = Z, 26 = AA)
1189
+ */
1190
+ columnToLetter(col) {
1191
+ let letter = "";
1192
+ while (col >= 0) {
1193
+ letter = String.fromCharCode(col % 26 + 65) + letter;
1194
+ col = Math.floor(col / 26) - 1;
1195
+ }
1196
+ return letter;
1197
+ }
1198
+ /**
1199
+ * Get sheet names to process based on options
1200
+ */
1201
+ getSheetNamesToProcess(workbook, options) {
1202
+ const allSheetNames = workbook.SheetNames;
1203
+ if (options.sheets && options.sheets.length > 0) {
1204
+ return options.sheets.filter((name) => allSheetNames.includes(name));
1205
+ }
1206
+ if (options.sheetIndices && options.sheetIndices.length > 0) {
1207
+ return options.sheetIndices.filter((index) => index >= 0 && index < allSheetNames.length).map((index) => allSheetNames[index]);
1208
+ }
1209
+ return allSheetNames;
1210
+ }
1211
+ /**
1212
+ * Boost confidence if column name indicates PII
1213
+ */
1214
+ boostConfidenceFromColumnName(detections, columnName, piiIndicatorNames) {
1215
+ if (!columnName) return detections;
1216
+ const nameLower = columnName.toLowerCase().trim();
1217
+ const isPiiColumn = piiIndicatorNames.some(
1218
+ (indicator) => nameLower.includes(indicator.toLowerCase())
1219
+ );
1220
+ if (!isPiiColumn) return detections;
1221
+ return detections.map((detection) => ({
1222
+ ...detection,
1223
+ confidence: Math.min(1, (detection.confidence || 0.5) * 1.2)
1224
+ }));
1225
+ }
1226
+ /**
1227
+ * Extract all cell values as text
1228
+ */
1229
+ extractText(buffer, options) {
1230
+ if (!this.xlsx) {
1231
+ throw new Error(
1232
+ "[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx"
1233
+ );
1234
+ }
1235
+ const workbook = this.parse(buffer);
1236
+ const opts = { ...this.defaultOptions, ...options };
1237
+ const sheetNames = this.getSheetNamesToProcess(workbook, opts);
1238
+ const textParts = [];
1239
+ for (const sheetName of sheetNames) {
1240
+ const sheet = workbook.Sheets[sheetName];
1241
+ const range = this.xlsx.utils.decode_range(sheet["!ref"] || "A1");
1242
+ for (let row = range.s.r; row <= range.e.r; row++) {
1243
+ for (let col = range.s.c; col <= range.e.c; col++) {
1244
+ const cellRef = this.xlsx.utils.encode_cell({ r: row, c: col });
1245
+ const cell = sheet[cellRef];
1246
+ if (cell) {
1247
+ const value = this.getCellValue(cell);
1248
+ if (value.trim().length > 0) {
1249
+ textParts.push(value);
1250
+ }
1251
+ }
1252
+ }
1253
+ }
1254
+ }
1255
+ return textParts.join(" ");
1256
+ }
1257
+ /**
1258
+ * Get workbook metadata
1259
+ */
1260
+ getMetadata(buffer) {
1261
+ if (!this.xlsx) {
1262
+ throw new Error(
1263
+ "[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx"
1264
+ );
1265
+ }
1266
+ const workbook = this.parse(buffer);
1267
+ return {
1268
+ sheetNames: workbook.SheetNames,
1269
+ sheetCount: workbook.SheetNames.length
1270
+ };
1271
+ }
1272
+ };
1273
+ function createXlsxProcessor() {
1274
+ return new XlsxProcessor();
1275
+ }
1276
+
1277
+ // src/document/DocumentProcessor.ts
1278
+ var DocumentProcessor = class {
1279
+ constructor() {
1280
+ try {
1281
+ this.pdfParse = __require("pdf-parse");
1282
+ } catch {
1283
+ }
1284
+ try {
1285
+ this.mammoth = __require("mammoth");
1286
+ } catch {
1287
+ }
1288
+ this.ocrProcessor = new OCRProcessor();
1289
+ this.jsonProcessor = new JsonProcessor();
1290
+ this.csvProcessor = new CsvProcessor();
1291
+ this.xlsxProcessor = new XlsxProcessor();
1292
+ }
1293
+ /**
1294
+ * Extract text from document buffer
1295
+ */
1296
+ async extractText(buffer, options) {
1297
+ const format = options?.format || this.detectFormat(buffer);
1298
+ if (!format) {
1299
+ throw new Error("[DocumentProcessor] Unable to detect document format. Supported: PDF, DOCX, TXT, images (with OCR)");
1300
+ }
1301
+ const maxSize = options?.maxSize || 50 * 1024 * 1024;
1302
+ if (buffer.length > maxSize) {
1303
+ throw new Error(`[DocumentProcessor] Document size (${buffer.length} bytes) exceeds maximum (${maxSize} bytes)`);
1304
+ }
1305
+ switch (format) {
1306
+ case "pdf":
1307
+ return this.extractPdfText(buffer, options);
1308
+ case "docx":
1309
+ return this.extractDocxText(buffer, options);
1310
+ case "txt":
1311
+ return buffer.toString("utf-8");
1312
+ case "image":
1313
+ return this.extractImageText(buffer, options);
1314
+ case "json":
1315
+ return this.extractJsonText(buffer, options);
1316
+ case "csv":
1317
+ return this.extractCsvText(buffer, options);
1318
+ case "xlsx":
1319
+ return this.extractXlsxText(buffer, options);
1320
+ default:
1321
+ throw new Error(`[DocumentProcessor] Unsupported format: ${format}`);
1322
+ }
1323
+ }
1324
+ /**
1325
+ * Get document metadata
1326
+ */
1327
+ async getMetadata(buffer, options) {
1328
+ const format = options?.format || this.detectFormat(buffer);
1329
+ if (!format) {
1330
+ throw new Error("[DocumentProcessor] Unable to detect document format");
1331
+ }
1332
+ switch (format) {
1333
+ case "pdf":
1334
+ return this.getPdfMetadata(buffer, options);
1335
+ case "docx":
1336
+ return this.getDocxMetadata(buffer, options);
1337
+ case "txt":
1338
+ return {
1339
+ format: "txt",
1340
+ pages: void 0
1341
+ };
1342
+ case "image":
1343
+ return this.getImageMetadata(buffer, options);
1344
+ case "json":
1345
+ return this.getJsonMetadata(buffer, options);
1346
+ case "csv":
1347
+ return this.getCsvMetadata(buffer, options);
1348
+ case "xlsx":
1349
+ return this.getXlsxMetadata(buffer, options);
1350
+ default:
1351
+ throw new Error(`[DocumentProcessor] Unsupported format: ${format}`);
1352
+ }
1353
+ }
1354
+ /**
1355
+ * Detect document format from buffer
1356
+ */
1357
+ detectFormat(buffer) {
1358
+ if (buffer.length < 4) {
1359
+ return null;
1360
+ }
1361
+ if (buffer.toString("utf-8", 0, 4) === "%PDF") {
1362
+ return "pdf";
1363
+ }
1364
+ if (buffer.length >= 8 && buffer[0] === 137 && buffer[1] === 80 && buffer[2] === 78 && buffer[3] === 71) {
1365
+ return "image";
1366
+ }
1367
+ if (buffer[0] === 255 && buffer[1] === 216 && buffer[2] === 255) {
1368
+ return "image";
1369
+ }
1370
+ if (buffer[0] === 73 && buffer[1] === 73 && buffer[2] === 42 && buffer[3] === 0 || buffer[0] === 77 && buffer[1] === 77 && buffer[2] === 0 && buffer[3] === 42) {
1371
+ return "image";
1372
+ }
1373
+ if (buffer[0] === 66 && buffer[1] === 77) {
1374
+ return "image";
1375
+ }
1376
+ if (buffer.length >= 12 && buffer[0] === 82 && buffer[1] === 73 && buffer[2] === 70 && buffer[3] === 70 && buffer[8] === 87 && buffer[9] === 69 && buffer[10] === 66 && buffer[11] === 80) {
1377
+ return "image";
1378
+ }
1379
+ if (buffer[0] === 80 && buffer[1] === 75) {
1380
+ const zipHeader = buffer.toString("utf-8", 0, Math.min(500, buffer.length));
1381
+ if (zipHeader.includes("word/") || zipHeader.includes("[Content_Types].xml")) {
1382
+ return "docx";
1383
+ }
1384
+ if (zipHeader.includes("xl/")) {
1385
+ return "xlsx";
1386
+ }
1387
+ }
1388
+ const text = buffer.toString("utf-8");
1389
+ const trimmed = text.trim();
1390
+ if (trimmed.startsWith("{") && trimmed.endsWith("}") || trimmed.startsWith("[") && trimmed.endsWith("]")) {
1391
+ if (this.jsonProcessor.isValid(buffer)) {
1392
+ return "json";
1393
+ }
1394
+ }
1395
+ const lines = text.split(/\r?\n/).slice(0, 5);
1396
+ if (lines.length >= 2) {
1397
+ const delimiters = [",", " ", ";", "|"];
1398
+ for (const delimiter of delimiters) {
1399
+ const counts = lines.map((line) => (line.match(new RegExp(delimiter, "g")) || []).length);
1400
+ if (counts[0] > 0 && counts.every((c) => c === counts[0])) {
1401
+ return "csv";
1402
+ }
1403
+ }
1404
+ }
1405
+ const sample = buffer.slice(0, Math.min(1e3, buffer.length));
1406
+ const nonPrintable = sample.filter((byte) => byte < 32 && byte !== 9 && byte !== 10 && byte !== 13).length;
1407
+ if (nonPrintable < sample.length * 0.1) {
1408
+ return "txt";
1409
+ }
1410
+ return null;
1411
+ }
1412
+ /**
1413
+ * Check if format is supported
1414
+ */
1415
+ isFormatSupported(format) {
1416
+ switch (format) {
1417
+ case "pdf":
1418
+ return !!this.pdfParse;
1419
+ case "docx":
1420
+ return !!this.mammoth;
1421
+ case "txt":
1422
+ return true;
1423
+ case "image":
1424
+ return this.ocrProcessor.isAvailable();
1425
+ case "json":
1426
+ return true;
1427
+ // Always supported (native)
1428
+ case "csv":
1429
+ return true;
1430
+ // Always supported (native)
1431
+ case "xlsx":
1432
+ return this.xlsxProcessor.isAvailable();
1433
+ default:
1434
+ return false;
1435
+ }
1436
+ }
1437
+ /**
1438
+ * Extract text from PDF
1439
+ */
1440
+ async extractPdfText(buffer, options) {
1441
+ if (!this.pdfParse) {
1442
+ throw new Error(
1443
+ "[DocumentProcessor] PDF support requires pdf-parse. Install with: npm install pdf-parse"
1444
+ );
1445
+ }
1446
+ try {
1447
+ const data = await this.pdfParse(buffer, {
1448
+ password: options?.password,
1449
+ max: options?.pages ? Math.max(...options.pages) : void 0
1450
+ });
1451
+ if (options?.pages) {
1452
+ return data.text;
1453
+ }
1454
+ return data.text || "";
1455
+ } catch (error) {
1456
+ throw new Error(`[DocumentProcessor] PDF extraction failed: ${error.message}`);
1457
+ }
1458
+ }
1459
+ /**
1460
+ * Extract text from DOCX
1461
+ */
1462
+ async extractDocxText(buffer, _options) {
1463
+ if (!this.mammoth) {
1464
+ throw new Error(
1465
+ "[DocumentProcessor] DOCX support requires mammoth. Install with: npm install mammoth"
1466
+ );
1467
+ }
1468
+ try {
1469
+ const result = await this.mammoth.extractRawText({ buffer });
1470
+ return result.value || "";
1471
+ } catch (error) {
1472
+ throw new Error(`[DocumentProcessor] DOCX extraction failed: ${error.message}`);
1473
+ }
1474
+ }
1475
+ /**
1476
+ * Get PDF metadata
1477
+ */
1478
+ async getPdfMetadata(buffer, _options) {
1479
+ if (!this.pdfParse) {
1480
+ throw new Error(
1481
+ "[DocumentProcessor] PDF support requires pdf-parse. Install with: npm install pdf-parse"
1482
+ );
1483
+ }
1484
+ try {
1485
+ const data = await this.pdfParse(buffer, {
1486
+ password: _options?.password
1487
+ });
1488
+ return {
1489
+ format: "pdf",
1490
+ pages: data.numpages,
1491
+ title: data.info?.Title,
1492
+ author: data.info?.Author,
1493
+ creationDate: data.info?.CreationDate ? new Date(data.info.CreationDate) : void 0,
1494
+ modifiedDate: data.info?.ModDate ? new Date(data.info.ModDate) : void 0,
1495
+ custom: data.info
1496
+ };
1497
+ } catch (error) {
1498
+ throw new Error(`[DocumentProcessor] PDF metadata extraction failed: ${error.message}`);
1499
+ }
1500
+ }
1501
+ /**
1502
+ * Get DOCX metadata
1503
+ */
1504
+ async getDocxMetadata(_buffer, _options) {
1505
+ return {
1506
+ format: "docx",
1507
+ pages: void 0
1508
+ // Word doesn't have fixed pages
1509
+ };
1510
+ }
1511
+ /**
1512
+ * Extract text from image using OCR
1513
+ */
1514
+ async extractImageText(buffer, options) {
1515
+ if (!this.ocrProcessor.isAvailable()) {
1516
+ throw new Error(
1517
+ "[DocumentProcessor] Image/OCR support requires tesseract.js. Install with: npm install tesseract.js"
1518
+ );
1519
+ }
1520
+ try {
1521
+ const result = await this.ocrProcessor.recognizeText(buffer, options?.ocrOptions);
1522
+ return result.text;
1523
+ } catch (error) {
1524
+ throw new Error(`[DocumentProcessor] Image text extraction failed: ${error.message}`);
1525
+ }
1526
+ }
1527
+ /**
1528
+ * Get image metadata
1529
+ */
1530
+ async getImageMetadata(buffer, options) {
1531
+ if (!this.ocrProcessor.isAvailable()) {
1532
+ return {
1533
+ format: "image",
1534
+ pages: void 0,
1535
+ usedOCR: false
1536
+ };
1537
+ }
1538
+ try {
1539
+ const result = await this.ocrProcessor.recognizeText(buffer, options?.ocrOptions);
1540
+ return {
1541
+ format: "image",
1542
+ pages: void 0,
1543
+ usedOCR: true,
1544
+ ocrConfidence: result.confidence
1545
+ };
1546
+ } catch {
1547
+ return {
1548
+ format: "image",
1549
+ pages: void 0,
1550
+ usedOCR: false
1551
+ };
1552
+ }
1553
+ }
1554
+ /**
1555
+ * Extract text from JSON
1556
+ */
1557
+ async extractJsonText(buffer, _options) {
1558
+ try {
1559
+ return this.jsonProcessor.extractText(buffer);
1560
+ } catch (error) {
1561
+ throw new Error(`[DocumentProcessor] JSON extraction failed: ${error.message}`);
1562
+ }
1563
+ }
1564
+ /**
1565
+ * Extract text from CSV
1566
+ */
1567
+ async extractCsvText(buffer, _options) {
1568
+ try {
1569
+ return this.csvProcessor.extractText(buffer);
1570
+ } catch (error) {
1571
+ throw new Error(`[DocumentProcessor] CSV extraction failed: ${error.message}`);
1572
+ }
1573
+ }
1574
+ /**
1575
+ * Extract text from XLSX
1576
+ */
1577
+ async extractXlsxText(buffer, _options) {
1578
+ if (!this.xlsxProcessor.isAvailable()) {
1579
+ throw new Error(
1580
+ "[DocumentProcessor] XLSX support requires xlsx package. Install with: npm install xlsx"
1581
+ );
1582
+ }
1583
+ try {
1584
+ return this.xlsxProcessor.extractText(buffer);
1585
+ } catch (error) {
1586
+ throw new Error(`[DocumentProcessor] XLSX extraction failed: ${error.message}`);
1587
+ }
1588
+ }
1589
+ /**
1590
+ * Get JSON metadata
1591
+ */
1592
+ async getJsonMetadata(buffer, _options) {
1593
+ try {
1594
+ const data = this.jsonProcessor.parse(buffer);
1595
+ const isArray = Array.isArray(data);
1596
+ return {
1597
+ format: "json",
1598
+ pages: void 0,
1599
+ custom: {
1600
+ isArray,
1601
+ itemCount: isArray ? data.length : Object.keys(data).length
1602
+ }
1603
+ };
1604
+ } catch {
1605
+ return {
1606
+ format: "json",
1607
+ pages: void 0
1608
+ };
1609
+ }
1610
+ }
1611
+ /**
1612
+ * Get CSV metadata
1613
+ */
1614
+ async getCsvMetadata(buffer, _options) {
1615
+ try {
1616
+ const info = this.csvProcessor.getColumnInfo(buffer);
1617
+ return {
1618
+ format: "csv",
1619
+ pages: void 0,
1620
+ custom: {
1621
+ rowCount: info.rowCount,
1622
+ columnCount: info.columnCount,
1623
+ headers: info.headers
1624
+ }
1625
+ };
1626
+ } catch {
1627
+ return {
1628
+ format: "csv",
1629
+ pages: void 0
1630
+ };
1631
+ }
1632
+ }
1633
+ /**
1634
+ * Get XLSX metadata
1635
+ */
1636
+ async getXlsxMetadata(buffer, _options) {
1637
+ if (!this.xlsxProcessor.isAvailable()) {
1638
+ return {
1639
+ format: "xlsx",
1640
+ pages: void 0
1641
+ };
1642
+ }
1643
+ try {
1644
+ const metadata = this.xlsxProcessor.getMetadata(buffer);
1645
+ return {
1646
+ format: "xlsx",
1647
+ pages: void 0,
1648
+ custom: {
1649
+ sheetNames: metadata.sheetNames,
1650
+ sheetCount: metadata.sheetCount
1651
+ }
1652
+ };
1653
+ } catch {
1654
+ return {
1655
+ format: "xlsx",
1656
+ pages: void 0
1657
+ };
1658
+ }
1659
+ }
1660
+ /**
1661
+ * Get OCR processor instance
1662
+ */
1663
+ getOCRProcessor() {
1664
+ return this.ocrProcessor;
1665
+ }
1666
+ /**
1667
+ * Get JSON processor instance
1668
+ */
1669
+ getJsonProcessor() {
1670
+ return this.jsonProcessor;
1671
+ }
1672
+ /**
1673
+ * Get CSV processor instance
1674
+ */
1675
+ getCsvProcessor() {
1676
+ return this.csvProcessor;
1677
+ }
1678
+ /**
1679
+ * Get XLSX processor instance
1680
+ */
1681
+ getXlsxProcessor() {
1682
+ return this.xlsxProcessor;
1683
+ }
1684
+ };
1685
+ function createDocumentProcessor() {
1686
+ return new DocumentProcessor();
1687
+ }
1688
+
1689
+ export {
1690
+ OCRProcessor,
1691
+ createOCRProcessor,
1692
+ JsonProcessor,
1693
+ createJsonProcessor,
1694
+ CsvProcessor,
1695
+ createCsvProcessor,
1696
+ XlsxProcessor,
1697
+ createXlsxProcessor,
1698
+ DocumentProcessor,
1699
+ createDocumentProcessor
1700
+ };
1701
+ //# sourceMappingURL=chunk-7OGNW2MU.mjs.map