od-temp 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cli.cjs +17459 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1980 -25
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +2004 -8
- package/dist/index.mjs.map +1 -0
- package/package.json +2 -7
package/dist/index.js
CHANGED
|
@@ -40,14 +40,13 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
40
40
|
var __toCommonJS = (mod) => __hasOwnProp.call(mod, "module.exports") ? mod["module.exports"] : __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
41
41
|
|
|
42
42
|
//#endregion
|
|
43
|
-
const require_document = require('./document-C4T2JLdu.js');
|
|
44
|
-
const require_workers = require('./workers-BmzAqLSu.js');
|
|
45
|
-
const require_HealthCheck = require('./HealthCheck-CFX1wPqE.js');
|
|
46
43
|
let crypto = require("crypto");
|
|
47
44
|
let fs = require("fs");
|
|
48
45
|
fs = __toESM(fs);
|
|
49
46
|
let path = require("path");
|
|
50
47
|
path = __toESM(path);
|
|
48
|
+
let worker_threads = require("worker_threads");
|
|
49
|
+
let os = require("os");
|
|
51
50
|
let react = require("react");
|
|
52
51
|
|
|
53
52
|
//#region src/audit/AuditLogger.ts
|
|
@@ -15754,6 +15753,1961 @@ var init_ConfigExporter = __esmMin((() => {
|
|
|
15754
15753
|
};
|
|
15755
15754
|
}));
|
|
15756
15755
|
|
|
15756
|
+
//#endregion
|
|
15757
|
+
//#region src/health/HealthCheck.ts
|
|
15758
|
+
var HealthCheck_exports = /* @__PURE__ */ __exportAll({
|
|
15759
|
+
HealthChecker: () => HealthChecker,
|
|
15760
|
+
createHealthChecker: () => createHealthChecker,
|
|
15761
|
+
healthCheckMiddleware: () => healthCheckMiddleware
|
|
15762
|
+
});
|
|
15763
|
+
/**
|
|
15764
|
+
* Create health checker for a detector
|
|
15765
|
+
*/
|
|
15766
|
+
function createHealthChecker(detector) {
|
|
15767
|
+
return new HealthChecker(detector);
|
|
15768
|
+
}
|
|
15769
|
+
/**
|
|
15770
|
+
* Express middleware for health check endpoint
|
|
15771
|
+
*/
|
|
15772
|
+
function healthCheckMiddleware(detector) {
|
|
15773
|
+
const checker = new HealthChecker(detector);
|
|
15774
|
+
return async (_req, res) => {
|
|
15775
|
+
try {
|
|
15776
|
+
const result = await checker.check({
|
|
15777
|
+
testDetection: true,
|
|
15778
|
+
checkPerformance: true,
|
|
15779
|
+
performanceThreshold: 100,
|
|
15780
|
+
memoryThreshold: 100
|
|
15781
|
+
});
|
|
15782
|
+
const statusCode = result.status === "healthy" ? 200 : result.status === "degraded" ? 200 : 503;
|
|
15783
|
+
res.status(statusCode).json(result);
|
|
15784
|
+
} catch (error) {
|
|
15785
|
+
res.status(503).json({
|
|
15786
|
+
status: "unhealthy",
|
|
15787
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
15788
|
+
error: error.message
|
|
15789
|
+
});
|
|
15790
|
+
}
|
|
15791
|
+
};
|
|
15792
|
+
}
|
|
15793
|
+
var HealthChecker;
|
|
15794
|
+
var init_HealthCheck = __esmMin((() => {
|
|
15795
|
+
HealthChecker = class {
|
|
15796
|
+
constructor(detector) {
|
|
15797
|
+
this.detector = detector;
|
|
15798
|
+
this.initTime = Date.now();
|
|
15799
|
+
}
|
|
15800
|
+
/**
|
|
15801
|
+
* Run complete health check
|
|
15802
|
+
*/
|
|
15803
|
+
async check(options = {}) {
|
|
15804
|
+
const result = {
|
|
15805
|
+
status: "healthy",
|
|
15806
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
15807
|
+
checks: {
|
|
15808
|
+
detector: {
|
|
15809
|
+
status: "pass",
|
|
15810
|
+
message: "Detector initialized"
|
|
15811
|
+
},
|
|
15812
|
+
patterns: {
|
|
15813
|
+
status: "pass",
|
|
15814
|
+
message: "Patterns loaded"
|
|
15815
|
+
},
|
|
15816
|
+
performance: {
|
|
15817
|
+
status: "pass",
|
|
15818
|
+
message: "Performance acceptable"
|
|
15819
|
+
},
|
|
15820
|
+
memory: {
|
|
15821
|
+
status: "pass",
|
|
15822
|
+
message: "Memory usage normal"
|
|
15823
|
+
}
|
|
15824
|
+
},
|
|
15825
|
+
metrics: {
|
|
15826
|
+
totalPatterns: 0,
|
|
15827
|
+
compiledPatterns: 0,
|
|
15828
|
+
cacheEnabled: false,
|
|
15829
|
+
uptime: Date.now() - this.initTime
|
|
15830
|
+
},
|
|
15831
|
+
errors: [],
|
|
15832
|
+
warnings: []
|
|
15833
|
+
};
|
|
15834
|
+
try {
|
|
15835
|
+
result.checks.detector = await this.checkDetector(options);
|
|
15836
|
+
result.checks.patterns = await this.checkPatterns();
|
|
15837
|
+
if (options.checkPerformance !== false) result.checks.performance = await this.checkPerformance(options.performanceThreshold);
|
|
15838
|
+
result.checks.memory = await this.checkMemory(options.memoryThreshold);
|
|
15839
|
+
result.metrics = this.collectMetrics();
|
|
15840
|
+
result.status = this.determineOverallStatus(result.checks);
|
|
15841
|
+
for (const check of Object.values(result.checks)) if (check.status === "fail") result.errors.push(check.message);
|
|
15842
|
+
else if (check.status === "warn") result.warnings.push(check.message);
|
|
15843
|
+
} catch (error) {
|
|
15844
|
+
result.status = "unhealthy";
|
|
15845
|
+
result.errors.push(`Health check failed: ${error.message}`);
|
|
15846
|
+
}
|
|
15847
|
+
return result;
|
|
15848
|
+
}
|
|
15849
|
+
/**
|
|
15850
|
+
* Check detector functionality
|
|
15851
|
+
*/
|
|
15852
|
+
async checkDetector(options) {
|
|
15853
|
+
try {
|
|
15854
|
+
if (options.testDetection !== false) {
|
|
15855
|
+
const result = await this.detector.detect("Test email: test@example.com");
|
|
15856
|
+
if (!result || !result.detections) return {
|
|
15857
|
+
status: "fail",
|
|
15858
|
+
message: "Detector returned invalid result"
|
|
15859
|
+
};
|
|
15860
|
+
if (result.detections.length === 0) return {
|
|
15861
|
+
status: "warn",
|
|
15862
|
+
message: "Test detection found no PII (expected at least 1)"
|
|
15863
|
+
};
|
|
15864
|
+
}
|
|
15865
|
+
return {
|
|
15866
|
+
status: "pass",
|
|
15867
|
+
message: "Detector functioning correctly"
|
|
15868
|
+
};
|
|
15869
|
+
} catch (error) {
|
|
15870
|
+
return {
|
|
15871
|
+
status: "fail",
|
|
15872
|
+
message: `Detector check failed: ${error.message}`
|
|
15873
|
+
};
|
|
15874
|
+
}
|
|
15875
|
+
}
|
|
15876
|
+
/**
|
|
15877
|
+
* Check patterns are loaded
|
|
15878
|
+
*/
|
|
15879
|
+
async checkPatterns() {
|
|
15880
|
+
try {
|
|
15881
|
+
const patterns = this.detector.getPatterns();
|
|
15882
|
+
if (!patterns || patterns.length === 0) return {
|
|
15883
|
+
status: "fail",
|
|
15884
|
+
message: "No patterns loaded",
|
|
15885
|
+
value: 0,
|
|
15886
|
+
threshold: 1
|
|
15887
|
+
};
|
|
15888
|
+
if (patterns.length < 10) return {
|
|
15889
|
+
status: "warn",
|
|
15890
|
+
message: "Very few patterns loaded (expected more)",
|
|
15891
|
+
value: patterns.length,
|
|
15892
|
+
threshold: 10
|
|
15893
|
+
};
|
|
15894
|
+
return {
|
|
15895
|
+
status: "pass",
|
|
15896
|
+
message: `${patterns.length} patterns loaded`,
|
|
15897
|
+
value: patterns.length
|
|
15898
|
+
};
|
|
15899
|
+
} catch (error) {
|
|
15900
|
+
return {
|
|
15901
|
+
status: "fail",
|
|
15902
|
+
message: `Pattern check failed: ${error.message}`
|
|
15903
|
+
};
|
|
15904
|
+
}
|
|
15905
|
+
}
|
|
15906
|
+
/**
|
|
15907
|
+
* Check performance
|
|
15908
|
+
*/
|
|
15909
|
+
async checkPerformance(threshold = 100) {
|
|
15910
|
+
try {
|
|
15911
|
+
const testText = "Test: john@example.com, phone: 555-123-4567, IP: 192.168.1.1";
|
|
15912
|
+
const start = performance.now();
|
|
15913
|
+
await this.detector.detect(testText);
|
|
15914
|
+
const duration = performance.now() - start;
|
|
15915
|
+
if (duration > threshold * 2) return {
|
|
15916
|
+
status: "fail",
|
|
15917
|
+
message: `Performance degraded: ${duration.toFixed(2)}ms`,
|
|
15918
|
+
value: duration,
|
|
15919
|
+
threshold
|
|
15920
|
+
};
|
|
15921
|
+
if (duration > threshold) return {
|
|
15922
|
+
status: "warn",
|
|
15923
|
+
message: `Performance slower than expected: ${duration.toFixed(2)}ms`,
|
|
15924
|
+
value: duration,
|
|
15925
|
+
threshold
|
|
15926
|
+
};
|
|
15927
|
+
return {
|
|
15928
|
+
status: "pass",
|
|
15929
|
+
message: `Performance good: ${duration.toFixed(2)}ms`,
|
|
15930
|
+
value: duration,
|
|
15931
|
+
threshold
|
|
15932
|
+
};
|
|
15933
|
+
} catch (error) {
|
|
15934
|
+
return {
|
|
15935
|
+
status: "fail",
|
|
15936
|
+
message: `Performance check failed: ${error.message}`
|
|
15937
|
+
};
|
|
15938
|
+
}
|
|
15939
|
+
}
|
|
15940
|
+
/**
|
|
15941
|
+
* Check memory usage
|
|
15942
|
+
*/
|
|
15943
|
+
async checkMemory(threshold = 100) {
|
|
15944
|
+
try {
|
|
15945
|
+
if (typeof process === "undefined" || !process.memoryUsage) return {
|
|
15946
|
+
status: "pass",
|
|
15947
|
+
message: "Memory check skipped (not in Node.js)"
|
|
15948
|
+
};
|
|
15949
|
+
const heapUsedMB = process.memoryUsage().heapUsed / 1024 / 1024;
|
|
15950
|
+
if (heapUsedMB > threshold * 2) return {
|
|
15951
|
+
status: "fail",
|
|
15952
|
+
message: `High memory usage: ${heapUsedMB.toFixed(2)}MB`,
|
|
15953
|
+
value: heapUsedMB,
|
|
15954
|
+
threshold
|
|
15955
|
+
};
|
|
15956
|
+
if (heapUsedMB > threshold) return {
|
|
15957
|
+
status: "warn",
|
|
15958
|
+
message: `Elevated memory usage: ${heapUsedMB.toFixed(2)}MB`,
|
|
15959
|
+
value: heapUsedMB,
|
|
15960
|
+
threshold
|
|
15961
|
+
};
|
|
15962
|
+
return {
|
|
15963
|
+
status: "pass",
|
|
15964
|
+
message: `Memory usage normal: ${heapUsedMB.toFixed(2)}MB`,
|
|
15965
|
+
value: heapUsedMB,
|
|
15966
|
+
threshold
|
|
15967
|
+
};
|
|
15968
|
+
} catch (error) {
|
|
15969
|
+
return {
|
|
15970
|
+
status: "warn",
|
|
15971
|
+
message: `Memory check skipped: ${error.message}`
|
|
15972
|
+
};
|
|
15973
|
+
}
|
|
15974
|
+
}
|
|
15975
|
+
/**
|
|
15976
|
+
* Collect metrics
|
|
15977
|
+
*/
|
|
15978
|
+
collectMetrics() {
|
|
15979
|
+
const patterns = this.detector.getPatterns();
|
|
15980
|
+
const cacheStats = this.detector.getCacheStats();
|
|
15981
|
+
return {
|
|
15982
|
+
totalPatterns: patterns.length,
|
|
15983
|
+
compiledPatterns: patterns.length,
|
|
15984
|
+
cacheSize: cacheStats.size,
|
|
15985
|
+
cacheEnabled: cacheStats.enabled,
|
|
15986
|
+
uptime: Date.now() - this.initTime
|
|
15987
|
+
};
|
|
15988
|
+
}
|
|
15989
|
+
/**
|
|
15990
|
+
* Determine overall status
|
|
15991
|
+
*/
|
|
15992
|
+
determineOverallStatus(checks) {
|
|
15993
|
+
const statuses = Object.values(checks).map((c) => c.status);
|
|
15994
|
+
if (statuses.includes("fail")) return "unhealthy";
|
|
15995
|
+
if (statuses.includes("warn")) return "degraded";
|
|
15996
|
+
return "healthy";
|
|
15997
|
+
}
|
|
15998
|
+
/**
|
|
15999
|
+
* Quick health check (minimal overhead)
|
|
16000
|
+
*/
|
|
16001
|
+
async quickCheck() {
|
|
16002
|
+
try {
|
|
16003
|
+
if (this.detector.getPatterns().length === 0) return {
|
|
16004
|
+
status: "unhealthy",
|
|
16005
|
+
message: "No patterns loaded"
|
|
16006
|
+
};
|
|
16007
|
+
return {
|
|
16008
|
+
status: "healthy",
|
|
16009
|
+
message: "OK"
|
|
16010
|
+
};
|
|
16011
|
+
} catch (error) {
|
|
16012
|
+
return {
|
|
16013
|
+
status: "unhealthy",
|
|
16014
|
+
message: `Error: ${error.message}`
|
|
16015
|
+
};
|
|
16016
|
+
}
|
|
16017
|
+
}
|
|
16018
|
+
/**
|
|
16019
|
+
* Get system info for debugging
|
|
16020
|
+
*/
|
|
16021
|
+
getSystemInfo() {
|
|
16022
|
+
const patterns = this.detector.getPatterns();
|
|
16023
|
+
const cacheStats = this.detector.getCacheStats();
|
|
16024
|
+
return {
|
|
16025
|
+
version: "1.0.0",
|
|
16026
|
+
patterns: {
|
|
16027
|
+
total: patterns.length,
|
|
16028
|
+
types: [...new Set(patterns.map((p) => p.type.split("_")[0]))].length
|
|
16029
|
+
},
|
|
16030
|
+
cache: {
|
|
16031
|
+
enabled: cacheStats.enabled,
|
|
16032
|
+
size: cacheStats.size,
|
|
16033
|
+
maxSize: cacheStats.maxSize
|
|
16034
|
+
},
|
|
16035
|
+
uptime: Date.now() - this.initTime,
|
|
16036
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
16037
|
+
};
|
|
16038
|
+
}
|
|
16039
|
+
};
|
|
16040
|
+
}));
|
|
16041
|
+
|
|
16042
|
+
//#endregion
|
|
16043
|
+
//#region src/document/OCRProcessor.ts
|
|
16044
|
+
/**
|
|
16045
|
+
* Create an OCR processor instance
|
|
16046
|
+
*/
|
|
16047
|
+
function createOCRProcessor() {
|
|
16048
|
+
return new OCRProcessor();
|
|
16049
|
+
}
|
|
16050
|
+
var OCRProcessor;
|
|
16051
|
+
var init_OCRProcessor = __esmMin((() => {
|
|
16052
|
+
OCRProcessor = class {
|
|
16053
|
+
constructor() {
|
|
16054
|
+
try {
|
|
16055
|
+
this.tesseract = require("tesseract.js");
|
|
16056
|
+
} catch {}
|
|
16057
|
+
}
|
|
16058
|
+
/**
|
|
16059
|
+
* Extract text from image buffer using OCR
|
|
16060
|
+
*/
|
|
16061
|
+
async recognizeText(buffer, options) {
|
|
16062
|
+
if (!this.tesseract) throw new Error("[OCRProcessor] OCR support requires tesseract.js. Install with: npm install tesseract.js");
|
|
16063
|
+
const startTime = performance.now();
|
|
16064
|
+
try {
|
|
16065
|
+
const language = Array.isArray(options?.language) ? options.language.join("+") : options?.language || "eng";
|
|
16066
|
+
const worker = await this.tesseract.createWorker(language, options?.oem || 3);
|
|
16067
|
+
if (options?.psm !== void 0) await worker.setParameters({ tessedit_pageseg_mode: options.psm });
|
|
16068
|
+
const result = await worker.recognize(buffer);
|
|
16069
|
+
await worker.terminate();
|
|
16070
|
+
const endTime = performance.now();
|
|
16071
|
+
const processingTime = Math.round((endTime - startTime) * 100) / 100;
|
|
16072
|
+
return {
|
|
16073
|
+
text: result.data.text || "",
|
|
16074
|
+
confidence: result.data.confidence || 0,
|
|
16075
|
+
processingTime
|
|
16076
|
+
};
|
|
16077
|
+
} catch (error) {
|
|
16078
|
+
throw new Error(`[OCRProcessor] OCR recognition failed: ${error.message}`);
|
|
16079
|
+
}
|
|
16080
|
+
}
|
|
16081
|
+
/**
|
|
16082
|
+
* Check if OCR is available (tesseract.js installed)
|
|
16083
|
+
*/
|
|
16084
|
+
isAvailable() {
|
|
16085
|
+
return !!this.tesseract;
|
|
16086
|
+
}
|
|
16087
|
+
/**
|
|
16088
|
+
* Create a scheduler for batch OCR processing
|
|
16089
|
+
* More efficient for processing multiple images
|
|
16090
|
+
*/
|
|
16091
|
+
async createScheduler(workerCount = 4) {
|
|
16092
|
+
if (!this.tesseract) throw new Error("[OCRProcessor] OCR support requires tesseract.js. Install with: npm install tesseract.js");
|
|
16093
|
+
if (this.scheduler) await this.scheduler.terminate();
|
|
16094
|
+
this.scheduler = this.tesseract.createScheduler();
|
|
16095
|
+
const workers = [];
|
|
16096
|
+
for (let i = 0; i < workerCount; i++) {
|
|
16097
|
+
const worker = await this.tesseract.createWorker("eng");
|
|
16098
|
+
this.scheduler.addWorker(worker);
|
|
16099
|
+
workers.push(worker);
|
|
16100
|
+
}
|
|
16101
|
+
return this.scheduler;
|
|
16102
|
+
}
|
|
16103
|
+
/**
|
|
16104
|
+
* Batch process multiple images
|
|
16105
|
+
*/
|
|
16106
|
+
async recognizeBatch(buffers, _options) {
|
|
16107
|
+
if (!this.tesseract) throw new Error("[OCRProcessor] OCR support requires tesseract.js. Install with: npm install tesseract.js");
|
|
16108
|
+
const scheduler = await this.createScheduler();
|
|
16109
|
+
try {
|
|
16110
|
+
const results = await Promise.all(buffers.map(async (buffer) => {
|
|
16111
|
+
const startTime = performance.now();
|
|
16112
|
+
const result = await scheduler.addJob("recognize", buffer);
|
|
16113
|
+
const endTime = performance.now();
|
|
16114
|
+
return {
|
|
16115
|
+
text: result.data.text || "",
|
|
16116
|
+
confidence: result.data.confidence || 0,
|
|
16117
|
+
processingTime: Math.round((endTime - startTime) * 100) / 100
|
|
16118
|
+
};
|
|
16119
|
+
}));
|
|
16120
|
+
await scheduler.terminate();
|
|
16121
|
+
this.scheduler = void 0;
|
|
16122
|
+
return results;
|
|
16123
|
+
} catch (error) {
|
|
16124
|
+
if (scheduler) {
|
|
16125
|
+
await scheduler.terminate();
|
|
16126
|
+
this.scheduler = void 0;
|
|
16127
|
+
}
|
|
16128
|
+
throw new Error(`[OCRProcessor] Batch OCR failed: ${error.message}`);
|
|
16129
|
+
}
|
|
16130
|
+
}
|
|
16131
|
+
/**
|
|
16132
|
+
* Terminate any running scheduler
|
|
16133
|
+
*/
|
|
16134
|
+
async cleanup() {
|
|
16135
|
+
if (this.scheduler) {
|
|
16136
|
+
await this.scheduler.terminate();
|
|
16137
|
+
this.scheduler = void 0;
|
|
16138
|
+
}
|
|
16139
|
+
}
|
|
16140
|
+
};
|
|
16141
|
+
}));
|
|
16142
|
+
|
|
16143
|
+
//#endregion
|
|
16144
|
+
//#region src/document/JsonProcessor.ts
|
|
16145
|
+
/**
|
|
16146
|
+
* Create a JSON processor instance
|
|
16147
|
+
*/
|
|
16148
|
+
function createJsonProcessor() {
|
|
16149
|
+
return new JsonProcessor();
|
|
16150
|
+
}
|
|
16151
|
+
var JsonProcessor;
|
|
16152
|
+
var init_JsonProcessor = __esmMin((() => {
|
|
16153
|
+
JsonProcessor = class {
|
|
16154
|
+
constructor() {
|
|
16155
|
+
this.defaultOptions = {
|
|
16156
|
+
maxDepth: 100,
|
|
16157
|
+
scanKeys: false,
|
|
16158
|
+
alwaysRedact: [],
|
|
16159
|
+
skipPaths: [],
|
|
16160
|
+
piiIndicatorKeys: [
|
|
16161
|
+
"email",
|
|
16162
|
+
"e-mail",
|
|
16163
|
+
"mail",
|
|
16164
|
+
"phone",
|
|
16165
|
+
"tel",
|
|
16166
|
+
"telephone",
|
|
16167
|
+
"mobile",
|
|
16168
|
+
"ssn",
|
|
16169
|
+
"social_security",
|
|
16170
|
+
"address",
|
|
16171
|
+
"street",
|
|
16172
|
+
"city",
|
|
16173
|
+
"zip",
|
|
16174
|
+
"postal",
|
|
16175
|
+
"name",
|
|
16176
|
+
"firstname",
|
|
16177
|
+
"lastname",
|
|
16178
|
+
"fullname",
|
|
16179
|
+
"password",
|
|
16180
|
+
"pwd",
|
|
16181
|
+
"secret",
|
|
16182
|
+
"token",
|
|
16183
|
+
"key",
|
|
16184
|
+
"card",
|
|
16185
|
+
"credit_card",
|
|
16186
|
+
"creditcard",
|
|
16187
|
+
"account",
|
|
16188
|
+
"iban",
|
|
16189
|
+
"swift",
|
|
16190
|
+
"passport",
|
|
16191
|
+
"license",
|
|
16192
|
+
"licence"
|
|
16193
|
+
],
|
|
16194
|
+
preserveStructure: true
|
|
16195
|
+
};
|
|
16196
|
+
}
|
|
16197
|
+
/**
|
|
16198
|
+
* Parse JSON from buffer or string
|
|
16199
|
+
*/
|
|
16200
|
+
parse(input) {
|
|
16201
|
+
try {
|
|
16202
|
+
const text = typeof input === "string" ? input : input.toString("utf-8");
|
|
16203
|
+
return JSON.parse(text);
|
|
16204
|
+
} catch (error) {
|
|
16205
|
+
throw new Error(`[JsonProcessor] Invalid JSON: ${error.message}`);
|
|
16206
|
+
}
|
|
16207
|
+
}
|
|
16208
|
+
/**
|
|
16209
|
+
* Detect PII in JSON data
|
|
16210
|
+
*/
|
|
16211
|
+
async detect(data, detector, options) {
|
|
16212
|
+
const opts = {
|
|
16213
|
+
...this.defaultOptions,
|
|
16214
|
+
...options
|
|
16215
|
+
};
|
|
16216
|
+
const pathsDetected = [];
|
|
16217
|
+
const matchesByPath = {};
|
|
16218
|
+
const allDetections = [];
|
|
16219
|
+
const promises = [];
|
|
16220
|
+
this.traverse(data, "", opts, (path, value, key) => {
|
|
16221
|
+
promises.push((async () => {
|
|
16222
|
+
if (this.shouldSkip(path, opts.skipPaths)) return;
|
|
16223
|
+
if (this.shouldAlwaysRedact(path, opts.alwaysRedact)) {
|
|
16224
|
+
const detection = {
|
|
16225
|
+
type: "SENSITIVE_FIELD",
|
|
16226
|
+
value: String(value),
|
|
16227
|
+
placeholder: `[SENSITIVE_FIELD]`,
|
|
16228
|
+
position: [0, String(value).length],
|
|
16229
|
+
severity: "high",
|
|
16230
|
+
confidence: 1
|
|
16231
|
+
};
|
|
16232
|
+
matchesByPath[path] = [detection];
|
|
16233
|
+
pathsDetected.push(path);
|
|
16234
|
+
allDetections.push(detection);
|
|
16235
|
+
return;
|
|
16236
|
+
}
|
|
16237
|
+
if (opts.scanKeys && key) {
|
|
16238
|
+
const keyResult = await detector.detect(key);
|
|
16239
|
+
if (keyResult.detections.length > 0) {
|
|
16240
|
+
const keyPath = `${path}.__key__`;
|
|
16241
|
+
matchesByPath[keyPath] = keyResult.detections;
|
|
16242
|
+
pathsDetected.push(keyPath);
|
|
16243
|
+
allDetections.push(...keyResult.detections);
|
|
16244
|
+
}
|
|
16245
|
+
}
|
|
16246
|
+
const valueStr = String(value);
|
|
16247
|
+
const result = await detector.detect(valueStr);
|
|
16248
|
+
if (result.detections.length > 0) {
|
|
16249
|
+
const boostedDetections = this.boostConfidenceFromKey(result.detections, key, opts.piiIndicatorKeys);
|
|
16250
|
+
matchesByPath[path] = boostedDetections;
|
|
16251
|
+
pathsDetected.push(path);
|
|
16252
|
+
allDetections.push(...boostedDetections);
|
|
16253
|
+
}
|
|
16254
|
+
})());
|
|
16255
|
+
});
|
|
16256
|
+
await Promise.all(promises);
|
|
16257
|
+
const original = JSON.stringify(data);
|
|
16258
|
+
const redacted = this.redact(data, {
|
|
16259
|
+
original,
|
|
16260
|
+
redacted: original,
|
|
16261
|
+
detections: allDetections,
|
|
16262
|
+
redactionMap: {},
|
|
16263
|
+
stats: { piiCount: allDetections.length },
|
|
16264
|
+
pathsDetected,
|
|
16265
|
+
matchesByPath
|
|
16266
|
+
}, opts);
|
|
16267
|
+
const redactionMap = {};
|
|
16268
|
+
allDetections.forEach((det) => {
|
|
16269
|
+
redactionMap[det.placeholder] = det.value;
|
|
16270
|
+
});
|
|
16271
|
+
return {
|
|
16272
|
+
original,
|
|
16273
|
+
redacted: typeof redacted === "string" ? redacted : JSON.stringify(redacted),
|
|
16274
|
+
detections: allDetections,
|
|
16275
|
+
redactionMap,
|
|
16276
|
+
stats: { piiCount: allDetections.length },
|
|
16277
|
+
pathsDetected,
|
|
16278
|
+
matchesByPath
|
|
16279
|
+
};
|
|
16280
|
+
}
|
|
16281
|
+
/**
|
|
16282
|
+
* Redact PII in JSON data
|
|
16283
|
+
*/
|
|
16284
|
+
redact(data, detectionResult, options) {
|
|
16285
|
+
if (!{
|
|
16286
|
+
...this.defaultOptions,
|
|
16287
|
+
...options
|
|
16288
|
+
}.preserveStructure) return this.parse(this.redactText(JSON.stringify(data, null, 2), detectionResult));
|
|
16289
|
+
return this.redactPreservingStructure(data, detectionResult.pathsDetected);
|
|
16290
|
+
}
|
|
16291
|
+
/**
|
|
16292
|
+
* Redact specific paths in JSON while preserving structure
|
|
16293
|
+
*/
|
|
16294
|
+
redactPreservingStructure(data, pathsToRedact) {
|
|
16295
|
+
const pathSet = new Set(pathsToRedact);
|
|
16296
|
+
const redactValue = (value, currentPath) => {
|
|
16297
|
+
if (pathSet.has(currentPath)) {
|
|
16298
|
+
if (typeof value === "string") return "[REDACTED]";
|
|
16299
|
+
else if (typeof value === "number") return 0;
|
|
16300
|
+
else if (typeof value === "boolean") return false;
|
|
16301
|
+
else if (value === null) return null;
|
|
16302
|
+
else if (Array.isArray(value)) return [];
|
|
16303
|
+
else if (typeof value === "object") return {};
|
|
16304
|
+
return "[REDACTED]";
|
|
16305
|
+
}
|
|
16306
|
+
if (Array.isArray(value)) return value.map((item, index) => redactValue(item, `${currentPath}[${index}]`));
|
|
16307
|
+
if (value !== null && typeof value === "object") {
|
|
16308
|
+
const result = {};
|
|
16309
|
+
for (const [key, val] of Object.entries(value)) result[key] = redactValue(val, currentPath ? `${currentPath}.${key}` : key);
|
|
16310
|
+
return result;
|
|
16311
|
+
}
|
|
16312
|
+
return value;
|
|
16313
|
+
};
|
|
16314
|
+
return redactValue(data, "");
|
|
16315
|
+
}
|
|
16316
|
+
/**
|
|
16317
|
+
* Simple text-based redaction (fallback)
|
|
16318
|
+
*/
|
|
16319
|
+
redactText(text, detectionResult) {
|
|
16320
|
+
let redacted = text;
|
|
16321
|
+
const sortedDetections = [...detectionResult.detections].sort((a, b) => b.position[0] - a.position[0]);
|
|
16322
|
+
for (const detection of sortedDetections) {
|
|
16323
|
+
const [start, end] = detection.position;
|
|
16324
|
+
redacted = redacted.slice(0, start) + detection.placeholder + redacted.slice(end);
|
|
16325
|
+
}
|
|
16326
|
+
return redacted;
|
|
16327
|
+
}
|
|
16328
|
+
/**
|
|
16329
|
+
* Traverse JSON structure and call callback for each value
|
|
16330
|
+
*/
|
|
16331
|
+
traverse(obj, path, options, callback, depth = 0) {
|
|
16332
|
+
if (depth > options.maxDepth) throw new Error(`[JsonProcessor] Maximum depth (${options.maxDepth}) exceeded`);
|
|
16333
|
+
if (obj === null || obj === void 0) return;
|
|
16334
|
+
if (Array.isArray(obj)) {
|
|
16335
|
+
obj.forEach((item, index) => {
|
|
16336
|
+
const itemPath = path ? `${path}[${index}]` : `[${index}]`;
|
|
16337
|
+
if (this.isPrimitive(item)) callback(itemPath, item);
|
|
16338
|
+
this.traverse(item, itemPath, options, callback, depth + 1);
|
|
16339
|
+
});
|
|
16340
|
+
return;
|
|
16341
|
+
}
|
|
16342
|
+
if (typeof obj === "object") {
|
|
16343
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
16344
|
+
const valuePath = path ? `${path}.${key}` : key;
|
|
16345
|
+
if (this.isPrimitive(value)) callback(valuePath, value, key);
|
|
16346
|
+
this.traverse(value, valuePath, options, callback, depth + 1);
|
|
16347
|
+
}
|
|
16348
|
+
return;
|
|
16349
|
+
}
|
|
16350
|
+
if (this.isPrimitive(obj)) callback(path, obj);
|
|
16351
|
+
}
|
|
16352
|
+
/**
|
|
16353
|
+
* Check if value is primitive (string, number, boolean)
|
|
16354
|
+
*/
|
|
16355
|
+
isPrimitive(value) {
|
|
16356
|
+
return typeof value === "string" || typeof value === "number" || typeof value === "boolean";
|
|
16357
|
+
}
|
|
16358
|
+
/**
|
|
16359
|
+
* Check if path should be skipped
|
|
16360
|
+
*/
|
|
16361
|
+
shouldSkip(path, skipPaths) {
|
|
16362
|
+
return skipPaths.some((skipPath) => {
|
|
16363
|
+
if (path === skipPath) return true;
|
|
16364
|
+
return new RegExp("^" + skipPath.replace(/\*/g, "[^.]+") + "$").test(path);
|
|
16365
|
+
});
|
|
16366
|
+
}
|
|
16367
|
+
/**
|
|
16368
|
+
* Check if path should always be redacted
|
|
16369
|
+
*/
|
|
16370
|
+
shouldAlwaysRedact(path, alwaysRedact) {
|
|
16371
|
+
return alwaysRedact.some((redactPath) => {
|
|
16372
|
+
if (path === redactPath) return true;
|
|
16373
|
+
return new RegExp("^" + redactPath.replace(/\*/g, "[^.]+") + "$").test(path);
|
|
16374
|
+
});
|
|
16375
|
+
}
|
|
16376
|
+
/**
|
|
16377
|
+
* Boost confidence if key name indicates PII
|
|
16378
|
+
*/
|
|
16379
|
+
boostConfidenceFromKey(detections, key, piiIndicatorKeys) {
|
|
16380
|
+
if (!key) return detections;
|
|
16381
|
+
const keyLower = key.toLowerCase();
|
|
16382
|
+
if (!piiIndicatorKeys.some((indicator) => keyLower.includes(indicator.toLowerCase()))) return detections;
|
|
16383
|
+
return detections.map((detection) => ({
|
|
16384
|
+
...detection,
|
|
16385
|
+
confidence: Math.min(1, (detection.confidence || .5) * 1.2)
|
|
16386
|
+
}));
|
|
16387
|
+
}
|
|
16388
|
+
/**
|
|
16389
|
+
* Extract all text values from JSON for simple text-based detection
|
|
16390
|
+
*/
|
|
16391
|
+
extractText(data, options) {
|
|
16392
|
+
const opts = {
|
|
16393
|
+
...this.defaultOptions,
|
|
16394
|
+
...options
|
|
16395
|
+
};
|
|
16396
|
+
const textParts = [];
|
|
16397
|
+
this.traverse(data, "", opts, (_path, value, key) => {
|
|
16398
|
+
if (opts.scanKeys && key) textParts.push(key);
|
|
16399
|
+
if (typeof value === "string") textParts.push(value);
|
|
16400
|
+
});
|
|
16401
|
+
return textParts.join(" ");
|
|
16402
|
+
}
|
|
16403
|
+
/**
|
|
16404
|
+
* Validate JSON buffer/string
|
|
16405
|
+
*/
|
|
16406
|
+
isValid(input) {
|
|
16407
|
+
try {
|
|
16408
|
+
this.parse(input);
|
|
16409
|
+
return true;
|
|
16410
|
+
} catch {
|
|
16411
|
+
return false;
|
|
16412
|
+
}
|
|
16413
|
+
}
|
|
16414
|
+
/**
|
|
16415
|
+
* Get JSON Lines (JSONL) support - split by newlines and parse each line
|
|
16416
|
+
*/
|
|
16417
|
+
parseJsonLines(input) {
|
|
16418
|
+
return (typeof input === "string" ? input : input.toString("utf-8")).split("\n").filter((line) => line.trim().length > 0).map((line, index) => {
|
|
16419
|
+
try {
|
|
16420
|
+
return JSON.parse(line);
|
|
16421
|
+
} catch (error) {
|
|
16422
|
+
throw new Error(`[JsonProcessor] Invalid JSON at line ${index + 1}: ${error.message}`);
|
|
16423
|
+
}
|
|
16424
|
+
});
|
|
16425
|
+
}
|
|
16426
|
+
/**
|
|
16427
|
+
* Detect PII in JSON Lines format
|
|
16428
|
+
*/
|
|
16429
|
+
async detectJsonLines(input, detector, options) {
|
|
16430
|
+
const documents = this.parseJsonLines(input);
|
|
16431
|
+
return Promise.all(documents.map((doc) => this.detect(doc, detector, options)));
|
|
16432
|
+
}
|
|
16433
|
+
};
|
|
16434
|
+
}));
|
|
16435
|
+
|
|
16436
|
+
//#endregion
|
|
16437
|
+
//#region src/document/CsvProcessor.ts
|
|
16438
|
+
/**
|
|
16439
|
+
* Create a CSV processor instance
|
|
16440
|
+
*/
|
|
16441
|
+
function createCsvProcessor() {
|
|
16442
|
+
return new CsvProcessor();
|
|
16443
|
+
}
|
|
16444
|
+
var CsvProcessor;
|
|
16445
|
+
var init_CsvProcessor = __esmMin((() => {
|
|
16446
|
+
CsvProcessor = class {
|
|
16447
|
+
constructor() {
|
|
16448
|
+
this.defaultOptions = {
|
|
16449
|
+
quote: "\"",
|
|
16450
|
+
escape: "\"",
|
|
16451
|
+
skipEmptyLines: true,
|
|
16452
|
+
piiIndicatorNames: [
|
|
16453
|
+
"email",
|
|
16454
|
+
"e-mail",
|
|
16455
|
+
"mail",
|
|
16456
|
+
"email_address",
|
|
16457
|
+
"phone",
|
|
16458
|
+
"tel",
|
|
16459
|
+
"telephone",
|
|
16460
|
+
"mobile",
|
|
16461
|
+
"phone_number",
|
|
16462
|
+
"ssn",
|
|
16463
|
+
"social_security",
|
|
16464
|
+
"social_security_number",
|
|
16465
|
+
"address",
|
|
16466
|
+
"street",
|
|
16467
|
+
"street_address",
|
|
16468
|
+
"city",
|
|
16469
|
+
"zip",
|
|
16470
|
+
"zipcode",
|
|
16471
|
+
"postal",
|
|
16472
|
+
"postcode",
|
|
16473
|
+
"name",
|
|
16474
|
+
"firstname",
|
|
16475
|
+
"first_name",
|
|
16476
|
+
"lastname",
|
|
16477
|
+
"last_name",
|
|
16478
|
+
"fullname",
|
|
16479
|
+
"full_name",
|
|
16480
|
+
"password",
|
|
16481
|
+
"pwd",
|
|
16482
|
+
"secret",
|
|
16483
|
+
"token",
|
|
16484
|
+
"api_key",
|
|
16485
|
+
"card",
|
|
16486
|
+
"credit_card",
|
|
16487
|
+
"creditcard",
|
|
16488
|
+
"card_number",
|
|
16489
|
+
"account",
|
|
16490
|
+
"account_number",
|
|
16491
|
+
"iban",
|
|
16492
|
+
"swift",
|
|
16493
|
+
"passport",
|
|
16494
|
+
"passport_number",
|
|
16495
|
+
"license",
|
|
16496
|
+
"licence",
|
|
16497
|
+
"driver_license",
|
|
16498
|
+
"dob",
|
|
16499
|
+
"date_of_birth",
|
|
16500
|
+
"birth_date",
|
|
16501
|
+
"birthdate"
|
|
16502
|
+
],
|
|
16503
|
+
treatFirstRowAsHeader: true
|
|
16504
|
+
};
|
|
16505
|
+
}
|
|
16506
|
+
/**
|
|
16507
|
+
* Parse CSV from buffer or string
|
|
16508
|
+
*/
|
|
16509
|
+
parse(input, options) {
|
|
16510
|
+
const opts = {
|
|
16511
|
+
...this.defaultOptions,
|
|
16512
|
+
...options
|
|
16513
|
+
};
|
|
16514
|
+
const text = typeof input === "string" ? input : input.toString("utf-8");
|
|
16515
|
+
const delimiter = opts.delimiter || this.detectDelimiter(text);
|
|
16516
|
+
const lines = text.split(/\r?\n/);
|
|
16517
|
+
const rows = [];
|
|
16518
|
+
let rowIndex = 0;
|
|
16519
|
+
for (let i = 0; i < lines.length; i++) {
|
|
16520
|
+
const line = lines[i];
|
|
16521
|
+
if (opts.skipEmptyLines && line.trim().length === 0) continue;
|
|
16522
|
+
if (opts.maxRows !== void 0 && rowIndex >= opts.maxRows) break;
|
|
16523
|
+
const values = this.parseRow(line, delimiter, opts.quote, opts.escape);
|
|
16524
|
+
rows.push({
|
|
16525
|
+
index: rowIndex,
|
|
16526
|
+
values
|
|
16527
|
+
});
|
|
16528
|
+
rowIndex++;
|
|
16529
|
+
}
|
|
16530
|
+
return rows;
|
|
16531
|
+
}
|
|
16532
|
+
/**
|
|
16533
|
+
* Detect PII in CSV data
|
|
16534
|
+
*/
|
|
16535
|
+
async detect(input, detector, options) {
|
|
16536
|
+
const opts = {
|
|
16537
|
+
...this.defaultOptions,
|
|
16538
|
+
...options
|
|
16539
|
+
};
|
|
16540
|
+
const rows = this.parse(input, options);
|
|
16541
|
+
if (rows.length === 0) {
|
|
16542
|
+
const original = typeof input === "string" ? input : input.toString("utf-8");
|
|
16543
|
+
return {
|
|
16544
|
+
original,
|
|
16545
|
+
redacted: original,
|
|
16546
|
+
detections: [],
|
|
16547
|
+
redactionMap: {},
|
|
16548
|
+
stats: { piiCount: 0 },
|
|
16549
|
+
rowCount: 0,
|
|
16550
|
+
columnCount: 0,
|
|
16551
|
+
columnStats: {},
|
|
16552
|
+
matchesByCell: []
|
|
16553
|
+
};
|
|
16554
|
+
}
|
|
16555
|
+
const hasHeader = opts.hasHeader !== void 0 ? opts.hasHeader : this.detectHeader(rows);
|
|
16556
|
+
const headers = hasHeader && rows.length > 0 ? rows[0].values : void 0;
|
|
16557
|
+
const dataRows = hasHeader ? rows.slice(1) : rows;
|
|
16558
|
+
const columnCount = rows[0].values.length;
|
|
16559
|
+
const columnNameToIndex = /* @__PURE__ */ new Map();
|
|
16560
|
+
if (headers) headers.forEach((header, index) => {
|
|
16561
|
+
columnNameToIndex.set(header.toLowerCase().trim(), index);
|
|
16562
|
+
});
|
|
16563
|
+
const alwaysRedactCols = new Set(opts.alwaysRedactColumns || []);
|
|
16564
|
+
if (opts.alwaysRedactColumnNames && headers) opts.alwaysRedactColumnNames.forEach((name) => {
|
|
16565
|
+
const index = columnNameToIndex.get(name.toLowerCase().trim());
|
|
16566
|
+
if (index !== void 0) alwaysRedactCols.add(index);
|
|
16567
|
+
});
|
|
16568
|
+
const skipCols = new Set(opts.skipColumns || []);
|
|
16569
|
+
const columnStats = {};
|
|
16570
|
+
const matchesByCell = [];
|
|
16571
|
+
const allDetections = [];
|
|
16572
|
+
for (let col = 0; col < columnCount; col++) columnStats[col] = {
|
|
16573
|
+
columnIndex: col,
|
|
16574
|
+
columnName: headers?.[col],
|
|
16575
|
+
piiCount: 0,
|
|
16576
|
+
piiPercentage: 0,
|
|
16577
|
+
piiTypes: []
|
|
16578
|
+
};
|
|
16579
|
+
for (const row of dataRows) for (let col = 0; col < row.values.length; col++) {
|
|
16580
|
+
if (skipCols.has(col)) continue;
|
|
16581
|
+
const cellValue = row.values[col];
|
|
16582
|
+
if (alwaysRedactCols.has(col)) {
|
|
16583
|
+
const detection = {
|
|
16584
|
+
type: "SENSITIVE_COLUMN",
|
|
16585
|
+
value: cellValue,
|
|
16586
|
+
placeholder: `[SENSITIVE_COLUMN_${col}]`,
|
|
16587
|
+
position: [0, cellValue.length],
|
|
16588
|
+
severity: "high",
|
|
16589
|
+
confidence: 1
|
|
16590
|
+
};
|
|
16591
|
+
matchesByCell.push({
|
|
16592
|
+
row: row.index,
|
|
16593
|
+
column: col,
|
|
16594
|
+
columnName: headers?.[col],
|
|
16595
|
+
value: cellValue,
|
|
16596
|
+
matches: [detection]
|
|
16597
|
+
});
|
|
16598
|
+
allDetections.push(detection);
|
|
16599
|
+
columnStats[col].piiCount++;
|
|
16600
|
+
continue;
|
|
16601
|
+
}
|
|
16602
|
+
const result = await detector.detect(cellValue);
|
|
16603
|
+
if (result.detections.length > 0) {
|
|
16604
|
+
const boostedDetections = this.boostConfidenceFromColumnName(result.detections, headers?.[col], opts.piiIndicatorNames || []);
|
|
16605
|
+
matchesByCell.push({
|
|
16606
|
+
row: row.index,
|
|
16607
|
+
column: col,
|
|
16608
|
+
columnName: headers?.[col],
|
|
16609
|
+
value: cellValue,
|
|
16610
|
+
matches: boostedDetections
|
|
16611
|
+
});
|
|
16612
|
+
allDetections.push(...boostedDetections);
|
|
16613
|
+
columnStats[col].piiCount += boostedDetections.length;
|
|
16614
|
+
const columnTypes = new Set(columnStats[col].piiTypes);
|
|
16615
|
+
boostedDetections.forEach((d) => columnTypes.add(d.type));
|
|
16616
|
+
columnStats[col].piiTypes = Array.from(columnTypes);
|
|
16617
|
+
}
|
|
16618
|
+
}
|
|
16619
|
+
for (let col = 0; col < columnCount; col++) {
|
|
16620
|
+
const rowsWithPii = matchesByCell.filter((m) => m.column === col).length;
|
|
16621
|
+
columnStats[col].piiPercentage = dataRows.length > 0 ? rowsWithPii / dataRows.length * 100 : 0;
|
|
16622
|
+
}
|
|
16623
|
+
const original = typeof input === "string" ? input : input.toString("utf-8");
|
|
16624
|
+
const redacted = this.redact(original, {
|
|
16625
|
+
original,
|
|
16626
|
+
redacted: original,
|
|
16627
|
+
detections: allDetections,
|
|
16628
|
+
redactionMap: {},
|
|
16629
|
+
stats: { piiCount: allDetections.length },
|
|
16630
|
+
rowCount: dataRows.length,
|
|
16631
|
+
columnCount,
|
|
16632
|
+
headers,
|
|
16633
|
+
columnStats,
|
|
16634
|
+
matchesByCell
|
|
16635
|
+
}, opts);
|
|
16636
|
+
const redactionMap = {};
|
|
16637
|
+
allDetections.forEach((det) => {
|
|
16638
|
+
redactionMap[det.placeholder] = det.value;
|
|
16639
|
+
});
|
|
16640
|
+
return {
|
|
16641
|
+
original,
|
|
16642
|
+
redacted,
|
|
16643
|
+
detections: allDetections,
|
|
16644
|
+
redactionMap,
|
|
16645
|
+
stats: { piiCount: allDetections.length },
|
|
16646
|
+
rowCount: dataRows.length,
|
|
16647
|
+
columnCount,
|
|
16648
|
+
headers: headers?.filter((h) => h !== void 0),
|
|
16649
|
+
columnStats,
|
|
16650
|
+
matchesByCell
|
|
16651
|
+
};
|
|
16652
|
+
}
|
|
16653
|
+
/**
|
|
16654
|
+
* Redact PII in CSV data
|
|
16655
|
+
*/
|
|
16656
|
+
redact(input, detectionResult, options) {
|
|
16657
|
+
const opts = {
|
|
16658
|
+
...this.defaultOptions,
|
|
16659
|
+
...options
|
|
16660
|
+
};
|
|
16661
|
+
const rows = this.parse(input, options);
|
|
16662
|
+
if (rows.length === 0) return "";
|
|
16663
|
+
const delimiter = opts.delimiter || this.detectDelimiter(typeof input === "string" ? input : input.toString("utf-8"));
|
|
16664
|
+
const hasHeader = detectionResult.headers !== void 0;
|
|
16665
|
+
const redactionMap = /* @__PURE__ */ new Map();
|
|
16666
|
+
for (const cellMatch of detectionResult.matchesByCell) {
|
|
16667
|
+
if (!redactionMap.has(cellMatch.row)) redactionMap.set(cellMatch.row, /* @__PURE__ */ new Map());
|
|
16668
|
+
redactionMap.get(cellMatch.row).set(cellMatch.column, "[REDACTED]");
|
|
16669
|
+
}
|
|
16670
|
+
const outputRows = [];
|
|
16671
|
+
for (let i = 0; i < rows.length; i++) {
|
|
16672
|
+
const row = rows[i];
|
|
16673
|
+
if (hasHeader && i === 0) outputRows.push(this.formatRow(row.values, delimiter, opts.quote));
|
|
16674
|
+
else {
|
|
16675
|
+
const rowIndex = hasHeader ? i - 1 : i;
|
|
16676
|
+
const redactedValues = row.values.map((value, colIndex) => {
|
|
16677
|
+
return redactionMap.get(rowIndex)?.get(colIndex) || value;
|
|
16678
|
+
});
|
|
16679
|
+
outputRows.push(this.formatRow(redactedValues, delimiter, opts.quote));
|
|
16680
|
+
}
|
|
16681
|
+
}
|
|
16682
|
+
return outputRows.join("\n");
|
|
16683
|
+
}
|
|
16684
|
+
/**
|
|
16685
|
+
* Parse a single CSV row
|
|
16686
|
+
*/
|
|
16687
|
+
parseRow(line, delimiter, quote, _escape) {
|
|
16688
|
+
const values = [];
|
|
16689
|
+
let current = "";
|
|
16690
|
+
let inQuotes = false;
|
|
16691
|
+
let i = 0;
|
|
16692
|
+
while (i < line.length) {
|
|
16693
|
+
const char = line[i];
|
|
16694
|
+
const nextChar = line[i + 1];
|
|
16695
|
+
if (char === quote) if (inQuotes && nextChar === quote) {
|
|
16696
|
+
current += quote;
|
|
16697
|
+
i += 2;
|
|
16698
|
+
} else {
|
|
16699
|
+
inQuotes = !inQuotes;
|
|
16700
|
+
i++;
|
|
16701
|
+
}
|
|
16702
|
+
else if (char === delimiter && !inQuotes) {
|
|
16703
|
+
values.push(current);
|
|
16704
|
+
current = "";
|
|
16705
|
+
i++;
|
|
16706
|
+
} else {
|
|
16707
|
+
current += char;
|
|
16708
|
+
i++;
|
|
16709
|
+
}
|
|
16710
|
+
}
|
|
16711
|
+
values.push(current);
|
|
16712
|
+
return values;
|
|
16713
|
+
}
|
|
16714
|
+
/**
|
|
16715
|
+
* Format a row as CSV
|
|
16716
|
+
*/
|
|
16717
|
+
formatRow(values, delimiter, quote) {
|
|
16718
|
+
return values.map((value) => {
|
|
16719
|
+
if (value.includes(delimiter) || value.includes(quote) || value.includes("\n")) return `${quote}${value.replace(new RegExp(quote, "g"), quote + quote)}${quote}`;
|
|
16720
|
+
return value;
|
|
16721
|
+
}).join(delimiter);
|
|
16722
|
+
}
|
|
16723
|
+
/**
|
|
16724
|
+
* Auto-detect CSV delimiter
|
|
16725
|
+
*/
|
|
16726
|
+
detectDelimiter(text) {
|
|
16727
|
+
const delimiters = [
|
|
16728
|
+
",",
|
|
16729
|
+
" ",
|
|
16730
|
+
";",
|
|
16731
|
+
"|"
|
|
16732
|
+
];
|
|
16733
|
+
const lines = text.split(/\r?\n/).slice(0, 5);
|
|
16734
|
+
let bestDelimiter = ",";
|
|
16735
|
+
let bestScore = 0;
|
|
16736
|
+
for (const delimiter of delimiters) {
|
|
16737
|
+
const counts = lines.map((line) => {
|
|
16738
|
+
let count = 0;
|
|
16739
|
+
let inQuotes = false;
|
|
16740
|
+
for (const char of line) {
|
|
16741
|
+
if (char === "\"") inQuotes = !inQuotes;
|
|
16742
|
+
if (char === delimiter && !inQuotes) count++;
|
|
16743
|
+
}
|
|
16744
|
+
return count;
|
|
16745
|
+
});
|
|
16746
|
+
if (counts.length > 0 && counts[0] > 0) {
|
|
16747
|
+
const avg = counts.reduce((a, b) => a + b, 0) / counts.length;
|
|
16748
|
+
const score = avg / (counts.reduce((sum, c) => sum + Math.pow(c - avg, 2), 0) / counts.length + 1);
|
|
16749
|
+
if (score > bestScore) {
|
|
16750
|
+
bestScore = score;
|
|
16751
|
+
bestDelimiter = delimiter;
|
|
16752
|
+
}
|
|
16753
|
+
}
|
|
16754
|
+
}
|
|
16755
|
+
return bestDelimiter;
|
|
16756
|
+
}
|
|
16757
|
+
/**
|
|
16758
|
+
* Detect if first row is likely a header
|
|
16759
|
+
*/
|
|
16760
|
+
detectHeader(rows) {
|
|
16761
|
+
if (rows.length < 2) return false;
|
|
16762
|
+
const firstRow = rows[0].values;
|
|
16763
|
+
const secondRow = rows[1].values;
|
|
16764
|
+
if (firstRow.reduce((sum, v) => sum + v.length, 0) / firstRow.length > secondRow.reduce((sum, v) => sum + v.length, 0) / secondRow.length * 1.5) return false;
|
|
16765
|
+
const firstRowNumeric = firstRow.filter((v) => !isNaN(Number(v)) && v.trim() !== "").length;
|
|
16766
|
+
return firstRow.length - firstRowNumeric >= firstRowNumeric;
|
|
16767
|
+
}
|
|
16768
|
+
/**
|
|
16769
|
+
* Boost confidence if column name indicates PII
|
|
16770
|
+
*/
|
|
16771
|
+
boostConfidenceFromColumnName(detections, columnName, piiIndicatorNames) {
|
|
16772
|
+
if (!columnName) return detections;
|
|
16773
|
+
const nameLower = columnName.toLowerCase().trim();
|
|
16774
|
+
if (!piiIndicatorNames.some((indicator) => nameLower.includes(indicator.toLowerCase()))) return detections;
|
|
16775
|
+
return detections.map((detection) => ({
|
|
16776
|
+
...detection,
|
|
16777
|
+
confidence: Math.min(1, (detection.confidence || .5) * 1.2)
|
|
16778
|
+
}));
|
|
16779
|
+
}
|
|
16780
|
+
/**
|
|
16781
|
+
* Extract all cell values as text
|
|
16782
|
+
*/
|
|
16783
|
+
extractText(input, options) {
|
|
16784
|
+
const rows = this.parse(input, options);
|
|
16785
|
+
const textParts = [];
|
|
16786
|
+
for (const row of rows) for (const value of row.values) if (value.trim().length > 0) textParts.push(value);
|
|
16787
|
+
return textParts.join(" ");
|
|
16788
|
+
}
|
|
16789
|
+
/**
|
|
16790
|
+
* Get column statistics without full PII detection
|
|
16791
|
+
*/
|
|
16792
|
+
getColumnInfo(input, options) {
|
|
16793
|
+
const rows = this.parse(input, options);
|
|
16794
|
+
if (rows.length === 0) return {
|
|
16795
|
+
columnCount: 0,
|
|
16796
|
+
rowCount: 0,
|
|
16797
|
+
sampleRows: []
|
|
16798
|
+
};
|
|
16799
|
+
const opts = {
|
|
16800
|
+
...this.defaultOptions,
|
|
16801
|
+
...options
|
|
16802
|
+
};
|
|
16803
|
+
const hasHeader = opts.hasHeader !== void 0 ? opts.hasHeader : this.detectHeader(rows);
|
|
16804
|
+
const headers = hasHeader && rows.length > 0 ? rows[0].values : void 0;
|
|
16805
|
+
const dataRows = hasHeader ? rows.slice(1) : rows;
|
|
16806
|
+
const sampleRows = dataRows.slice(0, 5).map((r) => r.values);
|
|
16807
|
+
return {
|
|
16808
|
+
columnCount: rows[0].values.length,
|
|
16809
|
+
rowCount: dataRows.length,
|
|
16810
|
+
headers,
|
|
16811
|
+
sampleRows
|
|
16812
|
+
};
|
|
16813
|
+
}
|
|
16814
|
+
};
|
|
16815
|
+
}));
|
|
16816
|
+
|
|
16817
|
+
//#endregion
|
|
16818
|
+
//#region src/document/XlsxProcessor.ts
|
|
16819
|
+
/**
|
|
16820
|
+
* Create an XLSX processor instance
|
|
16821
|
+
*/
|
|
16822
|
+
function createXlsxProcessor() {
|
|
16823
|
+
return new XlsxProcessor();
|
|
16824
|
+
}
|
|
16825
|
+
var XlsxProcessor;
|
|
16826
|
+
var init_XlsxProcessor = __esmMin((() => {
|
|
16827
|
+
XlsxProcessor = class {
|
|
16828
|
+
constructor() {
|
|
16829
|
+
this.defaultOptions = {
|
|
16830
|
+
piiIndicatorNames: [
|
|
16831
|
+
"email",
|
|
16832
|
+
"e-mail",
|
|
16833
|
+
"mail",
|
|
16834
|
+
"email_address",
|
|
16835
|
+
"phone",
|
|
16836
|
+
"tel",
|
|
16837
|
+
"telephone",
|
|
16838
|
+
"mobile",
|
|
16839
|
+
"phone_number",
|
|
16840
|
+
"ssn",
|
|
16841
|
+
"social_security",
|
|
16842
|
+
"social_security_number",
|
|
16843
|
+
"address",
|
|
16844
|
+
"street",
|
|
16845
|
+
"street_address",
|
|
16846
|
+
"city",
|
|
16847
|
+
"zip",
|
|
16848
|
+
"zipcode",
|
|
16849
|
+
"postal",
|
|
16850
|
+
"postcode",
|
|
16851
|
+
"name",
|
|
16852
|
+
"firstname",
|
|
16853
|
+
"first_name",
|
|
16854
|
+
"lastname",
|
|
16855
|
+
"last_name",
|
|
16856
|
+
"fullname",
|
|
16857
|
+
"full_name",
|
|
16858
|
+
"password",
|
|
16859
|
+
"pwd",
|
|
16860
|
+
"secret",
|
|
16861
|
+
"token",
|
|
16862
|
+
"api_key",
|
|
16863
|
+
"card",
|
|
16864
|
+
"credit_card",
|
|
16865
|
+
"creditcard",
|
|
16866
|
+
"card_number",
|
|
16867
|
+
"account",
|
|
16868
|
+
"account_number",
|
|
16869
|
+
"iban",
|
|
16870
|
+
"swift",
|
|
16871
|
+
"passport",
|
|
16872
|
+
"passport_number",
|
|
16873
|
+
"license",
|
|
16874
|
+
"licence",
|
|
16875
|
+
"driver_license",
|
|
16876
|
+
"dob",
|
|
16877
|
+
"date_of_birth",
|
|
16878
|
+
"birth_date",
|
|
16879
|
+
"birthdate"
|
|
16880
|
+
],
|
|
16881
|
+
preserveFormatting: true,
|
|
16882
|
+
preserveFormulas: true
|
|
16883
|
+
};
|
|
16884
|
+
try {
|
|
16885
|
+
this.xlsx = require("xlsx");
|
|
16886
|
+
} catch {}
|
|
16887
|
+
}
|
|
16888
|
+
/**
|
|
16889
|
+
* Check if XLSX support is available
|
|
16890
|
+
*/
|
|
16891
|
+
isAvailable() {
|
|
16892
|
+
return !!this.xlsx;
|
|
16893
|
+
}
|
|
16894
|
+
/**
|
|
16895
|
+
* Parse XLSX from buffer
|
|
16896
|
+
*/
|
|
16897
|
+
parse(buffer) {
|
|
16898
|
+
if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
|
|
16899
|
+
try {
|
|
16900
|
+
return this.xlsx.read(buffer, {
|
|
16901
|
+
type: "buffer",
|
|
16902
|
+
cellFormula: true,
|
|
16903
|
+
cellStyles: true
|
|
16904
|
+
});
|
|
16905
|
+
} catch (error) {
|
|
16906
|
+
throw new Error(`[XlsxProcessor] Failed to parse XLSX: ${error.message}`);
|
|
16907
|
+
}
|
|
16908
|
+
}
|
|
16909
|
+
/**
|
|
16910
|
+
* Detect PII in XLSX data
|
|
16911
|
+
*/
|
|
16912
|
+
async detect(buffer, detector, options) {
|
|
16913
|
+
if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
|
|
16914
|
+
const opts = {
|
|
16915
|
+
...this.defaultOptions,
|
|
16916
|
+
...options
|
|
16917
|
+
};
|
|
16918
|
+
const workbook = this.parse(buffer);
|
|
16919
|
+
const sheetNames = this.getSheetNamesToProcess(workbook, opts);
|
|
16920
|
+
const sheetResults = [];
|
|
16921
|
+
const allDetections = [];
|
|
16922
|
+
const allTypes = /* @__PURE__ */ new Set();
|
|
16923
|
+
for (let sheetIndex = 0; sheetIndex < sheetNames.length; sheetIndex++) {
|
|
16924
|
+
const sheetName = sheetNames[sheetIndex];
|
|
16925
|
+
const sheet = workbook.Sheets[sheetName];
|
|
16926
|
+
const sheetResult = await this.detectSheet(sheet, sheetName, sheetIndex, detector, opts);
|
|
16927
|
+
sheetResults.push(sheetResult);
|
|
16928
|
+
allDetections.push(...sheetResult.matchesByCell.flatMap((c) => c.matches));
|
|
16929
|
+
sheetResult.matchesByCell.forEach((cell) => {
|
|
16930
|
+
cell.matches.forEach((det) => allTypes.add(det.type));
|
|
16931
|
+
});
|
|
16932
|
+
}
|
|
16933
|
+
const original = this.extractText(buffer, options);
|
|
16934
|
+
const redactedBuffer = this.redact(buffer, {
|
|
16935
|
+
original,
|
|
16936
|
+
redacted: original,
|
|
16937
|
+
detections: allDetections,
|
|
16938
|
+
redactionMap: {},
|
|
16939
|
+
stats: { piiCount: allDetections.length },
|
|
16940
|
+
sheetResults,
|
|
16941
|
+
sheetCount: sheetResults.length
|
|
16942
|
+
}, options);
|
|
16943
|
+
const redacted = this.extractText(redactedBuffer, options);
|
|
16944
|
+
const redactionMap = {};
|
|
16945
|
+
allDetections.forEach((det) => {
|
|
16946
|
+
redactionMap[det.placeholder] = det.value;
|
|
16947
|
+
});
|
|
16948
|
+
return {
|
|
16949
|
+
original,
|
|
16950
|
+
redacted,
|
|
16951
|
+
detections: allDetections,
|
|
16952
|
+
redactionMap,
|
|
16953
|
+
stats: { piiCount: allDetections.length },
|
|
16954
|
+
sheetResults,
|
|
16955
|
+
sheetCount: sheetResults.length
|
|
16956
|
+
};
|
|
16957
|
+
}
|
|
16958
|
+
/**
|
|
16959
|
+
* Detect PII in a single sheet
|
|
16960
|
+
*/
|
|
16961
|
+
async detectSheet(sheet, sheetName, sheetIndex, detector, options) {
|
|
16962
|
+
const range = this.xlsx.utils.decode_range(sheet["!ref"] || "A1");
|
|
16963
|
+
const startRow = range.s.r;
|
|
16964
|
+
const endRow = options.maxRows !== void 0 ? Math.min(range.e.r, startRow + options.maxRows - 1) : range.e.r;
|
|
16965
|
+
const startCol = range.s.c;
|
|
16966
|
+
const endCol = range.e.c;
|
|
16967
|
+
const columnCount = endCol - startCol + 1;
|
|
16968
|
+
const hasHeader = options.hasHeader !== void 0 ? options.hasHeader : this.detectHeader(sheet, range);
|
|
16969
|
+
const headers = hasHeader ? this.getRowValues(sheet, startRow, startCol, endCol) : void 0;
|
|
16970
|
+
const dataStartRow = hasHeader ? startRow + 1 : startRow;
|
|
16971
|
+
const columnNameToIndex = /* @__PURE__ */ new Map();
|
|
16972
|
+
if (headers) headers.forEach((header, index) => {
|
|
16973
|
+
if (header) columnNameToIndex.set(header.toLowerCase().trim(), index);
|
|
16974
|
+
});
|
|
16975
|
+
const alwaysRedactCols = new Set(options.alwaysRedactColumns || []);
|
|
16976
|
+
if (options.alwaysRedactColumnNames && headers) options.alwaysRedactColumnNames.forEach((name) => {
|
|
16977
|
+
const index = columnNameToIndex.get(name.toLowerCase().trim());
|
|
16978
|
+
if (index !== void 0) alwaysRedactCols.add(index);
|
|
16979
|
+
});
|
|
16980
|
+
const skipCols = new Set(options.skipColumns || []);
|
|
16981
|
+
const columnStats = {};
|
|
16982
|
+
for (let col = 0; col <= endCol - startCol; col++) columnStats[col] = {
|
|
16983
|
+
columnIndex: col,
|
|
16984
|
+
columnLetter: this.columnToLetter(col),
|
|
16985
|
+
columnName: headers?.[col],
|
|
16986
|
+
piiCount: 0,
|
|
16987
|
+
piiPercentage: 0,
|
|
16988
|
+
piiTypes: []
|
|
16989
|
+
};
|
|
16990
|
+
const matchesByCell = [];
|
|
16991
|
+
for (let row = dataStartRow; row <= endRow; row++) for (let col = startCol; col <= endCol; col++) {
|
|
16992
|
+
const colIndex = col - startCol;
|
|
16993
|
+
if (skipCols.has(colIndex)) continue;
|
|
16994
|
+
const cellRef = this.xlsx.utils.encode_cell({
|
|
16995
|
+
r: row,
|
|
16996
|
+
c: col
|
|
16997
|
+
});
|
|
16998
|
+
const cell = sheet[cellRef];
|
|
16999
|
+
if (!cell) continue;
|
|
17000
|
+
const cellValue = this.getCellValue(cell);
|
|
17001
|
+
if (!cellValue) continue;
|
|
17002
|
+
const cellFormula = cell.f;
|
|
17003
|
+
if (alwaysRedactCols.has(colIndex)) {
|
|
17004
|
+
const detection = {
|
|
17005
|
+
type: "SENSITIVE_COLUMN",
|
|
17006
|
+
value: cellValue,
|
|
17007
|
+
placeholder: `[SENSITIVE_COLUMN_${colIndex}]`,
|
|
17008
|
+
position: [0, cellValue.length],
|
|
17009
|
+
severity: "high",
|
|
17010
|
+
confidence: 1
|
|
17011
|
+
};
|
|
17012
|
+
matchesByCell.push({
|
|
17013
|
+
cell: cellRef,
|
|
17014
|
+
row: row + 1,
|
|
17015
|
+
column: colIndex,
|
|
17016
|
+
columnLetter: this.columnToLetter(colIndex),
|
|
17017
|
+
columnName: headers?.[colIndex],
|
|
17018
|
+
value: cellValue,
|
|
17019
|
+
formula: cellFormula,
|
|
17020
|
+
matches: [detection]
|
|
17021
|
+
});
|
|
17022
|
+
columnStats[colIndex].piiCount++;
|
|
17023
|
+
continue;
|
|
17024
|
+
}
|
|
17025
|
+
const result = await detector.detect(cellValue);
|
|
17026
|
+
if (result.detections.length > 0) {
|
|
17027
|
+
const boostedDetections = this.boostConfidenceFromColumnName(result.detections, headers?.[colIndex], options.piiIndicatorNames || []);
|
|
17028
|
+
matchesByCell.push({
|
|
17029
|
+
cell: cellRef,
|
|
17030
|
+
row: row + 1,
|
|
17031
|
+
column: colIndex,
|
|
17032
|
+
columnLetter: this.columnToLetter(colIndex),
|
|
17033
|
+
columnName: headers?.[colIndex],
|
|
17034
|
+
value: cellValue,
|
|
17035
|
+
formula: cellFormula,
|
|
17036
|
+
matches: boostedDetections
|
|
17037
|
+
});
|
|
17038
|
+
columnStats[colIndex].piiCount += boostedDetections.length;
|
|
17039
|
+
const columnTypes = new Set(columnStats[colIndex].piiTypes);
|
|
17040
|
+
boostedDetections.forEach((d) => columnTypes.add(d.type));
|
|
17041
|
+
columnStats[colIndex].piiTypes = Array.from(columnTypes);
|
|
17042
|
+
}
|
|
17043
|
+
}
|
|
17044
|
+
const dataRowCount = endRow - dataStartRow + 1;
|
|
17045
|
+
for (let col = 0; col <= endCol - startCol; col++) {
|
|
17046
|
+
const rowsWithPii = matchesByCell.filter((m) => m.column === col).length;
|
|
17047
|
+
columnStats[col].piiPercentage = dataRowCount > 0 ? rowsWithPii / dataRowCount * 100 : 0;
|
|
17048
|
+
}
|
|
17049
|
+
return {
|
|
17050
|
+
sheetName,
|
|
17051
|
+
sheetIndex,
|
|
17052
|
+
rowCount: dataRowCount,
|
|
17053
|
+
columnCount,
|
|
17054
|
+
headers: headers?.filter((h) => h !== void 0),
|
|
17055
|
+
columnStats,
|
|
17056
|
+
matchesByCell
|
|
17057
|
+
};
|
|
17058
|
+
}
|
|
17059
|
+
/**
|
|
17060
|
+
* Redact PII in XLSX data
|
|
17061
|
+
*/
|
|
17062
|
+
redact(buffer, detectionResult, options) {
|
|
17063
|
+
if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
|
|
17064
|
+
const opts = {
|
|
17065
|
+
...this.defaultOptions,
|
|
17066
|
+
...options
|
|
17067
|
+
};
|
|
17068
|
+
const workbook = this.parse(buffer);
|
|
17069
|
+
for (const sheetResult of detectionResult.sheetResults) {
|
|
17070
|
+
const sheet = workbook.Sheets[sheetResult.sheetName];
|
|
17071
|
+
for (const cellMatch of sheetResult.matchesByCell) {
|
|
17072
|
+
const cell = sheet[cellMatch.cell];
|
|
17073
|
+
if (!cell) continue;
|
|
17074
|
+
cell.v = "[REDACTED]";
|
|
17075
|
+
cell.w = "[REDACTED]";
|
|
17076
|
+
if (!opts.preserveFormulas) delete cell.f;
|
|
17077
|
+
cell.t = "s";
|
|
17078
|
+
}
|
|
17079
|
+
}
|
|
17080
|
+
return this.xlsx.write(workbook, {
|
|
17081
|
+
type: "buffer",
|
|
17082
|
+
bookType: "xlsx"
|
|
17083
|
+
});
|
|
17084
|
+
}
|
|
17085
|
+
/**
|
|
17086
|
+
* Get cell value as string
|
|
17087
|
+
*/
|
|
17088
|
+
getCellValue(cell) {
|
|
17089
|
+
if (!cell) return "";
|
|
17090
|
+
if (cell.w !== void 0) return String(cell.w);
|
|
17091
|
+
if (cell.v !== void 0) return String(cell.v);
|
|
17092
|
+
return "";
|
|
17093
|
+
}
|
|
17094
|
+
/**
|
|
17095
|
+
* Get row values
|
|
17096
|
+
*/
|
|
17097
|
+
getRowValues(sheet, row, startCol, endCol) {
|
|
17098
|
+
const values = [];
|
|
17099
|
+
for (let col = startCol; col <= endCol; col++) {
|
|
17100
|
+
const cell = sheet[this.xlsx.utils.encode_cell({
|
|
17101
|
+
r: row,
|
|
17102
|
+
c: col
|
|
17103
|
+
})];
|
|
17104
|
+
values.push(cell ? this.getCellValue(cell) : void 0);
|
|
17105
|
+
}
|
|
17106
|
+
return values;
|
|
17107
|
+
}
|
|
17108
|
+
/**
|
|
17109
|
+
* Detect if first row is likely a header
|
|
17110
|
+
*/
|
|
17111
|
+
detectHeader(sheet, range) {
|
|
17112
|
+
const firstRow = this.getRowValues(sheet, range.s.r, range.s.c, range.e.c);
|
|
17113
|
+
const secondRow = range.s.r + 1 <= range.e.r ? this.getRowValues(sheet, range.s.r + 1, range.s.c, range.e.c) : null;
|
|
17114
|
+
if (!secondRow) return false;
|
|
17115
|
+
const firstRowValues = firstRow.filter((v) => v !== void 0);
|
|
17116
|
+
const secondRowValues = secondRow.filter((v) => v !== void 0);
|
|
17117
|
+
if (firstRowValues.length === 0 || secondRowValues.length === 0) return false;
|
|
17118
|
+
if (firstRowValues.reduce((sum, v) => sum + v.length, 0) / firstRowValues.length > secondRowValues.reduce((sum, v) => sum + v.length, 0) / secondRowValues.length * 1.5) return false;
|
|
17119
|
+
const firstRowNumeric = firstRowValues.filter((v) => !isNaN(Number(v)) && v.trim() !== "").length;
|
|
17120
|
+
return firstRowValues.length - firstRowNumeric >= firstRowNumeric;
|
|
17121
|
+
}
|
|
17122
|
+
/**
|
|
17123
|
+
* Convert column index to letter (0 = A, 25 = Z, 26 = AA)
|
|
17124
|
+
*/
|
|
17125
|
+
columnToLetter(col) {
|
|
17126
|
+
let letter = "";
|
|
17127
|
+
while (col >= 0) {
|
|
17128
|
+
letter = String.fromCharCode(col % 26 + 65) + letter;
|
|
17129
|
+
col = Math.floor(col / 26) - 1;
|
|
17130
|
+
}
|
|
17131
|
+
return letter;
|
|
17132
|
+
}
|
|
17133
|
+
/**
|
|
17134
|
+
* Get sheet names to process based on options
|
|
17135
|
+
*/
|
|
17136
|
+
getSheetNamesToProcess(workbook, options) {
|
|
17137
|
+
const allSheetNames = workbook.SheetNames;
|
|
17138
|
+
if (options.sheets && options.sheets.length > 0) return options.sheets.filter((name) => allSheetNames.includes(name));
|
|
17139
|
+
if (options.sheetIndices && options.sheetIndices.length > 0) return options.sheetIndices.filter((index) => index >= 0 && index < allSheetNames.length).map((index) => allSheetNames[index]);
|
|
17140
|
+
return allSheetNames;
|
|
17141
|
+
}
|
|
17142
|
+
/**
|
|
17143
|
+
* Boost confidence if column name indicates PII
|
|
17144
|
+
*/
|
|
17145
|
+
boostConfidenceFromColumnName(detections, columnName, piiIndicatorNames) {
|
|
17146
|
+
if (!columnName) return detections;
|
|
17147
|
+
const nameLower = columnName.toLowerCase().trim();
|
|
17148
|
+
if (!piiIndicatorNames.some((indicator) => nameLower.includes(indicator.toLowerCase()))) return detections;
|
|
17149
|
+
return detections.map((detection) => ({
|
|
17150
|
+
...detection,
|
|
17151
|
+
confidence: Math.min(1, (detection.confidence || .5) * 1.2)
|
|
17152
|
+
}));
|
|
17153
|
+
}
|
|
17154
|
+
/**
|
|
17155
|
+
* Extract all cell values as text
|
|
17156
|
+
*/
|
|
17157
|
+
extractText(buffer, options) {
|
|
17158
|
+
if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
|
|
17159
|
+
const workbook = this.parse(buffer);
|
|
17160
|
+
const opts = {
|
|
17161
|
+
...this.defaultOptions,
|
|
17162
|
+
...options
|
|
17163
|
+
};
|
|
17164
|
+
const sheetNames = this.getSheetNamesToProcess(workbook, opts);
|
|
17165
|
+
const textParts = [];
|
|
17166
|
+
for (const sheetName of sheetNames) {
|
|
17167
|
+
const sheet = workbook.Sheets[sheetName];
|
|
17168
|
+
const range = this.xlsx.utils.decode_range(sheet["!ref"] || "A1");
|
|
17169
|
+
for (let row = range.s.r; row <= range.e.r; row++) for (let col = range.s.c; col <= range.e.c; col++) {
|
|
17170
|
+
const cell = sheet[this.xlsx.utils.encode_cell({
|
|
17171
|
+
r: row,
|
|
17172
|
+
c: col
|
|
17173
|
+
})];
|
|
17174
|
+
if (cell) {
|
|
17175
|
+
const value = this.getCellValue(cell);
|
|
17176
|
+
if (value.trim().length > 0) textParts.push(value);
|
|
17177
|
+
}
|
|
17178
|
+
}
|
|
17179
|
+
}
|
|
17180
|
+
return textParts.join(" ");
|
|
17181
|
+
}
|
|
17182
|
+
/**
|
|
17183
|
+
* Get workbook metadata
|
|
17184
|
+
*/
|
|
17185
|
+
getMetadata(buffer) {
|
|
17186
|
+
if (!this.xlsx) throw new Error("[XlsxProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
|
|
17187
|
+
const workbook = this.parse(buffer);
|
|
17188
|
+
return {
|
|
17189
|
+
sheetNames: workbook.SheetNames,
|
|
17190
|
+
sheetCount: workbook.SheetNames.length
|
|
17191
|
+
};
|
|
17192
|
+
}
|
|
17193
|
+
};
|
|
17194
|
+
}));
|
|
17195
|
+
|
|
17196
|
+
//#endregion
|
|
17197
|
+
//#region src/document/DocumentProcessor.ts
|
|
17198
|
+
/**
|
|
17199
|
+
* Create a document processor instance
|
|
17200
|
+
*/
|
|
17201
|
+
function createDocumentProcessor() {
|
|
17202
|
+
return new DocumentProcessor();
|
|
17203
|
+
}
|
|
17204
|
+
var DocumentProcessor;
|
|
17205
|
+
var init_DocumentProcessor = __esmMin((() => {
|
|
17206
|
+
init_OCRProcessor();
|
|
17207
|
+
init_JsonProcessor();
|
|
17208
|
+
init_CsvProcessor();
|
|
17209
|
+
init_XlsxProcessor();
|
|
17210
|
+
DocumentProcessor = class {
|
|
17211
|
+
constructor() {
|
|
17212
|
+
try {
|
|
17213
|
+
this.pdfParse = require("pdf-parse");
|
|
17214
|
+
} catch {}
|
|
17215
|
+
try {
|
|
17216
|
+
this.mammoth = require("mammoth");
|
|
17217
|
+
} catch {}
|
|
17218
|
+
this.ocrProcessor = new OCRProcessor();
|
|
17219
|
+
this.jsonProcessor = new JsonProcessor();
|
|
17220
|
+
this.csvProcessor = new CsvProcessor();
|
|
17221
|
+
this.xlsxProcessor = new XlsxProcessor();
|
|
17222
|
+
}
|
|
17223
|
+
/**
|
|
17224
|
+
* Extract text from document buffer
|
|
17225
|
+
*/
|
|
17226
|
+
async extractText(buffer, options) {
|
|
17227
|
+
const format = options?.format || this.detectFormat(buffer);
|
|
17228
|
+
if (!format) throw new Error("[DocumentProcessor] Unable to detect document format. Supported: PDF, DOCX, TXT, images (with OCR)");
|
|
17229
|
+
const maxSize = options?.maxSize || 50 * 1024 * 1024;
|
|
17230
|
+
if (buffer.length > maxSize) throw new Error(`[DocumentProcessor] Document size (${buffer.length} bytes) exceeds maximum (${maxSize} bytes)`);
|
|
17231
|
+
switch (format) {
|
|
17232
|
+
case "pdf": return this.extractPdfText(buffer, options);
|
|
17233
|
+
case "docx": return this.extractDocxText(buffer, options);
|
|
17234
|
+
case "txt": return buffer.toString("utf-8");
|
|
17235
|
+
case "image": return this.extractImageText(buffer, options);
|
|
17236
|
+
case "json": return this.extractJsonText(buffer, options);
|
|
17237
|
+
case "csv": return this.extractCsvText(buffer, options);
|
|
17238
|
+
case "xlsx": return this.extractXlsxText(buffer, options);
|
|
17239
|
+
default: throw new Error(`[DocumentProcessor] Unsupported format: ${format}`);
|
|
17240
|
+
}
|
|
17241
|
+
}
|
|
17242
|
+
/**
|
|
17243
|
+
* Get document metadata
|
|
17244
|
+
*/
|
|
17245
|
+
async getMetadata(buffer, options) {
|
|
17246
|
+
const format = options?.format || this.detectFormat(buffer);
|
|
17247
|
+
if (!format) throw new Error("[DocumentProcessor] Unable to detect document format");
|
|
17248
|
+
switch (format) {
|
|
17249
|
+
case "pdf": return this.getPdfMetadata(buffer, options);
|
|
17250
|
+
case "docx": return this.getDocxMetadata(buffer, options);
|
|
17251
|
+
case "txt": return {
|
|
17252
|
+
format: "txt",
|
|
17253
|
+
pages: void 0
|
|
17254
|
+
};
|
|
17255
|
+
case "image": return this.getImageMetadata(buffer, options);
|
|
17256
|
+
case "json": return this.getJsonMetadata(buffer, options);
|
|
17257
|
+
case "csv": return this.getCsvMetadata(buffer, options);
|
|
17258
|
+
case "xlsx": return this.getXlsxMetadata(buffer, options);
|
|
17259
|
+
default: throw new Error(`[DocumentProcessor] Unsupported format: ${format}`);
|
|
17260
|
+
}
|
|
17261
|
+
}
|
|
17262
|
+
/**
|
|
17263
|
+
* Detect document format from buffer
|
|
17264
|
+
*/
|
|
17265
|
+
detectFormat(buffer) {
|
|
17266
|
+
if (buffer.length < 4) return null;
|
|
17267
|
+
if (buffer.toString("utf-8", 0, 4) === "%PDF") return "pdf";
|
|
17268
|
+
if (buffer.length >= 8 && buffer[0] === 137 && buffer[1] === 80 && buffer[2] === 78 && buffer[3] === 71) return "image";
|
|
17269
|
+
if (buffer[0] === 255 && buffer[1] === 216 && buffer[2] === 255) return "image";
|
|
17270
|
+
if (buffer[0] === 73 && buffer[1] === 73 && buffer[2] === 42 && buffer[3] === 0 || buffer[0] === 77 && buffer[1] === 77 && buffer[2] === 0 && buffer[3] === 42) return "image";
|
|
17271
|
+
if (buffer[0] === 66 && buffer[1] === 77) return "image";
|
|
17272
|
+
if (buffer.length >= 12 && buffer[0] === 82 && buffer[1] === 73 && buffer[2] === 70 && buffer[3] === 70 && buffer[8] === 87 && buffer[9] === 69 && buffer[10] === 66 && buffer[11] === 80) return "image";
|
|
17273
|
+
if (buffer[0] === 80 && buffer[1] === 75) {
|
|
17274
|
+
const zipHeader = buffer.toString("utf-8", 0, Math.min(500, buffer.length));
|
|
17275
|
+
if (zipHeader.includes("word/") || zipHeader.includes("[Content_Types].xml")) return "docx";
|
|
17276
|
+
if (zipHeader.includes("xl/")) return "xlsx";
|
|
17277
|
+
}
|
|
17278
|
+
const text = buffer.toString("utf-8");
|
|
17279
|
+
const trimmed = text.trim();
|
|
17280
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}") || trimmed.startsWith("[") && trimmed.endsWith("]")) {
|
|
17281
|
+
if (this.jsonProcessor.isValid(buffer)) return "json";
|
|
17282
|
+
}
|
|
17283
|
+
const lines = text.split(/\r?\n/).slice(0, 5);
|
|
17284
|
+
if (lines.length >= 2) for (const delimiter of [
|
|
17285
|
+
",",
|
|
17286
|
+
" ",
|
|
17287
|
+
";",
|
|
17288
|
+
"|"
|
|
17289
|
+
]) {
|
|
17290
|
+
const counts = lines.map((line) => (line.match(new RegExp(delimiter, "g")) || []).length);
|
|
17291
|
+
if (counts[0] > 0 && counts.every((c) => c === counts[0])) return "csv";
|
|
17292
|
+
}
|
|
17293
|
+
const sample = buffer.slice(0, Math.min(1e3, buffer.length));
|
|
17294
|
+
if (sample.filter((byte) => byte < 32 && byte !== 9 && byte !== 10 && byte !== 13).length < sample.length * .1) return "txt";
|
|
17295
|
+
return null;
|
|
17296
|
+
}
|
|
17297
|
+
/**
|
|
17298
|
+
* Check if format is supported
|
|
17299
|
+
*/
|
|
17300
|
+
isFormatSupported(format) {
|
|
17301
|
+
switch (format) {
|
|
17302
|
+
case "pdf": return !!this.pdfParse;
|
|
17303
|
+
case "docx": return !!this.mammoth;
|
|
17304
|
+
case "txt": return true;
|
|
17305
|
+
case "image": return this.ocrProcessor.isAvailable();
|
|
17306
|
+
case "json": return true;
|
|
17307
|
+
case "csv": return true;
|
|
17308
|
+
case "xlsx": return this.xlsxProcessor.isAvailable();
|
|
17309
|
+
default: return false;
|
|
17310
|
+
}
|
|
17311
|
+
}
|
|
17312
|
+
/**
|
|
17313
|
+
* Extract text from PDF
|
|
17314
|
+
*/
|
|
17315
|
+
async extractPdfText(buffer, options) {
|
|
17316
|
+
if (!this.pdfParse) throw new Error("[DocumentProcessor] PDF support requires pdf-parse. Install with: npm install pdf-parse");
|
|
17317
|
+
try {
|
|
17318
|
+
const data = await this.pdfParse(buffer, {
|
|
17319
|
+
password: options?.password,
|
|
17320
|
+
max: options?.pages ? Math.max(...options.pages) : void 0
|
|
17321
|
+
});
|
|
17322
|
+
if (options?.pages) return data.text;
|
|
17323
|
+
return data.text || "";
|
|
17324
|
+
} catch (error) {
|
|
17325
|
+
throw new Error(`[DocumentProcessor] PDF extraction failed: ${error.message}`);
|
|
17326
|
+
}
|
|
17327
|
+
}
|
|
17328
|
+
/**
|
|
17329
|
+
* Extract text from DOCX
|
|
17330
|
+
*/
|
|
17331
|
+
async extractDocxText(buffer, _options) {
|
|
17332
|
+
if (!this.mammoth) throw new Error("[DocumentProcessor] DOCX support requires mammoth. Install with: npm install mammoth");
|
|
17333
|
+
try {
|
|
17334
|
+
return (await this.mammoth.extractRawText({ buffer })).value || "";
|
|
17335
|
+
} catch (error) {
|
|
17336
|
+
throw new Error(`[DocumentProcessor] DOCX extraction failed: ${error.message}`);
|
|
17337
|
+
}
|
|
17338
|
+
}
|
|
17339
|
+
/**
|
|
17340
|
+
* Get PDF metadata
|
|
17341
|
+
*/
|
|
17342
|
+
async getPdfMetadata(buffer, _options) {
|
|
17343
|
+
if (!this.pdfParse) throw new Error("[DocumentProcessor] PDF support requires pdf-parse. Install with: npm install pdf-parse");
|
|
17344
|
+
try {
|
|
17345
|
+
const data = await this.pdfParse(buffer, { password: _options?.password });
|
|
17346
|
+
return {
|
|
17347
|
+
format: "pdf",
|
|
17348
|
+
pages: data.numpages,
|
|
17349
|
+
title: data.info?.Title,
|
|
17350
|
+
author: data.info?.Author,
|
|
17351
|
+
creationDate: data.info?.CreationDate ? new Date(data.info.CreationDate) : void 0,
|
|
17352
|
+
modifiedDate: data.info?.ModDate ? new Date(data.info.ModDate) : void 0,
|
|
17353
|
+
custom: data.info
|
|
17354
|
+
};
|
|
17355
|
+
} catch (error) {
|
|
17356
|
+
throw new Error(`[DocumentProcessor] PDF metadata extraction failed: ${error.message}`);
|
|
17357
|
+
}
|
|
17358
|
+
}
|
|
17359
|
+
/**
|
|
17360
|
+
* Get DOCX metadata
|
|
17361
|
+
*/
|
|
17362
|
+
async getDocxMetadata(_buffer, _options) {
|
|
17363
|
+
return {
|
|
17364
|
+
format: "docx",
|
|
17365
|
+
pages: void 0
|
|
17366
|
+
};
|
|
17367
|
+
}
|
|
17368
|
+
/**
|
|
17369
|
+
* Extract text from image using OCR
|
|
17370
|
+
*/
|
|
17371
|
+
async extractImageText(buffer, options) {
|
|
17372
|
+
if (!this.ocrProcessor.isAvailable()) throw new Error("[DocumentProcessor] Image/OCR support requires tesseract.js. Install with: npm install tesseract.js");
|
|
17373
|
+
try {
|
|
17374
|
+
return (await this.ocrProcessor.recognizeText(buffer, options?.ocrOptions)).text;
|
|
17375
|
+
} catch (error) {
|
|
17376
|
+
throw new Error(`[DocumentProcessor] Image text extraction failed: ${error.message}`);
|
|
17377
|
+
}
|
|
17378
|
+
}
|
|
17379
|
+
/**
|
|
17380
|
+
* Get image metadata
|
|
17381
|
+
*/
|
|
17382
|
+
async getImageMetadata(buffer, options) {
|
|
17383
|
+
if (!this.ocrProcessor.isAvailable()) return {
|
|
17384
|
+
format: "image",
|
|
17385
|
+
pages: void 0,
|
|
17386
|
+
usedOCR: false
|
|
17387
|
+
};
|
|
17388
|
+
try {
|
|
17389
|
+
return {
|
|
17390
|
+
format: "image",
|
|
17391
|
+
pages: void 0,
|
|
17392
|
+
usedOCR: true,
|
|
17393
|
+
ocrConfidence: (await this.ocrProcessor.recognizeText(buffer, options?.ocrOptions)).confidence
|
|
17394
|
+
};
|
|
17395
|
+
} catch {
|
|
17396
|
+
return {
|
|
17397
|
+
format: "image",
|
|
17398
|
+
pages: void 0,
|
|
17399
|
+
usedOCR: false
|
|
17400
|
+
};
|
|
17401
|
+
}
|
|
17402
|
+
}
|
|
17403
|
+
/**
|
|
17404
|
+
* Extract text from JSON
|
|
17405
|
+
*/
|
|
17406
|
+
async extractJsonText(buffer, _options) {
|
|
17407
|
+
try {
|
|
17408
|
+
return this.jsonProcessor.extractText(buffer);
|
|
17409
|
+
} catch (error) {
|
|
17410
|
+
throw new Error(`[DocumentProcessor] JSON extraction failed: ${error.message}`);
|
|
17411
|
+
}
|
|
17412
|
+
}
|
|
17413
|
+
/**
|
|
17414
|
+
* Extract text from CSV
|
|
17415
|
+
*/
|
|
17416
|
+
async extractCsvText(buffer, _options) {
|
|
17417
|
+
try {
|
|
17418
|
+
return this.csvProcessor.extractText(buffer);
|
|
17419
|
+
} catch (error) {
|
|
17420
|
+
throw new Error(`[DocumentProcessor] CSV extraction failed: ${error.message}`);
|
|
17421
|
+
}
|
|
17422
|
+
}
|
|
17423
|
+
/**
|
|
17424
|
+
* Extract text from XLSX
|
|
17425
|
+
*/
|
|
17426
|
+
async extractXlsxText(buffer, _options) {
|
|
17427
|
+
if (!this.xlsxProcessor.isAvailable()) throw new Error("[DocumentProcessor] XLSX support requires xlsx package. Install with: npm install xlsx");
|
|
17428
|
+
try {
|
|
17429
|
+
return this.xlsxProcessor.extractText(buffer);
|
|
17430
|
+
} catch (error) {
|
|
17431
|
+
throw new Error(`[DocumentProcessor] XLSX extraction failed: ${error.message}`);
|
|
17432
|
+
}
|
|
17433
|
+
}
|
|
17434
|
+
/**
|
|
17435
|
+
* Get JSON metadata
|
|
17436
|
+
*/
|
|
17437
|
+
async getJsonMetadata(buffer, _options) {
|
|
17438
|
+
try {
|
|
17439
|
+
const data = this.jsonProcessor.parse(buffer);
|
|
17440
|
+
const isArray = Array.isArray(data);
|
|
17441
|
+
return {
|
|
17442
|
+
format: "json",
|
|
17443
|
+
pages: void 0,
|
|
17444
|
+
custom: {
|
|
17445
|
+
isArray,
|
|
17446
|
+
itemCount: isArray ? data.length : Object.keys(data).length
|
|
17447
|
+
}
|
|
17448
|
+
};
|
|
17449
|
+
} catch {
|
|
17450
|
+
return {
|
|
17451
|
+
format: "json",
|
|
17452
|
+
pages: void 0
|
|
17453
|
+
};
|
|
17454
|
+
}
|
|
17455
|
+
}
|
|
17456
|
+
/**
|
|
17457
|
+
* Get CSV metadata
|
|
17458
|
+
*/
|
|
17459
|
+
async getCsvMetadata(buffer, _options) {
|
|
17460
|
+
try {
|
|
17461
|
+
const info = this.csvProcessor.getColumnInfo(buffer);
|
|
17462
|
+
return {
|
|
17463
|
+
format: "csv",
|
|
17464
|
+
pages: void 0,
|
|
17465
|
+
custom: {
|
|
17466
|
+
rowCount: info.rowCount,
|
|
17467
|
+
columnCount: info.columnCount,
|
|
17468
|
+
headers: info.headers
|
|
17469
|
+
}
|
|
17470
|
+
};
|
|
17471
|
+
} catch {
|
|
17472
|
+
return {
|
|
17473
|
+
format: "csv",
|
|
17474
|
+
pages: void 0
|
|
17475
|
+
};
|
|
17476
|
+
}
|
|
17477
|
+
}
|
|
17478
|
+
/**
|
|
17479
|
+
* Get XLSX metadata
|
|
17480
|
+
*/
|
|
17481
|
+
async getXlsxMetadata(buffer, _options) {
|
|
17482
|
+
if (!this.xlsxProcessor.isAvailable()) return {
|
|
17483
|
+
format: "xlsx",
|
|
17484
|
+
pages: void 0
|
|
17485
|
+
};
|
|
17486
|
+
try {
|
|
17487
|
+
const metadata = this.xlsxProcessor.getMetadata(buffer);
|
|
17488
|
+
return {
|
|
17489
|
+
format: "xlsx",
|
|
17490
|
+
pages: void 0,
|
|
17491
|
+
custom: {
|
|
17492
|
+
sheetNames: metadata.sheetNames,
|
|
17493
|
+
sheetCount: metadata.sheetCount
|
|
17494
|
+
}
|
|
17495
|
+
};
|
|
17496
|
+
} catch {
|
|
17497
|
+
return {
|
|
17498
|
+
format: "xlsx",
|
|
17499
|
+
pages: void 0
|
|
17500
|
+
};
|
|
17501
|
+
}
|
|
17502
|
+
}
|
|
17503
|
+
/**
|
|
17504
|
+
* Get OCR processor instance
|
|
17505
|
+
*/
|
|
17506
|
+
getOCRProcessor() {
|
|
17507
|
+
return this.ocrProcessor;
|
|
17508
|
+
}
|
|
17509
|
+
/**
|
|
17510
|
+
* Get JSON processor instance
|
|
17511
|
+
*/
|
|
17512
|
+
getJsonProcessor() {
|
|
17513
|
+
return this.jsonProcessor;
|
|
17514
|
+
}
|
|
17515
|
+
/**
|
|
17516
|
+
* Get CSV processor instance
|
|
17517
|
+
*/
|
|
17518
|
+
getCsvProcessor() {
|
|
17519
|
+
return this.csvProcessor;
|
|
17520
|
+
}
|
|
17521
|
+
/**
|
|
17522
|
+
* Get XLSX processor instance
|
|
17523
|
+
*/
|
|
17524
|
+
getXlsxProcessor() {
|
|
17525
|
+
return this.xlsxProcessor;
|
|
17526
|
+
}
|
|
17527
|
+
};
|
|
17528
|
+
}));
|
|
17529
|
+
|
|
17530
|
+
//#endregion
|
|
17531
|
+
//#region src/document/index.ts
|
|
17532
|
+
var document_exports = /* @__PURE__ */ __exportAll({
|
|
17533
|
+
CsvProcessor: () => CsvProcessor,
|
|
17534
|
+
DocumentProcessor: () => DocumentProcessor,
|
|
17535
|
+
JsonProcessor: () => JsonProcessor,
|
|
17536
|
+
OCRProcessor: () => OCRProcessor,
|
|
17537
|
+
XlsxProcessor: () => XlsxProcessor,
|
|
17538
|
+
createCsvProcessor: () => createCsvProcessor,
|
|
17539
|
+
createDocumentProcessor: () => createDocumentProcessor,
|
|
17540
|
+
createJsonProcessor: () => createJsonProcessor,
|
|
17541
|
+
createOCRProcessor: () => createOCRProcessor,
|
|
17542
|
+
createXlsxProcessor: () => createXlsxProcessor
|
|
17543
|
+
});
|
|
17544
|
+
var init_document = __esmMin((() => {
|
|
17545
|
+
init_DocumentProcessor();
|
|
17546
|
+
init_OCRProcessor();
|
|
17547
|
+
init_JsonProcessor();
|
|
17548
|
+
init_CsvProcessor();
|
|
17549
|
+
init_XlsxProcessor();
|
|
17550
|
+
}));
|
|
17551
|
+
|
|
17552
|
+
//#endregion
|
|
17553
|
+
//#region src/workers/WorkerPool.ts
|
|
17554
|
+
/**
|
|
17555
|
+
* Worker thread pool for parallel processing
|
|
17556
|
+
*/
|
|
17557
|
+
/**
|
|
17558
|
+
* Create a worker pool instance
|
|
17559
|
+
*/
|
|
17560
|
+
function createWorkerPool(config) {
|
|
17561
|
+
return new WorkerPool(config);
|
|
17562
|
+
}
|
|
17563
|
+
var WorkerPool;
|
|
17564
|
+
var init_WorkerPool = __esmMin((() => {
|
|
17565
|
+
WorkerPool = class {
|
|
17566
|
+
constructor(config = {}) {
|
|
17567
|
+
this.workers = [];
|
|
17568
|
+
this.availableWorkers = [];
|
|
17569
|
+
this.taskQueue = [];
|
|
17570
|
+
this.totalProcessingTime = 0;
|
|
17571
|
+
this.config = {
|
|
17572
|
+
numWorkers: config.numWorkers || (0, os.cpus)().length,
|
|
17573
|
+
maxQueueSize: config.maxQueueSize || 100,
|
|
17574
|
+
idleTimeout: config.idleTimeout || 3e4
|
|
17575
|
+
};
|
|
17576
|
+
this.stats = {
|
|
17577
|
+
activeWorkers: 0,
|
|
17578
|
+
idleWorkers: 0,
|
|
17579
|
+
queueSize: 0,
|
|
17580
|
+
totalProcessed: 0,
|
|
17581
|
+
totalErrors: 0,
|
|
17582
|
+
avgProcessingTime: 0
|
|
17583
|
+
};
|
|
17584
|
+
this.workerPath = (0, path.join)(__dirname, "worker.js");
|
|
17585
|
+
}
|
|
17586
|
+
/**
|
|
17587
|
+
* Initialize worker pool
|
|
17588
|
+
*/
|
|
17589
|
+
async initialize() {
|
|
17590
|
+
for (let i = 0; i < this.config.numWorkers; i++) await this.createWorker();
|
|
17591
|
+
}
|
|
17592
|
+
/**
|
|
17593
|
+
* Create a new worker
|
|
17594
|
+
*/
|
|
17595
|
+
async createWorker() {
|
|
17596
|
+
const worker = new worker_threads.Worker(this.workerPath);
|
|
17597
|
+
worker.on("message", (result) => {
|
|
17598
|
+
this.handleWorkerResult(worker, result);
|
|
17599
|
+
});
|
|
17600
|
+
worker.on("error", (error) => {
|
|
17601
|
+
console.error("[WorkerPool] Worker error:", error);
|
|
17602
|
+
this.stats.totalErrors++;
|
|
17603
|
+
this.removeWorker(worker);
|
|
17604
|
+
this.createWorker();
|
|
17605
|
+
});
|
|
17606
|
+
worker.on("exit", (code) => {
|
|
17607
|
+
if (code !== 0) console.error(`[WorkerPool] Worker exited with code ${code}`);
|
|
17608
|
+
this.removeWorker(worker);
|
|
17609
|
+
});
|
|
17610
|
+
this.workers.push(worker);
|
|
17611
|
+
this.availableWorkers.push(worker);
|
|
17612
|
+
this.stats.idleWorkers++;
|
|
17613
|
+
return worker;
|
|
17614
|
+
}
|
|
17615
|
+
/**
|
|
17616
|
+
* Execute a task on the worker pool
|
|
17617
|
+
*/
|
|
17618
|
+
async execute(task) {
|
|
17619
|
+
if (this.taskQueue.length >= this.config.maxQueueSize) throw new Error(`[WorkerPool] Queue is full (max: ${this.config.maxQueueSize})`);
|
|
17620
|
+
return new Promise((resolve, reject) => {
|
|
17621
|
+
this.taskQueue.push({
|
|
17622
|
+
task,
|
|
17623
|
+
resolve,
|
|
17624
|
+
reject
|
|
17625
|
+
});
|
|
17626
|
+
this.stats.queueSize = this.taskQueue.length;
|
|
17627
|
+
this.processQueue();
|
|
17628
|
+
});
|
|
17629
|
+
}
|
|
17630
|
+
/**
|
|
17631
|
+
* Process task queue
|
|
17632
|
+
*/
|
|
17633
|
+
processQueue() {
|
|
17634
|
+
while (this.taskQueue.length > 0 && this.availableWorkers.length > 0) {
|
|
17635
|
+
const worker = this.availableWorkers.shift();
|
|
17636
|
+
const { task, resolve, reject } = this.taskQueue.shift();
|
|
17637
|
+
this.stats.idleWorkers--;
|
|
17638
|
+
this.stats.activeWorkers++;
|
|
17639
|
+
this.stats.queueSize = this.taskQueue.length;
|
|
17640
|
+
worker.__currentTask = {
|
|
17641
|
+
resolve,
|
|
17642
|
+
reject,
|
|
17643
|
+
startTime: Date.now()
|
|
17644
|
+
};
|
|
17645
|
+
worker.postMessage(task);
|
|
17646
|
+
}
|
|
17647
|
+
}
|
|
17648
|
+
/**
|
|
17649
|
+
* Handle worker result
|
|
17650
|
+
*/
|
|
17651
|
+
handleWorkerResult(worker, result) {
|
|
17652
|
+
const currentTask = worker.__currentTask;
|
|
17653
|
+
if (!currentTask) return;
|
|
17654
|
+
this.stats.activeWorkers--;
|
|
17655
|
+
this.stats.idleWorkers++;
|
|
17656
|
+
this.stats.totalProcessed++;
|
|
17657
|
+
this.totalProcessingTime += result.processingTime;
|
|
17658
|
+
this.stats.avgProcessingTime = this.totalProcessingTime / this.stats.totalProcessed;
|
|
17659
|
+
this.availableWorkers.push(worker);
|
|
17660
|
+
delete worker.__currentTask;
|
|
17661
|
+
if (result.error) {
|
|
17662
|
+
this.stats.totalErrors++;
|
|
17663
|
+
currentTask.reject(new Error(result.error));
|
|
17664
|
+
} else currentTask.resolve(result.result);
|
|
17665
|
+
this.processQueue();
|
|
17666
|
+
}
|
|
17667
|
+
/**
|
|
17668
|
+
* Remove worker from pool
|
|
17669
|
+
*/
|
|
17670
|
+
removeWorker(worker) {
|
|
17671
|
+
const index = this.workers.indexOf(worker);
|
|
17672
|
+
if (index !== -1) this.workers.splice(index, 1);
|
|
17673
|
+
const availableIndex = this.availableWorkers.indexOf(worker);
|
|
17674
|
+
if (availableIndex !== -1) {
|
|
17675
|
+
this.availableWorkers.splice(availableIndex, 1);
|
|
17676
|
+
this.stats.idleWorkers--;
|
|
17677
|
+
}
|
|
17678
|
+
}
|
|
17679
|
+
/**
|
|
17680
|
+
* Get pool statistics
|
|
17681
|
+
*/
|
|
17682
|
+
getStats() {
|
|
17683
|
+
return { ...this.stats };
|
|
17684
|
+
}
|
|
17685
|
+
/**
|
|
17686
|
+
* Terminate all workers
|
|
17687
|
+
*/
|
|
17688
|
+
async terminate() {
|
|
17689
|
+
const terminatePromises = this.workers.map((worker) => worker.terminate());
|
|
17690
|
+
await Promise.all(terminatePromises);
|
|
17691
|
+
this.workers = [];
|
|
17692
|
+
this.availableWorkers = [];
|
|
17693
|
+
this.taskQueue = [];
|
|
17694
|
+
this.stats.activeWorkers = 0;
|
|
17695
|
+
this.stats.idleWorkers = 0;
|
|
17696
|
+
this.stats.queueSize = 0;
|
|
17697
|
+
}
|
|
17698
|
+
};
|
|
17699
|
+
}));
|
|
17700
|
+
|
|
17701
|
+
//#endregion
|
|
17702
|
+
//#region src/workers/index.ts
|
|
17703
|
+
var workers_exports = /* @__PURE__ */ __exportAll({
|
|
17704
|
+
WorkerPool: () => WorkerPool,
|
|
17705
|
+
createWorkerPool: () => createWorkerPool
|
|
17706
|
+
});
|
|
17707
|
+
var init_workers = __esmMin((() => {
|
|
17708
|
+
init_WorkerPool();
|
|
17709
|
+
}));
|
|
17710
|
+
|
|
15757
17711
|
//#endregion
|
|
15758
17712
|
//#region src/detector.ts
|
|
15759
17713
|
var OpenRedaction = class OpenRedaction {
|
|
@@ -16403,14 +18357,14 @@ var OpenRedaction = class OpenRedaction {
|
|
|
16403
18357
|
* Run health check
|
|
16404
18358
|
*/
|
|
16405
18359
|
async healthCheck(options) {
|
|
16406
|
-
const { HealthChecker } = await Promise.resolve().then(() =>
|
|
18360
|
+
const { HealthChecker } = await Promise.resolve().then(() => (init_HealthCheck(), HealthCheck_exports));
|
|
16407
18361
|
return new HealthChecker(this).check(options);
|
|
16408
18362
|
}
|
|
16409
18363
|
/**
|
|
16410
18364
|
* Quick health check (minimal overhead)
|
|
16411
18365
|
*/
|
|
16412
18366
|
async quickHealthCheck() {
|
|
16413
|
-
const { HealthChecker } = await Promise.resolve().then(() =>
|
|
18367
|
+
const { HealthChecker } = await Promise.resolve().then(() => (init_HealthCheck(), HealthCheck_exports));
|
|
16414
18368
|
return new HealthChecker(this).quickCheck();
|
|
16415
18369
|
}
|
|
16416
18370
|
/**
|
|
@@ -16421,7 +18375,7 @@ var OpenRedaction = class OpenRedaction {
|
|
|
16421
18375
|
*/
|
|
16422
18376
|
async detectDocument(buffer, options) {
|
|
16423
18377
|
if (this.rbacManager && !this.rbacManager.hasPermission("detection:detect")) throw new Error("[OpenRedaction] Permission denied: detection:detect required");
|
|
16424
|
-
const { createDocumentProcessor } = await Promise.resolve().then(() =>
|
|
18378
|
+
const { createDocumentProcessor } = await Promise.resolve().then(() => (init_document(), document_exports));
|
|
16425
18379
|
const processor = createDocumentProcessor();
|
|
16426
18380
|
const extractionStart = performance.now();
|
|
16427
18381
|
const text = await processor.extractText(buffer, options);
|
|
@@ -16450,7 +18404,7 @@ var OpenRedaction = class OpenRedaction {
|
|
|
16450
18404
|
* Significantly faster for processing many texts
|
|
16451
18405
|
*/
|
|
16452
18406
|
static async detectBatch(texts, options) {
|
|
16453
|
-
const { createWorkerPool } = await Promise.resolve().then(() =>
|
|
18407
|
+
const { createWorkerPool } = await Promise.resolve().then(() => (init_workers(), workers_exports));
|
|
16454
18408
|
const pool = createWorkerPool({ numWorkers: options?.numWorkers });
|
|
16455
18409
|
try {
|
|
16456
18410
|
await pool.initialize();
|
|
@@ -16470,7 +18424,7 @@ var OpenRedaction = class OpenRedaction {
|
|
|
16470
18424
|
* Efficient for processing many documents at once
|
|
16471
18425
|
*/
|
|
16472
18426
|
static async detectDocumentsBatch(buffers, options) {
|
|
16473
|
-
const { createWorkerPool } = await Promise.resolve().then(() =>
|
|
18427
|
+
const { createWorkerPool } = await Promise.resolve().then(() => (init_workers(), workers_exports));
|
|
16474
18428
|
const pool = createWorkerPool({ numWorkers: options?.numWorkers });
|
|
16475
18429
|
try {
|
|
16476
18430
|
await pool.initialize();
|
|
@@ -16489,6 +18443,7 @@ var OpenRedaction = class OpenRedaction {
|
|
|
16489
18443
|
|
|
16490
18444
|
//#endregion
|
|
16491
18445
|
//#region src/streaming/StreamingDetector.ts
|
|
18446
|
+
init_document();
|
|
16492
18447
|
/**
|
|
16493
18448
|
* Streaming detector for large documents
|
|
16494
18449
|
*/
|
|
@@ -16637,6 +18592,7 @@ function createStreamingDetector(detector, options) {
|
|
|
16637
18592
|
|
|
16638
18593
|
//#endregion
|
|
16639
18594
|
//#region src/batch/BatchProcessor.ts
|
|
18595
|
+
init_workers();
|
|
16640
18596
|
/**
|
|
16641
18597
|
* Batch processor for processing multiple documents
|
|
16642
18598
|
*/
|
|
@@ -18554,6 +20510,7 @@ function createAPIServer(config) {
|
|
|
18554
20510
|
//#endregion
|
|
18555
20511
|
//#region src/index.ts
|
|
18556
20512
|
init_ConfigExporter();
|
|
20513
|
+
init_HealthCheck();
|
|
18557
20514
|
|
|
18558
20515
|
//#endregion
|
|
18559
20516
|
exports.ADMIN_ROLE = ADMIN_ROLE;
|
|
@@ -18565,21 +20522,21 @@ exports.ConfigExporter = ConfigExporter;
|
|
|
18565
20522
|
exports.ConfigLoader = ConfigLoader;
|
|
18566
20523
|
exports.ConsoleAuditLogger = ConsoleAuditLogger;
|
|
18567
20524
|
exports.ContextRulesEngine = ContextRulesEngine;
|
|
18568
|
-
exports.CsvProcessor =
|
|
20525
|
+
exports.CsvProcessor = CsvProcessor;
|
|
18569
20526
|
exports.DEFAULT_DOMAIN_VOCABULARIES = DEFAULT_DOMAIN_VOCABULARIES;
|
|
18570
20527
|
exports.DEFAULT_PROXIMITY_RULES = DEFAULT_PROXIMITY_RULES;
|
|
18571
20528
|
exports.DEFAULT_SEVERITY_MAP = DEFAULT_SEVERITY_MAP;
|
|
18572
20529
|
exports.DEFAULT_TIER_QUOTAS = DEFAULT_TIER_QUOTAS;
|
|
18573
|
-
exports.DocumentProcessor =
|
|
20530
|
+
exports.DocumentProcessor = DocumentProcessor;
|
|
18574
20531
|
exports.ExplainAPI = ExplainAPI;
|
|
18575
20532
|
exports.GRAFANA_DASHBOARD_TEMPLATE = GRAFANA_DASHBOARD_TEMPLATE;
|
|
18576
|
-
exports.HealthChecker =
|
|
20533
|
+
exports.HealthChecker = HealthChecker;
|
|
18577
20534
|
exports.InMemoryAuditLogger = InMemoryAuditLogger;
|
|
18578
20535
|
exports.InMemoryMetricsCollector = InMemoryMetricsCollector;
|
|
18579
|
-
exports.JsonProcessor =
|
|
20536
|
+
exports.JsonProcessor = JsonProcessor;
|
|
18580
20537
|
exports.LocalLearningStore = LocalLearningStore;
|
|
18581
20538
|
exports.NERDetector = NERDetector;
|
|
18582
|
-
exports.OCRProcessor =
|
|
20539
|
+
exports.OCRProcessor = OCRProcessor;
|
|
18583
20540
|
exports.OPERATOR_ROLE = OPERATOR_ROLE;
|
|
18584
20541
|
exports.OpenRedaction = OpenRedaction;
|
|
18585
20542
|
exports.OpenRedactionError = OpenRedactionError;
|
|
@@ -18599,10 +20556,8 @@ exports.TenantQuotaExceededError = TenantQuotaExceededError;
|
|
|
18599
20556
|
exports.TenantSuspendedError = TenantSuspendedError;
|
|
18600
20557
|
exports.VIEWER_ROLE = VIEWER_ROLE;
|
|
18601
20558
|
exports.WebhookManager = WebhookManager;
|
|
18602
|
-
exports.WorkerPool =
|
|
18603
|
-
exports.XlsxProcessor =
|
|
18604
|
-
exports.__toCommonJS = __toCommonJS;
|
|
18605
|
-
exports.__toESM = __toESM;
|
|
20559
|
+
exports.WorkerPool = WorkerPool;
|
|
20560
|
+
exports.XlsxProcessor = XlsxProcessor;
|
|
18606
20561
|
exports.allPatterns = allPatterns;
|
|
18607
20562
|
exports.analyzeContextFeatures = analyzeContextFeatures;
|
|
18608
20563
|
exports.analyzeFullContext = analyzeFullContext;
|
|
@@ -18620,18 +20575,18 @@ exports.createCacheDisabledError = createCacheDisabledError;
|
|
|
18620
20575
|
exports.createConfigLoadError = createConfigLoadError;
|
|
18621
20576
|
exports.createConfigPreset = createConfigPreset;
|
|
18622
20577
|
exports.createContextRulesEngine = createContextRulesEngine;
|
|
18623
|
-
exports.createCsvProcessor =
|
|
20578
|
+
exports.createCsvProcessor = createCsvProcessor;
|
|
18624
20579
|
exports.createCustomRole = createCustomRole;
|
|
18625
|
-
exports.createDocumentProcessor =
|
|
20580
|
+
exports.createDocumentProcessor = createDocumentProcessor;
|
|
18626
20581
|
exports.createExplainAPI = createExplainAPI;
|
|
18627
|
-
exports.createHealthChecker =
|
|
20582
|
+
exports.createHealthChecker = createHealthChecker;
|
|
18628
20583
|
exports.createHighMemoryError = createHighMemoryError;
|
|
18629
20584
|
exports.createInvalidPatternError = createInvalidPatternError;
|
|
18630
|
-
exports.createJsonProcessor =
|
|
20585
|
+
exports.createJsonProcessor = createJsonProcessor;
|
|
18631
20586
|
exports.createLearningDisabledError = createLearningDisabledError;
|
|
18632
20587
|
exports.createMultiPassDisabledError = createMultiPassDisabledError;
|
|
18633
20588
|
exports.createNERDetector = createNERDetector;
|
|
18634
|
-
exports.createOCRProcessor =
|
|
20589
|
+
exports.createOCRProcessor = createOCRProcessor;
|
|
18635
20590
|
exports.createOptimizationDisabledError = createOptimizationDisabledError;
|
|
18636
20591
|
exports.createPersistentAuditLogger = createPersistentAuditLogger;
|
|
18637
20592
|
exports.createPriorityOptimizer = createPriorityOptimizer;
|
|
@@ -18644,8 +20599,8 @@ exports.createStreamingDetector = createStreamingDetector;
|
|
|
18644
20599
|
exports.createTenantManager = createTenantManager;
|
|
18645
20600
|
exports.createValidationError = createValidationError;
|
|
18646
20601
|
exports.createWebhookManager = createWebhookManager;
|
|
18647
|
-
exports.createWorkerPool =
|
|
18648
|
-
exports.createXlsxProcessor =
|
|
20602
|
+
exports.createWorkerPool = createWorkerPool;
|
|
20603
|
+
exports.createXlsxProcessor = createXlsxProcessor;
|
|
18649
20604
|
exports.defaultPasses = defaultPasses;
|
|
18650
20605
|
exports.detectPII = detectPII;
|
|
18651
20606
|
exports.detectionsOverlap = detectionsOverlap;
|
|
@@ -18664,7 +20619,7 @@ exports.getPreset = getPreset;
|
|
|
18664
20619
|
exports.getSeverity = getSeverity;
|
|
18665
20620
|
exports.governmentPatterns = governmentPatterns;
|
|
18666
20621
|
exports.groupPatternsByPass = groupPatternsByPass;
|
|
18667
|
-
exports.healthCheckMiddleware =
|
|
20622
|
+
exports.healthCheckMiddleware = healthCheckMiddleware;
|
|
18668
20623
|
exports.healthcarePreset = healthcarePreset;
|
|
18669
20624
|
exports.healthcareResearchPreset = healthcareResearchPreset;
|
|
18670
20625
|
exports.hipaaPreset = hipaaPreset;
|