secure-redact 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -11
- package/dist/assets/advanced.worker-DSVrF0gl.js +890 -0
- package/dist/assets/nlp.worker-u7Lr_A3c.js +521 -0
- package/dist/assets/ocr.worker-D5s6dY7M.js +139 -0
- package/dist/secure-redact.es.js +7027 -8622
- package/dist/secure-redact.es.js.map +1 -1
- package/dist/secure-redact.umd.js +33 -1583
- package/dist/secure-redact.umd.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,521 @@
|
|
|
1
|
+
const f = /* @__PURE__ */ new Set([
|
|
2
|
+
"aarav",
|
|
3
|
+
"aditi",
|
|
4
|
+
"aditya",
|
|
5
|
+
"akash",
|
|
6
|
+
"amit",
|
|
7
|
+
"amita",
|
|
8
|
+
"ananya",
|
|
9
|
+
"anil",
|
|
10
|
+
"anita",
|
|
11
|
+
"anjali",
|
|
12
|
+
"ankita",
|
|
13
|
+
"arjun",
|
|
14
|
+
"arun",
|
|
15
|
+
"aruna",
|
|
16
|
+
"ashok",
|
|
17
|
+
"bhavna",
|
|
18
|
+
"chandra",
|
|
19
|
+
"deepak",
|
|
20
|
+
"deepika",
|
|
21
|
+
"dev",
|
|
22
|
+
"devika",
|
|
23
|
+
"dhruv",
|
|
24
|
+
"dinesh",
|
|
25
|
+
"divya",
|
|
26
|
+
"ganesh",
|
|
27
|
+
"gaurav",
|
|
28
|
+
"geeta",
|
|
29
|
+
"hari",
|
|
30
|
+
"harish",
|
|
31
|
+
"indira",
|
|
32
|
+
"isha",
|
|
33
|
+
"jagdish",
|
|
34
|
+
"kamala",
|
|
35
|
+
"karan",
|
|
36
|
+
"kavita",
|
|
37
|
+
"kishore",
|
|
38
|
+
"krishna",
|
|
39
|
+
"kumar",
|
|
40
|
+
"lakshmi",
|
|
41
|
+
"mahesh",
|
|
42
|
+
"manish",
|
|
43
|
+
"meera",
|
|
44
|
+
"mohan",
|
|
45
|
+
"mohit",
|
|
46
|
+
"nandini",
|
|
47
|
+
"naresh",
|
|
48
|
+
"neha",
|
|
49
|
+
"nikhil",
|
|
50
|
+
"nisha",
|
|
51
|
+
"pankaj",
|
|
52
|
+
"pooja",
|
|
53
|
+
"prakash",
|
|
54
|
+
"priya",
|
|
55
|
+
"rahul",
|
|
56
|
+
"rajesh",
|
|
57
|
+
"rajiv",
|
|
58
|
+
"raman",
|
|
59
|
+
"ramesh",
|
|
60
|
+
"rani",
|
|
61
|
+
"ravi",
|
|
62
|
+
"rekha",
|
|
63
|
+
"rohit",
|
|
64
|
+
"sachin",
|
|
65
|
+
"sandeep",
|
|
66
|
+
"sanjay",
|
|
67
|
+
"sapna",
|
|
68
|
+
"saroj",
|
|
69
|
+
"seema",
|
|
70
|
+
"shanti",
|
|
71
|
+
"sharma",
|
|
72
|
+
"shivani",
|
|
73
|
+
"shobha",
|
|
74
|
+
"shreya",
|
|
75
|
+
"sita",
|
|
76
|
+
"sneha",
|
|
77
|
+
"sunil",
|
|
78
|
+
"sunita",
|
|
79
|
+
"suresh",
|
|
80
|
+
"swati",
|
|
81
|
+
"tanvi",
|
|
82
|
+
"usha",
|
|
83
|
+
"varun",
|
|
84
|
+
"vijay",
|
|
85
|
+
"vikram",
|
|
86
|
+
"vinod",
|
|
87
|
+
"vishal",
|
|
88
|
+
"vivek",
|
|
89
|
+
"yash",
|
|
90
|
+
"yogesh",
|
|
91
|
+
"john",
|
|
92
|
+
"james",
|
|
93
|
+
"robert",
|
|
94
|
+
"michael",
|
|
95
|
+
"william",
|
|
96
|
+
"david",
|
|
97
|
+
"richard",
|
|
98
|
+
"joseph",
|
|
99
|
+
"thomas",
|
|
100
|
+
"charles",
|
|
101
|
+
"mary",
|
|
102
|
+
"patricia",
|
|
103
|
+
"jennifer",
|
|
104
|
+
"linda",
|
|
105
|
+
"elizabeth",
|
|
106
|
+
"barbara",
|
|
107
|
+
"susan",
|
|
108
|
+
"jessica",
|
|
109
|
+
"sarah",
|
|
110
|
+
"karen",
|
|
111
|
+
"mohammed",
|
|
112
|
+
"ahmed",
|
|
113
|
+
"ali",
|
|
114
|
+
"hassan",
|
|
115
|
+
"hussein",
|
|
116
|
+
"omar",
|
|
117
|
+
"fatima",
|
|
118
|
+
"aisha",
|
|
119
|
+
"zainab",
|
|
120
|
+
"khadija"
|
|
121
|
+
]), m = /* @__PURE__ */ new Set([
|
|
122
|
+
"sharma",
|
|
123
|
+
"verma",
|
|
124
|
+
"gupta",
|
|
125
|
+
"singh",
|
|
126
|
+
"kumar",
|
|
127
|
+
"patel",
|
|
128
|
+
"joshi",
|
|
129
|
+
"mishra",
|
|
130
|
+
"agarwal",
|
|
131
|
+
"mehta",
|
|
132
|
+
"reddy",
|
|
133
|
+
"rao",
|
|
134
|
+
"nair",
|
|
135
|
+
"menon",
|
|
136
|
+
"pillai",
|
|
137
|
+
"iyer",
|
|
138
|
+
"iyengar",
|
|
139
|
+
"mukherjee",
|
|
140
|
+
"chatterjee",
|
|
141
|
+
"banerjee",
|
|
142
|
+
"das",
|
|
143
|
+
"bose",
|
|
144
|
+
"sen",
|
|
145
|
+
"ghosh",
|
|
146
|
+
"roy",
|
|
147
|
+
"dutta",
|
|
148
|
+
"sinha",
|
|
149
|
+
"jain",
|
|
150
|
+
"shah",
|
|
151
|
+
"desai",
|
|
152
|
+
"kulkarni",
|
|
153
|
+
"patil",
|
|
154
|
+
"deshpande",
|
|
155
|
+
"kaur",
|
|
156
|
+
"gill",
|
|
157
|
+
"bajwa",
|
|
158
|
+
"chopra",
|
|
159
|
+
"kapoor",
|
|
160
|
+
"malhotra",
|
|
161
|
+
"khanna",
|
|
162
|
+
"saxena",
|
|
163
|
+
"pandey",
|
|
164
|
+
"tiwari",
|
|
165
|
+
"dubey",
|
|
166
|
+
"trivedi",
|
|
167
|
+
"dwivedi",
|
|
168
|
+
"shukla",
|
|
169
|
+
"chauhan",
|
|
170
|
+
"yadav",
|
|
171
|
+
"thakur",
|
|
172
|
+
"smith",
|
|
173
|
+
"johnson",
|
|
174
|
+
"williams",
|
|
175
|
+
"brown",
|
|
176
|
+
"jones",
|
|
177
|
+
"davis",
|
|
178
|
+
"miller",
|
|
179
|
+
"wilson",
|
|
180
|
+
"moore",
|
|
181
|
+
"taylor"
|
|
182
|
+
]), j = /* @__PURE__ */ new Set([
|
|
183
|
+
"andhra pradesh",
|
|
184
|
+
"arunachal pradesh",
|
|
185
|
+
"assam",
|
|
186
|
+
"bihar",
|
|
187
|
+
"chhattisgarh",
|
|
188
|
+
"goa",
|
|
189
|
+
"gujarat",
|
|
190
|
+
"haryana",
|
|
191
|
+
"himachal pradesh",
|
|
192
|
+
"jharkhand",
|
|
193
|
+
"karnataka",
|
|
194
|
+
"kerala",
|
|
195
|
+
"madhya pradesh",
|
|
196
|
+
"maharashtra",
|
|
197
|
+
"manipur",
|
|
198
|
+
"meghalaya",
|
|
199
|
+
"mizoram",
|
|
200
|
+
"nagaland",
|
|
201
|
+
"odisha",
|
|
202
|
+
"punjab",
|
|
203
|
+
"rajasthan",
|
|
204
|
+
"sikkim",
|
|
205
|
+
"tamil nadu",
|
|
206
|
+
"telangana",
|
|
207
|
+
"tripura",
|
|
208
|
+
"uttar pradesh",
|
|
209
|
+
"uttarakhand",
|
|
210
|
+
"west bengal",
|
|
211
|
+
"delhi",
|
|
212
|
+
"chandigarh",
|
|
213
|
+
"puducherry",
|
|
214
|
+
"jammu and kashmir",
|
|
215
|
+
"ladakh"
|
|
216
|
+
]), u = /* @__PURE__ */ new Set([
|
|
217
|
+
"diabetes",
|
|
218
|
+
"hypertension",
|
|
219
|
+
"asthma",
|
|
220
|
+
"cancer",
|
|
221
|
+
"HIV",
|
|
222
|
+
"AIDS",
|
|
223
|
+
"tuberculosis",
|
|
224
|
+
"TB",
|
|
225
|
+
"hepatitis",
|
|
226
|
+
"malaria",
|
|
227
|
+
"dengue",
|
|
228
|
+
"cholesterol",
|
|
229
|
+
"thyroid",
|
|
230
|
+
"arthritis",
|
|
231
|
+
"epilepsy",
|
|
232
|
+
"pneumonia",
|
|
233
|
+
"bronchitis",
|
|
234
|
+
"anemia",
|
|
235
|
+
"leukemia",
|
|
236
|
+
"lymphoma",
|
|
237
|
+
"insulin",
|
|
238
|
+
"metformin",
|
|
239
|
+
"blood pressure",
|
|
240
|
+
"heart disease",
|
|
241
|
+
"kidney disease",
|
|
242
|
+
"liver disease",
|
|
243
|
+
"lung disease",
|
|
244
|
+
"chemotherapy",
|
|
245
|
+
"radiation",
|
|
246
|
+
"surgery",
|
|
247
|
+
"biopsy",
|
|
248
|
+
"diagnosis",
|
|
249
|
+
"prognosis",
|
|
250
|
+
"prescription",
|
|
251
|
+
"medication",
|
|
252
|
+
"dosage",
|
|
253
|
+
"allergic",
|
|
254
|
+
"allergy",
|
|
255
|
+
"positive",
|
|
256
|
+
"negative",
|
|
257
|
+
"report",
|
|
258
|
+
"pathology",
|
|
259
|
+
"radiology",
|
|
260
|
+
"MRI",
|
|
261
|
+
"CT scan",
|
|
262
|
+
"X-ray",
|
|
263
|
+
"ultrasound",
|
|
264
|
+
"ECG",
|
|
265
|
+
"EKG",
|
|
266
|
+
"patient",
|
|
267
|
+
"hospital",
|
|
268
|
+
"clinic",
|
|
269
|
+
"doctor",
|
|
270
|
+
"physician",
|
|
271
|
+
"surgeon"
|
|
272
|
+
]), I = /* @__PURE__ */ new Set([
|
|
273
|
+
"road",
|
|
274
|
+
"rd",
|
|
275
|
+
"street",
|
|
276
|
+
"st",
|
|
277
|
+
"avenue",
|
|
278
|
+
"ave",
|
|
279
|
+
"lane",
|
|
280
|
+
"ln",
|
|
281
|
+
"nagar",
|
|
282
|
+
"colony",
|
|
283
|
+
"sector",
|
|
284
|
+
"block",
|
|
285
|
+
"plot",
|
|
286
|
+
"flat",
|
|
287
|
+
"floor",
|
|
288
|
+
"building",
|
|
289
|
+
"bldg",
|
|
290
|
+
"apartment",
|
|
291
|
+
"apt",
|
|
292
|
+
"house",
|
|
293
|
+
"no",
|
|
294
|
+
"near",
|
|
295
|
+
"opposite",
|
|
296
|
+
"opp",
|
|
297
|
+
"behind",
|
|
298
|
+
"beside",
|
|
299
|
+
"next to",
|
|
300
|
+
"main",
|
|
301
|
+
"cross",
|
|
302
|
+
"layout",
|
|
303
|
+
"extension",
|
|
304
|
+
"extn",
|
|
305
|
+
"phase",
|
|
306
|
+
"village",
|
|
307
|
+
"town",
|
|
308
|
+
"city",
|
|
309
|
+
"district",
|
|
310
|
+
"taluk",
|
|
311
|
+
"tehsil",
|
|
312
|
+
"post",
|
|
313
|
+
"pin",
|
|
314
|
+
"pincode",
|
|
315
|
+
"zip"
|
|
316
|
+
]);
|
|
317
|
+
function p() {
|
|
318
|
+
return "nlp_" + Math.random().toString(36).substring(2, 11);
|
|
319
|
+
}
|
|
320
|
+
function M(a) {
|
|
321
|
+
const n = [];
|
|
322
|
+
for (let e = 0; e < a.length; e++) {
|
|
323
|
+
const o = a[e].text.replace(/[^a-zA-Z]/g, "").toLowerCase();
|
|
324
|
+
if (o.length < 2) continue;
|
|
325
|
+
const t = f.has(o), i = m.has(o);
|
|
326
|
+
if (t || i) {
|
|
327
|
+
let l = a[e].text, s = { ...a[e].bbox }, r = 0.65;
|
|
328
|
+
if (e + 1 < a.length) {
|
|
329
|
+
const h = a[e + 1].text.replace(/[^a-zA-Z]/g, "").toLowerCase();
|
|
330
|
+
(f.has(h) || m.has(h)) && (l += " " + a[e + 1].text, s.w = a[e + 1].bbox.x + a[e + 1].bbox.w - s.x, r = 0.82, e++);
|
|
331
|
+
}
|
|
332
|
+
if (e + 1 < a.length) {
|
|
333
|
+
const h = a[e + 1].text.replace(/[^a-zA-Z]/g, "").toLowerCase();
|
|
334
|
+
m.has(h) && (l += " " + a[e + 1].text, s.w = a[e + 1].bbox.x + a[e + 1].bbox.w - s.x, r = 0.88, e++);
|
|
335
|
+
}
|
|
336
|
+
a[e - (l.split(" ").length - 1)]?.text[0]?.match(/[A-Z]/) && (r += 0.05), n.push({
|
|
337
|
+
id: p(),
|
|
338
|
+
type: "NAME",
|
|
339
|
+
value: l,
|
|
340
|
+
confidence: Math.min(r, 0.95),
|
|
341
|
+
bbox: s,
|
|
342
|
+
masked: !0,
|
|
343
|
+
layer: 2
|
|
344
|
+
});
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
return n;
|
|
348
|
+
}
|
|
349
|
+
function A(a) {
|
|
350
|
+
const n = [], e = a.map((i) => i.text).join(" ").toLowerCase(), o = /\b\d{6}\b/g;
|
|
351
|
+
let t;
|
|
352
|
+
for (; (t = o.exec(e)) !== null; ) {
|
|
353
|
+
const i = parseInt(t[0]);
|
|
354
|
+
if (i >= 110001 && i <= 855117) {
|
|
355
|
+
const l = k(a, t.index, e);
|
|
356
|
+
if (l >= 0) {
|
|
357
|
+
const s = Math.max(0, l - 8), r = a.slice(s, l + 1), h = r.some(
|
|
358
|
+
(x) => I.has(x.text.toLowerCase().replace(/[^a-z]/g, ""))
|
|
359
|
+
);
|
|
360
|
+
if (h || r.length >= 3) {
|
|
361
|
+
const x = r[0], d = r[r.length - 1];
|
|
362
|
+
n.push({
|
|
363
|
+
id: p(),
|
|
364
|
+
type: "ADDRESS",
|
|
365
|
+
value: r.map((c) => c.text).join(" "),
|
|
366
|
+
confidence: h ? 0.78 : 0.55,
|
|
367
|
+
bbox: {
|
|
368
|
+
x: x.bbox.x,
|
|
369
|
+
y: Math.min(...r.map((c) => c.bbox.y)),
|
|
370
|
+
w: d.bbox.x + d.bbox.w - x.bbox.x,
|
|
371
|
+
h: Math.max(...r.map((c) => c.bbox.y + c.bbox.h)) - Math.min(...r.map((c) => c.bbox.y)),
|
|
372
|
+
pageIndex: x.bbox.pageIndex
|
|
373
|
+
},
|
|
374
|
+
masked: !0,
|
|
375
|
+
layer: 2
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
for (const i of j) {
|
|
382
|
+
const l = i.split(" ");
|
|
383
|
+
for (let s = 0; s <= a.length - l.length; s++)
|
|
384
|
+
if (a.slice(s, s + l.length).map((h) => h.text.toLowerCase().replace(/[^a-z ]/g, "")).join(" ") === i) {
|
|
385
|
+
const h = a[s], x = a[s + l.length - 1];
|
|
386
|
+
n.push({
|
|
387
|
+
id: p(),
|
|
388
|
+
type: "ADDRESS",
|
|
389
|
+
value: a.slice(s, s + l.length).map((d) => d.text).join(" "),
|
|
390
|
+
confidence: 0.72,
|
|
391
|
+
bbox: {
|
|
392
|
+
x: h.bbox.x,
|
|
393
|
+
y: h.bbox.y,
|
|
394
|
+
w: x.bbox.x + x.bbox.w - h.bbox.x,
|
|
395
|
+
h: Math.max(h.bbox.h, x.bbox.h),
|
|
396
|
+
pageIndex: h.bbox.pageIndex
|
|
397
|
+
},
|
|
398
|
+
masked: !0,
|
|
399
|
+
layer: 2
|
|
400
|
+
});
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
return n;
|
|
404
|
+
}
|
|
405
|
+
function S(a) {
|
|
406
|
+
const n = [];
|
|
407
|
+
for (let e = 0; e < a.length; e++) {
|
|
408
|
+
const o = a[e].text.toLowerCase().replace(/[^a-z]/g, "");
|
|
409
|
+
if ((u.has(o) || u.has(a[e].text)) && n.push({
|
|
410
|
+
id: p(),
|
|
411
|
+
type: "MEDICAL",
|
|
412
|
+
value: a[e].text,
|
|
413
|
+
confidence: 0.75,
|
|
414
|
+
bbox: { ...a[e].bbox },
|
|
415
|
+
masked: !0,
|
|
416
|
+
layer: 2
|
|
417
|
+
}), e + 1 < a.length) {
|
|
418
|
+
const t = a[e].text + " " + a[e + 1].text;
|
|
419
|
+
u.has(t.toLowerCase()) && (n.push({
|
|
420
|
+
id: p(),
|
|
421
|
+
type: "MEDICAL",
|
|
422
|
+
value: t,
|
|
423
|
+
confidence: 0.8,
|
|
424
|
+
bbox: {
|
|
425
|
+
x: a[e].bbox.x,
|
|
426
|
+
y: a[e].bbox.y,
|
|
427
|
+
w: a[e + 1].bbox.x + a[e + 1].bbox.w - a[e].bbox.x,
|
|
428
|
+
h: Math.max(a[e].bbox.h, a[e + 1].bbox.h),
|
|
429
|
+
pageIndex: a[e].bbox.pageIndex
|
|
430
|
+
},
|
|
431
|
+
masked: !0,
|
|
432
|
+
layer: 2
|
|
433
|
+
}), e++);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
return n;
|
|
437
|
+
}
|
|
438
|
+
function E(a) {
|
|
439
|
+
const n = [], e = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;
|
|
440
|
+
for (const o of a)
|
|
441
|
+
e.test(o.text) && n.push({
|
|
442
|
+
id: p(),
|
|
443
|
+
type: "EMAIL",
|
|
444
|
+
value: o.text,
|
|
445
|
+
confidence: 0.95,
|
|
446
|
+
bbox: { ...o.bbox },
|
|
447
|
+
masked: !0,
|
|
448
|
+
layer: 2
|
|
449
|
+
});
|
|
450
|
+
return n;
|
|
451
|
+
}
|
|
452
|
+
function L(a) {
|
|
453
|
+
const n = [], e = a.map((i) => i.text).join(" "), o = /\b(\d{1,2})[\/\-.](\d{1,2})[\/\-.](\d{4})\b/g;
|
|
454
|
+
let t;
|
|
455
|
+
for (; (t = o.exec(e)) !== null; ) {
|
|
456
|
+
const i = parseInt(t[1]), l = parseInt(t[2]), s = parseInt(t[3]);
|
|
457
|
+
if (i >= 1 && i <= 31 && l >= 1 && l <= 12 && s >= 1920 && s <= 2010) {
|
|
458
|
+
const r = Math.max(0, t.index - 30), h = e.substring(r, t.index).toLowerCase(), x = /\b(dob|date of birth|birth date|born|birthday|d\.o\.b)\b/.test(h), d = k(a, t.index, e);
|
|
459
|
+
if (d >= 0) {
|
|
460
|
+
const c = [];
|
|
461
|
+
let g = t.index;
|
|
462
|
+
for (let b = d; b < a.length && g < t.index + t[0].length; b++)
|
|
463
|
+
c.push(a[b]), g += a[b].text.length + 1;
|
|
464
|
+
if (c.length > 0) {
|
|
465
|
+
const b = c[0], y = c[c.length - 1];
|
|
466
|
+
n.push({
|
|
467
|
+
id: p(),
|
|
468
|
+
type: "DOB",
|
|
469
|
+
value: t[0],
|
|
470
|
+
confidence: x ? 0.9 : 0.6,
|
|
471
|
+
bbox: {
|
|
472
|
+
x: b.bbox.x,
|
|
473
|
+
y: b.bbox.y,
|
|
474
|
+
w: y.bbox.x + y.bbox.w - b.bbox.x,
|
|
475
|
+
h: Math.max(...c.map((v) => v.bbox.h)),
|
|
476
|
+
pageIndex: b.bbox.pageIndex
|
|
477
|
+
},
|
|
478
|
+
masked: !0,
|
|
479
|
+
layer: 2
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
return n;
|
|
486
|
+
}
|
|
487
|
+
function k(a, n, e) {
|
|
488
|
+
let o = 0;
|
|
489
|
+
for (let t = 0; t < a.length; t++) {
|
|
490
|
+
const i = e.indexOf(a[t].text, o);
|
|
491
|
+
if (i <= n && n < i + a[t].text.length)
|
|
492
|
+
return t;
|
|
493
|
+
o = i + a[t].text.length;
|
|
494
|
+
}
|
|
495
|
+
return -1;
|
|
496
|
+
}
|
|
497
|
+
self.onmessage = (a) => {
|
|
498
|
+
const { type: n, words: e, pageIndex: o } = a.data;
|
|
499
|
+
if (n === "NLP_ANALYZE")
|
|
500
|
+
try {
|
|
501
|
+
const t = [
|
|
502
|
+
...M(e),
|
|
503
|
+
...A(e),
|
|
504
|
+
...S(e),
|
|
505
|
+
...E(e),
|
|
506
|
+
...L(e)
|
|
507
|
+
];
|
|
508
|
+
for (const i of t)
|
|
509
|
+
i.bbox.pageIndex = o ?? 0;
|
|
510
|
+
self.postMessage({
|
|
511
|
+
type: "NLP_RESULT",
|
|
512
|
+
entities: t
|
|
513
|
+
});
|
|
514
|
+
} catch (t) {
|
|
515
|
+
self.postMessage({
|
|
516
|
+
type: "NLP_ERROR",
|
|
517
|
+
error: t instanceof Error ? t.message : "NLP analysis failed"
|
|
518
|
+
});
|
|
519
|
+
}
|
|
520
|
+
};
|
|
521
|
+
//# sourceMappingURL=nlp.worker-u7Lr_A3c.js.map
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
function d(t) {
|
|
2
|
+
for (let e = 0; e < t.length; e += 4) {
|
|
3
|
+
const r = Math.round(
|
|
4
|
+
0.299 * t[e] + // R
|
|
5
|
+
0.587 * t[e + 1] + // G
|
|
6
|
+
0.114 * t[e + 2]
|
|
7
|
+
// B
|
|
8
|
+
);
|
|
9
|
+
t[e] = r, t[e + 1] = r, t[e + 2] = r;
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
function y(t) {
|
|
13
|
+
let e = 255, r = 0;
|
|
14
|
+
for (let s = 0; s < t.length; s += 4) {
|
|
15
|
+
const o = t[s];
|
|
16
|
+
o < e && (e = o), o > r && (r = o);
|
|
17
|
+
}
|
|
18
|
+
const n = r - e;
|
|
19
|
+
if (n !== 0)
|
|
20
|
+
for (let s = 0; s < t.length; s += 4) {
|
|
21
|
+
const o = Math.round((t[s] - e) / n * 255);
|
|
22
|
+
t[s] = o, t[s + 1] = o, t[s + 2] = o;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
function R(t) {
|
|
26
|
+
const e = new Array(256).fill(0), r = t.length / 4;
|
|
27
|
+
for (let i = 0; i < t.length; i += 4)
|
|
28
|
+
e[t[i]]++;
|
|
29
|
+
let n = 0;
|
|
30
|
+
for (let i = 0; i < 256; i++)
|
|
31
|
+
n += i * e[i];
|
|
32
|
+
let s = 0, o = 0, l = 0, f = 0;
|
|
33
|
+
for (let i = 0; i < 256; i++) {
|
|
34
|
+
if (o += e[i], o === 0) continue;
|
|
35
|
+
const a = r - o;
|
|
36
|
+
if (a === 0) break;
|
|
37
|
+
s += i * e[i];
|
|
38
|
+
const g = s / o, u = (n - s) / a, p = o * a * (g - u) * (g - u);
|
|
39
|
+
p > l && (l = p, f = i);
|
|
40
|
+
}
|
|
41
|
+
return f;
|
|
42
|
+
}
|
|
43
|
+
function w(t, e) {
|
|
44
|
+
for (let r = 0; r < t.length; r += 4) {
|
|
45
|
+
const n = t[r] >= e ? 255 : 0;
|
|
46
|
+
t[r] = n, t[r + 1] = n, t[r + 2] = n;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
function O(t, e, r) {
|
|
50
|
+
let n = 0;
|
|
51
|
+
const s = e * r;
|
|
52
|
+
for (let l = 0; l < t.length; l += 4)
|
|
53
|
+
(t[l] + t[l + 1] + t[l + 2]) / 3 < 240 && n++;
|
|
54
|
+
const o = n / s;
|
|
55
|
+
return o < 0.2 ? 11 : o > 0.5 ? 6 : o > 0.3 ? 4 : 3;
|
|
56
|
+
}
|
|
57
|
+
async function k(t) {
|
|
58
|
+
try {
|
|
59
|
+
const e = await createImageBitmap(t), r = new OffscreenCanvas(e.width, e.height), n = r.getContext("2d", { willReadFrequently: !0 });
|
|
60
|
+
if (!n)
|
|
61
|
+
return e.close(), { blob: t, psm: 3 };
|
|
62
|
+
n.drawImage(e, 0, 0);
|
|
63
|
+
const s = n.getImageData(0, 0, e.width, e.height), o = s.data;
|
|
64
|
+
d(o), y(o);
|
|
65
|
+
const l = O(o, e.width, e.height), f = R(o);
|
|
66
|
+
w(o, f), n.putImageData(s, 0, 0);
|
|
67
|
+
const i = await r.convertToBlob({ type: "image/png" });
|
|
68
|
+
return e.close(), { blob: i, psm: l };
|
|
69
|
+
} catch (e) {
|
|
70
|
+
return console.warn("[OCR Worker] Preprocessing failed, using original image:", e), { blob: t, psm: 3 };
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
let h = null;
|
|
74
|
+
async function C() {
|
|
75
|
+
return h || (h = await (await import("./index-C62fEJ4q.js").then(function(e) {
|
|
76
|
+
return e.i;
|
|
77
|
+
})).createWorker("eng", void 0, {
|
|
78
|
+
logger: (e) => {
|
|
79
|
+
self.postMessage({
|
|
80
|
+
type: "OCR_PROGRESS",
|
|
81
|
+
progress: e.progress,
|
|
82
|
+
message: e.status
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
}), h);
|
|
86
|
+
}
|
|
87
|
+
self.onmessage = async (t) => {
|
|
88
|
+
const { type: e, fileBuffer: r, fileType: n, pageIndex: s } = t.data;
|
|
89
|
+
if (e === "OCR_START")
|
|
90
|
+
try {
|
|
91
|
+
const o = await C();
|
|
92
|
+
let l;
|
|
93
|
+
n === "application/pdf" ? l = new Blob([r], { type: "image/png" }) : l = new Blob([r], { type: n }), self.postMessage({
|
|
94
|
+
type: "OCR_PROGRESS",
|
|
95
|
+
progress: 0.1,
|
|
96
|
+
message: "Preprocessing image..."
|
|
97
|
+
});
|
|
98
|
+
const { blob: f, psm: i } = await k(l);
|
|
99
|
+
console.log(`[OCR Worker] Detected optimal PSM: ${i}`), await o.setParameters({
|
|
100
|
+
tessedit_pageseg_mode: i.toString()
|
|
101
|
+
});
|
|
102
|
+
const a = await o.recognize(f, {}, { text: !0, blocks: !0 }), g = [], u = (c) => {
|
|
103
|
+
!c || !c.text || !c.bbox || g.push({
|
|
104
|
+
text: c.text,
|
|
105
|
+
confidence: (c.confidence ?? 0) / 100,
|
|
106
|
+
bbox: {
|
|
107
|
+
x: c.bbox.x0,
|
|
108
|
+
y: c.bbox.y0,
|
|
109
|
+
w: c.bbox.x1 - c.bbox.x0,
|
|
110
|
+
h: c.bbox.y1 - c.bbox.y0,
|
|
111
|
+
pageIndex: s ?? 0
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
};
|
|
115
|
+
if (a.data.blocks && a.data.blocks.length > 0)
|
|
116
|
+
for (const c of a.data.blocks)
|
|
117
|
+
for (const m of c.paragraphs ?? [])
|
|
118
|
+
for (const x of m.lines ?? [])
|
|
119
|
+
for (const b of x.words ?? [])
|
|
120
|
+
u(b);
|
|
121
|
+
else if (a.data.words && a.data.words.length > 0)
|
|
122
|
+
for (const c of a.data.words)
|
|
123
|
+
u(c);
|
|
124
|
+
console.log(`[OCR Worker] Extracted ${g.length} words from Tesseract`);
|
|
125
|
+
let p = a.data.text ?? "";
|
|
126
|
+
!p.trim() && g.length > 0 && (p = g.map((c) => c.text).join(" "), console.log("[OCR Worker] Reconstructed fullText from words")), console.log(`[OCR Worker] fullText length: ${p.length}`), self.postMessage({
|
|
127
|
+
type: "OCR_RESULT",
|
|
128
|
+
words: g,
|
|
129
|
+
fullText: p,
|
|
130
|
+
pageIndex: s ?? 0
|
|
131
|
+
});
|
|
132
|
+
} catch (o) {
|
|
133
|
+
self.postMessage({
|
|
134
|
+
type: "OCR_ERROR",
|
|
135
|
+
error: o instanceof Error ? o.message : "OCR processing failed"
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
};
|
|
139
|
+
//# sourceMappingURL=ocr.worker-D5s6dY7M.js.map
|