@yigitahmetsahin/captcha-solver 2.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,51 +1,522 @@
1
- // src/solver.ts
2
- import { generateText } from "ai";
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropNames = Object.getOwnPropertyNames;
3
+ var __esm = (fn, res) => function __init() {
4
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
5
+ };
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
3
10
 
4
11
  // src/preprocess.ts
5
12
  import fs from "fs";
6
13
  import path from "path";
7
14
  import sharp from "sharp";
8
- async function preprocessCaptcha(input) {
9
- const buf = await preprocessCaptchaToBuffer(input);
15
+ async function preprocessCaptcha(input, options) {
16
+ const buf = await preprocessCaptchaToBuffer(input, options);
10
17
  return buf.toString("base64");
11
18
  }
12
- async function preprocessCaptchaToBuffer(input) {
13
- const source = typeof input === "string" ? path.resolve(input) : input;
19
+ async function preprocessCaptchaToBuffer(input, options) {
20
+ const {
21
+ preCropHeight = 1,
22
+ median = 0,
23
+ blur = 1.5,
24
+ greyscale = true,
25
+ scale = 4,
26
+ upscaleKernel = "lanczos3",
27
+ postBlur = 0,
28
+ normalise = false,
29
+ contrast = 3,
30
+ sharpen = true,
31
+ threshold = false,
32
+ negate = false,
33
+ crop = "auto",
34
+ padding = true
35
+ } = options ?? {};
36
+ let source = typeof input === "string" ? path.resolve(input) : input;
14
37
  const metadata = await sharp(source).metadata();
15
38
  const origW = metadata.width;
16
- const origH = metadata.height;
17
- const smoothed = await sharp(source).blur(1.5).greyscale().toBuffer();
18
- const upscaled = await sharp(smoothed).resize(origW * 4, origH * 4, { kernel: "lanczos3" }).toBuffer();
19
- const stats = await sharp(upscaled).stats();
20
- const mean = stats.channels[0].mean;
21
- const enhanced = await sharp(upscaled).linear(3, mean * (1 - 3)).sharpen({ sigma: 1, m1: 2, m2: 1 }).toBuffer();
22
- const scaledW = origW * 4;
23
- const scaledH = origH * 4;
24
- const cropLeft = Math.floor(scaledW * 0.1);
25
- const cropTop = Math.floor(scaledH * 0.02);
26
- const cropRight = Math.floor(scaledW * 0.9);
27
- const cropBottom = Math.floor(scaledH * 0.6);
28
- const cropW = cropRight - cropLeft;
29
- const cropH = cropBottom - cropTop;
30
- return sharp(enhanced).extract({ left: cropLeft, top: cropTop, width: cropW, height: cropH }).extend({
31
- top: 20,
32
- bottom: 20,
33
- left: 30,
34
- right: 30,
39
+ let origH = metadata.height;
40
+ if (preCropHeight < 1 && preCropHeight > 0) {
41
+ const keepH = Math.floor(origH * preCropHeight);
42
+ source = await sharp(source).extract({ left: 0, top: 0, width: origW, height: keepH }).toBuffer();
43
+ origH = keepH;
44
+ }
45
+ let pipeline = sharp(source);
46
+ if (median > 0) pipeline = pipeline.median(median);
47
+ if (blur > 0) pipeline = pipeline.blur(blur);
48
+ if (greyscale) pipeline = pipeline.greyscale();
49
+ const smoothed = await pipeline.toBuffer();
50
+ const upscaled = await sharp(smoothed).resize(origW * scale, origH * scale, { kernel: upscaleKernel }).toBuffer();
51
+ let postProcessed = upscaled;
52
+ if (postBlur > 0) {
53
+ postProcessed = await sharp(upscaled).blur(postBlur).toBuffer();
54
+ }
55
+ if (normalise) {
56
+ postProcessed = await sharp(postProcessed).normalise().toBuffer();
57
+ }
58
+ let enhanced;
59
+ if (contrast !== 1) {
60
+ const stats = await sharp(postProcessed).stats();
61
+ const mean = stats.channels[0].mean;
62
+ let pipe = sharp(postProcessed).linear(contrast, mean * (1 - contrast));
63
+ if (sharpen) pipe = pipe.sharpen({ sigma: 1, m1: 2, m2: 1 });
64
+ enhanced = await pipe.toBuffer();
65
+ } else {
66
+ enhanced = sharpen ? await sharp(postProcessed).sharpen({ sigma: 1, m1: 2, m2: 1 }).toBuffer() : postProcessed;
67
+ }
68
+ if (threshold !== false && typeof threshold === "number") {
69
+ enhanced = await sharp(enhanced).threshold(threshold).toBuffer();
70
+ }
71
+ const targetWidth = options?.targetWidth;
72
+ if (targetWidth && targetWidth > 0) {
73
+ enhanced = await sharp(enhanced).resize(targetWidth, null, { kernel: "lanczos3" }).toBuffer();
74
+ }
75
+ let cropped;
76
+ if (crop === "none") {
77
+ cropped = enhanced;
78
+ } else if (crop === "auto") {
79
+ cropped = await autoCrop(enhanced);
80
+ } else {
81
+ const fractions = crop === "legacy" ? LEGACY_CROP : crop;
82
+ const scaledW = origW * scale;
83
+ const scaledH = origH * scale;
84
+ const cropLeft = Math.floor(scaledW * fractions.left);
85
+ const cropTop = Math.floor(scaledH * fractions.top);
86
+ const cropRight = Math.floor(scaledW * fractions.right);
87
+ const cropBottom = Math.floor(scaledH * fractions.bottom);
88
+ const cropW = cropRight - cropLeft;
89
+ const cropH = cropBottom - cropTop;
90
+ cropped = await sharp(enhanced).extract({ left: cropLeft, top: cropTop, width: cropW, height: cropH }).toBuffer();
91
+ }
92
+ const final = negate ? await sharp(cropped).negate().toBuffer() : cropped;
93
+ if (padding === false) {
94
+ return sharp(final).png().toBuffer();
95
+ }
96
+ const pad = typeof padding === "number" ? padding : void 0;
97
+ const vPad = pad ?? 20;
98
+ const hPad = pad ?? 30;
99
+ return sharp(final).extend({
100
+ top: vPad,
101
+ bottom: vPad,
102
+ left: hPad,
103
+ right: hPad,
35
104
  background: { r: 255, g: 255, b: 255 }
36
105
  }).png().toBuffer();
37
106
  }
107
+ async function autoCrop(enhanced) {
108
+ try {
109
+ const trimmed = sharp(enhanced).trim({ threshold: 30 });
110
+ const trimmedBuf = await trimmed.toBuffer({ resolveWithObject: true });
111
+ const { width, height } = trimmedBuf.info;
112
+ if (width > 2 && height > 2) {
113
+ return trimmedBuf.data;
114
+ }
115
+ } catch {
116
+ }
117
+ return enhanced;
118
+ }
38
119
  function imageToBase64(imagePath) {
39
120
  const buffer = fs.readFileSync(imagePath);
40
121
  return buffer.toString("base64");
41
122
  }
123
+ var LEGACY_CROP;
124
+ var init_preprocess = __esm({
125
+ "src/preprocess.ts"() {
126
+ "use strict";
127
+ LEGACY_CROP = { left: 0.1, top: 0.02, right: 0.9, bottom: 0.6 };
128
+ }
129
+ });
130
+
131
+ // src/tesseract.ts
132
+ var tesseract_exports = {};
133
+ __export(tesseract_exports, {
134
+ TESSERACT_VARIANTS: () => TESSERACT_VARIANTS,
135
+ createTesseractReader: () => createTesseractReader
136
+ });
137
+ async function createTesseractReader() {
138
+ let createWorker;
139
+ try {
140
+ const tess = await import("tesseract.js");
141
+ createWorker = tess.createWorker;
142
+ } catch {
143
+ return null;
144
+ }
145
+ const worker = await createWorker("eng");
146
+ await worker.setParameters({
147
+ tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789",
148
+ tessedit_pageseg_mode: "7"
149
+ // PSM.SINGLE_LINE
150
+ });
151
+ return {
152
+ async recognize(image) {
153
+ const { data } = await worker.recognize(image);
154
+ return data.text.trim().replace(/[^A-Z0-9]/g, "");
155
+ },
156
+ async recognizeMulti(input, variants) {
157
+ const results = [];
158
+ for (const opts of variants) {
159
+ try {
160
+ const buf = await preprocessCaptchaToBuffer(input, opts);
161
+ const { data } = await worker.recognize(buf);
162
+ const clean = data.text.trim().replace(/[^A-Z0-9]/g, "");
163
+ if (clean.length >= 2 && clean.length <= 8) {
164
+ results.push(clean);
165
+ }
166
+ } catch {
167
+ }
168
+ }
169
+ return results;
170
+ },
171
+ async dispose() {
172
+ await worker.terminate();
173
+ }
174
+ };
175
+ }
176
+ var TESSERACT_VARIANTS;
177
+ var init_tesseract = __esm({
178
+ "src/tesseract.ts"() {
179
+ "use strict";
180
+ init_preprocess();
181
+ TESSERACT_VARIANTS = [
182
+ // Variant 1: standard enhanced
183
+ {
184
+ blur: 1.5,
185
+ greyscale: true,
186
+ scale: 4,
187
+ contrast: 3,
188
+ sharpen: true,
189
+ crop: "auto",
190
+ padding: true
191
+ },
192
+ // Variant 2: enhanced + negated
193
+ {
194
+ blur: 1.5,
195
+ greyscale: true,
196
+ scale: 4,
197
+ contrast: 3,
198
+ sharpen: true,
199
+ negate: true,
200
+ crop: "auto",
201
+ padding: true
202
+ }
203
+ ];
204
+ }
205
+ });
42
206
 
43
207
  // src/solver.ts
44
- var PROMPT = `You are an assistant helping a visually impaired person read distorted text from an image.
45
- The text contains uppercase letters A-Z and/or digits 0-9.
46
- A thin vertical stroke is the digit 1. Never read it as the letter I or L.
47
- A round closed shape is the letter O, not the letter D.
48
- Output ONLY the exact characters you read, nothing else.`;
208
+ init_preprocess();
209
+ import { generateText } from "ai";
210
+
211
+ // src/disambiguate.ts
212
+ import sharp2 from "sharp";
213
+ async function disambiguateResult(result, rankedByPos, binaryImage) {
214
+ const ambiguousPositions = [];
215
+ for (let pos = 0; pos < result.length; pos++) {
216
+ if (result[pos] !== "2" && result[pos] !== "Z") continue;
217
+ const ranked = rankedByPos[pos];
218
+ const hasAlt = (ranked.get("6") ?? 0) >= 1 || (ranked.get("L") ?? 0) >= 1 || (ranked.get("1") ?? 0) >= 1;
219
+ if (hasAlt) {
220
+ ambiguousPositions.push(pos);
221
+ continue;
222
+ }
223
+ const twoZCount = result.filter((c) => c === "2" || c === "Z").length;
224
+ if (twoZCount >= 3) {
225
+ ambiguousPositions.push(pos);
226
+ }
227
+ }
228
+ if (ambiguousPositions.length === 0) return;
229
+ const meta = await sharp2(binaryImage).metadata();
230
+ const fullW = meta.width;
231
+ const fullH = meta.height;
232
+ const cropTop = Math.floor(fullH * 0.12);
233
+ const cropH = Math.floor(fullH * 0.76);
234
+ const { data, info } = await sharp2(binaryImage).extract({ left: 0, top: cropTop, width: fullW, height: cropH }).greyscale().negate().raw().toBuffer({ resolveWithObject: true });
235
+ const w = info.width;
236
+ const h = info.height;
237
+ const pixels = new Uint8Array(data);
238
+ const regions = segmentCharacters(pixels, w, h, result.length);
239
+ if (!regions || regions.length !== result.length) return;
240
+ for (const pos of ambiguousPositions) {
241
+ const region = regions[pos];
242
+ const features = analyseCharacter(pixels, w, h, region);
243
+ const newChar = classifyFromFeatures(features, result[pos]);
244
+ if (newChar) {
245
+ result[pos] = newChar;
246
+ }
247
+ }
248
+ }
249
+ function segmentCharacters(pixels, w, h, expectedCount) {
250
+ const colDensity = new Float64Array(w);
251
+ for (let x = 0; x < w; x++) {
252
+ let count = 0;
253
+ for (let y = 0; y < h; y++) {
254
+ if (pixels[y * w + x] >= 128) count++;
255
+ }
256
+ colDensity[x] = count / h;
257
+ }
258
+ let contentLeft = 0;
259
+ let contentRight = w;
260
+ for (let x = 0; x < w; x++) {
261
+ if (colDensity[x] > 0.05) {
262
+ contentLeft = x;
263
+ break;
264
+ }
265
+ }
266
+ for (let x = w - 1; x >= 0; x--) {
267
+ if (colDensity[x] > 0.05) {
268
+ contentRight = x + 1;
269
+ break;
270
+ }
271
+ }
272
+ const smoothW = 15;
273
+ const smoothed = new Float64Array(w);
274
+ for (let x = contentLeft; x < contentRight; x++) {
275
+ let sum = 0;
276
+ let count = 0;
277
+ for (let dx = -smoothW; dx <= smoothW; dx++) {
278
+ const nx = x + dx;
279
+ if (nx >= contentLeft && nx < contentRight) {
280
+ sum += colDensity[nx];
281
+ count++;
282
+ }
283
+ }
284
+ smoothed[x] = sum / count;
285
+ }
286
+ const charWidth = (contentRight - contentLeft) / expectedCount;
287
+ const margin = Math.floor(charWidth * 0.6);
288
+ const searchLeft = contentLeft + margin;
289
+ const searchRight = contentRight - margin;
290
+ const valleys = [];
291
+ for (let x = searchLeft + 1; x < searchRight - 1; x++) {
292
+ if (smoothed[x] <= smoothed[x - 1] && smoothed[x] <= smoothed[x + 1]) {
293
+ const leftMax = Math.max(...Array.from(smoothed.slice(Math.max(searchLeft, x - 40), x)));
294
+ const rightMax = Math.max(
295
+ ...Array.from(smoothed.slice(x + 1, Math.min(searchRight, x + 41)))
296
+ );
297
+ const depth = Math.min(leftMax, rightMax) - smoothed[x];
298
+ if (depth > 0.01) {
299
+ valleys.push({ x, depth });
300
+ }
301
+ }
302
+ }
303
+ valleys.sort((a, b) => b.depth - a.depth);
304
+ const splits = [];
305
+ const minDist = charWidth * 0.6;
306
+ for (const v of valleys) {
307
+ if (splits.length >= expectedCount - 1) break;
308
+ if (splits.every((s) => Math.abs(s - v.x) > minDist)) {
309
+ splits.push(v.x);
310
+ }
311
+ }
312
+ if (splits.length < expectedCount - 1) {
313
+ const step = (contentRight - contentLeft) / expectedCount;
314
+ splits.length = 0;
315
+ for (let i = 1; i < expectedCount; i++) {
316
+ splits.push(Math.floor(contentLeft + step * i));
317
+ }
318
+ }
319
+ splits.sort((a, b) => a - b);
320
+ const boundaries = [contentLeft, ...splits, contentRight];
321
+ return boundaries.slice(0, expectedCount).map((start, idx) => {
322
+ const end = boundaries[idx + 1];
323
+ let top = h;
324
+ let bottom = 0;
325
+ for (let y = 0; y < h; y++) {
326
+ for (let x = start; x < end; x++) {
327
+ if (pixels[y * w + x] >= 128) {
328
+ if (y < top) top = y;
329
+ if (y > bottom) bottom = y;
330
+ }
331
+ }
332
+ }
333
+ return { left: start, right: end, top: Math.max(0, top), bottom: Math.min(h, bottom + 1) };
334
+ });
335
+ }
336
+ function detectHoles(pixels, imgW, region) {
337
+ const rw = region.right - region.left;
338
+ const rh = region.bottom - region.top;
339
+ if (rw < 3 || rh < 3) return { count: 0, hasBottom: false, hasTop: false };
340
+ const grid = new Uint8Array(rw * rh);
341
+ for (let ly = 0; ly < rh; ly++) {
342
+ for (let lx = 0; lx < rw; lx++) {
343
+ const px = pixels[(region.top + ly) * imgW + (region.left + lx)];
344
+ grid[ly * rw + lx] = px >= 128 ? 1 : 0;
345
+ }
346
+ }
347
+ const visited = new Uint8Array(rw * rh);
348
+ const queue = [];
349
+ for (let lx = 0; lx < rw; lx++) {
350
+ if (grid[lx] === 0 && !visited[lx]) {
351
+ visited[lx] = 1;
352
+ queue.push(lx);
353
+ }
354
+ const bottom = (rh - 1) * rw + lx;
355
+ if (grid[bottom] === 0 && !visited[bottom]) {
356
+ visited[bottom] = 1;
357
+ queue.push(bottom);
358
+ }
359
+ }
360
+ for (let ly = 0; ly < rh; ly++) {
361
+ const left = ly * rw;
362
+ if (grid[left] === 0 && !visited[left]) {
363
+ visited[left] = 1;
364
+ queue.push(left);
365
+ }
366
+ const right = ly * rw + rw - 1;
367
+ if (grid[right] === 0 && !visited[right]) {
368
+ visited[right] = 1;
369
+ queue.push(right);
370
+ }
371
+ }
372
+ let qi = 0;
373
+ while (qi < queue.length) {
374
+ const idx = queue[qi++];
375
+ const lx = idx % rw;
376
+ const ly = Math.floor(idx / rw);
377
+ for (const [dx, dy] of [
378
+ [0, 1],
379
+ [0, -1],
380
+ [1, 0],
381
+ [-1, 0]
382
+ ]) {
383
+ const nx = lx + dx;
384
+ const ny = ly + dy;
385
+ if (nx < 0 || nx >= rw || ny < 0 || ny >= rh) continue;
386
+ const ni = ny * rw + nx;
387
+ if (!visited[ni] && grid[ni] === 0) {
388
+ visited[ni] = 1;
389
+ queue.push(ni);
390
+ }
391
+ }
392
+ }
393
+ let holeCount = 0;
394
+ let hasBottom = false;
395
+ let hasTop = false;
396
+ const midY = rh / 2;
397
+ for (let ly = 0; ly < rh; ly++) {
398
+ for (let lx = 0; lx < rw; lx++) {
399
+ const idx = ly * rw + lx;
400
+ if (grid[idx] === 0 && !visited[idx]) {
401
+ const holeQueue = [idx];
402
+ visited[idx] = 1;
403
+ let hi = 0;
404
+ let area = 0;
405
+ let sumY = 0;
406
+ while (hi < holeQueue.length) {
407
+ const hidx = holeQueue[hi++];
408
+ area++;
409
+ sumY += Math.floor(hidx / rw);
410
+ const hx = hidx % rw;
411
+ const hy = Math.floor(hidx / rw);
412
+ for (const [dx, dy] of [
413
+ [0, 1],
414
+ [0, -1],
415
+ [1, 0],
416
+ [-1, 0]
417
+ ]) {
418
+ const hnx = hx + dx;
419
+ const hny = hy + dy;
420
+ if (hnx < 0 || hnx >= rw || hny < 0 || hny >= rh) continue;
421
+ const hni = hny * rw + hnx;
422
+ if (!visited[hni] && grid[hni] === 0) {
423
+ visited[hni] = 1;
424
+ holeQueue.push(hni);
425
+ }
426
+ }
427
+ }
428
+ const charArea = rw * rh;
429
+ if (area > charArea * 5e-3) {
430
+ holeCount++;
431
+ const avgY = sumY / area;
432
+ if (avgY >= midY) hasBottom = true;
433
+ else hasTop = true;
434
+ }
435
+ }
436
+ }
437
+ }
438
+ return { count: holeCount, hasBottom, hasTop };
439
+ }
440
+ function analyseCharacter(pixels, imgW, _imgH, region) {
441
+ const rw = region.right - region.left;
442
+ const rh = region.bottom - region.top;
443
+ const holes = detectHoles(pixels, imgW, region);
444
+ const aspectRatio = rh / Math.max(rw, 1);
445
+ const quarterH = Math.max(3, Math.floor(rh * 0.25));
446
+ let topMinX = rw, topMaxX = 0, botMinX = rw, botMaxX = 0;
447
+ for (let lx = 0; lx < rw; lx++) {
448
+ for (let ly = 0; ly < quarterH; ly++) {
449
+ if (pixels[(region.top + ly) * imgW + (region.left + lx)] >= 128) {
450
+ if (lx < topMinX) topMinX = lx;
451
+ if (lx > topMaxX) topMaxX = lx;
452
+ }
453
+ }
454
+ for (let ly = rh - quarterH; ly < rh; ly++) {
455
+ if (pixels[(region.top + ly) * imgW + (region.left + lx)] >= 128) {
456
+ if (lx < botMinX) botMinX = lx;
457
+ if (lx > botMaxX) botMaxX = lx;
458
+ }
459
+ }
460
+ }
461
+ const topWidth = topMaxX > topMinX ? (topMaxX - topMinX) / rw : 0;
462
+ const bottomWidth = botMaxX > botMinX ? (botMaxX - botMinX) / rw : 0;
463
+ const bottomHorizontalExtent = bottomWidth;
464
+ const topHorizontalExtent = topWidth;
465
+ const topQuarterH = Math.max(3, Math.floor(rh * 0.25));
466
+ const rightHalf = Math.floor(rw / 2);
467
+ let topRightDark = 0;
468
+ let topRightTotal = 0;
469
+ for (let ly = 0; ly < topQuarterH; ly++) {
470
+ for (let lx = rightHalf; lx < rw; lx++) {
471
+ topRightTotal++;
472
+ if (pixels[(region.top + ly) * imgW + (region.left + lx)] >= 128) {
473
+ topRightDark++;
474
+ }
475
+ }
476
+ }
477
+ const topCurvature = topRightTotal > 0 && topRightDark / topRightTotal > 0.15;
478
+ return {
479
+ hasHoleBottom: holes.hasBottom,
480
+ hasHoleTop: holes.hasTop,
481
+ holeCount: holes.count,
482
+ aspectRatio,
483
+ bottomHorizontalExtent,
484
+ topHorizontalExtent,
485
+ topCurvature
486
+ };
487
+ }
488
+ function classifyFromFeatures(features, _votedChar) {
489
+ if (features.hasHoleBottom && !features.hasHoleTop) {
490
+ return "6";
491
+ }
492
+ if (features.holeCount >= 2) {
493
+ return "8";
494
+ }
495
+ if (features.hasHoleTop && !features.hasHoleBottom) {
496
+ return null;
497
+ }
498
+ if (features.holeCount === 0 && features.aspectRatio > 1.8 && !features.topCurvature) {
499
+ return "1";
500
+ }
501
+ if (features.holeCount === 0 && features.bottomHorizontalExtent > 0.5 && features.bottomHorizontalExtent > features.topHorizontalExtent * 1.15 && features.aspectRatio > 0.8) {
502
+ return "L";
503
+ }
504
+ return null;
505
+ }
506
+
507
+ // src/solver.ts
508
+ var PROMPT = `Read the 4 distorted characters in these images. Two processed versions shown.
509
+ The text uses UPPERCASE A-Z and digits 0-9 only. No lowercase.
510
+
511
+ WARNING: The dithered rendering makes many characters appear as "2". Before writing "2", check:
512
+ - Could it be "6"? (has closed loop at bottom)
513
+ - Could it be "L"? (has vertical stem + horizontal foot, 90\xB0 corner)
514
+ - Could it be "1"? (thin vertical stroke, no curve)
515
+ - Could it be "Z"? (all straight lines, sharp angles)
516
+
517
+ Also watch for: O/0 have curved sides (not D which has flat left); B has two bumps (not D with one curve); X is two crossing diagonals (not K with vertical bar); G has horizontal bar inside (not C).
518
+
519
+ Output ONLY the 4 characters.`;
49
520
  var DEFAULT_MODELS = {
50
521
  openai: "gpt-4o",
51
522
  anthropic: "claude-sonnet-4-20250514",
@@ -71,7 +542,7 @@ async function resolveModel(apiKey, provider, modelId) {
71
542
  );
72
543
  }
73
544
  }
74
- var CONFUSION_GROUPS = {
545
+ var LEGACY_CONFUSION_GROUPS = {
75
546
  "1": "1",
76
547
  I: "1",
77
548
  L: "1",
@@ -83,7 +554,23 @@ var CONFUSION_GROUPS = {
83
554
  Z: "Z",
84
555
  "2": "Z"
85
556
  };
86
- function majorityVote(attempts, expectedLength) {
557
+ var DITHER_CONFUSION_GROUPS = {
558
+ D: "O",
559
+ O: "O",
560
+ I: "1",
561
+ "1": "1",
562
+ K: "X",
563
+ X: "X",
564
+ A: "X",
565
+ C: "G",
566
+ G: "G",
567
+ "9": "8",
568
+ "8": "8",
569
+ Y: "X",
570
+ E: "5",
571
+ "5": "5"
572
+ };
573
+ function majorityVote(attempts, expectedLength, groups) {
87
574
  let filtered = expectedLength ? attempts.filter((a) => a.length === expectedLength) : attempts;
88
575
  if (filtered.length === 0) {
89
576
  filtered = attempts;
@@ -103,30 +590,127 @@ function majorityVote(attempts, expectedLength) {
103
590
  }
104
591
  const sameLenAttempts = filtered.filter((a) => a.length === bestLen);
105
592
  if (sameLenAttempts.length === 0) return filtered[0];
593
+ const useGroups = groups && typeof groups === "object" ? groups : void 0;
106
594
  const result = [];
595
+ const rankedByPos = [];
107
596
  for (let pos = 0; pos < bestLen; pos++) {
108
597
  const charCounts = /* @__PURE__ */ new Map();
109
598
  for (const a of sameLenAttempts) {
110
599
  const ch = a[pos];
111
600
  charCounts.set(ch, (charCounts.get(ch) ?? 0) + 1);
112
601
  }
113
- const groupCounts = /* @__PURE__ */ new Map();
114
- for (const [ch, count] of charCounts) {
115
- const canonical = CONFUSION_GROUPS[ch] ?? ch;
116
- groupCounts.set(canonical, (groupCounts.get(canonical) ?? 0) + count);
117
- }
118
- let bestGroup = "";
119
- let bestGroupCount = 0;
120
- for (const [canonical, count] of groupCounts) {
121
- if (count > bestGroupCount) {
122
- bestGroup = canonical;
123
- bestGroupCount = count;
602
+ if (useGroups) {
603
+ const groupCounts = /* @__PURE__ */ new Map();
604
+ for (const [ch, count] of charCounts) {
605
+ const canonical = useGroups[ch] ?? ch;
606
+ groupCounts.set(canonical, (groupCounts.get(canonical) ?? 0) + count);
607
+ }
608
+ rankedByPos.push(groupCounts);
609
+ let bestGroup = "";
610
+ let bestGroupCount = 0;
611
+ for (const [canonical, count] of groupCounts) {
612
+ if (count > bestGroupCount) {
613
+ bestGroup = canonical;
614
+ bestGroupCount = count;
615
+ }
616
+ }
617
+ result.push(bestGroup);
618
+ } else {
619
+ rankedByPos.push(charCounts);
620
+ let bestChar = "";
621
+ let bestCharCount = 0;
622
+ for (const [ch, count] of charCounts) {
623
+ if (count > bestCharCount) {
624
+ bestChar = ch;
625
+ bestCharCount = count;
626
+ }
627
+ }
628
+ result.push(bestChar);
629
+ }
630
+ }
631
+ if (bestLen >= 4) {
632
+ const charFreq = /* @__PURE__ */ new Map();
633
+ for (const ch of result) {
634
+ charFreq.set(ch, (charFreq.get(ch) ?? 0) + 1);
635
+ }
636
+ for (const [ch, freq] of charFreq) {
637
+ if (freq < 3) continue;
638
+ let strongestPos = -1;
639
+ let strongestCount = 0;
640
+ for (let pos = 0; pos < bestLen; pos++) {
641
+ if (result[pos] !== ch) continue;
642
+ const count = rankedByPos[pos].get(ch) ?? 0;
643
+ if (count > strongestCount) {
644
+ strongestCount = count;
645
+ strongestPos = pos;
646
+ }
647
+ }
648
+ for (let pos = 0; pos < bestLen; pos++) {
649
+ if (result[pos] !== ch || pos === strongestPos) continue;
650
+ const ranked = rankedByPos[pos];
651
+ const usedChars = new Set(result);
652
+ let bestUnique = "";
653
+ let bestUniqueCount = 0;
654
+ let bestAny = "";
655
+ let bestAnyCount = 0;
656
+ for (const [c, count] of ranked) {
657
+ if (c === ch) continue;
658
+ if (count > bestAnyCount) {
659
+ bestAny = c;
660
+ bestAnyCount = count;
661
+ }
662
+ if (!usedChars.has(c) && count > bestUniqueCount) {
663
+ bestUnique = c;
664
+ bestUniqueCount = count;
665
+ }
666
+ }
667
+ const sub = bestUniqueCount >= 2 ? bestUnique : bestAnyCount >= 2 ? bestAny : "";
668
+ if (sub) {
669
+ result[pos] = sub;
670
+ }
124
671
  }
125
672
  }
126
- result.push(bestGroup);
127
673
  }
128
674
  return result.join("");
129
675
  }
676
+ function majorityVoteDetailed(attempts, expectedLength, groups) {
677
+ let filtered = expectedLength ? attempts.filter((a) => a.length === expectedLength) : attempts;
678
+ if (filtered.length === 0) filtered = attempts;
679
+ if (filtered.length === 0) return { result: [], rankedByPos: [] };
680
+ const lenCounts = /* @__PURE__ */ new Map();
681
+ for (const a of filtered) lenCounts.set(a.length, (lenCounts.get(a.length) ?? 0) + 1);
682
+ let bestLen = 0;
683
+ let bestCount = 0;
684
+ for (const [len, count] of lenCounts) {
685
+ if (count > bestCount) {
686
+ bestLen = len;
687
+ bestCount = count;
688
+ }
689
+ }
690
+ const sameLenAttempts = filtered.filter((a) => a.length === bestLen);
691
+ if (sameLenAttempts.length === 0) return { result: [...filtered[0]], rankedByPos: [] };
692
+ const useGroups = groups && typeof groups === "object" ? groups : void 0;
693
+ const result = [];
694
+ const rankedByPos = [];
695
+ for (let pos = 0; pos < bestLen; pos++) {
696
+ const counts = /* @__PURE__ */ new Map();
697
+ for (const a of sameLenAttempts) {
698
+ const ch = useGroups ? useGroups[a[pos]] ?? a[pos] : a[pos];
699
+ counts.set(ch, (counts.get(ch) ?? 0) + 1);
700
+ }
701
+ rankedByPos.push(counts);
702
+ let bestChar = "";
703
+ let bestCharCount = 0;
704
+ for (const [ch, count] of counts) {
705
+ if (count > bestCharCount) {
706
+ bestChar = ch;
707
+ bestCharCount = count;
708
+ }
709
+ }
710
+ result.push(bestChar);
711
+ }
712
+ return { result, rankedByPos };
713
+ }
130
714
  function sumOptional(a, b) {
131
715
  if (a === void 0 && b === void 0) return void 0;
132
716
  return (a ?? 0) + (b ?? 0);
@@ -219,30 +803,212 @@ var Solver = class {
219
803
  * @returns Solved text, per-attempt answers, and token usage
220
804
  */
221
805
  async solve(input, options = {}) {
222
- const { numAttempts = 5, expectedLength, maxRetries = 2, verbose = true } = options;
806
+ const {
807
+ numAttempts = 9,
808
+ expectedLength,
809
+ maxRetries = 2,
810
+ verbose = true,
811
+ confusionGroups = false,
812
+ preprocess,
813
+ useTesseract = true,
814
+ useDisambiguation = true
815
+ } = options;
223
816
  const model = await this.getModel();
224
- const imageBuffer = await preprocessCaptchaToBuffer(input);
225
- const results = await Promise.all(
226
- Array.from({ length: numAttempts }, () => this.singleAttempt(model, imageBuffer, maxRetries))
227
- );
228
- const valid = results.filter((r) => r !== null);
817
+ const [enhancedBuffer, heavyCleanBuffer, mediumCleanBuffer] = await Promise.all([
818
+ preprocessCaptchaToBuffer(input, preprocess),
819
+ preprocessCaptchaToBuffer(input, {
820
+ blur: 0,
821
+ greyscale: true,
822
+ scale: 8,
823
+ upscaleKernel: "nearest",
824
+ postBlur: 15,
825
+ normalise: true,
826
+ contrast: 1,
827
+ sharpen: false,
828
+ threshold: 140,
829
+ negate: true,
830
+ crop: "none",
831
+ targetWidth: 800,
832
+ padding: 20
833
+ }),
834
+ preprocessCaptchaToBuffer(input, {
835
+ blur: 0,
836
+ greyscale: true,
837
+ scale: 8,
838
+ upscaleKernel: "nearest",
839
+ postBlur: 8,
840
+ normalise: true,
841
+ contrast: 1,
842
+ sharpen: false,
843
+ threshold: 120,
844
+ negate: true,
845
+ crop: "none",
846
+ targetWidth: 800,
847
+ padding: 20
848
+ })
849
+ ]);
850
+ const halfN = Math.ceil(numAttempts / 2);
851
+ const visionResults = await Promise.all([
852
+ ...Array.from(
853
+ { length: halfN },
854
+ () => this.singleAttempt(model, enhancedBuffer, heavyCleanBuffer, maxRetries)
855
+ ),
856
+ ...Array.from(
857
+ { length: numAttempts - halfN },
858
+ () => this.singleAttempt(model, enhancedBuffer, mediumCleanBuffer, maxRetries)
859
+ )
860
+ ]);
861
+ const valid = visionResults.filter((r) => r !== null);
229
862
  if (verbose) {
230
863
  valid.forEach((r, i) => console.log(` Attempt ${i + 1}: ${r.text}`));
231
864
  }
232
865
  const attempts = valid.map((r) => r.text);
233
866
  const attemptUsages = valid.map((r) => r.usage);
867
+ if (useTesseract) {
868
+ try {
869
+ const reader = await this.getTesseractReader();
870
+ if (reader) {
871
+ const { TESSERACT_VARIANTS: TESSERACT_VARIANTS2 } = await Promise.resolve().then(() => (init_tesseract(), tesseract_exports));
872
+ const tessReads = await reader.recognizeMulti(input, TESSERACT_VARIANTS2);
873
+ for (const read of tessReads) {
874
+ attempts.push(read);
875
+ if (verbose) console.log(` Tesseract: ${read}`);
876
+ }
877
+ }
878
+ } catch {
879
+ }
880
+ }
881
+ const correctionAttempts = Math.min(3, Math.floor(numAttempts / 3));
882
+ if (correctionAttempts > 0 && attempts.length > 0) {
883
+ const initialVote = majorityVote(attempts, expectedLength, confusionGroups);
884
+ const suspiciousCount = [...initialVote].filter((c) => c === "2" || c === "Z").length;
885
+ if (suspiciousCount >= 2 && initialVote.length === (expectedLength ?? initialVote.length)) {
886
+ const corrPrompt = this.buildCorrectionPrompt(initialVote);
887
+ if (corrPrompt) {
888
+ const corrections = await Promise.all(
889
+ Array.from(
890
+ { length: correctionAttempts },
891
+ () => this.selfCorrect(model, enhancedBuffer, heavyCleanBuffer, initialVote, corrPrompt)
892
+ )
893
+ );
894
+ for (const c of corrections) {
895
+ if (c) {
896
+ for (let w = 0; w < 5; w++) attempts.push(c.text);
897
+ if (verbose) console.log(` Corrected: ${c.text}`);
898
+ }
899
+ }
900
+ }
901
+ }
902
+ }
234
903
  const usage = aggregateUsage(attemptUsages);
235
904
  if (attempts.length === 0) {
236
905
  if (verbose) console.log(" All attempts failed!");
237
906
  return { text: "", attempts, usage, attemptUsages };
238
907
  }
239
- return { text: majorityVote(attempts, expectedLength), attempts, usage, attemptUsages };
908
+ const { result, rankedByPos } = majorityVoteDetailed(attempts, expectedLength, confusionGroups);
909
+ if (useDisambiguation && result.length > 0 && rankedByPos.length > 0) {
910
+ try {
911
+ await disambiguateResult(result, rankedByPos, heavyCleanBuffer);
912
+ const lightCleanBuffer = await preprocessCaptchaToBuffer(input, {
913
+ median: 3,
914
+ blur: 0,
915
+ greyscale: true,
916
+ scale: 4,
917
+ postBlur: 3,
918
+ normalise: true,
919
+ contrast: 1,
920
+ sharpen: false,
921
+ threshold: 128,
922
+ crop: "none",
923
+ padding: 20
924
+ });
925
+ await disambiguateResult(result, rankedByPos, lightCleanBuffer);
926
+ } catch {
927
+ }
928
+ }
929
+ const finalText = majorityVote(
930
+ [...attempts, result.join("")],
931
+ // include disambiguated result as an extra "vote"
932
+ expectedLength,
933
+ confusionGroups
934
+ );
935
+ return {
936
+ text: finalText,
937
+ attempts,
938
+ usage,
939
+ attemptUsages
940
+ };
941
+ }
942
+ _tesseractReader = void 0;
943
+ async getTesseractReader() {
944
+ if (this._tesseractReader !== void 0) return this._tesseractReader;
945
+ try {
946
+ const { createTesseractReader: createTesseractReader2 } = await Promise.resolve().then(() => (init_tesseract(), tesseract_exports));
947
+ this._tesseractReader = await createTesseractReader2();
948
+ } catch {
949
+ this._tesseractReader = null;
950
+ }
951
+ return this._tesseractReader;
952
+ }
953
+ /** Clean up resources (Tesseract worker). */
954
+ async dispose() {
955
+ if (this._tesseractReader) {
956
+ await this._tesseractReader.dispose();
957
+ this._tesseractReader = null;
958
+ }
959
+ }
960
+ buildCorrectionPrompt(initial) {
961
+ const checks = [...initial].map((c, pos) => {
962
+ if (c !== "2" && c !== "Z") return null;
963
+ if (pos === 0)
964
+ return `Pos ${pos + 1} ("${c}"): thin stroke \u2192 "1"? closed loop at bottom \u2192 "6"? vertical+foot \u2192 "L"?`;
965
+ if (pos < initial.length - 1)
966
+ return `Pos ${pos + 1} ("${c}"): vertical + horizontal foot \u2192 "L"? thin stroke \u2192 "1"? loop \u2192 "6"?`;
967
+ return `Pos ${pos + 1} ("${c}"): curved top \u2192 keep "2"; straight angles \u2192 "Z"`;
968
+ }).filter(Boolean);
969
+ if (!checks.length) return null;
970
+ const prefix = [...initial].filter((c) => c === "2" || c === "Z").length >= 3 ? `"${initial}" has many similar chars \u2014 unusual for a captcha.
971
+ ` : "";
972
+ return `${prefix}Recheck:
973
+ ${checks.join("\n")}
974
+ Only change with clear evidence. Output ONLY the corrected 4 characters.`;
975
+ }
976
+ async selfCorrect(model, primaryBuffer, secondaryBuffer, initial, correctionPrompt) {
977
+ try {
978
+ const { text } = await generateText({
979
+ model,
980
+ messages: [
981
+ {
982
+ role: "user",
983
+ content: [
984
+ { type: "text", text: PROMPT },
985
+ { type: "image", image: primaryBuffer },
986
+ { type: "image", image: secondaryBuffer }
987
+ ]
988
+ },
989
+ { role: "assistant", content: initial },
990
+ {
991
+ role: "user",
992
+ content: [
993
+ { type: "text", text: correctionPrompt },
994
+ { type: "image", image: primaryBuffer }
995
+ ]
996
+ }
997
+ ],
998
+ temperature: 0.3,
999
+ maxOutputTokens: 32
1000
+ });
1001
+ const cleaned = text.trim().replace(/[^A-Za-z0-9]/g, "").toUpperCase();
1002
+ return cleaned.length >= 2 && cleaned.length <= 8 ? { text: cleaned } : null;
1003
+ } catch {
1004
+ return null;
1005
+ }
240
1006
  }
241
1007
  /**
242
1008
  * Make a single API call to read the captcha.
243
1009
  * Retries up to `maxRetries` times on failure.
244
1010
  */
245
- async singleAttempt(model, imageBuffer, maxRetries) {
1011
+ async singleAttempt(model, primaryBuffer, secondaryBuffer, maxRetries) {
246
1012
  for (let retry = 0; retry <= maxRetries; retry++) {
247
1013
  try {
248
1014
  const { text, usage } = await generateText({
@@ -252,7 +1018,8 @@ var Solver = class {
252
1018
  role: "user",
253
1019
  content: [
254
1020
  { type: "text", text: PROMPT },
255
- { type: "image", image: imageBuffer }
1021
+ { type: "image", image: primaryBuffer },
1022
+ { type: "image", image: secondaryBuffer }
256
1023
  ]
257
1024
  }
258
1025
  ],
@@ -261,11 +1028,29 @@ var Solver = class {
261
1028
  });
262
1029
  const raw = text.trim();
263
1030
  const lower = raw.toLowerCase();
264
- if (lower.includes("sorry") || lower.includes("can't help") || lower.includes("cannot help") || lower.includes("unable to") || lower.includes("i can't") || raw.length > 20) {
1031
+ if (lower.includes("sorry") || lower.includes("can't help") || lower.includes("cannot help") || lower.includes("unable to") || lower.includes("i can't")) {
265
1032
  return null;
266
1033
  }
267
- const cleaned = raw.toUpperCase().replace(/[^A-Z0-9]/g, "");
268
- return cleaned ? { text: cleaned, usage } : null;
1034
+ let answer = "";
1035
+ const allAlpha = raw.replace(/[^A-Za-z0-9]/g, "").toUpperCase();
1036
+ if (allAlpha.length <= 10) {
1037
+ answer = allAlpha;
1038
+ } else {
1039
+ const lines = raw.split(/\n/).reverse();
1040
+ for (const line of lines) {
1041
+ const tokens = line.trim().split(/\s+/);
1042
+ for (let ti = tokens.length - 1; ti >= 0; ti--) {
1043
+ const clean = tokens[ti].replace(/[^A-Za-z0-9]/g, "").toUpperCase();
1044
+ if (clean.length >= 2 && clean.length <= 8) {
1045
+ answer = clean;
1046
+ break;
1047
+ }
1048
+ }
1049
+ if (answer) break;
1050
+ }
1051
+ if (!answer) answer = allAlpha.slice(-8);
1052
+ }
1053
+ return answer ? { text: answer, usage } : null;
269
1054
  } catch (_err) {
270
1055
  if (retry < maxRetries) {
271
1056
  await new Promise((r) => setTimeout(r, 1e3 * (retry + 1)));
@@ -277,9 +1062,20 @@ var Solver = class {
277
1062
  return null;
278
1063
  }
279
1064
  };
1065
+
1066
+ // src/index.ts
1067
+ init_preprocess();
1068
+ init_tesseract();
280
1069
  export {
1070
+ DITHER_CONFUSION_GROUPS,
1071
+ LEGACY_CONFUSION_GROUPS,
281
1072
  Solver,
1073
+ TESSERACT_VARIANTS,
1074
+ createTesseractReader,
1075
+ disambiguateResult,
282
1076
  imageToBase64,
1077
+ majorityVote,
1078
+ majorityVoteDetailed,
283
1079
  preprocessCaptcha,
284
1080
  preprocessCaptchaToBuffer
285
1081
  };