@yigitahmetsahin/captcha-solver 2.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,6 +1,78 @@
1
1
  import { LanguageModelUsage, LanguageModel } from 'ai';
2
2
  export { LanguageModelUsage } from 'ai';
3
3
 
4
+ interface CropFractions {
5
+ /** Fraction from left edge (0–1, default: 0.1) */
6
+ left: number;
7
+ /** Fraction from top edge (0–1, default: 0.02) */
8
+ top: number;
9
+ /** Fraction from left to keep (0–1, default: 0.9) */
10
+ right: number;
11
+ /** Fraction from top to keep (0–1, default: 0.6) */
12
+ bottom: number;
13
+ }
14
+ interface PreprocessOptions {
15
+ /**
16
+ * Fraction of image height to keep from the top, cropping the bottom (default: 1.0, no pre-crop).
17
+ * Useful for removing dark bands at the bottom of dithered captchas.
18
+ */
19
+ preCropHeight?: number;
20
+ /** Median filter size at original resolution before other processing (default: 0, off). Odd number. */
21
+ median?: number;
22
+ /** Gaussian blur radius (default: 1.5). Set to 0 to skip. */
23
+ blur?: number;
24
+ /** Convert to greyscale (default: true) */
25
+ greyscale?: boolean;
26
+ /** Upscale factor (default: 4) */
27
+ scale?: number;
28
+ /** Upscale interpolation kernel (default: 'lanczos3') */
29
+ upscaleKernel?: 'lanczos3' | 'nearest' | 'cubic' | 'mitchell';
30
+ /** Gaussian blur applied AFTER upscaling — use large values (10-20) for dither removal (default: 0, off) */
31
+ postBlur?: number;
32
+ /** Normalise (stretch histogram to full range) before contrast/threshold (default: false) */
33
+ normalise?: boolean;
34
+ /** Contrast multiplier around image mean (default: 3.0). Set to 1 to skip. */
35
+ contrast?: number;
36
+ /** Enable unsharp-mask sharpening (default: true) */
37
+ sharpen?: boolean;
38
+ /** Binary threshold (0-255). Applied after contrast. (default: false, off) */
39
+ threshold?: number | false;
40
+ /** Invert colors (negate) after processing (default: false) */
41
+ negate?: boolean;
42
+ /**
43
+ * Crop mode (default: 'auto'):
44
+ * - 'auto' – trim whitespace after contrast enhancement, with margin
45
+ * - 'legacy' – fixed-percentage crop (original behavior)
46
+ * - 'none' – skip cropping
47
+ * - CropFractions – custom crop percentages
48
+ */
49
+ crop?: 'auto' | 'legacy' | 'none' | CropFractions;
50
+ /** Add white padding around the result (default: true). Pass false to skip, or a number for custom px. */
51
+ padding?: boolean | number;
52
+ /** Resize final image to this width in pixels, maintaining aspect ratio (default: none). Useful for downscaling after high-res processing. */
53
+ targetWidth?: number;
54
+ }
55
+ /**
56
+ * Preprocess a captcha image and return a base64-encoded PNG string.
57
+ */
58
+ declare function preprocessCaptcha(input: string | Buffer, options?: PreprocessOptions): Promise<string>;
59
+ /**
60
+ * Preprocess a captcha image and return the resulting PNG as a raw Buffer.
61
+ *
62
+ * Pipeline:
63
+ * 1. Gaussian blur in color space (smooths dither pattern)
64
+ * 2. Grayscale conversion
65
+ * 3. Upscale with Lanczos
66
+ * 4. Contrast boost around image mean + sharpen
67
+ * 5. Crop (auto-detect, legacy fixed, none, or custom)
68
+ * 6. Add white padding
69
+ */
70
+ declare function preprocessCaptchaToBuffer(input: string | Buffer, options?: PreprocessOptions): Promise<Buffer>;
71
+ /**
72
+ * Read an image file and return its base64-encoded content.
73
+ */
74
+ declare function imageToBase64(imagePath: string): string;
75
+
4
76
  type Provider = 'openai' | 'anthropic' | 'google';
5
77
  interface SolverOptions {
6
78
  /** AI provider to use when constructing the model from an API key (default: "openai") */
@@ -9,7 +81,7 @@ interface SolverOptions {
9
81
  model?: string;
10
82
  }
11
83
  interface SolveOptions {
12
- /** Number of voting attempts (default: 5) */
84
+ /** Number of voting attempts (default: 7) */
13
85
  numAttempts?: number;
14
86
  /** Expected captcha length — results of other lengths are discarded */
15
87
  expectedLength?: number;
@@ -17,6 +89,19 @@ interface SolveOptions {
17
89
  maxRetries?: number;
18
90
  /** Whether to log attempt details (default: true) */
19
91
  verbose?: boolean;
92
+ /**
93
+ * Confusion groups for majority voting.
94
+ * Pass a Record<string, string> to merge visually similar characters,
95
+ * or `false` to disable (default: false).
96
+ * Use LEGACY_CONFUSION_GROUPS to restore pre-3.0 behavior.
97
+ */
98
+ confusionGroups?: Record<string, string> | false;
99
+ /** Preprocessing options passed to the image pipeline */
100
+ preprocess?: PreprocessOptions;
101
+ /** Use Tesseract OCR as an additional voter (default: true if tesseract.js is installed) */
102
+ useTesseract?: boolean;
103
+ /** Use programmatic hole-detection to disambiguate 2/6/L/1 (default: true) */
104
+ useDisambiguation?: boolean;
20
105
  }
21
106
  interface SolveResult {
22
107
  /** The solved captcha text (majority-voted) */
@@ -28,6 +113,38 @@ interface SolveResult {
28
113
  /** Per-attempt usage breakdown */
29
114
  attemptUsages: LanguageModelUsage[];
30
115
  }
116
+ /**
117
+ * Pre-3.0 confusion groups that merge visually similar characters.
118
+ * Opt-in via `{ confusionGroups: LEGACY_CONFUSION_GROUPS }`.
119
+ *
120
+ * Maps: 1/I/L → '1', O/D/0 → 'O', S/5 → 'S', Z/2 → 'Z'
121
+ */
122
+ declare const LEGACY_CONFUSION_GROUPS: Record<string, string>;
123
+ /**
124
+ * Confusion groups optimised for dithered / halftone captchas.
125
+ * Vision models systematically misread certain characters in dithered rendering.
126
+ *
127
+ * Maps: D→'O', I→'1', K/A→'X', C→'G', 9→'8', Y→'X', E→'5'
128
+ */
129
+ declare const DITHER_CONFUSION_GROUPS: Record<string, string>;
130
+ /**
131
+ * Character-level majority vote across multiple attempts.
132
+ * When `groups` is provided, visually similar characters are merged
133
+ * during counting (e.g. 1/I/L all count toward '1').
134
+ *
135
+ * After voting, a repetition penalty is applied: if any character appears
136
+ * 3+ times in the result (unlikely in real captchas), positions with that
137
+ * character are reconsidered using the next-best alternative.
138
+ */
139
+ declare function majorityVote(attempts: string[], expectedLength?: number, groups?: Record<string, string> | false): string;
140
+ /**
141
+ * Raw character-level majority vote WITHOUT repetition penalty.
142
+ * Returns per-position vote maps for disambiguation.
143
+ */
144
+ declare function majorityVoteDetailed(attempts: string[], expectedLength?: number, groups?: Record<string, string> | false): {
145
+ result: string[];
146
+ rankedByPos: Map<string, number>[];
147
+ };
31
148
  declare class Solver {
32
149
  private _model;
33
150
  private _pendingModel;
@@ -58,6 +175,12 @@ declare class Solver {
58
175
  * @returns Solved text, per-attempt answers, and token usage
59
176
  */
60
177
  solve(input: string | Buffer, options?: SolveOptions): Promise<SolveResult>;
178
+ private _tesseractReader;
179
+ private getTesseractReader;
180
+ /** Clean up resources (Tesseract worker). */
181
+ dispose(): Promise<void>;
182
+ private buildCorrectionPrompt;
183
+ private selfCorrect;
61
184
  /**
62
185
  * Make a single API call to read the captcha.
63
186
  * Retries up to `maxRetries` times on failure.
@@ -65,29 +188,30 @@ declare class Solver {
65
188
  private singleAttempt;
66
189
  }
67
190
 
191
+ interface TesseractReader {
192
+ recognize: (image: Buffer) => Promise<string>;
193
+ recognizeMulti: (input: string | Buffer, variants: PreprocessOptions[]) => Promise<string[]>;
194
+ dispose: () => Promise<void>;
195
+ }
68
196
  /**
69
- * Preprocess a captcha image using sharp (libvips).
70
- *
71
- * Pipeline:
72
- * 1. Gaussian blur in color space (smooths dither pattern)
73
- * 2. Grayscale conversion
74
- * 3. Upscale 4× with Lanczos
75
- * 4. Contrast boost (3× around image mean) + sharpen
76
- * 5. Crop decorative borders
77
- * 6. Add white padding
78
- *
79
- * Accepts a file path or a raw image Buffer.
80
- * Returns a base64-encoded PNG string.
197
+ * Create a Tesseract OCR reader. Returns null if tesseract.js is not installed.
198
+ * The reader uses PSM_SINGLE_LINE and an A-Z0-9 whitelist.
81
199
  */
82
- declare function preprocessCaptcha(input: string | Buffer): Promise<string>;
200
+ declare function createTesseractReader(): Promise<TesseractReader | null>;
83
201
  /**
84
- * Same preprocessing pipeline as `preprocessCaptcha`, but returns the
85
- * resulting PNG as a raw Buffer (useful for AI SDK image content parts).
202
+ * Default preprocessing variants for Tesseract OCR.
203
+ * Different blur/threshold levels produce diverse reads.
86
204
  */
87
- declare function preprocessCaptchaToBuffer(input: string | Buffer): Promise<Buffer>;
205
+ declare const TESSERACT_VARIANTS: PreprocessOptions[];
206
+
88
207
  /**
89
- * Read an image file and return its base64-encoded content.
208
+ * Disambiguate characters in a voted result using deterministic image features.
209
+ * Only acts on positions voted as "2" or "Z" where alternatives like 6/L/1 received votes.
210
+ *
211
+ * @param result - The voted character array (mutable, modified in place)
212
+ * @param rankedByPos - Per-position vote counts from majorityVote
213
+ * @param binaryImage - The preprocessed binary image buffer (dark text on white, from threshold+negate)
90
214
  */
91
- declare function imageToBase64(imagePath: string): string;
215
+ declare function disambiguateResult(result: string[], rankedByPos: Map<string, number>[], binaryImage: Buffer): Promise<void>;
92
216
 
93
- export { type Provider, type SolveOptions, type SolveResult, Solver, type SolverOptions, imageToBase64, preprocessCaptcha, preprocessCaptchaToBuffer };
217
+ export { type CropFractions, DITHER_CONFUSION_GROUPS, LEGACY_CONFUSION_GROUPS, type PreprocessOptions, type Provider, type SolveOptions, type SolveResult, Solver, type SolverOptions, TESSERACT_VARIANTS, type TesseractReader, createTesseractReader, disambiguateResult, imageToBase64, majorityVote, majorityVoteDetailed, preprocessCaptcha, preprocessCaptchaToBuffer };