@yigitahmetsahin/captcha-solver 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,7 +9,7 @@ AI-powered captcha solver using image preprocessing and OpenAI vision models wit
9
9
  ## Features
10
10
 
11
11
  - **AI Vision OCR** - Uses OpenAI vision models (o3, gpt-4o, etc.) to read distorted captcha text
12
- - **Image Preprocessing** - PIL-based pipeline: grayscale, blur, upscale, contrast/sharpness enhancement, cropping
12
+ - **Image Preprocessing** - Sharp/libvips pipeline: grayscale, blur, upscale, contrast/sharpness enhancement, cropping
13
13
  - **Majority Voting** - Runs multiple attempts and uses character-level majority voting for accuracy
14
14
  - **Configurable** - Adjustable model, attempt count, expected length, and verbosity
15
15
  - **TypeScript** - Full type safety with strict mode
@@ -17,7 +17,6 @@ AI-powered captcha solver using image preprocessing and OpenAI vision models wit
17
17
  ## Prerequisites
18
18
 
19
19
  - Node.js >= 18
20
- - Python 3 with PIL/Pillow (`pip install Pillow`)
21
20
  - OpenAI API key
22
21
 
23
22
  ## Installation
@@ -82,7 +81,7 @@ npm run benchmark
82
81
 
83
82
  ## How It Works
84
83
 
85
- 1. **Preprocessing** - The image is processed through a PIL pipeline:
84
+ 1. **Preprocessing** - The image is processed through a sharp (libvips) pipeline:
86
85
  - Convert to grayscale
87
86
  - Apply Gaussian blur to smooth noise
88
87
  - Upscale 4x with Lanczos interpolation
package/dist/index.cjs CHANGED
@@ -30,49 +30,49 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
+ Solver: () => Solver,
33
34
  imageToBase64: () => imageToBase64,
34
35
  preprocessCaptcha: () => preprocessCaptcha,
35
- solveCaptchaImage: () => solveCaptchaImage
36
+ preprocessCaptchaToBuffer: () => preprocessCaptchaToBuffer
36
37
  });
37
38
  module.exports = __toCommonJS(index_exports);
38
39
 
39
40
  // src/solver.ts
40
- var import_openai = __toESM(require("openai"), 1);
41
+ var import_ai = require("ai");
41
42
 
42
43
  // src/preprocess.ts
43
44
  var import_fs = __toESM(require("fs"), 1);
44
- var import_child_process = require("child_process");
45
45
  var import_path = __toESM(require("path"), 1);
46
- var PYTHON_SCRIPT = `
47
- import sys, base64, io
48
- from PIL import Image, ImageFilter, ImageEnhance, ImageOps
49
-
50
- image_path = sys.argv[1]
51
- img = Image.open(image_path)
52
- img = ImageOps.grayscale(img)
53
- img = img.filter(ImageFilter.GaussianBlur(radius=1.2))
54
- img = img.resize((img.width * 4, img.height * 4), Image.LANCZOS)
55
- img = ImageEnhance.Contrast(img).enhance(3.0)
56
- img = ImageEnhance.Sharpness(img).enhance(2.0)
57
- w, h = img.size
58
- img = img.crop((int(w * 0.10), int(h * 0.02), int(w * 0.90), int(h * 0.60)))
59
- padded = Image.new('L', (img.width + 60, img.height + 40), 255)
60
- padded.paste(img, (30, 20))
61
- padded = padded.convert('RGB')
62
- buf = io.BytesIO()
63
- padded.save(buf, format='PNG')
64
- sys.stdout.buffer.write(base64.b64encode(buf.getvalue()))
65
- `;
66
- async function preprocessCaptcha(imagePath) {
67
- const absPath = import_path.default.resolve(imagePath);
68
- const scriptPath = "/tmp/_captcha_preprocess.py";
69
- import_fs.default.writeFileSync(scriptPath, PYTHON_SCRIPT);
70
- const result = (0, import_child_process.execSync)(`python3 "${scriptPath}" "${absPath}"`, {
71
- maxBuffer: 10 * 1024 * 1024,
72
- // 10MB
73
- encoding: "utf-8"
74
- });
75
- return result.trim();
46
+ var import_sharp = __toESM(require("sharp"), 1);
47
+ async function preprocessCaptcha(input) {
48
+ const buf = await preprocessCaptchaToBuffer(input);
49
+ return buf.toString("base64");
50
+ }
51
+ async function preprocessCaptchaToBuffer(input) {
52
+ const source = typeof input === "string" ? import_path.default.resolve(input) : input;
53
+ const metadata = await (0, import_sharp.default)(source).metadata();
54
+ const origW = metadata.width;
55
+ const origH = metadata.height;
56
+ const smoothed = await (0, import_sharp.default)(source).blur(1.5).greyscale().toBuffer();
57
+ const upscaled = await (0, import_sharp.default)(smoothed).resize(origW * 4, origH * 4, { kernel: "lanczos3" }).toBuffer();
58
+ const stats = await (0, import_sharp.default)(upscaled).stats();
59
+ const mean = stats.channels[0].mean;
60
+ const enhanced = await (0, import_sharp.default)(upscaled).linear(3, mean * (1 - 3)).sharpen({ sigma: 1, m1: 2, m2: 1 }).toBuffer();
61
+ const scaledW = origW * 4;
62
+ const scaledH = origH * 4;
63
+ const cropLeft = Math.floor(scaledW * 0.1);
64
+ const cropTop = Math.floor(scaledH * 0.02);
65
+ const cropRight = Math.floor(scaledW * 0.9);
66
+ const cropBottom = Math.floor(scaledH * 0.6);
67
+ const cropW = cropRight - cropLeft;
68
+ const cropH = cropBottom - cropTop;
69
+ return (0, import_sharp.default)(enhanced).extract({ left: cropLeft, top: cropTop, width: cropW, height: cropH }).extend({
70
+ top: 20,
71
+ bottom: 20,
72
+ left: 30,
73
+ right: 30,
74
+ background: { r: 255, g: 255, b: 255 }
75
+ }).png().toBuffer();
76
76
  }
77
77
  function imageToBase64(imagePath) {
78
78
  const buffer = import_fs.default.readFileSync(imagePath);
@@ -82,50 +82,46 @@ function imageToBase64(imagePath) {
82
82
  // src/solver.ts
83
83
  var PROMPT = `You are an assistant helping a visually impaired person read distorted text from an image.
84
84
  The text contains uppercase letters A-Z and/or digits 0-9.
85
- A thin vertical stroke is likely the digit 1, not the letter I.
85
+ A thin vertical stroke is the digit 1. Never read it as the letter I or L.
86
86
  A round closed shape is the letter O, not the letter D.
87
87
  Output ONLY the exact characters you read, nothing else.`;
88
- async function singleAttempt(client, base64Image, model, maxRetries) {
89
- for (let retry = 0; retry <= maxRetries; retry++) {
90
- try {
91
- const isReasoningModel = model.startsWith("o");
92
- const tokenParam = isReasoningModel ? { max_completion_tokens: 2e3 } : { max_tokens: 256 };
93
- const response = await client.chat.completions.create({
94
- model,
95
- messages: [
96
- {
97
- role: "user",
98
- content: [
99
- { type: "text", text: PROMPT },
100
- {
101
- type: "image_url",
102
- image_url: {
103
- url: `data:image/png;base64,${base64Image}`
104
- }
105
- }
106
- ]
107
- }
108
- ],
109
- temperature: 1,
110
- ...tokenParam
111
- });
112
- const raw = response.choices[0]?.message?.content?.trim() ?? "";
113
- const lower = raw.toLowerCase();
114
- if (lower.includes("sorry") || lower.includes("can't help") || lower.includes("cannot help") || lower.includes("unable to") || lower.includes("i can't") || raw.length > 20) {
115
- return null;
116
- }
117
- const cleaned = raw.toUpperCase().replace(/[^A-Z0-9]/g, "");
118
- return cleaned || null;
119
- } catch (_err) {
120
- if (retry < maxRetries) {
121
- await new Promise((r) => setTimeout(r, 1e3 * (retry + 1)));
122
- continue;
123
- }
124
- return null;
88
+ var DEFAULT_MODELS = {
89
+ openai: "gpt-4o",
90
+ anthropic: "claude-sonnet-4-20250514",
91
+ google: "gemini-2.0-flash"
92
+ };
93
+ async function resolveModel(apiKey, provider, modelId) {
94
+ switch (provider) {
95
+ case "openai": {
96
+ const { createOpenAI } = await import("@ai-sdk/openai");
97
+ return createOpenAI({ apiKey })(modelId);
98
+ }
99
+ case "anthropic": {
100
+ const { createAnthropic } = await import("@ai-sdk/anthropic");
101
+ return createAnthropic({ apiKey })(modelId);
102
+ }
103
+ case "google": {
104
+ const { createGoogleGenerativeAI } = await import("@ai-sdk/google");
105
+ return createGoogleGenerativeAI({ apiKey })(modelId);
125
106
  }
107
+ default:
108
+ throw new Error(
109
+ `Unknown provider "${provider}". Install the matching @ai-sdk/* package and pass the model directly.`
110
+ );
126
111
  }
127
- return null;
128
112
  }
113
+ var CONFUSION_GROUPS = {
114
+ "1": "1",
115
+ I: "1",
116
+ L: "1",
117
+ O: "O",
118
+ D: "O",
119
+ "0": "O",
120
+ S: "S",
121
+ "5": "S",
122
+ Z: "Z",
123
+ "2": "Z"
124
+ };
129
125
  function majorityVote(attempts, expectedLength) {
130
126
  let filtered = expectedLength ? attempts.filter((a) => a.length === expectedLength) : attempts;
131
127
  if (filtered.length === 0) {
@@ -153,46 +149,132 @@ function majorityVote(attempts, expectedLength) {
153
149
  const ch = a[pos];
154
150
  charCounts.set(ch, (charCounts.get(ch) ?? 0) + 1);
155
151
  }
156
- let bestChar = "";
157
- let bestCharCount = 0;
152
+ const groupCounts = /* @__PURE__ */ new Map();
158
153
  for (const [ch, count] of charCounts) {
159
- if (count > bestCharCount) {
160
- bestChar = ch;
161
- bestCharCount = count;
154
+ const canonical = CONFUSION_GROUPS[ch] ?? ch;
155
+ groupCounts.set(canonical, (groupCounts.get(canonical) ?? 0) + count);
156
+ }
157
+ let bestGroup = "";
158
+ let bestGroupCount = 0;
159
+ for (const [canonical, count] of groupCounts) {
160
+ if (count > bestGroupCount) {
161
+ bestGroup = canonical;
162
+ bestGroupCount = count;
162
163
  }
163
164
  }
164
- result.push(bestChar);
165
+ result.push(bestGroup);
165
166
  }
166
167
  return result.join("");
167
168
  }
168
- async function solveCaptchaImage(imagePath, options = {}) {
169
- const { model = "o3", numAttempts = 5, expectedLength, maxRetries = 2, verbose = true } = options;
170
- const client = new import_openai.default({ apiKey: process.env.OPENAI_API_KEY });
171
- const base64Processed = await preprocessCaptcha(imagePath);
172
- const attempts = [];
173
- const maxTotalCalls = numAttempts + 4;
174
- let callCount = 0;
175
- while (attempts.length < numAttempts && callCount < maxTotalCalls) {
176
- callCount++;
177
- const result = await singleAttempt(client, base64Processed, model, maxRetries);
178
- if (result) {
179
- attempts.push(result);
180
- if (verbose) console.log(` Attempt ${attempts.length}: ${result}`);
169
+ var Solver = class {
170
+ _model = null;
171
+ _pendingModel = null;
172
+ /**
173
+ * Create a captcha solver.
174
+ *
175
+ * @example
176
+ * // Simple defaults to OpenAI gpt-4o
177
+ * const solver = new Solver('sk-...');
178
+ *
179
+ * @example
180
+ * // Specify provider and model
181
+ * const solver = new Solver('sk-ant-...', { provider: 'anthropic', model: 'claude-sonnet-4-20250514' });
182
+ *
183
+ * @example
184
+ * // Pass an AI SDK model directly
185
+ * import { createOpenAI } from '@ai-sdk/openai';
186
+ * const openai = createOpenAI({ apiKey: 'sk-...' });
187
+ * const solver = new Solver(openai('gpt-4o'));
188
+ */
189
+ constructor(keyOrModel, options) {
190
+ if (typeof keyOrModel === "string") {
191
+ const provider = options?.provider ?? "openai";
192
+ const modelId = options?.model ?? DEFAULT_MODELS[provider];
193
+ this._pendingModel = resolveModel(keyOrModel, provider, modelId);
181
194
  } else {
182
- if (verbose) console.log(` Call ${callCount}: (refused/failed, retrying...)`);
195
+ this._model = keyOrModel;
183
196
  }
184
197
  }
185
- if (attempts.length === 0) {
186
- if (verbose) console.log(" All attempts failed!");
187
- return "";
198
+ async getModel() {
199
+ if (this._model) return this._model;
200
+ this._model = await this._pendingModel;
201
+ this._pendingModel = null;
202
+ return this._model;
188
203
  }
189
- const answer = majorityVote(attempts, expectedLength);
190
- return answer;
191
- }
204
+ /**
205
+ * Solve a captcha image.
206
+ *
207
+ * @param input - File path (string) or raw image Buffer
208
+ * @param options - Solve options (attempts, expected length, etc.)
209
+ * @returns The captcha text
210
+ */
211
+ async solve(input, options = {}) {
212
+ const { numAttempts = 5, expectedLength, maxRetries = 2, verbose = true } = options;
213
+ const model = await this.getModel();
214
+ const imageBuffer = await preprocessCaptchaToBuffer(input);
215
+ const attempts = [];
216
+ const maxTotalCalls = numAttempts + 4;
217
+ let callCount = 0;
218
+ while (attempts.length < numAttempts && callCount < maxTotalCalls) {
219
+ callCount++;
220
+ const result = await this.singleAttempt(model, imageBuffer, maxRetries);
221
+ if (result) {
222
+ attempts.push(result);
223
+ if (verbose) console.log(` Attempt ${attempts.length}: ${result}`);
224
+ } else {
225
+ if (verbose) console.log(` Call ${callCount}: (refused/failed, retrying...)`);
226
+ }
227
+ }
228
+ if (attempts.length === 0) {
229
+ if (verbose) console.log(" All attempts failed!");
230
+ return "";
231
+ }
232
+ return majorityVote(attempts, expectedLength);
233
+ }
234
+ /**
235
+ * Make a single API call to read the captcha.
236
+ * Retries up to `maxRetries` times on failure.
237
+ */
238
+ async singleAttempt(model, imageBuffer, maxRetries) {
239
+ for (let retry = 0; retry <= maxRetries; retry++) {
240
+ try {
241
+ const { text } = await (0, import_ai.generateText)({
242
+ model,
243
+ messages: [
244
+ {
245
+ role: "user",
246
+ content: [
247
+ { type: "text", text: PROMPT },
248
+ { type: "image", image: imageBuffer }
249
+ ]
250
+ }
251
+ ],
252
+ temperature: 1,
253
+ maxOutputTokens: 256
254
+ });
255
+ const raw = text.trim();
256
+ const lower = raw.toLowerCase();
257
+ if (lower.includes("sorry") || lower.includes("can't help") || lower.includes("cannot help") || lower.includes("unable to") || lower.includes("i can't") || raw.length > 20) {
258
+ return null;
259
+ }
260
+ const cleaned = raw.toUpperCase().replace(/[^A-Z0-9]/g, "");
261
+ return cleaned || null;
262
+ } catch (_err) {
263
+ if (retry < maxRetries) {
264
+ await new Promise((r) => setTimeout(r, 1e3 * (retry + 1)));
265
+ continue;
266
+ }
267
+ return null;
268
+ }
269
+ }
270
+ return null;
271
+ }
272
+ };
192
273
  // Annotate the CommonJS export names for ESM import in node:
193
274
  0 && (module.exports = {
275
+ Solver,
194
276
  imageToBase64,
195
277
  preprocessCaptcha,
196
- solveCaptchaImage
278
+ preprocessCaptchaToBuffer
197
279
  });
198
280
  //# sourceMappingURL=index.cjs.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/index.ts","../src/solver.ts","../src/preprocess.ts"],"sourcesContent":["export { solveCaptchaImage } from './solver.js';\nexport { preprocessCaptcha, imageToBase64 } from './preprocess.js';\n","import OpenAI from 'openai';\nimport { preprocessCaptcha } from './preprocess.js';\n\nconst PROMPT = `You are an assistant helping a visually impaired person read distorted text from an image.\nThe text contains uppercase letters A-Z and/or digits 0-9.\nA thin vertical stroke is likely the digit 1, not the letter I.\nA round closed shape is the letter O, not the letter D.\nOutput ONLY the exact characters you read, nothing else.`;\n\ninterface SolverOptions {\n /** OpenAI model to use (default: \"o3\") */\n model?: string;\n /** Number of voting attempts (default: 5) */\n numAttempts?: number;\n /** Expected captcha length — results of other lengths are discarded (default: undefined = no filter) */\n expectedLength?: number;\n /** Max retries per attempt on API failure (default: 2) */\n maxRetries?: number;\n /** Whether to log attempt details (default: true) */\n verbose?: boolean;\n}\n\n/**\n * Make a single API call to read the captcha.\n * Retries up to `maxRetries` times on failure.\n */\nasync function singleAttempt(\n client: OpenAI,\n base64Image: string,\n model: string,\n maxRetries: number\n): Promise<string | null> {\n for (let retry = 0; retry <= maxRetries; retry++) {\n try {\n // Reasoning models (o3, o4-mini) use max_completion_tokens;\n // Standard models (gpt-4o, gpt-4.1, gpt-5.4-mini) use max_tokens.\n const isReasoningModel = model.startsWith('o');\n const tokenParam = isReasoningModel ? { max_completion_tokens: 2000 } : { max_tokens: 256 };\n\n const response = await client.chat.completions.create({\n model,\n messages: [\n {\n role: 'user',\n content: [\n { type: 'text', text: PROMPT },\n {\n type: 'image_url',\n image_url: {\n url: `data:image/png;base64,${base64Image}`,\n },\n },\n ],\n },\n ],\n temperature: 1,\n ...tokenParam,\n });\n\n const raw = response.choices[0]?.message?.content?.trim() ?? '';\n\n // Detect refusals\n const lower = raw.toLowerCase();\n if (\n lower.includes('sorry') ||\n lower.includes(\"can't help\") ||\n lower.includes('cannot help') ||\n lower.includes('unable to') ||\n lower.includes(\"i can't\") ||\n raw.length > 20\n ) {\n return null; // Model refused — don't count as an attempt\n }\n\n // Clean: keep only uppercase letters and digits\n const cleaned = raw.toUpperCase().replace(/[^A-Z0-9]/g, '');\n return cleaned || null;\n } catch (_err) {\n if (retry < maxRetries) {\n // Wait briefly before retry\n await new Promise((r) => setTimeout(r, 1000 * (retry + 1)));\n continue;\n }\n return null;\n }\n }\n return null;\n}\n\n/**\n * Character-level majority vote across multiple attempts.\n */\nfunction majorityVote(attempts: string[], expectedLength?: number): string {\n // Filter to expected length if specified\n let filtered = expectedLength ? attempts.filter((a) => a.length === expectedLength) : attempts;\n\n // If length filter removed everything, fall back to most common length\n if (filtered.length === 0) {\n filtered = attempts;\n }\n\n if (filtered.length === 0) return '';\n\n // Find most common length\n const lenCounts = new Map<number, number>();\n for (const a of filtered) {\n lenCounts.set(a.length, (lenCounts.get(a.length) ?? 0) + 1);\n }\n let bestLen = 0;\n let bestCount = 0;\n for (const [len, count] of lenCounts) {\n if (count > bestCount) {\n bestLen = len;\n bestCount = count;\n }\n }\n\n const sameLenAttempts = filtered.filter((a) => a.length === bestLen);\n if (sameLenAttempts.length === 0) return filtered[0];\n\n // Vote per character position\n const result: string[] = [];\n for (let pos = 0; pos < bestLen; pos++) {\n const charCounts = new Map<string, number>();\n for (const a of sameLenAttempts) {\n const ch = a[pos];\n charCounts.set(ch, (charCounts.get(ch) ?? 0) + 1);\n }\n let bestChar = '';\n let bestCharCount = 0;\n for (const [ch, count] of charCounts) {\n if (count > bestCharCount) {\n bestChar = ch;\n bestCharCount = count;\n }\n }\n result.push(bestChar);\n }\n\n return result.join('');\n}\n\n/**\n * Solve a captcha image using OpenAI vision + preprocessing + majority voting.\n */\nexport async function solveCaptchaImage(\n imagePath: string,\n options: SolverOptions = {}\n): Promise<string> {\n const { model = 'o3', numAttempts = 5, expectedLength, maxRetries = 2, verbose = true } = options;\n\n const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });\n\n // Preprocess the image\n const base64Processed = await preprocessCaptcha(imagePath);\n\n // Run attempts — retry refusals/failures to guarantee numAttempts valid results\n const attempts: string[] = [];\n const maxTotalCalls = numAttempts + 4; // allow up to 4 extra calls for refusals\n let callCount = 0;\n while (attempts.length < numAttempts && callCount < maxTotalCalls) {\n callCount++;\n const result = await singleAttempt(client, base64Processed, model, maxRetries);\n if (result) {\n attempts.push(result);\n if (verbose) console.log(` Attempt ${attempts.length}: ${result}`);\n } else {\n if (verbose) console.log(` Call ${callCount}: (refused/failed, retrying...)`);\n }\n }\n\n if (attempts.length === 0) {\n if (verbose) console.log(' All attempts failed!');\n return '';\n }\n\n // Majority vote\n const answer = majorityVote(attempts, expectedLength);\n return answer;\n}\n","import fs from 'fs';\nimport { execSync } from 'child_process';\nimport path from 'path';\n\n// Inline Python script for image preprocessing\n// Uses PIL which produces optimal results for captcha OCR\nconst PYTHON_SCRIPT = `\nimport sys, base64, io\nfrom PIL import Image, ImageFilter, ImageEnhance, ImageOps\n\nimage_path = sys.argv[1]\nimg = Image.open(image_path)\nimg = ImageOps.grayscale(img)\nimg = img.filter(ImageFilter.GaussianBlur(radius=1.2))\nimg = img.resize((img.width * 4, img.height * 4), Image.LANCZOS)\nimg = ImageEnhance.Contrast(img).enhance(3.0)\nimg = ImageEnhance.Sharpness(img).enhance(2.0)\nw, h = img.size\nimg = img.crop((int(w * 0.10), int(h * 0.02), int(w * 0.90), int(h * 0.60)))\npadded = Image.new('L', (img.width + 60, img.height + 40), 255)\npadded.paste(img, (30, 20))\npadded = padded.convert('RGB')\nbuf = io.BytesIO()\npadded.save(buf, format='PNG')\nsys.stdout.buffer.write(base64.b64encode(buf.getvalue()))\n`;\n\n/**\n * Preprocess a captcha image using PIL (via Python subprocess).\n *\n * Pipeline:\n * 1. Grayscale\n * 2. Gaussian blur (radius=1.2) to smooth dither pattern\n * 3. Upscale 4x with Lanczos\n * 4. Contrast 3x + Sharpness 2x (PIL enhancement — preserves soft gradients)\n * 5. Crop decorative borders\n * 6. Add white padding\n *\n * Returns a base64-encoded PNG string.\n */\nexport async function preprocessCaptcha(imagePath: string): Promise<string> {\n const absPath = path.resolve(imagePath);\n\n // Write the Python script to a temp file\n const scriptPath = '/tmp/_captcha_preprocess.py';\n fs.writeFileSync(scriptPath, PYTHON_SCRIPT);\n\n // Execute Python and capture base64 output\n const result = execSync(`python3 \"${scriptPath}\" \"${absPath}\"`, {\n maxBuffer: 10 * 1024 * 1024, // 10MB\n encoding: 'utf-8',\n });\n\n return result.trim();\n}\n\n/**\n * Read an image file and return its base64-encoded content.\n */\nexport function imageToBase64(imagePath: string): string {\n const buffer = fs.readFileSync(imagePath);\n return buffer.toString('base64');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,oBAAmB;;;ACAnB,gBAAe;AACf,2BAAyB;AACzB,kBAAiB;AAIjB,IAAM,gBAAgB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAkCtB,eAAsB,kBAAkB,WAAoC;AAC1E,QAAM,UAAU,YAAAA,QAAK,QAAQ,SAAS;AAGtC,QAAM,aAAa;AACnB,YAAAC,QAAG,cAAc,YAAY,aAAa;AAG1C,QAAM,aAAS,+BAAS,YAAY,UAAU,MAAM,OAAO,KAAK;AAAA,IAC9D,WAAW,KAAK,OAAO;AAAA;AAAA,IACvB,UAAU;AAAA,EACZ,CAAC;AAED,SAAO,OAAO,KAAK;AACrB;AAKO,SAAS,cAAc,WAA2B;AACvD,QAAM,SAAS,UAAAA,QAAG,aAAa,SAAS;AACxC,SAAO,OAAO,SAAS,QAAQ;AACjC;;;AD3DA,IAAM,SAAS;AAAA;AAAA;AAAA;AAAA;AAuBf,eAAe,cACb,QACA,aACA,OACA,YACwB;AACxB,WAAS,QAAQ,GAAG,SAAS,YAAY,SAAS;AAChD,QAAI;AAGF,YAAM,mBAAmB,MAAM,WAAW,GAAG;AAC7C,YAAM,aAAa,mBAAmB,EAAE,uBAAuB,IAAK,IAAI,EAAE,YAAY,IAAI;AAE1F,YAAM,WAAW,MAAM,OAAO,KAAK,YAAY,OAAO;AAAA,QACpD;AAAA,QACA,UAAU;AAAA,UACR;AAAA,YACE,MAAM;AAAA,YACN,SAAS;AAAA,cACP,EAAE,MAAM,QAAQ,MAAM,OAAO;AAAA,cAC7B;AAAA,gBACE,MAAM;AAAA,gBACN,WAAW;AAAA,kBACT,KAAK,yBAAyB,WAAW;AAAA,gBAC3C;AAAA,cACF;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,QACA,aAAa;AAAA,QACb,GAAG;AAAA,MACL,CAAC;AAED,YAAM,MAAM,SAAS,QAAQ,CAAC,GAAG,SAAS,SAAS,KAAK,KAAK;AAG7D,YAAM,QAAQ,IAAI,YAAY;AAC9B,UACE,MAAM,SAAS,OAAO,KACtB,MAAM,SAAS,YAAY,KAC3B,MAAM,SAAS,aAAa,KAC5B,MAAM,SAAS,WAAW,KAC1B,MAAM,SAAS,SAAS,KACxB,IAAI,SAAS,IACb;AACA,eAAO;AAAA,MACT;AAGA,YAAM,UAAU,IAAI,YAAY,EAAE,QAAQ,cAAc,EAAE;AAC1D,aAAO,WAAW;AAAA,IACpB,SAAS,MAAM;AACb,UAAI,QAAQ,YAAY;AAEtB,cAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,OAAQ,QAAQ,EAAE,CAAC;AAC1D;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACA,SAAO;AACT;AAKA,SAAS,aAAa,UAAoB,gBAAiC;AAEzE,MAAI,WAAW,iBAAiB,SAAS,OAAO,CAAC,MAAM,EAAE,WAAW,cAAc,IAAI;AAGtF,MAAI,SAAS,WAAW,GAAG;AACzB,eAAW;AAAA,EACb;AAEA,MAAI,SAAS,WAAW,EAAG,QAAO;AAGlC,QAAM,YAAY,oBAAI,IAAoB;AAC1C,aAAW,KAAK,UAAU;AACxB,cAAU,IAAI,EAAE,SAAS,UAAU,IAAI,EAAE,MAAM,KAAK,KAAK,CAAC;AAAA,EAC5D;AACA,MAAI,UAAU;AACd,MAAI,YAAY;AAChB,aAAW,CAAC,KAAK,KAAK,KAAK,WAAW;AACpC,QAAI,QAAQ,WAAW;AACrB,gBAAU;AACV,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,QAAM,kBAAkB,SAAS,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO;AACnE,MAAI,gBAAgB,WAAW,EAAG,QAAO,SAAS,CAAC;AAGnD,QAAM,SAAmB,CAAC;AAC1B,WAAS,MAAM,GAAG,MAAM,SAAS,OAAO;AACtC,UAAM,aAAa,oBAAI,IAAoB;AAC3C,eAAW,KAAK,iBAAiB;AAC/B,YAAM,KAAK,EAAE,GAAG;AAChB,iBAAW,IAAI,KAAK,WAAW,IAAI,EAAE,KAAK,KAAK,CAAC;AAAA,IAClD;AACA,QAAI,WAAW;AACf,QAAI,gBAAgB;AACpB,eAAW,CAAC,IAAI,KAAK,KAAK,YAAY;AACpC,UAAI,QAAQ,eAAe;AACzB,mBAAW;AACX,wBAAgB;AAAA,MAClB;AAAA,IACF;AACA,WAAO,KAAK,QAAQ;AAAA,EACtB;AAEA,SAAO,OAAO,KAAK,EAAE;AACvB;AAKA,eAAsB,kBACpB,WACA,UAAyB,CAAC,GACT;AACjB,QAAM,EAAE,QAAQ,MAAM,cAAc,GAAG,gBAAgB,aAAa,GAAG,UAAU,KAAK,IAAI;AAE1F,QAAM,SAAS,IAAI,cAAAC,QAAO,EAAE,QAAQ,QAAQ,IAAI,eAAe,CAAC;AAGhE,QAAM,kBAAkB,MAAM,kBAAkB,SAAS;AAGzD,QAAM,WAAqB,CAAC;AAC5B,QAAM,gBAAgB,cAAc;AACpC,MAAI,YAAY;AAChB,SAAO,SAAS,SAAS,eAAe,YAAY,eAAe;AACjE;AACA,UAAM,SAAS,MAAM,cAAc,QAAQ,iBAAiB,OAAO,UAAU;AAC7E,QAAI,QAAQ;AACV,eAAS,KAAK,MAAM;AACpB,UAAI,QAAS,SAAQ,IAAI,aAAa,SAAS,MAAM,KAAK,MAAM,EAAE;AAAA,IACpE,OAAO;AACL,UAAI,QAAS,SAAQ,IAAI,UAAU,SAAS,iCAAiC;AAAA,IAC/E;AAAA,EACF;AAEA,MAAI,SAAS,WAAW,GAAG;AACzB,QAAI,QAAS,SAAQ,IAAI,wBAAwB;AACjD,WAAO;AAAA,EACT;AAGA,QAAM,SAAS,aAAa,UAAU,cAAc;AACpD,SAAO;AACT;","names":["path","fs","OpenAI"]}
1
+ {"version":3,"sources":["../src/index.ts","../src/solver.ts","../src/preprocess.ts"],"sourcesContent":["export { Solver } from './solver.js';\nexport type { SolverOptions, SolveOptions, Provider } from './solver.js';\nexport { preprocessCaptcha, preprocessCaptchaToBuffer, imageToBase64 } from './preprocess.js';\n","import type { LanguageModel } from 'ai';\nimport { generateText } from 'ai';\nimport { preprocessCaptchaToBuffer } from './preprocess.js';\n\nconst PROMPT = `You are an assistant helping a visually impaired person read distorted text from an image.\nThe text contains uppercase letters A-Z and/or digits 0-9.\nA thin vertical stroke is the digit 1. Never read it as the letter I or L.\nA round closed shape is the letter O, not the letter D.\nOutput ONLY the exact characters you read, nothing else.`;\n\n// ── Types ────────────────────────────────────────────────────────────\n\nexport type Provider = 'openai' | 'anthropic' | 'google';\n\nexport interface SolverOptions {\n /** AI provider to use when constructing the model from an API key (default: \"openai\") */\n provider?: Provider;\n /** Model ID passed to the provider (default: \"gpt-4o\") */\n model?: string;\n}\n\nexport interface SolveOptions {\n /** Number of voting attempts (default: 5) */\n numAttempts?: number;\n /** Expected captcha length — results of other lengths are discarded */\n expectedLength?: number;\n /** Max retries per attempt on API failure (default: 2) */\n maxRetries?: number;\n /** Whether to log attempt details (default: true) */\n verbose?: boolean;\n}\n\n// ── Provider resolution ──────────────────────────────────────────────\n\nconst DEFAULT_MODELS: Record<Provider, string> = {\n openai: 'gpt-4o',\n anthropic: 'claude-sonnet-4-20250514',\n google: 'gemini-2.0-flash',\n};\n\nasync function resolveModel(\n apiKey: string,\n provider: Provider,\n modelId: string\n): Promise<LanguageModel> {\n switch (provider) {\n case 'openai': {\n const { createOpenAI } = await import('@ai-sdk/openai');\n return createOpenAI({ apiKey })(modelId);\n }\n case 'anthropic': {\n // @ts-expect-error — optional peer dependency\n const { createAnthropic } = await import('@ai-sdk/anthropic');\n return createAnthropic({ apiKey })(modelId);\n }\n case 'google': {\n // @ts-expect-error — optional peer dependency\n const { createGoogleGenerativeAI } = await import('@ai-sdk/google');\n return createGoogleGenerativeAI({ apiKey })(modelId);\n }\n default:\n throw new Error(\n `Unknown provider \"${provider}\". Install the matching @ai-sdk/* package and pass the model directly.`\n );\n }\n}\n\n// ── Confusion groups ─────────────────────────────────────────────────\n\n/**\n * Characters the model commonly misreads as each other.\n * Each group maps to its canonical (most likely correct) character.\n */\nconst CONFUSION_GROUPS: Record<string, string> = {\n '1': '1',\n I: '1',\n L: '1',\n O: 'O',\n D: 'O',\n '0': 'O',\n S: 'S',\n '5': 'S',\n Z: 'Z',\n '2': 'Z',\n};\n\n// ── Majority voting ──────────────────────────────────────────────────\n\n/**\n * Character-level majority vote across multiple attempts.\n * Uses confusion-aware voting: characters that the model commonly\n * confuses (e.g. 1/I/L, O/D/0) are grouped together during counting.\n */\nfunction majorityVote(attempts: string[], expectedLength?: number): string {\n let filtered = expectedLength ? attempts.filter((a) => a.length === expectedLength) : attempts;\n\n if (filtered.length === 0) {\n filtered = attempts;\n }\n if (filtered.length === 0) return '';\n\n // Find most common length\n const lenCounts = new Map<number, number>();\n for (const a of filtered) {\n lenCounts.set(a.length, (lenCounts.get(a.length) ?? 0) + 1);\n }\n let bestLen = 0;\n let bestCount = 0;\n for (const [len, count] of lenCounts) {\n if (count > bestCount) {\n bestLen = len;\n bestCount = count;\n }\n }\n\n const sameLenAttempts = filtered.filter((a) => a.length === bestLen);\n if (sameLenAttempts.length === 0) return filtered[0];\n\n // Vote per character position with confusion-aware grouping\n const result: string[] = [];\n for (let pos = 0; pos < bestLen; pos++) {\n const charCounts = new Map<string, number>();\n for (const a of sameLenAttempts) {\n const ch = a[pos];\n charCounts.set(ch, (charCounts.get(ch) ?? 0) + 1);\n }\n\n const groupCounts = new Map<string, number>();\n for (const [ch, count] of charCounts) {\n const canonical = CONFUSION_GROUPS[ch] ?? ch;\n groupCounts.set(canonical, (groupCounts.get(canonical) ?? 0) + count);\n }\n\n let bestGroup = '';\n let bestGroupCount = 0;\n for (const [canonical, count] of groupCounts) {\n if (count > bestGroupCount) {\n bestGroup = canonical;\n bestGroupCount = count;\n }\n }\n\n result.push(bestGroup);\n }\n\n return result.join('');\n}\n\n// ── Solver class ─────────────────────────────────────────────────────\n\nexport class Solver {\n private _model: LanguageModel | null = null;\n private _pendingModel: Promise<LanguageModel> | null = null;\n\n /**\n * Create a captcha solver.\n *\n * @example\n * // Simple — defaults to OpenAI gpt-4o\n * const solver = new Solver('sk-...');\n *\n * @example\n * // Specify provider and model\n * const solver = new Solver('sk-ant-...', { provider: 'anthropic', model: 'claude-sonnet-4-20250514' });\n *\n * @example\n * // Pass an AI SDK model directly\n * import { createOpenAI } from '@ai-sdk/openai';\n * const openai = createOpenAI({ apiKey: 'sk-...' });\n * const solver = new Solver(openai('gpt-4o'));\n */\n constructor(keyOrModel: string | LanguageModel, options?: SolverOptions) {\n if (typeof keyOrModel === 'string') {\n const provider = options?.provider ?? 'openai';\n const modelId = options?.model ?? DEFAULT_MODELS[provider];\n // Lazily resolve the model on first use\n this._pendingModel = resolveModel(keyOrModel, provider, modelId);\n } else {\n this._model = keyOrModel;\n }\n }\n\n private async getModel(): Promise<LanguageModel> {\n if (this._model) return this._model;\n this._model = await this._pendingModel!;\n this._pendingModel = null;\n return this._model;\n }\n\n /**\n * Solve a captcha image.\n *\n * @param input - File path (string) or raw image Buffer\n * @param options - Solve options (attempts, expected length, etc.)\n * @returns The captcha text\n */\n async solve(input: string | Buffer, options: SolveOptions = {}): Promise<string> {\n const { numAttempts = 5, expectedLength, maxRetries = 2, verbose = true } = options;\n\n const model = await this.getModel();\n const imageBuffer = await preprocessCaptchaToBuffer(input);\n\n // Run attempts — retry refusals/failures to guarantee numAttempts valid results\n const attempts: string[] = [];\n const maxTotalCalls = numAttempts + 4;\n let callCount = 0;\n\n while (attempts.length < numAttempts && callCount < maxTotalCalls) {\n callCount++;\n const result = await this.singleAttempt(model, imageBuffer, maxRetries);\n if (result) {\n attempts.push(result);\n if (verbose) console.log(` Attempt ${attempts.length}: ${result}`);\n } else {\n if (verbose) console.log(` Call ${callCount}: (refused/failed, retrying...)`);\n }\n }\n\n if (attempts.length === 0) {\n if (verbose) console.log(' All attempts failed!');\n return '';\n }\n\n return majorityVote(attempts, expectedLength);\n }\n\n /**\n * Make a single API call to read the captcha.\n * Retries up to `maxRetries` times on failure.\n */\n private async singleAttempt(\n model: LanguageModel,\n imageBuffer: Buffer,\n maxRetries: number\n ): Promise<string | null> {\n for (let retry = 0; retry <= maxRetries; retry++) {\n try {\n const { text } = await generateText({\n model,\n messages: [\n {\n role: 'user',\n content: [\n { type: 'text', text: PROMPT },\n { type: 'image', image: imageBuffer },\n ],\n },\n ],\n temperature: 1,\n maxOutputTokens: 256,\n });\n\n const raw = text.trim();\n\n // Detect refusals\n const lower = raw.toLowerCase();\n if (\n lower.includes('sorry') ||\n lower.includes(\"can't help\") ||\n lower.includes('cannot help') ||\n lower.includes('unable to') ||\n lower.includes(\"i can't\") ||\n raw.length > 20\n ) {\n return null;\n }\n\n // Clean: keep only uppercase letters and digits\n const cleaned = raw.toUpperCase().replace(/[^A-Z0-9]/g, '');\n return cleaned || null;\n } catch (_err) {\n if (retry < maxRetries) {\n await new Promise((r) => setTimeout(r, 1000 * (retry + 1)));\n continue;\n }\n return null;\n }\n }\n return null;\n }\n}\n","import fs from 'fs';\nimport path from 'path';\nimport sharp from 'sharp';\n\n/**\n * Preprocess a captcha image using sharp (libvips).\n *\n * Pipeline:\n * 1. Gaussian blur in color space (smooths dither pattern)\n * 2. Grayscale conversion\n * 3. Upscale 4× with Lanczos\n * 4. Contrast boost (3× around image mean) + sharpen\n * 5. Crop decorative borders\n * 6. Add white padding\n *\n * Accepts a file path or a raw image Buffer.\n * Returns a base64-encoded PNG string.\n */\nexport async function preprocessCaptcha(input: string | Buffer): Promise<string> {\n const buf = await preprocessCaptchaToBuffer(input);\n return buf.toString('base64');\n}\n\n/**\n * Same preprocessing pipeline as `preprocessCaptcha`, but returns the\n * resulting PNG as a raw Buffer (useful for AI SDK image content parts).\n */\nexport async function preprocessCaptchaToBuffer(input: string | Buffer): Promise<Buffer> {\n const source = typeof input === 'string' ? path.resolve(input) : input;\n\n // Read original dimensions for crop/resize calculations\n const metadata = await sharp(source).metadata();\n const origW = metadata.width!;\n const origH = metadata.height!;\n\n // Step 1-2: Blur in color space (smooths dither pattern) → greyscale\n // Separate from resize to prevent pipeline reordering\n const smoothed = await sharp(source).blur(1.5).greyscale().toBuffer();\n\n // Step 3: Upscale 4× with Lanczos\n const upscaled = await sharp(smoothed)\n .resize(origW * 4, origH * 4, { kernel: 'lanczos3' })\n .toBuffer();\n\n // Step 4: Contrast 3× around actual image mean + sharpen\n // Matches PIL's ImageEnhance.Contrast: output = factor*input + mean*(1-factor)\n const stats = await sharp(upscaled).stats();\n const mean = stats.channels[0].mean;\n const enhanced = await sharp(upscaled)\n .linear(3.0, mean * (1 - 3.0))\n .sharpen({ sigma: 1.0, m1: 2.0, m2: 1.0 })\n .toBuffer();\n\n // Step 5: Crop decorative borders\n // Remove 10% left/right, 2% top, 40% bottom (keep top 60%)\n // Math.floor matches Python's int() truncation\n const scaledW = origW * 4;\n const scaledH = origH * 4;\n const cropLeft = Math.floor(scaledW * 0.1);\n const cropTop = Math.floor(scaledH * 0.02);\n const cropRight = Math.floor(scaledW * 0.9);\n const cropBottom = Math.floor(scaledH * 0.6);\n const cropW = cropRight - cropLeft;\n const cropH = cropBottom - cropTop;\n\n // Step 5-6: Crop → add white padding → output PNG\n return sharp(enhanced)\n .extract({ left: cropLeft, top: cropTop, width: cropW, height: cropH })\n .extend({\n top: 20,\n bottom: 20,\n left: 30,\n right: 30,\n background: { r: 255, g: 255, b: 255 },\n })\n .png()\n .toBuffer();\n}\n\n/**\n * Read an image file and return its base64-encoded content.\n */\nexport function imageToBase64(imagePath: string): string {\n const buffer = fs.readFileSync(imagePath);\n return buffer.toString('base64');\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACCA,gBAA6B;;;ACD7B,gBAAe;AACf,kBAAiB;AACjB,mBAAkB;AAgBlB,eAAsB,kBAAkB,OAAyC;AAC/E,QAAM,MAAM,MAAM,0BAA0B,KAAK;AACjD,SAAO,IAAI,SAAS,QAAQ;AAC9B;AAMA,eAAsB,0BAA0B,OAAyC;AACvF,QAAM,SAAS,OAAO,UAAU,WAAW,YAAAA,QAAK,QAAQ,KAAK,IAAI;AAGjE,QAAM,WAAW,UAAM,aAAAC,SAAM,MAAM,EAAE,SAAS;AAC9C,QAAM,QAAQ,SAAS;AACvB,QAAM,QAAQ,SAAS;AAIvB,QAAM,WAAW,UAAM,aAAAA,SAAM,MAAM,EAAE,KAAK,GAAG,EAAE,UAAU,EAAE,SAAS;AAGpE,QAAM,WAAW,UAAM,aAAAA,SAAM,QAAQ,EAClC,OAAO,QAAQ,GAAG,QAAQ,GAAG,EAAE,QAAQ,WAAW,CAAC,EACnD,SAAS;AAIZ,QAAM,QAAQ,UAAM,aAAAA,SAAM,QAAQ,EAAE,MAAM;AAC1C,QAAM,OAAO,MAAM,SAAS,CAAC,EAAE;AAC/B,QAAM,WAAW,UAAM,aAAAA,SAAM,QAAQ,EAClC,OAAO,GAAK,QAAQ,IAAI,EAAI,EAC5B,QAAQ,EAAE,OAAO,GAAK,IAAI,GAAK,IAAI,EAAI,CAAC,EACxC,SAAS;AAKZ,QAAM,UAAU,QAAQ;AACxB,QAAM,UAAU,QAAQ;AACxB,QAAM,WAAW,KAAK,MAAM,UAAU,GAAG;AACzC,QAAM,UAAU,KAAK,MAAM,UAAU,IAAI;AACzC,QAAM,YAAY,KAAK,MAAM,UAAU,GAAG;AAC1C,QAAM,aAAa,KAAK,MAAM,UAAU,GAAG;AAC3C,QAAM,QAAQ,YAAY;AAC1B,QAAM,QAAQ,aAAa;AAG3B,aAAO,aAAAA,SAAM,QAAQ,EAClB,QAAQ,EAAE,MAAM,UAAU,KAAK,SAAS,OAAO,OAAO,QAAQ,MAAM,CAAC,EACrE,OAAO;AAAA,IACN,KAAK;AAAA,IACL,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,OAAO;AAAA,IACP,YAAY,EAAE,GAAG,KAAK,GAAG,KAAK,GAAG,IAAI;AAAA,EACvC,CAAC,EACA,IAAI,EACJ,SAAS;AACd;AAKO,SAAS,cAAc,WAA2B;AACvD,QAAM,SAAS,UAAAC,QAAG,aAAa,SAAS;AACxC,SAAO,OAAO,SAAS,QAAQ;AACjC;;;ADjFA,IAAM,SAAS;AAAA;AAAA;AAAA;AAAA;AA8Bf,IAAM,iBAA2C;AAAA,EAC/C,QAAQ;AAAA,EACR,WAAW;AAAA,EACX,QAAQ;AACV;AAEA,eAAe,aACb,QACA,UACA,SACwB;AACxB,UAAQ,UAAU;AAAA,IAChB,KAAK,UAAU;AACb,YAAM,EAAE,aAAa,IAAI,MAAM,OAAO,gBAAgB;AACtD,aAAO,aAAa,EAAE,OAAO,CAAC,EAAE,OAAO;AAAA,IACzC;AAAA,IACA,KAAK,aAAa;AAEhB,YAAM,EAAE,gBAAgB,IAAI,MAAM,OAAO,mBAAmB;AAC5D,aAAO,gBAAgB,EAAE,OAAO,CAAC,EAAE,OAAO;AAAA,IAC5C;AAAA,IACA,KAAK,UAAU;AAEb,YAAM,EAAE,yBAAyB,IAAI,MAAM,OAAO,gBAAgB;AAClE,aAAO,yBAAyB,EAAE,OAAO,CAAC,EAAE,OAAO;AAAA,IACrD;AAAA,IACA;AACE,YAAM,IAAI;AAAA,QACR,qBAAqB,QAAQ;AAAA,MAC/B;AAAA,EACJ;AACF;AAQA,IAAM,mBAA2C;AAAA,EAC/C,KAAK;AAAA,EACL,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,KAAK;AAAA,EACL,GAAG;AAAA,EACH,KAAK;AAAA,EACL,GAAG;AAAA,EACH,KAAK;AACP;AASA,SAAS,aAAa,UAAoB,gBAAiC;AACzE,MAAI,WAAW,iBAAiB,SAAS,OAAO,CAAC,MAAM,EAAE,WAAW,cAAc,IAAI;AAEtF,MAAI,SAAS,WAAW,GAAG;AACzB,eAAW;AAAA,EACb;AACA,MAAI,SAAS,WAAW,EAAG,QAAO;AAGlC,QAAM,YAAY,oBAAI,IAAoB;AAC1C,aAAW,KAAK,UAAU;AACxB,cAAU,IAAI,EAAE,SAAS,UAAU,IAAI,EAAE,MAAM,KAAK,KAAK,CAAC;AAAA,EAC5D;AACA,MAAI,UAAU;AACd,MAAI,YAAY;AAChB,aAAW,CAAC,KAAK,KAAK,KAAK,WAAW;AACpC,QAAI,QAAQ,WAAW;AACrB,gBAAU;AACV,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,QAAM,kBAAkB,SAAS,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO;AACnE,MAAI,gBAAgB,WAAW,EAAG,QAAO,SAAS,CAAC;AAGnD,QAAM,SAAmB,CAAC;AAC1B,WAAS,MAAM,GAAG,MAAM,SAAS,OAAO;AACtC,UAAM,aAAa,oBAAI,IAAoB;AAC3C,eAAW,KAAK,iBAAiB;AAC/B,YAAM,KAAK,EAAE,GAAG;AAChB,iBAAW,IAAI,KAAK,WAAW,IAAI,EAAE,KAAK,KAAK,CAAC;AAAA,IAClD;AAEA,UAAM,cAAc,oBAAI,IAAoB;AAC5C,eAAW,CAAC,IAAI,KAAK,KAAK,YAAY;AACpC,YAAM,YAAY,iBAAiB,EAAE,KAAK;AAC1C,kBAAY,IAAI,YAAY,YAAY,IAAI,SAAS,KAAK,KAAK,KAAK;AAAA,IACtE;AAEA,QAAI,YAAY;AAChB,QAAI,iBAAiB;AACrB,eAAW,CAAC,WAAW,KAAK,KAAK,aAAa;AAC5C,UAAI,QAAQ,gBAAgB;AAC1B,oBAAY;AACZ,yBAAiB;AAAA,MACnB;AAAA,IACF;AAEA,WAAO,KAAK,SAAS;AAAA,EACvB;AAEA,SAAO,OAAO,KAAK,EAAE;AACvB;AAIO,IAAM,SAAN,MAAa;AAAA,EACV,SAA+B;AAAA,EAC/B,gBAA+C;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAmBvD,YAAY,YAAoC,SAAyB;AACvE,QAAI,OAAO,eAAe,UAAU;AAClC,YAAM,WAAW,SAAS,YAAY;AACtC,YAAM,UAAU,SAAS,SAAS,eAAe,QAAQ;AAEzD,WAAK,gBAAgB,aAAa,YAAY,UAAU,OAAO;AAAA,IACjE,OAAO;AACL,WAAK,SAAS;AAAA,IAChB;AAAA,EACF;AAAA,EAEA,MAAc,WAAmC;AAC/C,QAAI,KAAK,OAAQ,QAAO,KAAK;AAC7B,SAAK,SAAS,MAAM,KAAK;AACzB,SAAK,gBAAgB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,MAAM,MAAM,OAAwB,UAAwB,CAAC,GAAoB;AAC/E,UAAM,EAAE,cAAc,GAAG,gBAAgB,aAAa,GAAG,UAAU,KAAK,IAAI;AAE5E,UAAM,QAAQ,MAAM,KAAK,SAAS;AAClC,UAAM,cAAc,MAAM,0BAA0B,KAAK;AAGzD,UAAM,WAAqB,CAAC;AAC5B,UAAM,gBAAgB,cAAc;AACpC,QAAI,YAAY;AAEhB,WAAO,SAAS,SAAS,eAAe,YAAY,eAAe;AACjE;AACA,YAAM,SAAS,MAAM,KAAK,cAAc,OAAO,aAAa,UAAU;AACtE,UAAI,QAAQ;AACV,iBAAS,KAAK,MAAM;AACpB,YAAI,QAAS,SAAQ,IAAI,aAAa,SAAS,MAAM,KAAK,MAAM,EAAE;AAAA,MACpE,OAAO;AACL,YAAI,QAAS,SAAQ,IAAI,UAAU,SAAS,iCAAiC;AAAA,MAC/E;AAAA,IACF;AAEA,QAAI,SAAS,WAAW,GAAG;AACzB,UAAI,QAAS,SAAQ,IAAI,wBAAwB;AACjD,aAAO;AAAA,IACT;AAEA,WAAO,aAAa,UAAU,cAAc;AAAA,EAC9C;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,cACZ,OACA,aACA,YACwB;AACxB,aAAS,QAAQ,GAAG,SAAS,YAAY,SAAS;AAChD,UAAI;AACF,cAAM,EAAE,KAAK,IAAI,UAAM,wBAAa;AAAA,UAClC;AAAA,UACA,UAAU;AAAA,YACR;AAAA,cACE,MAAM;AAAA,cACN,SAAS;AAAA,gBACP,EAAE,MAAM,QAAQ,MAAM,OAAO;AAAA,gBAC7B,EAAE,MAAM,SAAS,OAAO,YAAY;AAAA,cACtC;AAAA,YACF;AAAA,UACF;AAAA,UACA,aAAa;AAAA,UACb,iBAAiB;AAAA,QACnB,CAAC;AAED,cAAM,MAAM,KAAK,KAAK;AAGtB,cAAM,QAAQ,IAAI,YAAY;AAC9B,YACE,MAAM,SAAS,OAAO,KACtB,MAAM,SAAS,YAAY,KAC3B,MAAM,SAAS,aAAa,KAC5B,MAAM,SAAS,WAAW,KAC1B,MAAM,SAAS,SAAS,KACxB,IAAI,SAAS,IACb;AACA,iBAAO;AAAA,QACT;AAGA,cAAM,UAAU,IAAI,YAAY,EAAE,QAAQ,cAAc,EAAE;AAC1D,eAAO,WAAW;AAAA,MACpB,SAAS,MAAM;AACb,YAAI,QAAQ,YAAY;AACtB,gBAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,OAAQ,QAAQ,EAAE,CAAC;AAC1D;AAAA,QACF;AACA,eAAO;AAAA,MACT;AAAA,IACF;AACA,WAAO;AAAA,EACT;AACF;","names":["path","sharp","fs"]}
package/dist/index.d.cts CHANGED
@@ -1,37 +1,82 @@
1
+ import { LanguageModel } from 'ai';
2
+
3
+ type Provider = 'openai' | 'anthropic' | 'google';
1
4
  interface SolverOptions {
2
- /** OpenAI model to use (default: "o3") */
5
+ /** AI provider to use when constructing the model from an API key (default: "openai") */
6
+ provider?: Provider;
7
+ /** Model ID passed to the provider (default: "gpt-4o") */
3
8
  model?: string;
9
+ }
10
+ interface SolveOptions {
4
11
  /** Number of voting attempts (default: 5) */
5
12
  numAttempts?: number;
6
- /** Expected captcha length — results of other lengths are discarded (default: undefined = no filter) */
13
+ /** Expected captcha length — results of other lengths are discarded */
7
14
  expectedLength?: number;
8
15
  /** Max retries per attempt on API failure (default: 2) */
9
16
  maxRetries?: number;
10
17
  /** Whether to log attempt details (default: true) */
11
18
  verbose?: boolean;
12
19
  }
13
- /**
14
- * Solve a captcha image using OpenAI vision + preprocessing + majority voting.
15
- */
16
- declare function solveCaptchaImage(imagePath: string, options?: SolverOptions): Promise<string>;
20
+ declare class Solver {
21
+ private _model;
22
+ private _pendingModel;
23
+ /**
24
+ * Create a captcha solver.
25
+ *
26
+ * @example
27
+ * // Simple — defaults to OpenAI gpt-4o
28
+ * const solver = new Solver('sk-...');
29
+ *
30
+ * @example
31
+ * // Specify provider and model
32
+ * const solver = new Solver('sk-ant-...', { provider: 'anthropic', model: 'claude-sonnet-4-20250514' });
33
+ *
34
+ * @example
35
+ * // Pass an AI SDK model directly
36
+ * import { createOpenAI } from '@ai-sdk/openai';
37
+ * const openai = createOpenAI({ apiKey: 'sk-...' });
38
+ * const solver = new Solver(openai('gpt-4o'));
39
+ */
40
+ constructor(keyOrModel: string | LanguageModel, options?: SolverOptions);
41
+ private getModel;
42
+ /**
43
+ * Solve a captcha image.
44
+ *
45
+ * @param input - File path (string) or raw image Buffer
46
+ * @param options - Solve options (attempts, expected length, etc.)
47
+ * @returns The captcha text
48
+ */
49
+ solve(input: string | Buffer, options?: SolveOptions): Promise<string>;
50
+ /**
51
+ * Make a single API call to read the captcha.
52
+ * Retries up to `maxRetries` times on failure.
53
+ */
54
+ private singleAttempt;
55
+ }
17
56
 
18
57
  /**
19
- * Preprocess a captcha image using PIL (via Python subprocess).
58
+ * Preprocess a captcha image using sharp (libvips).
20
59
  *
21
60
  * Pipeline:
22
- * 1. Grayscale
23
- * 2. Gaussian blur (radius=1.2) to smooth dither pattern
24
- * 3. Upscale 4x with Lanczos
25
- * 4. Contrast 3x + Sharpness 2x (PIL enhancement preserves soft gradients)
61
+ * 1. Gaussian blur in color space (smooths dither pattern)
62
+ * 2. Grayscale conversion
63
+ * 3. Upscale with Lanczos
64
+ * 4. Contrast boost ( around image mean) + sharpen
26
65
  * 5. Crop decorative borders
27
66
  * 6. Add white padding
28
67
  *
68
+ * Accepts a file path or a raw image Buffer.
29
69
  * Returns a base64-encoded PNG string.
30
70
  */
31
- declare function preprocessCaptcha(imagePath: string): Promise<string>;
71
+ declare function preprocessCaptcha(input: string | Buffer): Promise<string>;
72
+ /**
73
+ * Same preprocessing pipeline as `preprocessCaptcha`, but returns the
74
+ * resulting PNG as a raw Buffer (useful for AI SDK image content parts).
75
+ */
76
+ declare function preprocessCaptchaToBuffer(input: string | Buffer): Promise<Buffer>;
32
77
  /**
33
78
  * Read an image file and return its base64-encoded content.
34
79
  */
35
80
  declare function imageToBase64(imagePath: string): string;
36
81
 
37
- export { imageToBase64, preprocessCaptcha, solveCaptchaImage };
82
+ export { type Provider, type SolveOptions, Solver, type SolverOptions, imageToBase64, preprocessCaptcha, preprocessCaptchaToBuffer };
package/dist/index.d.ts CHANGED
@@ -1,37 +1,82 @@
1
+ import { LanguageModel } from 'ai';
2
+
3
+ type Provider = 'openai' | 'anthropic' | 'google';
1
4
  interface SolverOptions {
2
- /** OpenAI model to use (default: "o3") */
5
+ /** AI provider to use when constructing the model from an API key (default: "openai") */
6
+ provider?: Provider;
7
+ /** Model ID passed to the provider (default: "gpt-4o") */
3
8
  model?: string;
9
+ }
10
+ interface SolveOptions {
4
11
  /** Number of voting attempts (default: 5) */
5
12
  numAttempts?: number;
6
- /** Expected captcha length — results of other lengths are discarded (default: undefined = no filter) */
13
+ /** Expected captcha length — results of other lengths are discarded */
7
14
  expectedLength?: number;
8
15
  /** Max retries per attempt on API failure (default: 2) */
9
16
  maxRetries?: number;
10
17
  /** Whether to log attempt details (default: true) */
11
18
  verbose?: boolean;
12
19
  }
13
- /**
14
- * Solve a captcha image using OpenAI vision + preprocessing + majority voting.
15
- */
16
- declare function solveCaptchaImage(imagePath: string, options?: SolverOptions): Promise<string>;
20
+ declare class Solver {
21
+ private _model;
22
+ private _pendingModel;
23
+ /**
24
+ * Create a captcha solver.
25
+ *
26
+ * @example
27
+ * // Simple — defaults to OpenAI gpt-4o
28
+ * const solver = new Solver('sk-...');
29
+ *
30
+ * @example
31
+ * // Specify provider and model
32
+ * const solver = new Solver('sk-ant-...', { provider: 'anthropic', model: 'claude-sonnet-4-20250514' });
33
+ *
34
+ * @example
35
+ * // Pass an AI SDK model directly
36
+ * import { createOpenAI } from '@ai-sdk/openai';
37
+ * const openai = createOpenAI({ apiKey: 'sk-...' });
38
+ * const solver = new Solver(openai('gpt-4o'));
39
+ */
40
+ constructor(keyOrModel: string | LanguageModel, options?: SolverOptions);
41
+ private getModel;
42
+ /**
43
+ * Solve a captcha image.
44
+ *
45
+ * @param input - File path (string) or raw image Buffer
46
+ * @param options - Solve options (attempts, expected length, etc.)
47
+ * @returns The captcha text
48
+ */
49
+ solve(input: string | Buffer, options?: SolveOptions): Promise<string>;
50
+ /**
51
+ * Make a single API call to read the captcha.
52
+ * Retries up to `maxRetries` times on failure.
53
+ */
54
+ private singleAttempt;
55
+ }
17
56
 
18
57
  /**
19
- * Preprocess a captcha image using PIL (via Python subprocess).
58
+ * Preprocess a captcha image using sharp (libvips).
20
59
  *
21
60
  * Pipeline:
22
- * 1. Grayscale
23
- * 2. Gaussian blur (radius=1.2) to smooth dither pattern
24
- * 3. Upscale 4x with Lanczos
25
- * 4. Contrast 3x + Sharpness 2x (PIL enhancement preserves soft gradients)
61
+ * 1. Gaussian blur in color space (smooths dither pattern)
62
+ * 2. Grayscale conversion
63
+ * 3. Upscale with Lanczos
64
+ * 4. Contrast boost ( around image mean) + sharpen
26
65
  * 5. Crop decorative borders
27
66
  * 6. Add white padding
28
67
  *
68
+ * Accepts a file path or a raw image Buffer.
29
69
  * Returns a base64-encoded PNG string.
30
70
  */
31
- declare function preprocessCaptcha(imagePath: string): Promise<string>;
71
+ declare function preprocessCaptcha(input: string | Buffer): Promise<string>;
72
+ /**
73
+ * Same preprocessing pipeline as `preprocessCaptcha`, but returns the
74
+ * resulting PNG as a raw Buffer (useful for AI SDK image content parts).
75
+ */
76
+ declare function preprocessCaptchaToBuffer(input: string | Buffer): Promise<Buffer>;
32
77
  /**
33
78
  * Read an image file and return its base64-encoded content.
34
79
  */
35
80
  declare function imageToBase64(imagePath: string): string;
36
81
 
37
- export { imageToBase64, preprocessCaptcha, solveCaptchaImage };
82
+ export { type Provider, type SolveOptions, Solver, type SolverOptions, imageToBase64, preprocessCaptcha, preprocessCaptchaToBuffer };
package/dist/index.js CHANGED
@@ -1,40 +1,39 @@
1
1
  // src/solver.ts
2
- import OpenAI from "openai";
2
+ import { generateText } from "ai";
3
3
 
4
4
  // src/preprocess.ts
5
5
  import fs from "fs";
6
- import { execSync } from "child_process";
7
6
  import path from "path";
8
- var PYTHON_SCRIPT = `
9
- import sys, base64, io
10
- from PIL import Image, ImageFilter, ImageEnhance, ImageOps
11
-
12
- image_path = sys.argv[1]
13
- img = Image.open(image_path)
14
- img = ImageOps.grayscale(img)
15
- img = img.filter(ImageFilter.GaussianBlur(radius=1.2))
16
- img = img.resize((img.width * 4, img.height * 4), Image.LANCZOS)
17
- img = ImageEnhance.Contrast(img).enhance(3.0)
18
- img = ImageEnhance.Sharpness(img).enhance(2.0)
19
- w, h = img.size
20
- img = img.crop((int(w * 0.10), int(h * 0.02), int(w * 0.90), int(h * 0.60)))
21
- padded = Image.new('L', (img.width + 60, img.height + 40), 255)
22
- padded.paste(img, (30, 20))
23
- padded = padded.convert('RGB')
24
- buf = io.BytesIO()
25
- padded.save(buf, format='PNG')
26
- sys.stdout.buffer.write(base64.b64encode(buf.getvalue()))
27
- `;
28
- async function preprocessCaptcha(imagePath) {
29
- const absPath = path.resolve(imagePath);
30
- const scriptPath = "/tmp/_captcha_preprocess.py";
31
- fs.writeFileSync(scriptPath, PYTHON_SCRIPT);
32
- const result = execSync(`python3 "${scriptPath}" "${absPath}"`, {
33
- maxBuffer: 10 * 1024 * 1024,
34
- // 10MB
35
- encoding: "utf-8"
36
- });
37
- return result.trim();
7
+ import sharp from "sharp";
8
+ async function preprocessCaptcha(input) {
9
+ const buf = await preprocessCaptchaToBuffer(input);
10
+ return buf.toString("base64");
11
+ }
12
+ async function preprocessCaptchaToBuffer(input) {
13
+ const source = typeof input === "string" ? path.resolve(input) : input;
14
+ const metadata = await sharp(source).metadata();
15
+ const origW = metadata.width;
16
+ const origH = metadata.height;
17
+ const smoothed = await sharp(source).blur(1.5).greyscale().toBuffer();
18
+ const upscaled = await sharp(smoothed).resize(origW * 4, origH * 4, { kernel: "lanczos3" }).toBuffer();
19
+ const stats = await sharp(upscaled).stats();
20
+ const mean = stats.channels[0].mean;
21
+ const enhanced = await sharp(upscaled).linear(3, mean * (1 - 3)).sharpen({ sigma: 1, m1: 2, m2: 1 }).toBuffer();
22
+ const scaledW = origW * 4;
23
+ const scaledH = origH * 4;
24
+ const cropLeft = Math.floor(scaledW * 0.1);
25
+ const cropTop = Math.floor(scaledH * 0.02);
26
+ const cropRight = Math.floor(scaledW * 0.9);
27
+ const cropBottom = Math.floor(scaledH * 0.6);
28
+ const cropW = cropRight - cropLeft;
29
+ const cropH = cropBottom - cropTop;
30
+ return sharp(enhanced).extract({ left: cropLeft, top: cropTop, width: cropW, height: cropH }).extend({
31
+ top: 20,
32
+ bottom: 20,
33
+ left: 30,
34
+ right: 30,
35
+ background: { r: 255, g: 255, b: 255 }
36
+ }).png().toBuffer();
38
37
  }
39
38
  function imageToBase64(imagePath) {
40
39
  const buffer = fs.readFileSync(imagePath);
@@ -44,50 +43,46 @@ function imageToBase64(imagePath) {
44
43
  // src/solver.ts
45
44
  var PROMPT = `You are an assistant helping a visually impaired person read distorted text from an image.
46
45
  The text contains uppercase letters A-Z and/or digits 0-9.
47
- A thin vertical stroke is likely the digit 1, not the letter I.
46
+ A thin vertical stroke is the digit 1. Never read it as the letter I or L.
48
47
  A round closed shape is the letter O, not the letter D.
49
48
  Output ONLY the exact characters you read, nothing else.`;
50
- async function singleAttempt(client, base64Image, model, maxRetries) {
51
- for (let retry = 0; retry <= maxRetries; retry++) {
52
- try {
53
- const isReasoningModel = model.startsWith("o");
54
- const tokenParam = isReasoningModel ? { max_completion_tokens: 2e3 } : { max_tokens: 256 };
55
- const response = await client.chat.completions.create({
56
- model,
57
- messages: [
58
- {
59
- role: "user",
60
- content: [
61
- { type: "text", text: PROMPT },
62
- {
63
- type: "image_url",
64
- image_url: {
65
- url: `data:image/png;base64,${base64Image}`
66
- }
67
- }
68
- ]
69
- }
70
- ],
71
- temperature: 1,
72
- ...tokenParam
73
- });
74
- const raw = response.choices[0]?.message?.content?.trim() ?? "";
75
- const lower = raw.toLowerCase();
76
- if (lower.includes("sorry") || lower.includes("can't help") || lower.includes("cannot help") || lower.includes("unable to") || lower.includes("i can't") || raw.length > 20) {
77
- return null;
78
- }
79
- const cleaned = raw.toUpperCase().replace(/[^A-Z0-9]/g, "");
80
- return cleaned || null;
81
- } catch (_err) {
82
- if (retry < maxRetries) {
83
- await new Promise((r) => setTimeout(r, 1e3 * (retry + 1)));
84
- continue;
85
- }
86
- return null;
49
+ var DEFAULT_MODELS = {
50
+ openai: "gpt-4o",
51
+ anthropic: "claude-sonnet-4-20250514",
52
+ google: "gemini-2.0-flash"
53
+ };
54
+ async function resolveModel(apiKey, provider, modelId) {
55
+ switch (provider) {
56
+ case "openai": {
57
+ const { createOpenAI } = await import("@ai-sdk/openai");
58
+ return createOpenAI({ apiKey })(modelId);
59
+ }
60
+ case "anthropic": {
61
+ const { createAnthropic } = await import("@ai-sdk/anthropic");
62
+ return createAnthropic({ apiKey })(modelId);
63
+ }
64
+ case "google": {
65
+ const { createGoogleGenerativeAI } = await import("@ai-sdk/google");
66
+ return createGoogleGenerativeAI({ apiKey })(modelId);
87
67
  }
68
+ default:
69
+ throw new Error(
70
+ `Unknown provider "${provider}". Install the matching @ai-sdk/* package and pass the model directly.`
71
+ );
88
72
  }
89
- return null;
90
73
  }
74
+ var CONFUSION_GROUPS = {
75
+ "1": "1",
76
+ I: "1",
77
+ L: "1",
78
+ O: "O",
79
+ D: "O",
80
+ "0": "O",
81
+ S: "S",
82
+ "5": "S",
83
+ Z: "Z",
84
+ "2": "Z"
85
+ };
91
86
  function majorityVote(attempts, expectedLength) {
92
87
  let filtered = expectedLength ? attempts.filter((a) => a.length === expectedLength) : attempts;
93
88
  if (filtered.length === 0) {
@@ -115,45 +110,131 @@ function majorityVote(attempts, expectedLength) {
115
110
  const ch = a[pos];
116
111
  charCounts.set(ch, (charCounts.get(ch) ?? 0) + 1);
117
112
  }
118
- let bestChar = "";
119
- let bestCharCount = 0;
113
+ const groupCounts = /* @__PURE__ */ new Map();
120
114
  for (const [ch, count] of charCounts) {
121
- if (count > bestCharCount) {
122
- bestChar = ch;
123
- bestCharCount = count;
115
+ const canonical = CONFUSION_GROUPS[ch] ?? ch;
116
+ groupCounts.set(canonical, (groupCounts.get(canonical) ?? 0) + count);
117
+ }
118
+ let bestGroup = "";
119
+ let bestGroupCount = 0;
120
+ for (const [canonical, count] of groupCounts) {
121
+ if (count > bestGroupCount) {
122
+ bestGroup = canonical;
123
+ bestGroupCount = count;
124
124
  }
125
125
  }
126
- result.push(bestChar);
126
+ result.push(bestGroup);
127
127
  }
128
128
  return result.join("");
129
129
  }
130
- async function solveCaptchaImage(imagePath, options = {}) {
131
- const { model = "o3", numAttempts = 5, expectedLength, maxRetries = 2, verbose = true } = options;
132
- const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
133
- const base64Processed = await preprocessCaptcha(imagePath);
134
- const attempts = [];
135
- const maxTotalCalls = numAttempts + 4;
136
- let callCount = 0;
137
- while (attempts.length < numAttempts && callCount < maxTotalCalls) {
138
- callCount++;
139
- const result = await singleAttempt(client, base64Processed, model, maxRetries);
140
- if (result) {
141
- attempts.push(result);
142
- if (verbose) console.log(` Attempt ${attempts.length}: ${result}`);
130
+ var Solver = class {
131
+ _model = null;
132
+ _pendingModel = null;
133
+ /**
134
+ * Create a captcha solver.
135
+ *
136
+ * @example
137
+ * // Simple defaults to OpenAI gpt-4o
138
+ * const solver = new Solver('sk-...');
139
+ *
140
+ * @example
141
+ * // Specify provider and model
142
+ * const solver = new Solver('sk-ant-...', { provider: 'anthropic', model: 'claude-sonnet-4-20250514' });
143
+ *
144
+ * @example
145
+ * // Pass an AI SDK model directly
146
+ * import { createOpenAI } from '@ai-sdk/openai';
147
+ * const openai = createOpenAI({ apiKey: 'sk-...' });
148
+ * const solver = new Solver(openai('gpt-4o'));
149
+ */
150
+ constructor(keyOrModel, options) {
151
+ if (typeof keyOrModel === "string") {
152
+ const provider = options?.provider ?? "openai";
153
+ const modelId = options?.model ?? DEFAULT_MODELS[provider];
154
+ this._pendingModel = resolveModel(keyOrModel, provider, modelId);
143
155
  } else {
144
- if (verbose) console.log(` Call ${callCount}: (refused/failed, retrying...)`);
156
+ this._model = keyOrModel;
145
157
  }
146
158
  }
147
- if (attempts.length === 0) {
148
- if (verbose) console.log(" All attempts failed!");
149
- return "";
159
+ async getModel() {
160
+ if (this._model) return this._model;
161
+ this._model = await this._pendingModel;
162
+ this._pendingModel = null;
163
+ return this._model;
150
164
  }
151
- const answer = majorityVote(attempts, expectedLength);
152
- return answer;
153
- }
165
+ /**
166
+ * Solve a captcha image.
167
+ *
168
+ * @param input - File path (string) or raw image Buffer
169
+ * @param options - Solve options (attempts, expected length, etc.)
170
+ * @returns The captcha text
171
+ */
172
+ async solve(input, options = {}) {
173
+ const { numAttempts = 5, expectedLength, maxRetries = 2, verbose = true } = options;
174
+ const model = await this.getModel();
175
+ const imageBuffer = await preprocessCaptchaToBuffer(input);
176
+ const attempts = [];
177
+ const maxTotalCalls = numAttempts + 4;
178
+ let callCount = 0;
179
+ while (attempts.length < numAttempts && callCount < maxTotalCalls) {
180
+ callCount++;
181
+ const result = await this.singleAttempt(model, imageBuffer, maxRetries);
182
+ if (result) {
183
+ attempts.push(result);
184
+ if (verbose) console.log(` Attempt ${attempts.length}: ${result}`);
185
+ } else {
186
+ if (verbose) console.log(` Call ${callCount}: (refused/failed, retrying...)`);
187
+ }
188
+ }
189
+ if (attempts.length === 0) {
190
+ if (verbose) console.log(" All attempts failed!");
191
+ return "";
192
+ }
193
+ return majorityVote(attempts, expectedLength);
194
+ }
195
+ /**
196
+ * Make a single API call to read the captcha.
197
+ * Retries up to `maxRetries` times on failure.
198
+ */
199
+ async singleAttempt(model, imageBuffer, maxRetries) {
200
+ for (let retry = 0; retry <= maxRetries; retry++) {
201
+ try {
202
+ const { text } = await generateText({
203
+ model,
204
+ messages: [
205
+ {
206
+ role: "user",
207
+ content: [
208
+ { type: "text", text: PROMPT },
209
+ { type: "image", image: imageBuffer }
210
+ ]
211
+ }
212
+ ],
213
+ temperature: 1,
214
+ maxOutputTokens: 256
215
+ });
216
+ const raw = text.trim();
217
+ const lower = raw.toLowerCase();
218
+ if (lower.includes("sorry") || lower.includes("can't help") || lower.includes("cannot help") || lower.includes("unable to") || lower.includes("i can't") || raw.length > 20) {
219
+ return null;
220
+ }
221
+ const cleaned = raw.toUpperCase().replace(/[^A-Z0-9]/g, "");
222
+ return cleaned || null;
223
+ } catch (_err) {
224
+ if (retry < maxRetries) {
225
+ await new Promise((r) => setTimeout(r, 1e3 * (retry + 1)));
226
+ continue;
227
+ }
228
+ return null;
229
+ }
230
+ }
231
+ return null;
232
+ }
233
+ };
154
234
  export {
235
+ Solver,
155
236
  imageToBase64,
156
237
  preprocessCaptcha,
157
- solveCaptchaImage
238
+ preprocessCaptchaToBuffer
158
239
  };
159
240
  //# sourceMappingURL=index.js.map
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/solver.ts","../src/preprocess.ts"],"sourcesContent":["import OpenAI from 'openai';\nimport { preprocessCaptcha } from './preprocess.js';\n\nconst PROMPT = `You are an assistant helping a visually impaired person read distorted text from an image.\nThe text contains uppercase letters A-Z and/or digits 0-9.\nA thin vertical stroke is likely the digit 1, not the letter I.\nA round closed shape is the letter O, not the letter D.\nOutput ONLY the exact characters you read, nothing else.`;\n\ninterface SolverOptions {\n /** OpenAI model to use (default: \"o3\") */\n model?: string;\n /** Number of voting attempts (default: 5) */\n numAttempts?: number;\n /** Expected captcha length — results of other lengths are discarded (default: undefined = no filter) */\n expectedLength?: number;\n /** Max retries per attempt on API failure (default: 2) */\n maxRetries?: number;\n /** Whether to log attempt details (default: true) */\n verbose?: boolean;\n}\n\n/**\n * Make a single API call to read the captcha.\n * Retries up to `maxRetries` times on failure.\n */\nasync function singleAttempt(\n client: OpenAI,\n base64Image: string,\n model: string,\n maxRetries: number\n): Promise<string | null> {\n for (let retry = 0; retry <= maxRetries; retry++) {\n try {\n // Reasoning models (o3, o4-mini) use max_completion_tokens;\n // Standard models (gpt-4o, gpt-4.1, gpt-5.4-mini) use max_tokens.\n const isReasoningModel = model.startsWith('o');\n const tokenParam = isReasoningModel ? { max_completion_tokens: 2000 } : { max_tokens: 256 };\n\n const response = await client.chat.completions.create({\n model,\n messages: [\n {\n role: 'user',\n content: [\n { type: 'text', text: PROMPT },\n {\n type: 'image_url',\n image_url: {\n url: `data:image/png;base64,${base64Image}`,\n },\n },\n ],\n },\n ],\n temperature: 1,\n ...tokenParam,\n });\n\n const raw = response.choices[0]?.message?.content?.trim() ?? '';\n\n // Detect refusals\n const lower = raw.toLowerCase();\n if (\n lower.includes('sorry') ||\n lower.includes(\"can't help\") ||\n lower.includes('cannot help') ||\n lower.includes('unable to') ||\n lower.includes(\"i can't\") ||\n raw.length > 20\n ) {\n return null; // Model refused — don't count as an attempt\n }\n\n // Clean: keep only uppercase letters and digits\n const cleaned = raw.toUpperCase().replace(/[^A-Z0-9]/g, '');\n return cleaned || null;\n } catch (_err) {\n if (retry < maxRetries) {\n // Wait briefly before retry\n await new Promise((r) => setTimeout(r, 1000 * (retry + 1)));\n continue;\n }\n return null;\n }\n }\n return null;\n}\n\n/**\n * Character-level majority vote across multiple attempts.\n */\nfunction majorityVote(attempts: string[], expectedLength?: number): string {\n // Filter to expected length if specified\n let filtered = expectedLength ? attempts.filter((a) => a.length === expectedLength) : attempts;\n\n // If length filter removed everything, fall back to most common length\n if (filtered.length === 0) {\n filtered = attempts;\n }\n\n if (filtered.length === 0) return '';\n\n // Find most common length\n const lenCounts = new Map<number, number>();\n for (const a of filtered) {\n lenCounts.set(a.length, (lenCounts.get(a.length) ?? 0) + 1);\n }\n let bestLen = 0;\n let bestCount = 0;\n for (const [len, count] of lenCounts) {\n if (count > bestCount) {\n bestLen = len;\n bestCount = count;\n }\n }\n\n const sameLenAttempts = filtered.filter((a) => a.length === bestLen);\n if (sameLenAttempts.length === 0) return filtered[0];\n\n // Vote per character position\n const result: string[] = [];\n for (let pos = 0; pos < bestLen; pos++) {\n const charCounts = new Map<string, number>();\n for (const a of sameLenAttempts) {\n const ch = a[pos];\n charCounts.set(ch, (charCounts.get(ch) ?? 0) + 1);\n }\n let bestChar = '';\n let bestCharCount = 0;\n for (const [ch, count] of charCounts) {\n if (count > bestCharCount) {\n bestChar = ch;\n bestCharCount = count;\n }\n }\n result.push(bestChar);\n }\n\n return result.join('');\n}\n\n/**\n * Solve a captcha image using OpenAI vision + preprocessing + majority voting.\n */\nexport async function solveCaptchaImage(\n imagePath: string,\n options: SolverOptions = {}\n): Promise<string> {\n const { model = 'o3', numAttempts = 5, expectedLength, maxRetries = 2, verbose = true } = options;\n\n const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });\n\n // Preprocess the image\n const base64Processed = await preprocessCaptcha(imagePath);\n\n // Run attempts — retry refusals/failures to guarantee numAttempts valid results\n const attempts: string[] = [];\n const maxTotalCalls = numAttempts + 4; // allow up to 4 extra calls for refusals\n let callCount = 0;\n while (attempts.length < numAttempts && callCount < maxTotalCalls) {\n callCount++;\n const result = await singleAttempt(client, base64Processed, model, maxRetries);\n if (result) {\n attempts.push(result);\n if (verbose) console.log(` Attempt ${attempts.length}: ${result}`);\n } else {\n if (verbose) console.log(` Call ${callCount}: (refused/failed, retrying...)`);\n }\n }\n\n if (attempts.length === 0) {\n if (verbose) console.log(' All attempts failed!');\n return '';\n }\n\n // Majority vote\n const answer = majorityVote(attempts, expectedLength);\n return answer;\n}\n","import fs from 'fs';\nimport { execSync } from 'child_process';\nimport path from 'path';\n\n// Inline Python script for image preprocessing\n// Uses PIL which produces optimal results for captcha OCR\nconst PYTHON_SCRIPT = `\nimport sys, base64, io\nfrom PIL import Image, ImageFilter, ImageEnhance, ImageOps\n\nimage_path = sys.argv[1]\nimg = Image.open(image_path)\nimg = ImageOps.grayscale(img)\nimg = img.filter(ImageFilter.GaussianBlur(radius=1.2))\nimg = img.resize((img.width * 4, img.height * 4), Image.LANCZOS)\nimg = ImageEnhance.Contrast(img).enhance(3.0)\nimg = ImageEnhance.Sharpness(img).enhance(2.0)\nw, h = img.size\nimg = img.crop((int(w * 0.10), int(h * 0.02), int(w * 0.90), int(h * 0.60)))\npadded = Image.new('L', (img.width + 60, img.height + 40), 255)\npadded.paste(img, (30, 20))\npadded = padded.convert('RGB')\nbuf = io.BytesIO()\npadded.save(buf, format='PNG')\nsys.stdout.buffer.write(base64.b64encode(buf.getvalue()))\n`;\n\n/**\n * Preprocess a captcha image using PIL (via Python subprocess).\n *\n * Pipeline:\n * 1. Grayscale\n * 2. Gaussian blur (radius=1.2) to smooth dither pattern\n * 3. Upscale 4x with Lanczos\n * 4. Contrast 3x + Sharpness 2x (PIL enhancement — preserves soft gradients)\n * 5. Crop decorative borders\n * 6. Add white padding\n *\n * Returns a base64-encoded PNG string.\n */\nexport async function preprocessCaptcha(imagePath: string): Promise<string> {\n const absPath = path.resolve(imagePath);\n\n // Write the Python script to a temp file\n const scriptPath = '/tmp/_captcha_preprocess.py';\n fs.writeFileSync(scriptPath, PYTHON_SCRIPT);\n\n // Execute Python and capture base64 output\n const result = execSync(`python3 \"${scriptPath}\" \"${absPath}\"`, {\n maxBuffer: 10 * 1024 * 1024, // 10MB\n encoding: 'utf-8',\n });\n\n return result.trim();\n}\n\n/**\n * Read an image file and return its base64-encoded content.\n */\nexport function imageToBase64(imagePath: string): string {\n const buffer = fs.readFileSync(imagePath);\n return buffer.toString('base64');\n}\n"],"mappings":";AAAA,OAAO,YAAY;;;ACAnB,OAAO,QAAQ;AACf,SAAS,gBAAgB;AACzB,OAAO,UAAU;AAIjB,IAAM,gBAAgB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAkCtB,eAAsB,kBAAkB,WAAoC;AAC1E,QAAM,UAAU,KAAK,QAAQ,SAAS;AAGtC,QAAM,aAAa;AACnB,KAAG,cAAc,YAAY,aAAa;AAG1C,QAAM,SAAS,SAAS,YAAY,UAAU,MAAM,OAAO,KAAK;AAAA,IAC9D,WAAW,KAAK,OAAO;AAAA;AAAA,IACvB,UAAU;AAAA,EACZ,CAAC;AAED,SAAO,OAAO,KAAK;AACrB;AAKO,SAAS,cAAc,WAA2B;AACvD,QAAM,SAAS,GAAG,aAAa,SAAS;AACxC,SAAO,OAAO,SAAS,QAAQ;AACjC;;;AD3DA,IAAM,SAAS;AAAA;AAAA;AAAA;AAAA;AAuBf,eAAe,cACb,QACA,aACA,OACA,YACwB;AACxB,WAAS,QAAQ,GAAG,SAAS,YAAY,SAAS;AAChD,QAAI;AAGF,YAAM,mBAAmB,MAAM,WAAW,GAAG;AAC7C,YAAM,aAAa,mBAAmB,EAAE,uBAAuB,IAAK,IAAI,EAAE,YAAY,IAAI;AAE1F,YAAM,WAAW,MAAM,OAAO,KAAK,YAAY,OAAO;AAAA,QACpD;AAAA,QACA,UAAU;AAAA,UACR;AAAA,YACE,MAAM;AAAA,YACN,SAAS;AAAA,cACP,EAAE,MAAM,QAAQ,MAAM,OAAO;AAAA,cAC7B;AAAA,gBACE,MAAM;AAAA,gBACN,WAAW;AAAA,kBACT,KAAK,yBAAyB,WAAW;AAAA,gBAC3C;AAAA,cACF;AAAA,YACF;AAAA,UACF;AAAA,QACF;AAAA,QACA,aAAa;AAAA,QACb,GAAG;AAAA,MACL,CAAC;AAED,YAAM,MAAM,SAAS,QAAQ,CAAC,GAAG,SAAS,SAAS,KAAK,KAAK;AAG7D,YAAM,QAAQ,IAAI,YAAY;AAC9B,UACE,MAAM,SAAS,OAAO,KACtB,MAAM,SAAS,YAAY,KAC3B,MAAM,SAAS,aAAa,KAC5B,MAAM,SAAS,WAAW,KAC1B,MAAM,SAAS,SAAS,KACxB,IAAI,SAAS,IACb;AACA,eAAO;AAAA,MACT;AAGA,YAAM,UAAU,IAAI,YAAY,EAAE,QAAQ,cAAc,EAAE;AAC1D,aAAO,WAAW;AAAA,IACpB,SAAS,MAAM;AACb,UAAI,QAAQ,YAAY;AAEtB,cAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,OAAQ,QAAQ,EAAE,CAAC;AAC1D;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACA,SAAO;AACT;AAKA,SAAS,aAAa,UAAoB,gBAAiC;AAEzE,MAAI,WAAW,iBAAiB,SAAS,OAAO,CAAC,MAAM,EAAE,WAAW,cAAc,IAAI;AAGtF,MAAI,SAAS,WAAW,GAAG;AACzB,eAAW;AAAA,EACb;AAEA,MAAI,SAAS,WAAW,EAAG,QAAO;AAGlC,QAAM,YAAY,oBAAI,IAAoB;AAC1C,aAAW,KAAK,UAAU;AACxB,cAAU,IAAI,EAAE,SAAS,UAAU,IAAI,EAAE,MAAM,KAAK,KAAK,CAAC;AAAA,EAC5D;AACA,MAAI,UAAU;AACd,MAAI,YAAY;AAChB,aAAW,CAAC,KAAK,KAAK,KAAK,WAAW;AACpC,QAAI,QAAQ,WAAW;AACrB,gBAAU;AACV,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,QAAM,kBAAkB,SAAS,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO;AACnE,MAAI,gBAAgB,WAAW,EAAG,QAAO,SAAS,CAAC;AAGnD,QAAM,SAAmB,CAAC;AAC1B,WAAS,MAAM,GAAG,MAAM,SAAS,OAAO;AACtC,UAAM,aAAa,oBAAI,IAAoB;AAC3C,eAAW,KAAK,iBAAiB;AAC/B,YAAM,KAAK,EAAE,GAAG;AAChB,iBAAW,IAAI,KAAK,WAAW,IAAI,EAAE,KAAK,KAAK,CAAC;AAAA,IAClD;AACA,QAAI,WAAW;AACf,QAAI,gBAAgB;AACpB,eAAW,CAAC,IAAI,KAAK,KAAK,YAAY;AACpC,UAAI,QAAQ,eAAe;AACzB,mBAAW;AACX,wBAAgB;AAAA,MAClB;AAAA,IACF;AACA,WAAO,KAAK,QAAQ;AAAA,EACtB;AAEA,SAAO,OAAO,KAAK,EAAE;AACvB;AAKA,eAAsB,kBACpB,WACA,UAAyB,CAAC,GACT;AACjB,QAAM,EAAE,QAAQ,MAAM,cAAc,GAAG,gBAAgB,aAAa,GAAG,UAAU,KAAK,IAAI;AAE1F,QAAM,SAAS,IAAI,OAAO,EAAE,QAAQ,QAAQ,IAAI,eAAe,CAAC;AAGhE,QAAM,kBAAkB,MAAM,kBAAkB,SAAS;AAGzD,QAAM,WAAqB,CAAC;AAC5B,QAAM,gBAAgB,cAAc;AACpC,MAAI,YAAY;AAChB,SAAO,SAAS,SAAS,eAAe,YAAY,eAAe;AACjE;AACA,UAAM,SAAS,MAAM,cAAc,QAAQ,iBAAiB,OAAO,UAAU;AAC7E,QAAI,QAAQ;AACV,eAAS,KAAK,MAAM;AACpB,UAAI,QAAS,SAAQ,IAAI,aAAa,SAAS,MAAM,KAAK,MAAM,EAAE;AAAA,IACpE,OAAO;AACL,UAAI,QAAS,SAAQ,IAAI,UAAU,SAAS,iCAAiC;AAAA,IAC/E;AAAA,EACF;AAEA,MAAI,SAAS,WAAW,GAAG;AACzB,QAAI,QAAS,SAAQ,IAAI,wBAAwB;AACjD,WAAO;AAAA,EACT;AAGA,QAAM,SAAS,aAAa,UAAU,cAAc;AACpD,SAAO;AACT;","names":[]}
1
+ {"version":3,"sources":["../src/solver.ts","../src/preprocess.ts"],"sourcesContent":["import type { LanguageModel } from 'ai';\nimport { generateText } from 'ai';\nimport { preprocessCaptchaToBuffer } from './preprocess.js';\n\nconst PROMPT = `You are an assistant helping a visually impaired person read distorted text from an image.\nThe text contains uppercase letters A-Z and/or digits 0-9.\nA thin vertical stroke is the digit 1. Never read it as the letter I or L.\nA round closed shape is the letter O, not the letter D.\nOutput ONLY the exact characters you read, nothing else.`;\n\n// ── Types ────────────────────────────────────────────────────────────\n\nexport type Provider = 'openai' | 'anthropic' | 'google';\n\nexport interface SolverOptions {\n /** AI provider to use when constructing the model from an API key (default: \"openai\") */\n provider?: Provider;\n /** Model ID passed to the provider (default: \"gpt-4o\") */\n model?: string;\n}\n\nexport interface SolveOptions {\n /** Number of voting attempts (default: 5) */\n numAttempts?: number;\n /** Expected captcha length — results of other lengths are discarded */\n expectedLength?: number;\n /** Max retries per attempt on API failure (default: 2) */\n maxRetries?: number;\n /** Whether to log attempt details (default: true) */\n verbose?: boolean;\n}\n\n// ── Provider resolution ──────────────────────────────────────────────\n\nconst DEFAULT_MODELS: Record<Provider, string> = {\n openai: 'gpt-4o',\n anthropic: 'claude-sonnet-4-20250514',\n google: 'gemini-2.0-flash',\n};\n\nasync function resolveModel(\n apiKey: string,\n provider: Provider,\n modelId: string\n): Promise<LanguageModel> {\n switch (provider) {\n case 'openai': {\n const { createOpenAI } = await import('@ai-sdk/openai');\n return createOpenAI({ apiKey })(modelId);\n }\n case 'anthropic': {\n // @ts-expect-error — optional peer dependency\n const { createAnthropic } = await import('@ai-sdk/anthropic');\n return createAnthropic({ apiKey })(modelId);\n }\n case 'google': {\n // @ts-expect-error — optional peer dependency\n const { createGoogleGenerativeAI } = await import('@ai-sdk/google');\n return createGoogleGenerativeAI({ apiKey })(modelId);\n }\n default:\n throw new Error(\n `Unknown provider \"${provider}\". Install the matching @ai-sdk/* package and pass the model directly.`\n );\n }\n}\n\n// ── Confusion groups ─────────────────────────────────────────────────\n\n/**\n * Characters the model commonly misreads as each other.\n * Each group maps to its canonical (most likely correct) character.\n */\nconst CONFUSION_GROUPS: Record<string, string> = {\n '1': '1',\n I: '1',\n L: '1',\n O: 'O',\n D: 'O',\n '0': 'O',\n S: 'S',\n '5': 'S',\n Z: 'Z',\n '2': 'Z',\n};\n\n// ── Majority voting ──────────────────────────────────────────────────\n\n/**\n * Character-level majority vote across multiple attempts.\n * Uses confusion-aware voting: characters that the model commonly\n * confuses (e.g. 1/I/L, O/D/0) are grouped together during counting.\n */\nfunction majorityVote(attempts: string[], expectedLength?: number): string {\n let filtered = expectedLength ? attempts.filter((a) => a.length === expectedLength) : attempts;\n\n if (filtered.length === 0) {\n filtered = attempts;\n }\n if (filtered.length === 0) return '';\n\n // Find most common length\n const lenCounts = new Map<number, number>();\n for (const a of filtered) {\n lenCounts.set(a.length, (lenCounts.get(a.length) ?? 0) + 1);\n }\n let bestLen = 0;\n let bestCount = 0;\n for (const [len, count] of lenCounts) {\n if (count > bestCount) {\n bestLen = len;\n bestCount = count;\n }\n }\n\n const sameLenAttempts = filtered.filter((a) => a.length === bestLen);\n if (sameLenAttempts.length === 0) return filtered[0];\n\n // Vote per character position with confusion-aware grouping\n const result: string[] = [];\n for (let pos = 0; pos < bestLen; pos++) {\n const charCounts = new Map<string, number>();\n for (const a of sameLenAttempts) {\n const ch = a[pos];\n charCounts.set(ch, (charCounts.get(ch) ?? 0) + 1);\n }\n\n const groupCounts = new Map<string, number>();\n for (const [ch, count] of charCounts) {\n const canonical = CONFUSION_GROUPS[ch] ?? ch;\n groupCounts.set(canonical, (groupCounts.get(canonical) ?? 0) + count);\n }\n\n let bestGroup = '';\n let bestGroupCount = 0;\n for (const [canonical, count] of groupCounts) {\n if (count > bestGroupCount) {\n bestGroup = canonical;\n bestGroupCount = count;\n }\n }\n\n result.push(bestGroup);\n }\n\n return result.join('');\n}\n\n// ── Solver class ─────────────────────────────────────────────────────\n\nexport class Solver {\n private _model: LanguageModel | null = null;\n private _pendingModel: Promise<LanguageModel> | null = null;\n\n /**\n * Create a captcha solver.\n *\n * @example\n * // Simple — defaults to OpenAI gpt-4o\n * const solver = new Solver('sk-...');\n *\n * @example\n * // Specify provider and model\n * const solver = new Solver('sk-ant-...', { provider: 'anthropic', model: 'claude-sonnet-4-20250514' });\n *\n * @example\n * // Pass an AI SDK model directly\n * import { createOpenAI } from '@ai-sdk/openai';\n * const openai = createOpenAI({ apiKey: 'sk-...' });\n * const solver = new Solver(openai('gpt-4o'));\n */\n constructor(keyOrModel: string | LanguageModel, options?: SolverOptions) {\n if (typeof keyOrModel === 'string') {\n const provider = options?.provider ?? 'openai';\n const modelId = options?.model ?? DEFAULT_MODELS[provider];\n // Lazily resolve the model on first use\n this._pendingModel = resolveModel(keyOrModel, provider, modelId);\n } else {\n this._model = keyOrModel;\n }\n }\n\n private async getModel(): Promise<LanguageModel> {\n if (this._model) return this._model;\n this._model = await this._pendingModel!;\n this._pendingModel = null;\n return this._model;\n }\n\n /**\n * Solve a captcha image.\n *\n * @param input - File path (string) or raw image Buffer\n * @param options - Solve options (attempts, expected length, etc.)\n * @returns The captcha text\n */\n async solve(input: string | Buffer, options: SolveOptions = {}): Promise<string> {\n const { numAttempts = 5, expectedLength, maxRetries = 2, verbose = true } = options;\n\n const model = await this.getModel();\n const imageBuffer = await preprocessCaptchaToBuffer(input);\n\n // Run attempts — retry refusals/failures to guarantee numAttempts valid results\n const attempts: string[] = [];\n const maxTotalCalls = numAttempts + 4;\n let callCount = 0;\n\n while (attempts.length < numAttempts && callCount < maxTotalCalls) {\n callCount++;\n const result = await this.singleAttempt(model, imageBuffer, maxRetries);\n if (result) {\n attempts.push(result);\n if (verbose) console.log(` Attempt ${attempts.length}: ${result}`);\n } else {\n if (verbose) console.log(` Call ${callCount}: (refused/failed, retrying...)`);\n }\n }\n\n if (attempts.length === 0) {\n if (verbose) console.log(' All attempts failed!');\n return '';\n }\n\n return majorityVote(attempts, expectedLength);\n }\n\n /**\n * Make a single API call to read the captcha.\n * Retries up to `maxRetries` times on failure.\n */\n private async singleAttempt(\n model: LanguageModel,\n imageBuffer: Buffer,\n maxRetries: number\n ): Promise<string | null> {\n for (let retry = 0; retry <= maxRetries; retry++) {\n try {\n const { text } = await generateText({\n model,\n messages: [\n {\n role: 'user',\n content: [\n { type: 'text', text: PROMPT },\n { type: 'image', image: imageBuffer },\n ],\n },\n ],\n temperature: 1,\n maxOutputTokens: 256,\n });\n\n const raw = text.trim();\n\n // Detect refusals\n const lower = raw.toLowerCase();\n if (\n lower.includes('sorry') ||\n lower.includes(\"can't help\") ||\n lower.includes('cannot help') ||\n lower.includes('unable to') ||\n lower.includes(\"i can't\") ||\n raw.length > 20\n ) {\n return null;\n }\n\n // Clean: keep only uppercase letters and digits\n const cleaned = raw.toUpperCase().replace(/[^A-Z0-9]/g, '');\n return cleaned || null;\n } catch (_err) {\n if (retry < maxRetries) {\n await new Promise((r) => setTimeout(r, 1000 * (retry + 1)));\n continue;\n }\n return null;\n }\n }\n return null;\n }\n}\n","import fs from 'fs';\nimport path from 'path';\nimport sharp from 'sharp';\n\n/**\n * Preprocess a captcha image using sharp (libvips).\n *\n * Pipeline:\n * 1. Gaussian blur in color space (smooths dither pattern)\n * 2. Grayscale conversion\n * 3. Upscale 4× with Lanczos\n * 4. Contrast boost (3× around image mean) + sharpen\n * 5. Crop decorative borders\n * 6. Add white padding\n *\n * Accepts a file path or a raw image Buffer.\n * Returns a base64-encoded PNG string.\n */\nexport async function preprocessCaptcha(input: string | Buffer): Promise<string> {\n const buf = await preprocessCaptchaToBuffer(input);\n return buf.toString('base64');\n}\n\n/**\n * Same preprocessing pipeline as `preprocessCaptcha`, but returns the\n * resulting PNG as a raw Buffer (useful for AI SDK image content parts).\n */\nexport async function preprocessCaptchaToBuffer(input: string | Buffer): Promise<Buffer> {\n const source = typeof input === 'string' ? path.resolve(input) : input;\n\n // Read original dimensions for crop/resize calculations\n const metadata = await sharp(source).metadata();\n const origW = metadata.width!;\n const origH = metadata.height!;\n\n // Step 1-2: Blur in color space (smooths dither pattern) → greyscale\n // Separate from resize to prevent pipeline reordering\n const smoothed = await sharp(source).blur(1.5).greyscale().toBuffer();\n\n // Step 3: Upscale 4× with Lanczos\n const upscaled = await sharp(smoothed)\n .resize(origW * 4, origH * 4, { kernel: 'lanczos3' })\n .toBuffer();\n\n // Step 4: Contrast 3× around actual image mean + sharpen\n // Matches PIL's ImageEnhance.Contrast: output = factor*input + mean*(1-factor)\n const stats = await sharp(upscaled).stats();\n const mean = stats.channels[0].mean;\n const enhanced = await sharp(upscaled)\n .linear(3.0, mean * (1 - 3.0))\n .sharpen({ sigma: 1.0, m1: 2.0, m2: 1.0 })\n .toBuffer();\n\n // Step 5: Crop decorative borders\n // Remove 10% left/right, 2% top, 40% bottom (keep top 60%)\n // Math.floor matches Python's int() truncation\n const scaledW = origW * 4;\n const scaledH = origH * 4;\n const cropLeft = Math.floor(scaledW * 0.1);\n const cropTop = Math.floor(scaledH * 0.02);\n const cropRight = Math.floor(scaledW * 0.9);\n const cropBottom = Math.floor(scaledH * 0.6);\n const cropW = cropRight - cropLeft;\n const cropH = cropBottom - cropTop;\n\n // Step 5-6: Crop → add white padding → output PNG\n return sharp(enhanced)\n .extract({ left: cropLeft, top: cropTop, width: cropW, height: cropH })\n .extend({\n top: 20,\n bottom: 20,\n left: 30,\n right: 30,\n background: { r: 255, g: 255, b: 255 },\n })\n .png()\n .toBuffer();\n}\n\n/**\n * Read an image file and return its base64-encoded content.\n */\nexport function imageToBase64(imagePath: string): string {\n const buffer = fs.readFileSync(imagePath);\n return buffer.toString('base64');\n}\n"],"mappings":";AACA,SAAS,oBAAoB;;;ACD7B,OAAO,QAAQ;AACf,OAAO,UAAU;AACjB,OAAO,WAAW;AAgBlB,eAAsB,kBAAkB,OAAyC;AAC/E,QAAM,MAAM,MAAM,0BAA0B,KAAK;AACjD,SAAO,IAAI,SAAS,QAAQ;AAC9B;AAMA,eAAsB,0BAA0B,OAAyC;AACvF,QAAM,SAAS,OAAO,UAAU,WAAW,KAAK,QAAQ,KAAK,IAAI;AAGjE,QAAM,WAAW,MAAM,MAAM,MAAM,EAAE,SAAS;AAC9C,QAAM,QAAQ,SAAS;AACvB,QAAM,QAAQ,SAAS;AAIvB,QAAM,WAAW,MAAM,MAAM,MAAM,EAAE,KAAK,GAAG,EAAE,UAAU,EAAE,SAAS;AAGpE,QAAM,WAAW,MAAM,MAAM,QAAQ,EAClC,OAAO,QAAQ,GAAG,QAAQ,GAAG,EAAE,QAAQ,WAAW,CAAC,EACnD,SAAS;AAIZ,QAAM,QAAQ,MAAM,MAAM,QAAQ,EAAE,MAAM;AAC1C,QAAM,OAAO,MAAM,SAAS,CAAC,EAAE;AAC/B,QAAM,WAAW,MAAM,MAAM,QAAQ,EAClC,OAAO,GAAK,QAAQ,IAAI,EAAI,EAC5B,QAAQ,EAAE,OAAO,GAAK,IAAI,GAAK,IAAI,EAAI,CAAC,EACxC,SAAS;AAKZ,QAAM,UAAU,QAAQ;AACxB,QAAM,UAAU,QAAQ;AACxB,QAAM,WAAW,KAAK,MAAM,UAAU,GAAG;AACzC,QAAM,UAAU,KAAK,MAAM,UAAU,IAAI;AACzC,QAAM,YAAY,KAAK,MAAM,UAAU,GAAG;AAC1C,QAAM,aAAa,KAAK,MAAM,UAAU,GAAG;AAC3C,QAAM,QAAQ,YAAY;AAC1B,QAAM,QAAQ,aAAa;AAG3B,SAAO,MAAM,QAAQ,EAClB,QAAQ,EAAE,MAAM,UAAU,KAAK,SAAS,OAAO,OAAO,QAAQ,MAAM,CAAC,EACrE,OAAO;AAAA,IACN,KAAK;AAAA,IACL,QAAQ;AAAA,IACR,MAAM;AAAA,IACN,OAAO;AAAA,IACP,YAAY,EAAE,GAAG,KAAK,GAAG,KAAK,GAAG,IAAI;AAAA,EACvC,CAAC,EACA,IAAI,EACJ,SAAS;AACd;AAKO,SAAS,cAAc,WAA2B;AACvD,QAAM,SAAS,GAAG,aAAa,SAAS;AACxC,SAAO,OAAO,SAAS,QAAQ;AACjC;;;ADjFA,IAAM,SAAS;AAAA;AAAA;AAAA;AAAA;AA8Bf,IAAM,iBAA2C;AAAA,EAC/C,QAAQ;AAAA,EACR,WAAW;AAAA,EACX,QAAQ;AACV;AAEA,eAAe,aACb,QACA,UACA,SACwB;AACxB,UAAQ,UAAU;AAAA,IAChB,KAAK,UAAU;AACb,YAAM,EAAE,aAAa,IAAI,MAAM,OAAO,gBAAgB;AACtD,aAAO,aAAa,EAAE,OAAO,CAAC,EAAE,OAAO;AAAA,IACzC;AAAA,IACA,KAAK,aAAa;AAEhB,YAAM,EAAE,gBAAgB,IAAI,MAAM,OAAO,mBAAmB;AAC5D,aAAO,gBAAgB,EAAE,OAAO,CAAC,EAAE,OAAO;AAAA,IAC5C;AAAA,IACA,KAAK,UAAU;AAEb,YAAM,EAAE,yBAAyB,IAAI,MAAM,OAAO,gBAAgB;AAClE,aAAO,yBAAyB,EAAE,OAAO,CAAC,EAAE,OAAO;AAAA,IACrD;AAAA,IACA;AACE,YAAM,IAAI;AAAA,QACR,qBAAqB,QAAQ;AAAA,MAC/B;AAAA,EACJ;AACF;AAQA,IAAM,mBAA2C;AAAA,EAC/C,KAAK;AAAA,EACL,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,KAAK;AAAA,EACL,GAAG;AAAA,EACH,KAAK;AAAA,EACL,GAAG;AAAA,EACH,KAAK;AACP;AASA,SAAS,aAAa,UAAoB,gBAAiC;AACzE,MAAI,WAAW,iBAAiB,SAAS,OAAO,CAAC,MAAM,EAAE,WAAW,cAAc,IAAI;AAEtF,MAAI,SAAS,WAAW,GAAG;AACzB,eAAW;AAAA,EACb;AACA,MAAI,SAAS,WAAW,EAAG,QAAO;AAGlC,QAAM,YAAY,oBAAI,IAAoB;AAC1C,aAAW,KAAK,UAAU;AACxB,cAAU,IAAI,EAAE,SAAS,UAAU,IAAI,EAAE,MAAM,KAAK,KAAK,CAAC;AAAA,EAC5D;AACA,MAAI,UAAU;AACd,MAAI,YAAY;AAChB,aAAW,CAAC,KAAK,KAAK,KAAK,WAAW;AACpC,QAAI,QAAQ,WAAW;AACrB,gBAAU;AACV,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,QAAM,kBAAkB,SAAS,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO;AACnE,MAAI,gBAAgB,WAAW,EAAG,QAAO,SAAS,CAAC;AAGnD,QAAM,SAAmB,CAAC;AAC1B,WAAS,MAAM,GAAG,MAAM,SAAS,OAAO;AACtC,UAAM,aAAa,oBAAI,IAAoB;AAC3C,eAAW,KAAK,iBAAiB;AAC/B,YAAM,KAAK,EAAE,GAAG;AAChB,iBAAW,IAAI,KAAK,WAAW,IAAI,EAAE,KAAK,KAAK,CAAC;AAAA,IAClD;AAEA,UAAM,cAAc,oBAAI,IAAoB;AAC5C,eAAW,CAAC,IAAI,KAAK,KAAK,YAAY;AACpC,YAAM,YAAY,iBAAiB,EAAE,KAAK;AAC1C,kBAAY,IAAI,YAAY,YAAY,IAAI,SAAS,KAAK,KAAK,KAAK;AAAA,IACtE;AAEA,QAAI,YAAY;AAChB,QAAI,iBAAiB;AACrB,eAAW,CAAC,WAAW,KAAK,KAAK,aAAa;AAC5C,UAAI,QAAQ,gBAAgB;AAC1B,oBAAY;AACZ,yBAAiB;AAAA,MACnB;AAAA,IACF;AAEA,WAAO,KAAK,SAAS;AAAA,EACvB;AAEA,SAAO,OAAO,KAAK,EAAE;AACvB;AAIO,IAAM,SAAN,MAAa;AAAA,EACV,SAA+B;AAAA,EAC/B,gBAA+C;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAmBvD,YAAY,YAAoC,SAAyB;AACvE,QAAI,OAAO,eAAe,UAAU;AAClC,YAAM,WAAW,SAAS,YAAY;AACtC,YAAM,UAAU,SAAS,SAAS,eAAe,QAAQ;AAEzD,WAAK,gBAAgB,aAAa,YAAY,UAAU,OAAO;AAAA,IACjE,OAAO;AACL,WAAK,SAAS;AAAA,IAChB;AAAA,EACF;AAAA,EAEA,MAAc,WAAmC;AAC/C,QAAI,KAAK,OAAQ,QAAO,KAAK;AAC7B,SAAK,SAAS,MAAM,KAAK;AACzB,SAAK,gBAAgB;AACrB,WAAO,KAAK;AAAA,EACd;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASA,MAAM,MAAM,OAAwB,UAAwB,CAAC,GAAoB;AAC/E,UAAM,EAAE,cAAc,GAAG,gBAAgB,aAAa,GAAG,UAAU,KAAK,IAAI;AAE5E,UAAM,QAAQ,MAAM,KAAK,SAAS;AAClC,UAAM,cAAc,MAAM,0BAA0B,KAAK;AAGzD,UAAM,WAAqB,CAAC;AAC5B,UAAM,gBAAgB,cAAc;AACpC,QAAI,YAAY;AAEhB,WAAO,SAAS,SAAS,eAAe,YAAY,eAAe;AACjE;AACA,YAAM,SAAS,MAAM,KAAK,cAAc,OAAO,aAAa,UAAU;AACtE,UAAI,QAAQ;AACV,iBAAS,KAAK,MAAM;AACpB,YAAI,QAAS,SAAQ,IAAI,aAAa,SAAS,MAAM,KAAK,MAAM,EAAE;AAAA,MACpE,OAAO;AACL,YAAI,QAAS,SAAQ,IAAI,UAAU,SAAS,iCAAiC;AAAA,MAC/E;AAAA,IACF;AAEA,QAAI,SAAS,WAAW,GAAG;AACzB,UAAI,QAAS,SAAQ,IAAI,wBAAwB;AACjD,aAAO;AAAA,IACT;AAEA,WAAO,aAAa,UAAU,cAAc;AAAA,EAC9C;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,cACZ,OACA,aACA,YACwB;AACxB,aAAS,QAAQ,GAAG,SAAS,YAAY,SAAS;AAChD,UAAI;AACF,cAAM,EAAE,KAAK,IAAI,MAAM,aAAa;AAAA,UAClC;AAAA,UACA,UAAU;AAAA,YACR;AAAA,cACE,MAAM;AAAA,cACN,SAAS;AAAA,gBACP,EAAE,MAAM,QAAQ,MAAM,OAAO;AAAA,gBAC7B,EAAE,MAAM,SAAS,OAAO,YAAY;AAAA,cACtC;AAAA,YACF;AAAA,UACF;AAAA,UACA,aAAa;AAAA,UACb,iBAAiB;AAAA,QACnB,CAAC;AAED,cAAM,MAAM,KAAK,KAAK;AAGtB,cAAM,QAAQ,IAAI,YAAY;AAC9B,YACE,MAAM,SAAS,OAAO,KACtB,MAAM,SAAS,YAAY,KAC3B,MAAM,SAAS,aAAa,KAC5B,MAAM,SAAS,WAAW,KAC1B,MAAM,SAAS,SAAS,KACxB,IAAI,SAAS,IACb;AACA,iBAAO;AAAA,QACT;AAGA,cAAM,UAAU,IAAI,YAAY,EAAE,QAAQ,cAAc,EAAE;AAC1D,eAAO,WAAW;AAAA,MACpB,SAAS,MAAM;AACb,YAAI,QAAQ,YAAY;AACtB,gBAAM,IAAI,QAAQ,CAAC,MAAM,WAAW,GAAG,OAAQ,QAAQ,EAAE,CAAC;AAC1D;AAAA,QACF;AACA,eAAO;AAAA,MACT;AAAA,IACF;AACA,WAAO;AAAA,EACT;AACF;","names":[]}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@yigitahmetsahin/captcha-solver",
3
- "version": "1.0.1",
4
- "description": "AI-powered captcha solver using image preprocessing and OpenAI vision models",
3
+ "version": "1.2.0",
4
+ "description": "AI-powered captcha solver using image preprocessing and multi-provider vision models (Vercel AI SDK)",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",
7
7
  "types": "dist/index.d.ts",
@@ -32,7 +32,10 @@
32
32
  "captcha",
33
33
  "solver",
34
34
  "ocr",
35
+ "ai-sdk",
35
36
  "openai",
37
+ "anthropic",
38
+ "google",
36
39
  "vision",
37
40
  "image-processing",
38
41
  "typescript"
@@ -48,11 +51,12 @@
48
51
  },
49
52
  "homepage": "https://github.com/yigitahmetsahin/captcha-solver#readme",
50
53
  "dependencies": {
54
+ "ai": "^6.0.146",
51
55
  "dotenv": "^16.4.7",
52
- "openai": "^4.77.0",
53
56
  "sharp": "^0.33.5"
54
57
  },
55
58
  "devDependencies": {
59
+ "@ai-sdk/openai": "^3.0.50",
56
60
  "@eslint/js": "^9.39.2",
57
61
  "@types/node": "^22.10.0",
58
62
  "@vitest/coverage-v8": "^4.0.18",
@@ -65,6 +69,22 @@
65
69
  "typescript-eslint": "^8.53.1",
66
70
  "vitest": "^4.0.17"
67
71
  },
72
+ "peerDependencies": {
73
+ "@ai-sdk/openai": ">=1.0.0",
74
+ "@ai-sdk/anthropic": ">=1.0.0",
75
+ "@ai-sdk/google": ">=1.0.0"
76
+ },
77
+ "peerDependenciesMeta": {
78
+ "@ai-sdk/openai": {
79
+ "optional": true
80
+ },
81
+ "@ai-sdk/anthropic": {
82
+ "optional": true
83
+ },
84
+ "@ai-sdk/google": {
85
+ "optional": true
86
+ }
87
+ },
68
88
  "engines": {
69
89
  "node": ">=24"
70
90
  },