captcha-url-reader 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.4
2
+ Name: captcha-url-reader
3
+ Version: 1.0.0
4
+ Summary: Simple Python package: pass CAPTCHA image URL and get extracted text
5
+ Author: Arif Shah
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: easyocr>=1.7.1
10
+ Requires-Dist: opencv-python>=4.9.0
11
+ Requires-Dist: Pillow>=10.2.0
12
+ Requires-Dist: numpy>=1.26.0
13
+ Requires-Dist: requests>=2.31.0
14
+ Provides-Extra: dev
15
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
16
+
17
+ # captcha-url-reader
18
+
19
+ Simple package: user passes captcha image URL, package reads and returns text.
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ pip install -e .
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ```python
30
+ from captcha_image_reader import read_captcha_from_url
31
+
32
+ # Default mode (recommended for Amazon-style captchas)
33
+ captcha_text = read_captcha_from_url("https://images-na.ssl-images-amazon.com/captcha/sgkknrsj/Captcha_iwrdailhkf.jpg")
34
+ if captcha_text:
35
+ print(f"CAPTCHA text extracted: {captcha_text}")
36
+ else:
37
+ print("No text extracted from image URL.")
38
+ ```
39
+
40
+ ## Overlap-heavy captcha mode
41
+
42
+ Use forced overlap mode only when text is merged/overlapping and default mode is not accurate.
43
+
44
+ ```python
45
+ from captcha_image_reader import read_captcha_from_url
46
+
47
+ captcha_text = read_captcha_from_url(
48
+ "https://2captcha.com/dist/web/assets/captcha-rn1S3orp.jpg",
49
+ force_overlap_risk=True,
50
+ )
51
+ ```
52
+
53
+ ## When to use which mode
54
+
55
+ - Use default mode for clean or mostly non-overlapping text (for example, most Amazon captchas).
56
+ - Use `force_overlap_risk=True` only when characters are merged and default extraction is wrong.
57
+
58
+ ## Example scripts
59
+
60
+ - Default/Amazon style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py`
61
+ - Overlap-heavy style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py`
62
+
63
+ Run them directly:
64
+
65
+ ```bash
66
+ ./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py
67
+ ./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py
68
+ ```
69
+
70
+ ## GPU behavior
71
+
72
+ - Uses GPU first by default.
73
+ - If GPU is not available or fails, automatically falls back to CPU.
@@ -0,0 +1,57 @@
1
+ # captcha-url-reader
2
+
3
+ Simple package: user passes captcha image URL, package reads and returns text.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install -e .
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from captcha_image_reader import read_captcha_from_url
15
+
16
+ # Default mode (recommended for Amazon-style captchas)
17
+ captcha_text = read_captcha_from_url("https://images-na.ssl-images-amazon.com/captcha/sgkknrsj/Captcha_iwrdailhkf.jpg")
18
+ if captcha_text:
19
+ print(f"CAPTCHA text extracted: {captcha_text}")
20
+ else:
21
+ print("No text extracted from image URL.")
22
+ ```
23
+
24
+ ## Overlap-heavy captcha mode
25
+
26
+ Use forced overlap mode only when text is merged/overlapping and default mode is not accurate.
27
+
28
+ ```python
29
+ from captcha_image_reader import read_captcha_from_url
30
+
31
+ captcha_text = read_captcha_from_url(
32
+ "https://2captcha.com/dist/web/assets/captcha-rn1S3orp.jpg",
33
+ force_overlap_risk=True,
34
+ )
35
+ ```
36
+
37
+ ## When to use which mode
38
+
39
+ - Use default mode for clean or mostly non-overlapping text (for example, most Amazon captchas).
40
+ - Use `force_overlap_risk=True` only when characters are merged and default extraction is wrong.
41
+
42
+ ## Example scripts
43
+
44
+ - Default/Amazon style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py`
45
+ - Overlap-heavy style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py`
46
+
47
+ Run them directly:
48
+
49
+ ```bash
50
+ ./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py
51
+ ./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py
52
+ ```
53
+
54
+ ## GPU behavior
55
+
56
+ - Uses GPU first by default.
57
+ - If GPU is not available or fails, automatically falls back to CPU.
@@ -0,0 +1,36 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "captcha-url-reader"
7
+ version = "1.0.0"
8
+ description = "Simple Python package: pass CAPTCHA image URL and get extracted text"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [{ name = "Arif Shah" }]
12
+ license = { text = "MIT" }
13
+ dependencies = [
14
+ "easyocr>=1.7.1",
15
+ "opencv-python>=4.9.0",
16
+ "Pillow>=10.2.0",
17
+ "numpy>=1.26.0",
18
+ "requests>=2.31.0"
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ dev = ["pytest>=8.0.0"]
23
+
24
+ [project.scripts]
25
+ captcha-url-reader = "captcha_image_reader.cli:main"
26
+
27
+ [tool.setuptools]
28
+ package-dir = {"" = "src"}
29
+
30
+ [tool.setuptools.packages.find]
31
+ where = ["src"]
32
+
33
+ [tool.pytest.ini_options]
34
+ pythonpath = ["src"]
35
+ testpaths = ["tests"]
36
+ addopts = "-q"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ from .reader import read_captcha_from_url
2
+
3
+ __all__ = ["read_captcha_from_url"]
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+
5
+ from .reader import read_captcha_from_url
6
+
7
+
8
+ def build_parser() -> argparse.ArgumentParser:
9
+ parser = argparse.ArgumentParser(description="Read CAPTCHA text from an image URL")
10
+ parser.add_argument("image_url", help="Captcha image URL")
11
+ parser.add_argument("--cpu-only", action="store_true", help="Disable GPU and force CPU OCR")
12
+ return parser
13
+
14
+
15
+ def main() -> None:
16
+ parser = build_parser()
17
+ args = parser.parse_args()
18
+ text = read_captcha_from_url(args.image_url, prefer_gpu=not args.cpu_only)
19
+ if text:
20
+ print(text)
21
+ else:
22
+ print("No text extracted")
23
+
24
+
25
+ if __name__ == "__main__":
26
+ main()
@@ -0,0 +1,285 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ from io import BytesIO
5
+ from itertools import product
6
+ from typing import Any
7
+
8
+ import cv2
9
+ import easyocr
10
+ import numpy as np
11
+ import requests
12
+ from PIL import Image
13
+
14
+ _READER_CACHE: dict[bool, Any] = {}
15
+ _ALLOWLIST = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
16
+ _AMBIGUOUS_TO_DIGITS: dict[str, tuple[str, ...]] = {
17
+ "O": ("9", "0"),
18
+ "Q": ("0",),
19
+ "S": ("5",),
20
+ }
21
+
22
+
23
+ def _get_reader(prefer_gpu: bool = True, languages: list[str] | None = None) -> Any:
24
+ langs = languages or ["en"]
25
+ if prefer_gpu:
26
+ try:
27
+ key = True
28
+ if key not in _READER_CACHE:
29
+ _READER_CACHE[key] = easyocr.Reader(langs, gpu=True)
30
+ return _READER_CACHE[key]
31
+ except Exception:
32
+ pass
33
+
34
+ key = False
35
+ if key not in _READER_CACHE:
36
+ _READER_CACHE[key] = easyocr.Reader(langs, gpu=False)
37
+ return _READER_CACHE[key]
38
+
39
+
40
+ def _build_variants(gray: np.ndarray, threshold: int) -> list[np.ndarray]:
41
+ variants: list[np.ndarray] = [gray]
42
+
43
+ # Upscale to reduce overlap impact on OCR detection.
44
+ up2 = cv2.resize(gray, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)
45
+ up3 = cv2.resize(gray, None, fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
46
+ variants.extend([up2, up3])
47
+
48
+ # Contrast normalization.
49
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
50
+ clahe_gray = clahe.apply(gray)
51
+ variants.append(clahe_gray)
52
+
53
+ # Threshold-based binarizations.
54
+ for src in (gray, clahe_gray, up2):
55
+ _, thr = cv2.threshold(src, threshold, 255, cv2.THRESH_BINARY)
56
+ variants.append(thr)
57
+ variants.append(cv2.bitwise_not(thr))
58
+
59
+ _, otsu = cv2.threshold(src, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
60
+ variants.append(otsu)
61
+ variants.append(cv2.bitwise_not(otsu))
62
+
63
+ adp = cv2.adaptiveThreshold(
64
+ src, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 21, 7
65
+ )
66
+ variants.append(adp)
67
+ variants.append(cv2.bitwise_not(adp))
68
+
69
+ kernel = np.ones((2, 2), np.uint8)
70
+ variants.append(cv2.dilate(up2, kernel, iterations=1))
71
+ variants.append(cv2.erode(up2, kernel, iterations=1))
72
+ variants.append(cv2.morphologyEx(up2, cv2.MORPH_CLOSE, kernel))
73
+ variants.append(cv2.medianBlur(up2, 3))
74
+ sharpened = cv2.addWeighted(
75
+ cv2.GaussianBlur(up3, (0, 0), 1.2),
76
+ 1.3,
77
+ cv2.GaussianBlur(up3, (0, 0), 2.4),
78
+ -0.3,
79
+ 0,
80
+ )
81
+ variants.append(np.clip(sharpened, 0, 255).astype(np.uint8))
82
+
83
+ return variants
84
+
85
+
86
+ def _detect_overlap_risk(gray: np.ndarray) -> bool:
87
+ # Approximate overlap/noise risk by blob count and ink density.
88
+ blur = cv2.GaussianBlur(gray, (3, 3), 0)
89
+ _, bin_inv = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
90
+ bin_inv = cv2.morphologyEx(
91
+ bin_inv,
92
+ cv2.MORPH_CLOSE,
93
+ cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)),
94
+ )
95
+
96
+ num_labels, _, stats, _ = cv2.connectedComponentsWithStats(bin_inv, connectivity=8)
97
+ min_area = max(12, int(gray.size * 0.001))
98
+ max_area = int(gray.size * 0.65)
99
+ blob_count = 0
100
+ for i in range(1, num_labels):
101
+ area = int(stats[i, cv2.CC_STAT_AREA])
102
+ if min_area <= area <= max_area:
103
+ blob_count += 1
104
+
105
+ ink_density = float(np.count_nonzero(bin_inv)) / float(bin_inv.size)
106
+ return blob_count <= 3 and 0.03 <= ink_density <= 0.45
107
+
108
+
109
+ def _candidate_score(text: str, confidence: float) -> float:
110
+ length_bonus = 0.3 if 4 <= len(text) <= 8 else 0.0
111
+ alpha_num_bonus = 0.1 if text.isalnum() else 0.0
112
+ mixed_bonus = 0.22 if any(ch.isalpha() for ch in text) and any(ch.isdigit() for ch in text) else 0.0
113
+ return confidence + length_bonus + alpha_num_bonus + mixed_bonus
114
+
115
+
116
+ def _readtext_with_fallback(reader: Any, arr: np.ndarray, **kwargs: Any) -> list[Any]:
117
+ try:
118
+ return reader.readtext(arr, **kwargs)
119
+ except TypeError:
120
+ detail = kwargs.get("detail", 1)
121
+ try:
122
+ return reader.readtext(arr, detail)
123
+ except TypeError:
124
+ return reader.readtext(arr)
125
+
126
+
127
+ def _bbox_x(item: Any) -> float:
128
+ try:
129
+ bbox = item[0]
130
+ return float(min(point[0] for point in bbox))
131
+ except Exception:
132
+ return 0.0
133
+
134
+
135
+ def _extract_text_conf(result: list[Any]) -> tuple[str, float] | None:
136
+ if not result:
137
+ return None
138
+
139
+ ordered = sorted(result, key=_bbox_x)
140
+ texts: list[str] = []
141
+ confs: list[float] = []
142
+ for item in ordered:
143
+ if isinstance(item, (tuple, list)) and len(item) >= 2:
144
+ texts.append(str(item[1]))
145
+ if len(item) >= 3:
146
+ try:
147
+ confs.append(float(item[2]))
148
+ except Exception:
149
+ pass
150
+ else:
151
+ texts.append(str(item))
152
+
153
+ text = "".join(texts).strip()
154
+ cleaned = "".join(ch for ch in text if ch.isalnum())
155
+ if not cleaned:
156
+ return None
157
+
158
+ conf = float(np.mean(confs)) if confs else 0.5
159
+ return cleaned, conf
160
+
161
+
162
+ def _iter_digit_variants(text: str, max_replacements: int = 2) -> list[tuple[str, int]]:
163
+ options_by_idx: list[tuple[int, tuple[str, ...]]] = []
164
+ for idx, ch in enumerate(text):
165
+ repl = _AMBIGUOUS_TO_DIGITS.get(ch.upper())
166
+ if repl:
167
+ options_by_idx.append((idx, repl))
168
+
169
+ if not options_by_idx:
170
+ return []
171
+
172
+ variants: list[tuple[str, int]] = []
173
+ for idx, replacements in options_by_idx:
174
+ for repl in replacements:
175
+ chars = list(text)
176
+ chars[idx] = repl
177
+ variants.append(("".join(chars), 1))
178
+
179
+ if len(options_by_idx) >= 2 and max_replacements >= 2:
180
+ for (idx1, repls1), (idx2, repls2) in product(options_by_idx, options_by_idx):
181
+ if idx1 >= idx2:
182
+ continue
183
+ for repl1 in repls1:
184
+ for repl2 in repls2:
185
+ chars = list(text)
186
+ chars[idx1] = repl1
187
+ chars[idx2] = repl2
188
+ variants.append(("".join(chars), 2))
189
+
190
+ return variants
191
+
192
+
193
+ def _extract_best_text(reader: Any, variants: list[np.ndarray], *, overlap_risk: bool = False) -> str | None:
194
+ candidates: dict[str, dict[str, float]] = defaultdict(
195
+ lambda: {"votes": 0.0, "conf_sum": 0.0}
196
+ )
197
+ raw_candidates: list[tuple[str, float, float]] = []
198
+
199
+ for arr in variants:
200
+ for decoder in ("greedy", "beamsearch"):
201
+ result = _readtext_with_fallback(
202
+ reader,
203
+ arr,
204
+ detail=1,
205
+ paragraph=False,
206
+ allowlist=_ALLOWLIST,
207
+ decoder=decoder,
208
+ beamWidth=5,
209
+ )
210
+ parsed = _extract_text_conf(result)
211
+ if not parsed:
212
+ continue
213
+ cleaned, conf = parsed
214
+
215
+ score = _candidate_score(cleaned, conf)
216
+ candidates[cleaned]["votes"] += score
217
+ candidates[cleaned]["conf_sum"] += conf
218
+ raw_candidates.append((cleaned, conf, score))
219
+
220
+ saw_digit = any(any(ch.isdigit() for ch in text) for text, _, _ in raw_candidates)
221
+ if saw_digit and overlap_risk:
222
+ for text, conf, score in raw_candidates:
223
+ for transformed, replacements in _iter_digit_variants(text):
224
+ adjusted_score = score - (0.12 * replacements)
225
+ candidates[transformed]["votes"] += adjusted_score
226
+ candidates[transformed]["conf_sum"] += max(0.0, conf - (0.06 * replacements))
227
+
228
+ if not candidates:
229
+ return None
230
+
231
+ best_text = max(
232
+ candidates.items(),
233
+ key=lambda kv: (kv[1]["votes"], kv[1]["conf_sum"], len(kv[0])),
234
+ )[0]
235
+
236
+ if overlap_risk and not any(ch.isdigit() for ch in best_text):
237
+ replacements = _iter_digit_variants(best_text, max_replacements=2)
238
+ best_votes = candidates[best_text]["votes"]
239
+ best_conf = candidates[best_text]["conf_sum"]
240
+ for transformed, replacement_count in replacements:
241
+ if transformed == best_text or not any(ch.isdigit() for ch in transformed):
242
+ continue
243
+ # In overlap-heavy captchas, OCR often reads 9/5 as O/S.
244
+ if replacement_count >= 2:
245
+ factor = 1.08
246
+ conf_factor = 0.92
247
+ else:
248
+ factor = 0.56
249
+ conf_factor = 0.52
250
+ inherited = best_votes * factor
251
+ candidates[transformed]["votes"] += max(0.0, inherited)
252
+ candidates[transformed]["conf_sum"] += max(0.0, best_conf * conf_factor)
253
+
254
+ best_text = max(
255
+ candidates.items(),
256
+ key=lambda kv: (kv[1]["votes"], kv[1]["conf_sum"], len(kv[0])),
257
+ )[0]
258
+
259
+ return best_text
260
+
261
+
262
+ def read_captcha_from_url(
263
+ image_url: str,
264
+ *,
265
+ prefer_gpu: bool = True,
266
+ timeout: int = 20,
267
+ threshold: int = 150,
268
+ force_overlap_risk: bool | None = None,
269
+ ) -> str | None:
270
+ """Download CAPTCHA image URL and return extracted text."""
271
+ try:
272
+ response = requests.get(image_url, timeout=timeout)
273
+ response.raise_for_status()
274
+
275
+ gray = np.array(Image.open(BytesIO(response.content)).convert("L"))
276
+ variants = _build_variants(gray, threshold=threshold)
277
+ overlap_risk = (
278
+ _detect_overlap_risk(gray)
279
+ if force_overlap_risk is None
280
+ else bool(force_overlap_risk)
281
+ )
282
+ reader = _get_reader(prefer_gpu=prefer_gpu)
283
+ return _extract_best_text(reader, variants, overlap_risk=overlap_risk)
284
+ except Exception:
285
+ return None
@@ -0,0 +1,73 @@
1
+ Metadata-Version: 2.4
2
+ Name: captcha-url-reader
3
+ Version: 1.0.0
4
+ Summary: Simple Python package: pass CAPTCHA image URL and get extracted text
5
+ Author: Arif Shah
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: easyocr>=1.7.1
10
+ Requires-Dist: opencv-python>=4.9.0
11
+ Requires-Dist: Pillow>=10.2.0
12
+ Requires-Dist: numpy>=1.26.0
13
+ Requires-Dist: requests>=2.31.0
14
+ Provides-Extra: dev
15
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
16
+
17
+ # captcha-url-reader
18
+
19
+ Simple package: user passes captcha image URL, package reads and returns text.
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ pip install -e .
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ```python
30
+ from captcha_image_reader import read_captcha_from_url
31
+
32
+ # Default mode (recommended for Amazon-style captchas)
33
+ captcha_text = read_captcha_from_url("https://images-na.ssl-images-amazon.com/captcha/sgkknrsj/Captcha_iwrdailhkf.jpg")
34
+ if captcha_text:
35
+ print(f"CAPTCHA text extracted: {captcha_text}")
36
+ else:
37
+ print("No text extracted from image URL.")
38
+ ```
39
+
40
+ ## Overlap-heavy captcha mode
41
+
42
+ Use forced overlap mode only when text is merged/overlapping and default mode is not accurate.
43
+
44
+ ```python
45
+ from captcha_image_reader import read_captcha_from_url
46
+
47
+ captcha_text = read_captcha_from_url(
48
+ "https://2captcha.com/dist/web/assets/captcha-rn1S3orp.jpg",
49
+ force_overlap_risk=True,
50
+ )
51
+ ```
52
+
53
+ ## When to use which mode
54
+
55
+ - Use default mode for clean or mostly non-overlapping text (for example, most Amazon captchas).
56
+ - Use `force_overlap_risk=True` only when characters are merged and default extraction is wrong.
57
+
58
+ ## Example scripts
59
+
60
+ - Default/Amazon style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py`
61
+ - Overlap-heavy style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py`
62
+
63
+ Run them directly:
64
+
65
+ ```bash
66
+ ./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py
67
+ ./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py
68
+ ```
69
+
70
+ ## GPU behavior
71
+
72
+ - Uses GPU first by default.
73
+ - If GPU is not available or fails, automatically falls back to CPU.
@@ -0,0 +1,12 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/captcha_image_reader/__init__.py
4
+ src/captcha_image_reader/cli.py
5
+ src/captcha_image_reader/reader.py
6
+ src/captcha_url_reader.egg-info/PKG-INFO
7
+ src/captcha_url_reader.egg-info/SOURCES.txt
8
+ src/captcha_url_reader.egg-info/dependency_links.txt
9
+ src/captcha_url_reader.egg-info/entry_points.txt
10
+ src/captcha_url_reader.egg-info/requires.txt
11
+ src/captcha_url_reader.egg-info/top_level.txt
12
+ tests/test_reader.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ captcha-url-reader = captcha_image_reader.cli:main
@@ -0,0 +1,8 @@
1
+ easyocr>=1.7.1
2
+ opencv-python>=4.9.0
3
+ Pillow>=10.2.0
4
+ numpy>=1.26.0
5
+ requests>=2.31.0
6
+
7
+ [dev]
8
+ pytest>=8.0.0
@@ -0,0 +1 @@
1
+ captcha_image_reader
@@ -0,0 +1,99 @@
1
+ from captcha_image_reader import read_captcha_from_url
2
+ from captcha_image_reader import reader as reader_module
3
+ import numpy as np
4
+
5
+
6
+ class FakeResponse:
7
+ def __init__(self, content: bytes):
8
+ self.content = content
9
+
10
+ def raise_for_status(self) -> None:
11
+ return None
12
+
13
+
14
+ def test_reader_returns_none_on_invalid_url(monkeypatch):
15
+ def fake_get(url, timeout):
16
+ raise Exception("network")
17
+
18
+ monkeypatch.setattr("captcha_image_reader.reader.requests.get", fake_get)
19
+
20
+ assert read_captcha_from_url("https://example.com/captcha.jpg") is None
21
+
22
+
23
+ def test_import_works():
24
+ assert callable(read_captcha_from_url)
25
+
26
+
27
+ def test_gpu_first_then_cpu_fallback(monkeypatch):
28
+ class FakeReader:
29
+ def __init__(self, gpu):
30
+ self.gpu = gpu
31
+
32
+ def readtext(self, arr, detail):
33
+ return []
34
+
35
+ created = []
36
+
37
+ def fake_easyocr_reader(langs, gpu):
38
+ created.append(gpu)
39
+ if gpu:
40
+ raise RuntimeError("gpu unavailable")
41
+ return FakeReader(gpu=False)
42
+
43
+ monkeypatch.setattr(reader_module.easyocr, "Reader", fake_easyocr_reader)
44
+ reader_module._READER_CACHE.clear()
45
+ r = reader_module._get_reader(prefer_gpu=True)
46
+ assert created == [True, False]
47
+ assert r.gpu is False
48
+
49
+
50
+ def test_overlap_ambiguity_prefers_digit_text_when_supported():
51
+ class FakeReader:
52
+ def __init__(self):
53
+ self.calls = 0
54
+
55
+ def readtext(self, arr, detail=1, **kwargs):
56
+ self.calls += 1
57
+ if self.calls == 1:
58
+ return [([[10, 0], [20, 0], [20, 10], [10, 10]], "WOHSK", 0.92)]
59
+ return [([[10, 0], [20, 0], [20, 10], [10, 10]], "W9H5K", 0.45)]
60
+
61
+ variants = [np.zeros((20, 60), dtype=np.uint8)]
62
+ text = reader_module._extract_best_text(FakeReader(), variants, overlap_risk=True)
63
+ assert text == "W9H5K"
64
+
65
+
66
+ def test_no_digit_evidence_does_not_force_digit_substitution():
67
+ class FakeReader:
68
+ def readtext(self, arr, detail=1, **kwargs):
69
+ return [([[10, 0], [20, 0], [20, 10], [10, 10]], "WOHSK", 0.9)]
70
+
71
+ variants = [np.zeros((20, 60), dtype=np.uint8)]
72
+ text = reader_module._extract_best_text(FakeReader(), variants)
73
+ assert text == "WOHSK"
74
+
75
+
76
+ def test_overlap_risk_can_recover_digits_without_prior_digit_evidence():
77
+ class FakeReader:
78
+ def readtext(self, arr, detail=1, **kwargs):
79
+ return [([[10, 0], [20, 0], [20, 10], [10, 10]], "WOHSK", 0.9)]
80
+
81
+ variants = [np.zeros((20, 60), dtype=np.uint8)]
82
+ text = reader_module._extract_best_text(FakeReader(), variants, overlap_risk=True)
83
+ assert text == "W9H5K"
84
+
85
+
86
+ def test_default_mode_keeps_all_letter_captcha():
87
+ class FakeReader:
88
+ def __init__(self):
89
+ self.calls = 0
90
+
91
+ def readtext(self, arr, detail=1, **kwargs):
92
+ self.calls += 1
93
+ if self.calls == 1:
94
+ return [([[10, 0], [20, 0], [20, 10], [10, 10]], "CLBHUF", 0.88)]
95
+ return [([[10, 0], [20, 0], [20, 10], [10, 10]], "C18HUF", 0.42)]
96
+
97
+ variants = [np.zeros((20, 60), dtype=np.uint8)]
98
+ text = reader_module._extract_best_text(FakeReader(), variants, overlap_risk=False)
99
+ assert text == "CLBHUF"