captcha-url-reader 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- captcha_image_reader/__init__.py +3 -0
- captcha_image_reader/cli.py +26 -0
- captcha_image_reader/reader.py +285 -0
- captcha_url_reader-1.0.0.dist-info/METADATA +73 -0
- captcha_url_reader-1.0.0.dist-info/RECORD +8 -0
- captcha_url_reader-1.0.0.dist-info/WHEEL +5 -0
- captcha_url_reader-1.0.0.dist-info/entry_points.txt +2 -0
- captcha_url_reader-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
from .reader import read_captcha_from_url
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
9
|
+
parser = argparse.ArgumentParser(description="Read CAPTCHA text from an image URL")
|
|
10
|
+
parser.add_argument("image_url", help="Captcha image URL")
|
|
11
|
+
parser.add_argument("--cpu-only", action="store_true", help="Disable GPU and force CPU OCR")
|
|
12
|
+
return parser
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> None:
|
|
16
|
+
parser = build_parser()
|
|
17
|
+
args = parser.parse_args()
|
|
18
|
+
text = read_captcha_from_url(args.image_url, prefer_gpu=not args.cpu_only)
|
|
19
|
+
if text:
|
|
20
|
+
print(text)
|
|
21
|
+
else:
|
|
22
|
+
print("No text extracted")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == "__main__":
|
|
26
|
+
main()
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from itertools import product
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import cv2
|
|
9
|
+
import easyocr
|
|
10
|
+
import numpy as np
|
|
11
|
+
import requests
|
|
12
|
+
from PIL import Image
|
|
13
|
+
|
|
14
|
+
_READER_CACHE: dict[bool, Any] = {}
|
|
15
|
+
_ALLOWLIST = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
|
16
|
+
_AMBIGUOUS_TO_DIGITS: dict[str, tuple[str, ...]] = {
|
|
17
|
+
"O": ("9", "0"),
|
|
18
|
+
"Q": ("0",),
|
|
19
|
+
"S": ("5",),
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _get_reader(prefer_gpu: bool = True, languages: list[str] | None = None) -> Any:
|
|
24
|
+
langs = languages or ["en"]
|
|
25
|
+
if prefer_gpu:
|
|
26
|
+
try:
|
|
27
|
+
key = True
|
|
28
|
+
if key not in _READER_CACHE:
|
|
29
|
+
_READER_CACHE[key] = easyocr.Reader(langs, gpu=True)
|
|
30
|
+
return _READER_CACHE[key]
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
key = False
|
|
35
|
+
if key not in _READER_CACHE:
|
|
36
|
+
_READER_CACHE[key] = easyocr.Reader(langs, gpu=False)
|
|
37
|
+
return _READER_CACHE[key]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _build_variants(gray: np.ndarray, threshold: int) -> list[np.ndarray]:
|
|
41
|
+
variants: list[np.ndarray] = [gray]
|
|
42
|
+
|
|
43
|
+
# Upscale to reduce overlap impact on OCR detection.
|
|
44
|
+
up2 = cv2.resize(gray, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)
|
|
45
|
+
up3 = cv2.resize(gray, None, fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
|
|
46
|
+
variants.extend([up2, up3])
|
|
47
|
+
|
|
48
|
+
# Contrast normalization.
|
|
49
|
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
|
50
|
+
clahe_gray = clahe.apply(gray)
|
|
51
|
+
variants.append(clahe_gray)
|
|
52
|
+
|
|
53
|
+
# Threshold-based binarizations.
|
|
54
|
+
for src in (gray, clahe_gray, up2):
|
|
55
|
+
_, thr = cv2.threshold(src, threshold, 255, cv2.THRESH_BINARY)
|
|
56
|
+
variants.append(thr)
|
|
57
|
+
variants.append(cv2.bitwise_not(thr))
|
|
58
|
+
|
|
59
|
+
_, otsu = cv2.threshold(src, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
60
|
+
variants.append(otsu)
|
|
61
|
+
variants.append(cv2.bitwise_not(otsu))
|
|
62
|
+
|
|
63
|
+
adp = cv2.adaptiveThreshold(
|
|
64
|
+
src, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 21, 7
|
|
65
|
+
)
|
|
66
|
+
variants.append(adp)
|
|
67
|
+
variants.append(cv2.bitwise_not(adp))
|
|
68
|
+
|
|
69
|
+
kernel = np.ones((2, 2), np.uint8)
|
|
70
|
+
variants.append(cv2.dilate(up2, kernel, iterations=1))
|
|
71
|
+
variants.append(cv2.erode(up2, kernel, iterations=1))
|
|
72
|
+
variants.append(cv2.morphologyEx(up2, cv2.MORPH_CLOSE, kernel))
|
|
73
|
+
variants.append(cv2.medianBlur(up2, 3))
|
|
74
|
+
sharpened = cv2.addWeighted(
|
|
75
|
+
cv2.GaussianBlur(up3, (0, 0), 1.2),
|
|
76
|
+
1.3,
|
|
77
|
+
cv2.GaussianBlur(up3, (0, 0), 2.4),
|
|
78
|
+
-0.3,
|
|
79
|
+
0,
|
|
80
|
+
)
|
|
81
|
+
variants.append(np.clip(sharpened, 0, 255).astype(np.uint8))
|
|
82
|
+
|
|
83
|
+
return variants
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _detect_overlap_risk(gray: np.ndarray) -> bool:
|
|
87
|
+
# Approximate overlap/noise risk by blob count and ink density.
|
|
88
|
+
blur = cv2.GaussianBlur(gray, (3, 3), 0)
|
|
89
|
+
_, bin_inv = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
90
|
+
bin_inv = cv2.morphologyEx(
|
|
91
|
+
bin_inv,
|
|
92
|
+
cv2.MORPH_CLOSE,
|
|
93
|
+
cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
num_labels, _, stats, _ = cv2.connectedComponentsWithStats(bin_inv, connectivity=8)
|
|
97
|
+
min_area = max(12, int(gray.size * 0.001))
|
|
98
|
+
max_area = int(gray.size * 0.65)
|
|
99
|
+
blob_count = 0
|
|
100
|
+
for i in range(1, num_labels):
|
|
101
|
+
area = int(stats[i, cv2.CC_STAT_AREA])
|
|
102
|
+
if min_area <= area <= max_area:
|
|
103
|
+
blob_count += 1
|
|
104
|
+
|
|
105
|
+
ink_density = float(np.count_nonzero(bin_inv)) / float(bin_inv.size)
|
|
106
|
+
return blob_count <= 3 and 0.03 <= ink_density <= 0.45
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _candidate_score(text: str, confidence: float) -> float:
|
|
110
|
+
length_bonus = 0.3 if 4 <= len(text) <= 8 else 0.0
|
|
111
|
+
alpha_num_bonus = 0.1 if text.isalnum() else 0.0
|
|
112
|
+
mixed_bonus = 0.22 if any(ch.isalpha() for ch in text) and any(ch.isdigit() for ch in text) else 0.0
|
|
113
|
+
return confidence + length_bonus + alpha_num_bonus + mixed_bonus
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _readtext_with_fallback(reader: Any, arr: np.ndarray, **kwargs: Any) -> list[Any]:
|
|
117
|
+
try:
|
|
118
|
+
return reader.readtext(arr, **kwargs)
|
|
119
|
+
except TypeError:
|
|
120
|
+
detail = kwargs.get("detail", 1)
|
|
121
|
+
try:
|
|
122
|
+
return reader.readtext(arr, detail)
|
|
123
|
+
except TypeError:
|
|
124
|
+
return reader.readtext(arr)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _bbox_x(item: Any) -> float:
|
|
128
|
+
try:
|
|
129
|
+
bbox = item[0]
|
|
130
|
+
return float(min(point[0] for point in bbox))
|
|
131
|
+
except Exception:
|
|
132
|
+
return 0.0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _extract_text_conf(result: list[Any]) -> tuple[str, float] | None:
|
|
136
|
+
if not result:
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
ordered = sorted(result, key=_bbox_x)
|
|
140
|
+
texts: list[str] = []
|
|
141
|
+
confs: list[float] = []
|
|
142
|
+
for item in ordered:
|
|
143
|
+
if isinstance(item, (tuple, list)) and len(item) >= 2:
|
|
144
|
+
texts.append(str(item[1]))
|
|
145
|
+
if len(item) >= 3:
|
|
146
|
+
try:
|
|
147
|
+
confs.append(float(item[2]))
|
|
148
|
+
except Exception:
|
|
149
|
+
pass
|
|
150
|
+
else:
|
|
151
|
+
texts.append(str(item))
|
|
152
|
+
|
|
153
|
+
text = "".join(texts).strip()
|
|
154
|
+
cleaned = "".join(ch for ch in text if ch.isalnum())
|
|
155
|
+
if not cleaned:
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
conf = float(np.mean(confs)) if confs else 0.5
|
|
159
|
+
return cleaned, conf
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _iter_digit_variants(text: str, max_replacements: int = 2) -> list[tuple[str, int]]:
|
|
163
|
+
options_by_idx: list[tuple[int, tuple[str, ...]]] = []
|
|
164
|
+
for idx, ch in enumerate(text):
|
|
165
|
+
repl = _AMBIGUOUS_TO_DIGITS.get(ch.upper())
|
|
166
|
+
if repl:
|
|
167
|
+
options_by_idx.append((idx, repl))
|
|
168
|
+
|
|
169
|
+
if not options_by_idx:
|
|
170
|
+
return []
|
|
171
|
+
|
|
172
|
+
variants: list[tuple[str, int]] = []
|
|
173
|
+
for idx, replacements in options_by_idx:
|
|
174
|
+
for repl in replacements:
|
|
175
|
+
chars = list(text)
|
|
176
|
+
chars[idx] = repl
|
|
177
|
+
variants.append(("".join(chars), 1))
|
|
178
|
+
|
|
179
|
+
if len(options_by_idx) >= 2 and max_replacements >= 2:
|
|
180
|
+
for (idx1, repls1), (idx2, repls2) in product(options_by_idx, options_by_idx):
|
|
181
|
+
if idx1 >= idx2:
|
|
182
|
+
continue
|
|
183
|
+
for repl1 in repls1:
|
|
184
|
+
for repl2 in repls2:
|
|
185
|
+
chars = list(text)
|
|
186
|
+
chars[idx1] = repl1
|
|
187
|
+
chars[idx2] = repl2
|
|
188
|
+
variants.append(("".join(chars), 2))
|
|
189
|
+
|
|
190
|
+
return variants
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _extract_best_text(reader: Any, variants: list[np.ndarray], *, overlap_risk: bool = False) -> str | None:
|
|
194
|
+
candidates: dict[str, dict[str, float]] = defaultdict(
|
|
195
|
+
lambda: {"votes": 0.0, "conf_sum": 0.0}
|
|
196
|
+
)
|
|
197
|
+
raw_candidates: list[tuple[str, float, float]] = []
|
|
198
|
+
|
|
199
|
+
for arr in variants:
|
|
200
|
+
for decoder in ("greedy", "beamsearch"):
|
|
201
|
+
result = _readtext_with_fallback(
|
|
202
|
+
reader,
|
|
203
|
+
arr,
|
|
204
|
+
detail=1,
|
|
205
|
+
paragraph=False,
|
|
206
|
+
allowlist=_ALLOWLIST,
|
|
207
|
+
decoder=decoder,
|
|
208
|
+
beamWidth=5,
|
|
209
|
+
)
|
|
210
|
+
parsed = _extract_text_conf(result)
|
|
211
|
+
if not parsed:
|
|
212
|
+
continue
|
|
213
|
+
cleaned, conf = parsed
|
|
214
|
+
|
|
215
|
+
score = _candidate_score(cleaned, conf)
|
|
216
|
+
candidates[cleaned]["votes"] += score
|
|
217
|
+
candidates[cleaned]["conf_sum"] += conf
|
|
218
|
+
raw_candidates.append((cleaned, conf, score))
|
|
219
|
+
|
|
220
|
+
saw_digit = any(any(ch.isdigit() for ch in text) for text, _, _ in raw_candidates)
|
|
221
|
+
if saw_digit and overlap_risk:
|
|
222
|
+
for text, conf, score in raw_candidates:
|
|
223
|
+
for transformed, replacements in _iter_digit_variants(text):
|
|
224
|
+
adjusted_score = score - (0.12 * replacements)
|
|
225
|
+
candidates[transformed]["votes"] += adjusted_score
|
|
226
|
+
candidates[transformed]["conf_sum"] += max(0.0, conf - (0.06 * replacements))
|
|
227
|
+
|
|
228
|
+
if not candidates:
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
best_text = max(
|
|
232
|
+
candidates.items(),
|
|
233
|
+
key=lambda kv: (kv[1]["votes"], kv[1]["conf_sum"], len(kv[0])),
|
|
234
|
+
)[0]
|
|
235
|
+
|
|
236
|
+
if overlap_risk and not any(ch.isdigit() for ch in best_text):
|
|
237
|
+
replacements = _iter_digit_variants(best_text, max_replacements=2)
|
|
238
|
+
best_votes = candidates[best_text]["votes"]
|
|
239
|
+
best_conf = candidates[best_text]["conf_sum"]
|
|
240
|
+
for transformed, replacement_count in replacements:
|
|
241
|
+
if transformed == best_text or not any(ch.isdigit() for ch in transformed):
|
|
242
|
+
continue
|
|
243
|
+
# In overlap-heavy captchas, OCR often reads 9/5 as O/S.
|
|
244
|
+
if replacement_count >= 2:
|
|
245
|
+
factor = 1.08
|
|
246
|
+
conf_factor = 0.92
|
|
247
|
+
else:
|
|
248
|
+
factor = 0.56
|
|
249
|
+
conf_factor = 0.52
|
|
250
|
+
inherited = best_votes * factor
|
|
251
|
+
candidates[transformed]["votes"] += max(0.0, inherited)
|
|
252
|
+
candidates[transformed]["conf_sum"] += max(0.0, best_conf * conf_factor)
|
|
253
|
+
|
|
254
|
+
best_text = max(
|
|
255
|
+
candidates.items(),
|
|
256
|
+
key=lambda kv: (kv[1]["votes"], kv[1]["conf_sum"], len(kv[0])),
|
|
257
|
+
)[0]
|
|
258
|
+
|
|
259
|
+
return best_text
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def read_captcha_from_url(
|
|
263
|
+
image_url: str,
|
|
264
|
+
*,
|
|
265
|
+
prefer_gpu: bool = True,
|
|
266
|
+
timeout: int = 20,
|
|
267
|
+
threshold: int = 150,
|
|
268
|
+
force_overlap_risk: bool | None = None,
|
|
269
|
+
) -> str | None:
|
|
270
|
+
"""Download CAPTCHA image URL and return extracted text."""
|
|
271
|
+
try:
|
|
272
|
+
response = requests.get(image_url, timeout=timeout)
|
|
273
|
+
response.raise_for_status()
|
|
274
|
+
|
|
275
|
+
gray = np.array(Image.open(BytesIO(response.content)).convert("L"))
|
|
276
|
+
variants = _build_variants(gray, threshold=threshold)
|
|
277
|
+
overlap_risk = (
|
|
278
|
+
_detect_overlap_risk(gray)
|
|
279
|
+
if force_overlap_risk is None
|
|
280
|
+
else bool(force_overlap_risk)
|
|
281
|
+
)
|
|
282
|
+
reader = _get_reader(prefer_gpu=prefer_gpu)
|
|
283
|
+
return _extract_best_text(reader, variants, overlap_risk=overlap_risk)
|
|
284
|
+
except Exception:
|
|
285
|
+
return None
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: captcha-url-reader
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Simple Python package: pass CAPTCHA image URL and get extracted text
|
|
5
|
+
Author: Arif Shah
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: easyocr>=1.7.1
|
|
10
|
+
Requires-Dist: opencv-python>=4.9.0
|
|
11
|
+
Requires-Dist: Pillow>=10.2.0
|
|
12
|
+
Requires-Dist: numpy>=1.26.0
|
|
13
|
+
Requires-Dist: requests>=2.31.0
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
16
|
+
|
|
17
|
+
# captcha-url-reader
|
|
18
|
+
|
|
19
|
+
Simple package: user passes captcha image URL, package reads and returns text.
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install -e .
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from captcha_image_reader import read_captcha_from_url
|
|
31
|
+
|
|
32
|
+
# Default mode (recommended for Amazon-style captchas)
|
|
33
|
+
captcha_text = read_captcha_from_url("https://images-na.ssl-images-amazon.com/captcha/sgkknrsj/Captcha_iwrdailhkf.jpg")
|
|
34
|
+
if captcha_text:
|
|
35
|
+
print(f"CAPTCHA text extracted: {captcha_text}")
|
|
36
|
+
else:
|
|
37
|
+
print("No text extracted from image URL.")
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Overlap-heavy captcha mode
|
|
41
|
+
|
|
42
|
+
Use forced overlap mode only when text is merged/overlapping and default mode is not accurate.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from captcha_image_reader import read_captcha_from_url
|
|
46
|
+
|
|
47
|
+
captcha_text = read_captcha_from_url(
|
|
48
|
+
"https://2captcha.com/dist/web/assets/captcha-rn1S3orp.jpg",
|
|
49
|
+
force_overlap_risk=True,
|
|
50
|
+
)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## When to use which mode
|
|
54
|
+
|
|
55
|
+
- Use default mode for clean or mostly non-overlapping text (for example, most Amazon captchas).
|
|
56
|
+
- Use `force_overlap_risk=True` only when characters are merged and default extraction is wrong.
|
|
57
|
+
|
|
58
|
+
## Example scripts
|
|
59
|
+
|
|
60
|
+
- Default/Amazon style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py`
|
|
61
|
+
- Overlap-heavy style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py`
|
|
62
|
+
|
|
63
|
+
Run them directly:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py
|
|
67
|
+
./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## GPU behavior
|
|
71
|
+
|
|
72
|
+
- Uses GPU first by default.
|
|
73
|
+
- If GPU is not available or fails, automatically falls back to CPU.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
captcha_image_reader/__init__.py,sha256=55imgvNhUXSFyNsvKh_Z1Iiu3qwrIIvZ1j0vXyhotGY,79
|
|
2
|
+
captcha_image_reader/cli.py,sha256=8whVxkDOjc7TDD-3G31zR4DcArcstiNNC-t5sKEm0c0,688
|
|
3
|
+
captcha_image_reader/reader.py,sha256=fxjtWBy7dpHRcvI46Cn8wOmlGpxu1aBGjZggXpQchcA,9600
|
|
4
|
+
captcha_url_reader-1.0.0.dist-info/METADATA,sha256=rc3r1XiSf9YNNBe97Vd5XI7cm-G9C1RoU8LZXaSmHv0,2131
|
|
5
|
+
captcha_url_reader-1.0.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
|
|
6
|
+
captcha_url_reader-1.0.0.dist-info/entry_points.txt,sha256=NpOle_Ounm2mpcsF5ahUOMsvA3VaYh1IiQpiv8-EILM,69
|
|
7
|
+
captcha_url_reader-1.0.0.dist-info/top_level.txt,sha256=WmV9_fWsM4xvIm19nfgR_HFVVt0k2hK1mnDGAwfMLkM,21
|
|
8
|
+
captcha_url_reader-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
captcha_image_reader
|