captcha-url-reader 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- captcha_url_reader-1.0.0/PKG-INFO +73 -0
- captcha_url_reader-1.0.0/README.md +57 -0
- captcha_url_reader-1.0.0/pyproject.toml +36 -0
- captcha_url_reader-1.0.0/setup.cfg +4 -0
- captcha_url_reader-1.0.0/src/captcha_image_reader/__init__.py +3 -0
- captcha_url_reader-1.0.0/src/captcha_image_reader/cli.py +26 -0
- captcha_url_reader-1.0.0/src/captcha_image_reader/reader.py +285 -0
- captcha_url_reader-1.0.0/src/captcha_url_reader.egg-info/PKG-INFO +73 -0
- captcha_url_reader-1.0.0/src/captcha_url_reader.egg-info/SOURCES.txt +12 -0
- captcha_url_reader-1.0.0/src/captcha_url_reader.egg-info/dependency_links.txt +1 -0
- captcha_url_reader-1.0.0/src/captcha_url_reader.egg-info/entry_points.txt +2 -0
- captcha_url_reader-1.0.0/src/captcha_url_reader.egg-info/requires.txt +8 -0
- captcha_url_reader-1.0.0/src/captcha_url_reader.egg-info/top_level.txt +1 -0
- captcha_url_reader-1.0.0/tests/test_reader.py +99 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: captcha-url-reader
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Simple Python package: pass CAPTCHA image URL and get extracted text
|
|
5
|
+
Author: Arif Shah
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: easyocr>=1.7.1
|
|
10
|
+
Requires-Dist: opencv-python>=4.9.0
|
|
11
|
+
Requires-Dist: Pillow>=10.2.0
|
|
12
|
+
Requires-Dist: numpy>=1.26.0
|
|
13
|
+
Requires-Dist: requests>=2.31.0
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
16
|
+
|
|
17
|
+
# captcha-url-reader
|
|
18
|
+
|
|
19
|
+
Simple package: user passes captcha image URL, package reads and returns text.
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install -e .
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from captcha_image_reader import read_captcha_from_url
|
|
31
|
+
|
|
32
|
+
# Default mode (recommended for Amazon-style captchas)
|
|
33
|
+
captcha_text = read_captcha_from_url("https://images-na.ssl-images-amazon.com/captcha/sgkknrsj/Captcha_iwrdailhkf.jpg")
|
|
34
|
+
if captcha_text:
|
|
35
|
+
print(f"CAPTCHA text extracted: {captcha_text}")
|
|
36
|
+
else:
|
|
37
|
+
print("No text extracted from image URL.")
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Overlap-heavy captcha mode
|
|
41
|
+
|
|
42
|
+
Use forced overlap mode only when text is merged/overlapping and default mode is not accurate.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from captcha_image_reader import read_captcha_from_url
|
|
46
|
+
|
|
47
|
+
captcha_text = read_captcha_from_url(
|
|
48
|
+
"https://2captcha.com/dist/web/assets/captcha-rn1S3orp.jpg",
|
|
49
|
+
force_overlap_risk=True,
|
|
50
|
+
)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## When to use which mode
|
|
54
|
+
|
|
55
|
+
- Use default mode for clean or mostly non-overlapping text (for example, most Amazon captchas).
|
|
56
|
+
- Use `force_overlap_risk=True` only when characters are merged and default extraction is wrong.
|
|
57
|
+
|
|
58
|
+
## Example scripts
|
|
59
|
+
|
|
60
|
+
- Default/Amazon style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py`
|
|
61
|
+
- Overlap-heavy style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py`
|
|
62
|
+
|
|
63
|
+
Run them directly:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py
|
|
67
|
+
./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## GPU behavior
|
|
71
|
+
|
|
72
|
+
- Uses GPU first by default.
|
|
73
|
+
- If GPU is not available or fails, automatically falls back to CPU.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# captcha-url-reader
|
|
2
|
+
|
|
3
|
+
Simple package: user passes captcha image URL, package reads and returns text.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install -e .
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from captcha_image_reader import read_captcha_from_url
|
|
15
|
+
|
|
16
|
+
# Default mode (recommended for Amazon-style captchas)
|
|
17
|
+
captcha_text = read_captcha_from_url("https://images-na.ssl-images-amazon.com/captcha/sgkknrsj/Captcha_iwrdailhkf.jpg")
|
|
18
|
+
if captcha_text:
|
|
19
|
+
print(f"CAPTCHA text extracted: {captcha_text}")
|
|
20
|
+
else:
|
|
21
|
+
print("No text extracted from image URL.")
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Overlap-heavy captcha mode
|
|
25
|
+
|
|
26
|
+
Use forced overlap mode only when text is merged/overlapping and default mode is not accurate.
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from captcha_image_reader import read_captcha_from_url
|
|
30
|
+
|
|
31
|
+
captcha_text = read_captcha_from_url(
|
|
32
|
+
"https://2captcha.com/dist/web/assets/captcha-rn1S3orp.jpg",
|
|
33
|
+
force_overlap_risk=True,
|
|
34
|
+
)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## When to use which mode
|
|
38
|
+
|
|
39
|
+
- Use default mode for clean or mostly non-overlapping text (for example, most Amazon captchas).
|
|
40
|
+
- Use `force_overlap_risk=True` only when characters are merged and default extraction is wrong.
|
|
41
|
+
|
|
42
|
+
## Example scripts
|
|
43
|
+
|
|
44
|
+
- Default/Amazon style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py`
|
|
45
|
+
- Overlap-heavy style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py`
|
|
46
|
+
|
|
47
|
+
Run them directly:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py
|
|
51
|
+
./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## GPU behavior
|
|
55
|
+
|
|
56
|
+
- Uses GPU first by default.
|
|
57
|
+
- If GPU is not available or fails, automatically falls back to CPU.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "captcha-url-reader"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Simple Python package: pass CAPTCHA image URL and get extracted text"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [{ name = "Arif Shah" }]
|
|
12
|
+
license = { text = "MIT" }
|
|
13
|
+
dependencies = [
|
|
14
|
+
"easyocr>=1.7.1",
|
|
15
|
+
"opencv-python>=4.9.0",
|
|
16
|
+
"Pillow>=10.2.0",
|
|
17
|
+
"numpy>=1.26.0",
|
|
18
|
+
"requests>=2.31.0"
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
dev = ["pytest>=8.0.0"]
|
|
23
|
+
|
|
24
|
+
[project.scripts]
|
|
25
|
+
captcha-url-reader = "captcha_image_reader.cli:main"
|
|
26
|
+
|
|
27
|
+
[tool.setuptools]
|
|
28
|
+
package-dir = {"" = "src"}
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
where = ["src"]
|
|
32
|
+
|
|
33
|
+
[tool.pytest.ini_options]
|
|
34
|
+
pythonpath = ["src"]
|
|
35
|
+
testpaths = ["tests"]
|
|
36
|
+
addopts = "-q"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
from .reader import read_captcha_from_url
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
9
|
+
parser = argparse.ArgumentParser(description="Read CAPTCHA text from an image URL")
|
|
10
|
+
parser.add_argument("image_url", help="Captcha image URL")
|
|
11
|
+
parser.add_argument("--cpu-only", action="store_true", help="Disable GPU and force CPU OCR")
|
|
12
|
+
return parser
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> None:
|
|
16
|
+
parser = build_parser()
|
|
17
|
+
args = parser.parse_args()
|
|
18
|
+
text = read_captcha_from_url(args.image_url, prefer_gpu=not args.cpu_only)
|
|
19
|
+
if text:
|
|
20
|
+
print(text)
|
|
21
|
+
else:
|
|
22
|
+
print("No text extracted")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == "__main__":
|
|
26
|
+
main()
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from itertools import product
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import cv2
|
|
9
|
+
import easyocr
|
|
10
|
+
import numpy as np
|
|
11
|
+
import requests
|
|
12
|
+
from PIL import Image
|
|
13
|
+
|
|
14
|
+
_READER_CACHE: dict[bool, Any] = {}
|
|
15
|
+
_ALLOWLIST = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
|
16
|
+
_AMBIGUOUS_TO_DIGITS: dict[str, tuple[str, ...]] = {
|
|
17
|
+
"O": ("9", "0"),
|
|
18
|
+
"Q": ("0",),
|
|
19
|
+
"S": ("5",),
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _get_reader(prefer_gpu: bool = True, languages: list[str] | None = None) -> Any:
|
|
24
|
+
langs = languages or ["en"]
|
|
25
|
+
if prefer_gpu:
|
|
26
|
+
try:
|
|
27
|
+
key = True
|
|
28
|
+
if key not in _READER_CACHE:
|
|
29
|
+
_READER_CACHE[key] = easyocr.Reader(langs, gpu=True)
|
|
30
|
+
return _READER_CACHE[key]
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
key = False
|
|
35
|
+
if key not in _READER_CACHE:
|
|
36
|
+
_READER_CACHE[key] = easyocr.Reader(langs, gpu=False)
|
|
37
|
+
return _READER_CACHE[key]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _build_variants(gray: np.ndarray, threshold: int) -> list[np.ndarray]:
|
|
41
|
+
variants: list[np.ndarray] = [gray]
|
|
42
|
+
|
|
43
|
+
# Upscale to reduce overlap impact on OCR detection.
|
|
44
|
+
up2 = cv2.resize(gray, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)
|
|
45
|
+
up3 = cv2.resize(gray, None, fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
|
|
46
|
+
variants.extend([up2, up3])
|
|
47
|
+
|
|
48
|
+
# Contrast normalization.
|
|
49
|
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
|
50
|
+
clahe_gray = clahe.apply(gray)
|
|
51
|
+
variants.append(clahe_gray)
|
|
52
|
+
|
|
53
|
+
# Threshold-based binarizations.
|
|
54
|
+
for src in (gray, clahe_gray, up2):
|
|
55
|
+
_, thr = cv2.threshold(src, threshold, 255, cv2.THRESH_BINARY)
|
|
56
|
+
variants.append(thr)
|
|
57
|
+
variants.append(cv2.bitwise_not(thr))
|
|
58
|
+
|
|
59
|
+
_, otsu = cv2.threshold(src, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
60
|
+
variants.append(otsu)
|
|
61
|
+
variants.append(cv2.bitwise_not(otsu))
|
|
62
|
+
|
|
63
|
+
adp = cv2.adaptiveThreshold(
|
|
64
|
+
src, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 21, 7
|
|
65
|
+
)
|
|
66
|
+
variants.append(adp)
|
|
67
|
+
variants.append(cv2.bitwise_not(adp))
|
|
68
|
+
|
|
69
|
+
kernel = np.ones((2, 2), np.uint8)
|
|
70
|
+
variants.append(cv2.dilate(up2, kernel, iterations=1))
|
|
71
|
+
variants.append(cv2.erode(up2, kernel, iterations=1))
|
|
72
|
+
variants.append(cv2.morphologyEx(up2, cv2.MORPH_CLOSE, kernel))
|
|
73
|
+
variants.append(cv2.medianBlur(up2, 3))
|
|
74
|
+
sharpened = cv2.addWeighted(
|
|
75
|
+
cv2.GaussianBlur(up3, (0, 0), 1.2),
|
|
76
|
+
1.3,
|
|
77
|
+
cv2.GaussianBlur(up3, (0, 0), 2.4),
|
|
78
|
+
-0.3,
|
|
79
|
+
0,
|
|
80
|
+
)
|
|
81
|
+
variants.append(np.clip(sharpened, 0, 255).astype(np.uint8))
|
|
82
|
+
|
|
83
|
+
return variants
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _detect_overlap_risk(gray: np.ndarray) -> bool:
|
|
87
|
+
# Approximate overlap/noise risk by blob count and ink density.
|
|
88
|
+
blur = cv2.GaussianBlur(gray, (3, 3), 0)
|
|
89
|
+
_, bin_inv = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
90
|
+
bin_inv = cv2.morphologyEx(
|
|
91
|
+
bin_inv,
|
|
92
|
+
cv2.MORPH_CLOSE,
|
|
93
|
+
cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
num_labels, _, stats, _ = cv2.connectedComponentsWithStats(bin_inv, connectivity=8)
|
|
97
|
+
min_area = max(12, int(gray.size * 0.001))
|
|
98
|
+
max_area = int(gray.size * 0.65)
|
|
99
|
+
blob_count = 0
|
|
100
|
+
for i in range(1, num_labels):
|
|
101
|
+
area = int(stats[i, cv2.CC_STAT_AREA])
|
|
102
|
+
if min_area <= area <= max_area:
|
|
103
|
+
blob_count += 1
|
|
104
|
+
|
|
105
|
+
ink_density = float(np.count_nonzero(bin_inv)) / float(bin_inv.size)
|
|
106
|
+
return blob_count <= 3 and 0.03 <= ink_density <= 0.45
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _candidate_score(text: str, confidence: float) -> float:
|
|
110
|
+
length_bonus = 0.3 if 4 <= len(text) <= 8 else 0.0
|
|
111
|
+
alpha_num_bonus = 0.1 if text.isalnum() else 0.0
|
|
112
|
+
mixed_bonus = 0.22 if any(ch.isalpha() for ch in text) and any(ch.isdigit() for ch in text) else 0.0
|
|
113
|
+
return confidence + length_bonus + alpha_num_bonus + mixed_bonus
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _readtext_with_fallback(reader: Any, arr: np.ndarray, **kwargs: Any) -> list[Any]:
|
|
117
|
+
try:
|
|
118
|
+
return reader.readtext(arr, **kwargs)
|
|
119
|
+
except TypeError:
|
|
120
|
+
detail = kwargs.get("detail", 1)
|
|
121
|
+
try:
|
|
122
|
+
return reader.readtext(arr, detail)
|
|
123
|
+
except TypeError:
|
|
124
|
+
return reader.readtext(arr)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _bbox_x(item: Any) -> float:
|
|
128
|
+
try:
|
|
129
|
+
bbox = item[0]
|
|
130
|
+
return float(min(point[0] for point in bbox))
|
|
131
|
+
except Exception:
|
|
132
|
+
return 0.0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _extract_text_conf(result: list[Any]) -> tuple[str, float] | None:
|
|
136
|
+
if not result:
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
ordered = sorted(result, key=_bbox_x)
|
|
140
|
+
texts: list[str] = []
|
|
141
|
+
confs: list[float] = []
|
|
142
|
+
for item in ordered:
|
|
143
|
+
if isinstance(item, (tuple, list)) and len(item) >= 2:
|
|
144
|
+
texts.append(str(item[1]))
|
|
145
|
+
if len(item) >= 3:
|
|
146
|
+
try:
|
|
147
|
+
confs.append(float(item[2]))
|
|
148
|
+
except Exception:
|
|
149
|
+
pass
|
|
150
|
+
else:
|
|
151
|
+
texts.append(str(item))
|
|
152
|
+
|
|
153
|
+
text = "".join(texts).strip()
|
|
154
|
+
cleaned = "".join(ch for ch in text if ch.isalnum())
|
|
155
|
+
if not cleaned:
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
conf = float(np.mean(confs)) if confs else 0.5
|
|
159
|
+
return cleaned, conf
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _iter_digit_variants(text: str, max_replacements: int = 2) -> list[tuple[str, int]]:
|
|
163
|
+
options_by_idx: list[tuple[int, tuple[str, ...]]] = []
|
|
164
|
+
for idx, ch in enumerate(text):
|
|
165
|
+
repl = _AMBIGUOUS_TO_DIGITS.get(ch.upper())
|
|
166
|
+
if repl:
|
|
167
|
+
options_by_idx.append((idx, repl))
|
|
168
|
+
|
|
169
|
+
if not options_by_idx:
|
|
170
|
+
return []
|
|
171
|
+
|
|
172
|
+
variants: list[tuple[str, int]] = []
|
|
173
|
+
for idx, replacements in options_by_idx:
|
|
174
|
+
for repl in replacements:
|
|
175
|
+
chars = list(text)
|
|
176
|
+
chars[idx] = repl
|
|
177
|
+
variants.append(("".join(chars), 1))
|
|
178
|
+
|
|
179
|
+
if len(options_by_idx) >= 2 and max_replacements >= 2:
|
|
180
|
+
for (idx1, repls1), (idx2, repls2) in product(options_by_idx, options_by_idx):
|
|
181
|
+
if idx1 >= idx2:
|
|
182
|
+
continue
|
|
183
|
+
for repl1 in repls1:
|
|
184
|
+
for repl2 in repls2:
|
|
185
|
+
chars = list(text)
|
|
186
|
+
chars[idx1] = repl1
|
|
187
|
+
chars[idx2] = repl2
|
|
188
|
+
variants.append(("".join(chars), 2))
|
|
189
|
+
|
|
190
|
+
return variants
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _extract_best_text(reader: Any, variants: list[np.ndarray], *, overlap_risk: bool = False) -> str | None:
|
|
194
|
+
candidates: dict[str, dict[str, float]] = defaultdict(
|
|
195
|
+
lambda: {"votes": 0.0, "conf_sum": 0.0}
|
|
196
|
+
)
|
|
197
|
+
raw_candidates: list[tuple[str, float, float]] = []
|
|
198
|
+
|
|
199
|
+
for arr in variants:
|
|
200
|
+
for decoder in ("greedy", "beamsearch"):
|
|
201
|
+
result = _readtext_with_fallback(
|
|
202
|
+
reader,
|
|
203
|
+
arr,
|
|
204
|
+
detail=1,
|
|
205
|
+
paragraph=False,
|
|
206
|
+
allowlist=_ALLOWLIST,
|
|
207
|
+
decoder=decoder,
|
|
208
|
+
beamWidth=5,
|
|
209
|
+
)
|
|
210
|
+
parsed = _extract_text_conf(result)
|
|
211
|
+
if not parsed:
|
|
212
|
+
continue
|
|
213
|
+
cleaned, conf = parsed
|
|
214
|
+
|
|
215
|
+
score = _candidate_score(cleaned, conf)
|
|
216
|
+
candidates[cleaned]["votes"] += score
|
|
217
|
+
candidates[cleaned]["conf_sum"] += conf
|
|
218
|
+
raw_candidates.append((cleaned, conf, score))
|
|
219
|
+
|
|
220
|
+
saw_digit = any(any(ch.isdigit() for ch in text) for text, _, _ in raw_candidates)
|
|
221
|
+
if saw_digit and overlap_risk:
|
|
222
|
+
for text, conf, score in raw_candidates:
|
|
223
|
+
for transformed, replacements in _iter_digit_variants(text):
|
|
224
|
+
adjusted_score = score - (0.12 * replacements)
|
|
225
|
+
candidates[transformed]["votes"] += adjusted_score
|
|
226
|
+
candidates[transformed]["conf_sum"] += max(0.0, conf - (0.06 * replacements))
|
|
227
|
+
|
|
228
|
+
if not candidates:
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
best_text = max(
|
|
232
|
+
candidates.items(),
|
|
233
|
+
key=lambda kv: (kv[1]["votes"], kv[1]["conf_sum"], len(kv[0])),
|
|
234
|
+
)[0]
|
|
235
|
+
|
|
236
|
+
if overlap_risk and not any(ch.isdigit() for ch in best_text):
|
|
237
|
+
replacements = _iter_digit_variants(best_text, max_replacements=2)
|
|
238
|
+
best_votes = candidates[best_text]["votes"]
|
|
239
|
+
best_conf = candidates[best_text]["conf_sum"]
|
|
240
|
+
for transformed, replacement_count in replacements:
|
|
241
|
+
if transformed == best_text or not any(ch.isdigit() for ch in transformed):
|
|
242
|
+
continue
|
|
243
|
+
# In overlap-heavy captchas, OCR often reads 9/5 as O/S.
|
|
244
|
+
if replacement_count >= 2:
|
|
245
|
+
factor = 1.08
|
|
246
|
+
conf_factor = 0.92
|
|
247
|
+
else:
|
|
248
|
+
factor = 0.56
|
|
249
|
+
conf_factor = 0.52
|
|
250
|
+
inherited = best_votes * factor
|
|
251
|
+
candidates[transformed]["votes"] += max(0.0, inherited)
|
|
252
|
+
candidates[transformed]["conf_sum"] += max(0.0, best_conf * conf_factor)
|
|
253
|
+
|
|
254
|
+
best_text = max(
|
|
255
|
+
candidates.items(),
|
|
256
|
+
key=lambda kv: (kv[1]["votes"], kv[1]["conf_sum"], len(kv[0])),
|
|
257
|
+
)[0]
|
|
258
|
+
|
|
259
|
+
return best_text
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def read_captcha_from_url(
|
|
263
|
+
image_url: str,
|
|
264
|
+
*,
|
|
265
|
+
prefer_gpu: bool = True,
|
|
266
|
+
timeout: int = 20,
|
|
267
|
+
threshold: int = 150,
|
|
268
|
+
force_overlap_risk: bool | None = None,
|
|
269
|
+
) -> str | None:
|
|
270
|
+
"""Download CAPTCHA image URL and return extracted text."""
|
|
271
|
+
try:
|
|
272
|
+
response = requests.get(image_url, timeout=timeout)
|
|
273
|
+
response.raise_for_status()
|
|
274
|
+
|
|
275
|
+
gray = np.array(Image.open(BytesIO(response.content)).convert("L"))
|
|
276
|
+
variants = _build_variants(gray, threshold=threshold)
|
|
277
|
+
overlap_risk = (
|
|
278
|
+
_detect_overlap_risk(gray)
|
|
279
|
+
if force_overlap_risk is None
|
|
280
|
+
else bool(force_overlap_risk)
|
|
281
|
+
)
|
|
282
|
+
reader = _get_reader(prefer_gpu=prefer_gpu)
|
|
283
|
+
return _extract_best_text(reader, variants, overlap_risk=overlap_risk)
|
|
284
|
+
except Exception:
|
|
285
|
+
return None
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: captcha-url-reader
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Simple Python package: pass CAPTCHA image URL and get extracted text
|
|
5
|
+
Author: Arif Shah
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: easyocr>=1.7.1
|
|
10
|
+
Requires-Dist: opencv-python>=4.9.0
|
|
11
|
+
Requires-Dist: Pillow>=10.2.0
|
|
12
|
+
Requires-Dist: numpy>=1.26.0
|
|
13
|
+
Requires-Dist: requests>=2.31.0
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
16
|
+
|
|
17
|
+
# captcha-url-reader
|
|
18
|
+
|
|
19
|
+
Simple package: user passes captcha image URL, package reads and returns text.
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install -e .
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from captcha_image_reader import read_captcha_from_url
|
|
31
|
+
|
|
32
|
+
# Default mode (recommended for Amazon-style captchas)
|
|
33
|
+
captcha_text = read_captcha_from_url("https://images-na.ssl-images-amazon.com/captcha/sgkknrsj/Captcha_iwrdailhkf.jpg")
|
|
34
|
+
if captcha_text:
|
|
35
|
+
print(f"CAPTCHA text extracted: {captcha_text}")
|
|
36
|
+
else:
|
|
37
|
+
print("No text extracted from image URL.")
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Overlap-heavy captcha mode
|
|
41
|
+
|
|
42
|
+
Use forced overlap mode only when text is merged/overlapping and default mode is not accurate.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from captcha_image_reader import read_captcha_from_url
|
|
46
|
+
|
|
47
|
+
captcha_text = read_captcha_from_url(
|
|
48
|
+
"https://2captcha.com/dist/web/assets/captcha-rn1S3orp.jpg",
|
|
49
|
+
force_overlap_risk=True,
|
|
50
|
+
)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## When to use which mode
|
|
54
|
+
|
|
55
|
+
- Use default mode for clean or mostly non-overlapping text (for example, most Amazon captchas).
|
|
56
|
+
- Use `force_overlap_risk=True` only when characters are merged and default extraction is wrong.
|
|
57
|
+
|
|
58
|
+
## Example scripts
|
|
59
|
+
|
|
60
|
+
- Default/Amazon style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py`
|
|
61
|
+
- Overlap-heavy style: `/Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py`
|
|
62
|
+
|
|
63
|
+
Run them directly:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_amazon_default.py
|
|
67
|
+
./.venv/bin/python /Users/arif.shah/PycharmProjects/captcha-url-reader/examples/read_overlap_captcha.py
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## GPU behavior
|
|
71
|
+
|
|
72
|
+
- Uses GPU first by default.
|
|
73
|
+
- If GPU is not available or fails, automatically falls back to CPU.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/captcha_image_reader/__init__.py
|
|
4
|
+
src/captcha_image_reader/cli.py
|
|
5
|
+
src/captcha_image_reader/reader.py
|
|
6
|
+
src/captcha_url_reader.egg-info/PKG-INFO
|
|
7
|
+
src/captcha_url_reader.egg-info/SOURCES.txt
|
|
8
|
+
src/captcha_url_reader.egg-info/dependency_links.txt
|
|
9
|
+
src/captcha_url_reader.egg-info/entry_points.txt
|
|
10
|
+
src/captcha_url_reader.egg-info/requires.txt
|
|
11
|
+
src/captcha_url_reader.egg-info/top_level.txt
|
|
12
|
+
tests/test_reader.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
captcha_image_reader
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from captcha_image_reader import read_captcha_from_url
|
|
2
|
+
from captcha_image_reader import reader as reader_module
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FakeResponse:
|
|
7
|
+
def __init__(self, content: bytes):
|
|
8
|
+
self.content = content
|
|
9
|
+
|
|
10
|
+
def raise_for_status(self) -> None:
|
|
11
|
+
return None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_reader_returns_none_on_invalid_url(monkeypatch):
|
|
15
|
+
def fake_get(url, timeout):
|
|
16
|
+
raise Exception("network")
|
|
17
|
+
|
|
18
|
+
monkeypatch.setattr("captcha_image_reader.reader.requests.get", fake_get)
|
|
19
|
+
|
|
20
|
+
assert read_captcha_from_url("https://example.com/captcha.jpg") is None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_import_works():
|
|
24
|
+
assert callable(read_captcha_from_url)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_gpu_first_then_cpu_fallback(monkeypatch):
|
|
28
|
+
class FakeReader:
|
|
29
|
+
def __init__(self, gpu):
|
|
30
|
+
self.gpu = gpu
|
|
31
|
+
|
|
32
|
+
def readtext(self, arr, detail):
|
|
33
|
+
return []
|
|
34
|
+
|
|
35
|
+
created = []
|
|
36
|
+
|
|
37
|
+
def fake_easyocr_reader(langs, gpu):
|
|
38
|
+
created.append(gpu)
|
|
39
|
+
if gpu:
|
|
40
|
+
raise RuntimeError("gpu unavailable")
|
|
41
|
+
return FakeReader(gpu=False)
|
|
42
|
+
|
|
43
|
+
monkeypatch.setattr(reader_module.easyocr, "Reader", fake_easyocr_reader)
|
|
44
|
+
reader_module._READER_CACHE.clear()
|
|
45
|
+
r = reader_module._get_reader(prefer_gpu=True)
|
|
46
|
+
assert created == [True, False]
|
|
47
|
+
assert r.gpu is False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_overlap_ambiguity_prefers_digit_text_when_supported():
|
|
51
|
+
class FakeReader:
|
|
52
|
+
def __init__(self):
|
|
53
|
+
self.calls = 0
|
|
54
|
+
|
|
55
|
+
def readtext(self, arr, detail=1, **kwargs):
|
|
56
|
+
self.calls += 1
|
|
57
|
+
if self.calls == 1:
|
|
58
|
+
return [([[10, 0], [20, 0], [20, 10], [10, 10]], "WOHSK", 0.92)]
|
|
59
|
+
return [([[10, 0], [20, 0], [20, 10], [10, 10]], "W9H5K", 0.45)]
|
|
60
|
+
|
|
61
|
+
variants = [np.zeros((20, 60), dtype=np.uint8)]
|
|
62
|
+
text = reader_module._extract_best_text(FakeReader(), variants, overlap_risk=True)
|
|
63
|
+
assert text == "W9H5K"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_no_digit_evidence_does_not_force_digit_substitution():
|
|
67
|
+
class FakeReader:
|
|
68
|
+
def readtext(self, arr, detail=1, **kwargs):
|
|
69
|
+
return [([[10, 0], [20, 0], [20, 10], [10, 10]], "WOHSK", 0.9)]
|
|
70
|
+
|
|
71
|
+
variants = [np.zeros((20, 60), dtype=np.uint8)]
|
|
72
|
+
text = reader_module._extract_best_text(FakeReader(), variants)
|
|
73
|
+
assert text == "WOHSK"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_overlap_risk_can_recover_digits_without_prior_digit_evidence():
|
|
77
|
+
class FakeReader:
|
|
78
|
+
def readtext(self, arr, detail=1, **kwargs):
|
|
79
|
+
return [([[10, 0], [20, 0], [20, 10], [10, 10]], "WOHSK", 0.9)]
|
|
80
|
+
|
|
81
|
+
variants = [np.zeros((20, 60), dtype=np.uint8)]
|
|
82
|
+
text = reader_module._extract_best_text(FakeReader(), variants, overlap_risk=True)
|
|
83
|
+
assert text == "W9H5K"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_default_mode_keeps_all_letter_captcha():
|
|
87
|
+
class FakeReader:
|
|
88
|
+
def __init__(self):
|
|
89
|
+
self.calls = 0
|
|
90
|
+
|
|
91
|
+
def readtext(self, arr, detail=1, **kwargs):
|
|
92
|
+
self.calls += 1
|
|
93
|
+
if self.calls == 1:
|
|
94
|
+
return [([[10, 0], [20, 0], [20, 10], [10, 10]], "CLBHUF", 0.88)]
|
|
95
|
+
return [([[10, 0], [20, 0], [20, 10], [10, 10]], "C18HUF", 0.42)]
|
|
96
|
+
|
|
97
|
+
variants = [np.zeros((20, 60), dtype=np.uint8)]
|
|
98
|
+
text = reader_module._extract_best_text(FakeReader(), variants, overlap_risk=False)
|
|
99
|
+
assert text == "CLBHUF"
|