wrapper-mcp 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wrapper_mcp/__init__.py +3 -0
- wrapper_mcp/http.py +108 -0
- wrapper_mcp/korektor.py +65 -0
- wrapper_mcp/langdetect.py +508 -0
- wrapper_mcp/maskit.py +452 -0
- wrapper_mcp/maskit_audit.py +342 -0
- wrapper_mcp/maskit_constants.py +141 -0
- wrapper_mcp/maskit_normalize.py +172 -0
- wrapper_mcp/maskit_parsing.py +147 -0
- wrapper_mcp/maskit_patterns.py +1502 -0
- wrapper_mcp/maskit_placeholders.py +259 -0
- wrapper_mcp/maskit_postprocess.py +858 -0
- wrapper_mcp/maskit_stoplist.py +75 -0
- wrapper_mcp/maskit_strict.py +130 -0
- wrapper_mcp/nametag.py +386 -0
- wrapper_mcp/nametag_labels.py +76 -0
- wrapper_mcp/ponk.py +245 -0
- wrapper_mcp/server.py +380 -0
- wrapper_mcp/translator.py +182 -0
- wrapper_mcp/udpipe.py +225 -0
- wrapper_mcp/validation.py +125 -0
- wrapper_mcp-0.8.0.dist-info/METADATA +214 -0
- wrapper_mcp-0.8.0.dist-info/RECORD +26 -0
- wrapper_mcp-0.8.0.dist-info/WHEEL +4 -0
- wrapper_mcp-0.8.0.dist-info/entry_points.txt +2 -0
- wrapper_mcp-0.8.0.dist-info/licenses/LICENSE +30 -0
wrapper_mcp/__init__.py
ADDED
wrapper_mcp/http.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""HTTP klient pro LINDAT REST API — s retry, logging, exponential backoff."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
11
|
+
MASKIT_URL = "https://quest.ms.mff.cuni.cz/maskit/api/process"
|
|
12
|
+
NAMETAG_URL = "https://lindat.mff.cuni.cz/services/nametag/api/recognize"
|
|
13
|
+
PONK_URL = "https://quest.ms.mff.cuni.cz/ponk/api/process"
|
|
14
|
+
UDPIPE_URL = "https://lindat.mff.cuni.cz/services/udpipe/api/process"
|
|
15
|
+
|
|
16
|
+
HTTP_TIMEOUT = 120.0 # Zvýšeno z 60s — MasKIT API občas reaguje 60-90s na úřední SK texty
|
|
17
|
+
HTTP_TIMEOUT_LONG = 240.0 # Translator doc mode / large inputs
|
|
18
|
+
|
|
19
|
+
# Retry config — exponential backoff pro transient failures
|
|
20
|
+
MAX_RETRIES = 3
|
|
21
|
+
INITIAL_BACKOFF_S = 1.0
|
|
22
|
+
BACKOFF_MULTIPLIER = 2.0 # 1s → 2s → 4s
|
|
23
|
+
|
|
24
|
+
# Status codes worth retrying (transient): 429 Too Many Requests, 502/503/504 server errors
|
|
25
|
+
_RETRYABLE_STATUSES = frozenset({429, 502, 503, 504})
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def _post_with_retry(
|
|
31
|
+
url: str,
|
|
32
|
+
data: dict[str, str],
|
|
33
|
+
timeout: float,
|
|
34
|
+
) -> httpx.Response:
|
|
35
|
+
"""POST s exponential backoff retry pro transient errors.
|
|
36
|
+
|
|
37
|
+
Retry pravidla:
|
|
38
|
+
- httpx.TimeoutException, httpx.ConnectError, httpx.RemoteProtocolError → retry
|
|
39
|
+
- HTTP status 429/502/503/504 → retry
|
|
40
|
+
- Jiné HTTP errors (4xx) → fail immediately
|
|
41
|
+
- Po MAX_RETRIES pokusech → raise last exception
|
|
42
|
+
"""
|
|
43
|
+
backoff = INITIAL_BACKOFF_S
|
|
44
|
+
last_exc: Exception | None = None
|
|
45
|
+
|
|
46
|
+
for attempt in range(MAX_RETRIES + 1):
|
|
47
|
+
try:
|
|
48
|
+
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
|
49
|
+
response = await client.post(url, data=data)
|
|
50
|
+
|
|
51
|
+
if response.status_code in _RETRYABLE_STATUSES and attempt < MAX_RETRIES:
|
|
52
|
+
logger.warning(
|
|
53
|
+
"HTTP %s na %s (pokus %d/%d), retry za %.1fs",
|
|
54
|
+
response.status_code, url, attempt + 1, MAX_RETRIES + 1, backoff,
|
|
55
|
+
)
|
|
56
|
+
await asyncio.sleep(backoff)
|
|
57
|
+
backoff *= BACKOFF_MULTIPLIER
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
response.raise_for_status()
|
|
61
|
+
if attempt > 0:
|
|
62
|
+
logger.info("Retry úspěšný na %s po %d pokusech", url, attempt + 1)
|
|
63
|
+
return response
|
|
64
|
+
|
|
65
|
+
except (httpx.TimeoutException, httpx.ConnectError, httpx.RemoteProtocolError) as e:
|
|
66
|
+
last_exc = e
|
|
67
|
+
if attempt < MAX_RETRIES:
|
|
68
|
+
logger.warning(
|
|
69
|
+
"%s na %s (pokus %d/%d), retry za %.1fs",
|
|
70
|
+
type(e).__name__, url, attempt + 1, MAX_RETRIES + 1, backoff,
|
|
71
|
+
)
|
|
72
|
+
await asyncio.sleep(backoff)
|
|
73
|
+
backoff *= BACKOFF_MULTIPLIER
|
|
74
|
+
continue
|
|
75
|
+
logger.error("Vše %d pokusů selhalo na %s: %s", MAX_RETRIES + 1, url, e)
|
|
76
|
+
# Vytvoř exception s explicitní message — httpx.ReadTimeout má prázdné
|
|
77
|
+
# str(), což znesnadňuje debug u uživatele.
|
|
78
|
+
err_type = type(e).__name__
|
|
79
|
+
msg = str(e) or f"{err_type} po {timeout}s na {url} (server pravděpodobně přetížený)"
|
|
80
|
+
raise type(e)(msg) from e
|
|
81
|
+
except httpx.HTTPStatusError as e:
|
|
82
|
+
# 4xx errors except 429 — fail immediately, no point retrying client errors
|
|
83
|
+
logger.error("HTTP %d na %s: %s", e.response.status_code, url, e)
|
|
84
|
+
raise
|
|
85
|
+
|
|
86
|
+
if last_exc:
|
|
87
|
+
raise last_exc
|
|
88
|
+
raise RuntimeError(f"Unexpected: vyčerpáno {MAX_RETRIES + 1} pokusů na {url} bez exception")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def post_form(url: str, data: dict[str, str]) -> dict[str, Any]:
|
|
92
|
+
"""POST x-www-form-urlencoded → JSON response (s retry + logging)."""
|
|
93
|
+
response = await _post_with_retry(url, data, HTTP_TIMEOUT)
|
|
94
|
+
return response.json()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def post_form_text(
|
|
98
|
+
url: str,
|
|
99
|
+
data: dict[str, str],
|
|
100
|
+
timeout: float = HTTP_TIMEOUT_LONG,
|
|
101
|
+
) -> str:
|
|
102
|
+
"""POST x-www-form-urlencoded → plain text response (s retry + logging).
|
|
103
|
+
|
|
104
|
+
Použito pro Charles Translator, který vrací přeložený text přímo,
|
|
105
|
+
ne JSON. Vyšší default timeout (180s) protože doc mode + velký vstup.
|
|
106
|
+
"""
|
|
107
|
+
response = await _post_with_retry(url, data, timeout)
|
|
108
|
+
return response.text
|
wrapper_mcp/korektor.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Korektor — český spell checker + auto-doplnění diakritiky (LINDAT).
|
|
2
|
+
|
|
3
|
+
Wrapper kolem `https://lindat.mff.cuni.cz/services/korektor/api/correct`.
|
|
4
|
+
Dostupné modely:
|
|
5
|
+
- ``czech-spellchecker-130202`` (default) — opravy pravopisu
|
|
6
|
+
- ``czech-spellchecker_2edits-130202`` — agresivnější (až 2 edits/word)
|
|
7
|
+
- ``czech-diacritics_generator-130202`` — doplnění diakritiky do textu
|
|
8
|
+
- ``strip_diacritics-130202`` — odstranění diakritiky
|
|
9
|
+
|
|
10
|
+
Use cases pro legal-tech:
|
|
11
|
+
- Před odesláním podání na soud — checkuje pravopis
|
|
12
|
+
- OCR/email texty bez diakritiky — auto-doplnění (`Jiri` → `Jiří`)
|
|
13
|
+
- Občanské porady — text bez diakritiky z mobilní klávesnice → korektně formátovaný
|
|
14
|
+
|
|
15
|
+
Pozor: Korektor je CZ-only, modely jsou z roku 2013. Pro vlastní jména
|
|
16
|
+
(příjmení Pluhařík, slovenská jména…) může mít omezenou přesnost.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import Any, Literal
|
|
22
|
+
|
|
23
|
+
from .http import post_form
|
|
24
|
+
|
|
25
|
+
KOREKTOR_URL = "https://lindat.mff.cuni.cz/services/korektor/api/correct"
|
|
26
|
+
|
|
27
|
+
_MODEL_ALIASES: dict[str, str] = {
|
|
28
|
+
"spellcheck": "czech-spellchecker-130202",
|
|
29
|
+
"spellcheck_strict": "czech-spellchecker_2edits-130202",
|
|
30
|
+
"diacritics": "czech-diacritics_generator-130202",
|
|
31
|
+
"strip": "strip_diacritics-130202",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def correct(
|
|
36
|
+
text: str,
|
|
37
|
+
mode: Literal["spellcheck", "spellcheck_strict", "diacritics", "strip"] = "spellcheck",
|
|
38
|
+
) -> dict[str, Any]:
|
|
39
|
+
"""Vrátí opravený / upravený text podle zvoleného Korektor modelu.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
text: Vstupní český text.
|
|
43
|
+
mode: ``spellcheck`` (default), ``spellcheck_strict``, ``diacritics``,
|
|
44
|
+
``strip``.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
``corrected`` (text), ``model`` (server-reported), ``mode``,
|
|
48
|
+
``changed`` (bool — došlo k úpravě?).
|
|
49
|
+
"""
|
|
50
|
+
if not text.strip():
|
|
51
|
+
return {"corrected": "", "model": None, "mode": mode, "changed": False}
|
|
52
|
+
|
|
53
|
+
model_name = _MODEL_ALIASES.get(mode, mode)
|
|
54
|
+
payload: dict[str, str] = {"data": text}
|
|
55
|
+
if model_name:
|
|
56
|
+
payload["model"] = model_name
|
|
57
|
+
|
|
58
|
+
data = await post_form(KOREKTOR_URL, payload)
|
|
59
|
+
corrected = data.get("result", text)
|
|
60
|
+
return {
|
|
61
|
+
"corrected": corrected,
|
|
62
|
+
"model": data.get("model"),
|
|
63
|
+
"mode": mode,
|
|
64
|
+
"changed": corrected != text,
|
|
65
|
+
}
|