ufal-mcp 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ufal_mcp-0.2.0/.gitignore +12 -0
- ufal_mcp-0.2.0/LICENSE +30 -0
- ufal_mcp-0.2.0/PKG-INFO +147 -0
- ufal_mcp-0.2.0/README.md +117 -0
- ufal_mcp-0.2.0/pyproject.toml +56 -0
- ufal_mcp-0.2.0/src/ufal_mcp/__init__.py +3 -0
- ufal_mcp-0.2.0/src/ufal_mcp/server.py +399 -0
ufal_mcp-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Michal Bürgermeister
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
NOTE: This MIT license covers the wrapper code only.
|
|
26
|
+
The underlying ÚFAL MFF UK NLP models accessed via the REST APIs
|
|
27
|
+
(NameTag 3, MasKIT, PONK) are licensed under CC BY-NC-SA, which means
|
|
28
|
+
their results are limited to NON-COMMERCIAL use unless you obtain
|
|
29
|
+
explicit written permission from the model authors.
|
|
30
|
+
See https://lindat.mff.cuni.cz/en/terms-of-use for details.
|
ufal_mcp-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ufal-mcp
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: MCP server pro ÚFAL MFF UK NLP nástroje (NameTag, MasKIT, UDPipe, PONK) — česká právní AI
|
|
5
|
+
Project-URL: Homepage, https://github.com/Buggy1111/ufal-mcp
|
|
6
|
+
Project-URL: Repository, https://github.com/Buggy1111/ufal-mcp
|
|
7
|
+
Project-URL: Issues, https://github.com/Buggy1111/ufal-mcp/issues
|
|
8
|
+
Project-URL: ÚFAL MFF UK, https://ufal.mff.cuni.cz/
|
|
9
|
+
Author-email: Michal Bürgermeister <michalbugy12@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: anonymization,anthropic,claude,czech,legal-tech,maskit,mcp,model-context-protocol,nametag,ner,nlp,ponk,udpipe,ufal
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Legal Industry
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Natural Language :: Czech
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
26
|
+
Requires-Python: >=3.10
|
|
27
|
+
Requires-Dist: httpx>=0.27.0
|
|
28
|
+
Requires-Dist: mcp>=1.2.0
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# ufal-mcp
|
|
32
|
+
|
|
33
|
+
[](https://github.com/Buggy1111/ufal-mcp/actions/workflows/ci.yml)
|
|
34
|
+
[](https://pypi.org/project/ufal-mcp/)
|
|
35
|
+
[](https://pypi.org/project/ufal-mcp/)
|
|
36
|
+
[](LICENSE)
|
|
37
|
+
|
|
38
|
+
MCP server obalující NLP nástroje [ÚFAL MFF UK](https://ufal.mff.cuni.cz/) pro zpracování **českých právních textů**.
|
|
39
|
+
|
|
40
|
+
## Co umí
|
|
41
|
+
|
|
42
|
+
| Tool | Backend | K čemu |
|
|
43
|
+
|------|---------|--------|
|
|
44
|
+
| `anonymize` | [MasKIT](https://ufal.mff.cuni.cz/maskit) | Pseudonymizace osobních údajů (jména, IČO, telefony, adresy, č.j., rodná čísla, data narození…) |
|
|
45
|
+
| `extract_entities` | [NameTag 3](https://ufal.mff.cuni.cz/nametag/3) | Named Entity Recognition — osoby, instituce, firmy, geo, data |
|
|
46
|
+
| `analyze_morphology` | [UDPipe](https://ufal.mff.cuni.cz/udpipe) | Tokenizace, lemmatizace, POS tagging, závislostní parse |
|
|
47
|
+
| `check_readability` | [PONK](https://ufal.mff.cuni.cz/ponk) | Analýza čitelnosti právních textů (ARI, Verb Distance, Activity, Lexical diversity) |
|
|
48
|
+
|
|
49
|
+
## Instalace
|
|
50
|
+
|
|
51
|
+
Z PyPI (doporučeno):
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install ufal-mcp
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Nebo ze source:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/Buggy1111/ufal-mcp.git
|
|
61
|
+
cd ufal-mcp
|
|
62
|
+
pip install -e .
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Registrace v Claude Code
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
claude mcp add ufal -s user -- ufal-mcp
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Pokud máš binárku v jiném venv:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
claude mcp add ufal -s user -- /cesta/k/.venv/bin/ufal-mcp
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Poté Claude Code restartuj — nástroje budou dostupné jako:
|
|
78
|
+
|
|
79
|
+
- `mcp__ufal__anonymize`
|
|
80
|
+
- `mcp__ufal__extract_entities`
|
|
81
|
+
- `mcp__ufal__analyze_morphology`
|
|
82
|
+
- `mcp__ufal__check_readability`
|
|
83
|
+
|
|
84
|
+
## Použití
|
|
85
|
+
|
|
86
|
+
V Claude Code stačí napsat například:
|
|
87
|
+
|
|
88
|
+
> Anonymizuj text z `PRICHOZI_POSTA/2026-03-02_odpoved_na_stiznost.md` a vrať mi čistou verzi pro veřejný demo.
|
|
89
|
+
|
|
90
|
+
> Vytáhni z dokumentu všechny osoby, soudy a č.j. — chystám matter intake pro `/litigation-legal:matter-intake`.
|
|
91
|
+
|
|
92
|
+
> Zlemmatizuj tenhle text a vyhoď mi všechny tvary slova "soud" — potřebuju fulltextové vyhledávání.
|
|
93
|
+
|
|
94
|
+
> Projeď moje podání přes PONK — kolik vět má příliš dlouhých?
|
|
95
|
+
|
|
96
|
+
## Licence
|
|
97
|
+
|
|
98
|
+
- **Kód**: MIT
|
|
99
|
+
- **Modely (přes API)**: CC BY-NC-SA — **NEKOMERČNÍ použití**. Pro placené nasazení potřebuješ explicitní písemné svolení autorů (Jana Straková, Milan Straka).
|
|
100
|
+
|
|
101
|
+
## Bezpečnost
|
|
102
|
+
|
|
103
|
+
- **Vše posíláš na externí server ÚFAL** (`quest.ms.mff.cuni.cz`, `lindat.mff.cuni.cz`)
|
|
104
|
+
- ÚFAL loguje: čas, velikost dat, konfigurace serveru, IP. **Obsah neloguje** (přes POST).
|
|
105
|
+
- Pro plně privátní variantu lze rozšířit o lokální self-host (UDPipe + NameTag mají modely ke stažení).
|
|
106
|
+
|
|
107
|
+
## Použité API
|
|
108
|
+
|
|
109
|
+
- `POST https://lindat.mff.cuni.cz/services/nametag/api/recognize`
|
|
110
|
+
- `POST https://lindat.mff.cuni.cz/services/udpipe/api/process`
|
|
111
|
+
- `POST https://quest.ms.mff.cuni.cz/maskit/api/process`
|
|
112
|
+
- `POST https://quest.ms.mff.cuni.cz/ponk/api/process`
|
|
113
|
+
|
|
114
|
+
## Vývoj
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
python3 -m venv .venv
|
|
118
|
+
source .venv/bin/activate
|
|
119
|
+
pip install -e .
|
|
120
|
+
|
|
121
|
+
# Smoke test (volá živé ÚFAL API)
|
|
122
|
+
python test_live.py
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Release proces
|
|
126
|
+
|
|
127
|
+
PyPI publish je automatický přes [Trusted Publisher (OIDC)](https://docs.pypi.org/trusted-publishers/).
|
|
128
|
+
|
|
129
|
+
**Jednorázové nastavení (PyPI strana):**
|
|
130
|
+
1. Vytvořit balíček na https://pypi.org (nebo nechat workflow, ať ho vytvoří první run)
|
|
131
|
+
2. PyPI → Account settings → Publishing → Add pending publisher:
|
|
132
|
+
- PyPI Project Name: `ufal-mcp`
|
|
133
|
+
- Owner: `Buggy1111`
|
|
134
|
+
- Repository: `ufal-mcp`
|
|
135
|
+
- Workflow: `release.yml`
|
|
136
|
+
- Environment: `pypi`
|
|
137
|
+
|
|
138
|
+
**Release nového releasu:**
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Bump version v pyproject.toml a src/ufal_mcp/__init__.py
|
|
142
|
+
git commit -am "release: v0.X.0"
|
|
143
|
+
git tag v0.X.0
|
|
144
|
+
git push origin main --tags
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
GHA workflow `release.yml` automaticky postaví distribution, publishne na PyPI a vytvoří GitHub Release s artefakty.
|
ufal_mcp-0.2.0/README.md
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# ufal-mcp
|
|
2
|
+
|
|
3
|
+
[](https://github.com/Buggy1111/ufal-mcp/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/ufal-mcp/)
|
|
5
|
+
[](https://pypi.org/project/ufal-mcp/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
MCP server obalující NLP nástroje [ÚFAL MFF UK](https://ufal.mff.cuni.cz/) pro zpracování **českých právních textů**.
|
|
9
|
+
|
|
10
|
+
## Co umí
|
|
11
|
+
|
|
12
|
+
| Tool | Backend | K čemu |
|
|
13
|
+
|------|---------|--------|
|
|
14
|
+
| `anonymize` | [MasKIT](https://ufal.mff.cuni.cz/maskit) | Pseudonymizace osobních údajů (jména, IČO, telefony, adresy, č.j., rodná čísla, data narození…) |
|
|
15
|
+
| `extract_entities` | [NameTag 3](https://ufal.mff.cuni.cz/nametag/3) | Named Entity Recognition — osoby, instituce, firmy, geo, data |
|
|
16
|
+
| `analyze_morphology` | [UDPipe](https://ufal.mff.cuni.cz/udpipe) | Tokenizace, lemmatizace, POS tagging, závislostní parse |
|
|
17
|
+
| `check_readability` | [PONK](https://ufal.mff.cuni.cz/ponk) | Analýza čitelnosti právních textů (ARI, Verb Distance, Activity, Lexical diversity) |
|
|
18
|
+
|
|
19
|
+
## Instalace
|
|
20
|
+
|
|
21
|
+
Z PyPI (doporučeno):
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install ufal-mcp
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Nebo ze source:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
git clone https://github.com/Buggy1111/ufal-mcp.git
|
|
31
|
+
cd ufal-mcp
|
|
32
|
+
pip install -e .
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Registrace v Claude Code
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
claude mcp add ufal -s user -- ufal-mcp
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Pokud máš binárku v jiném venv:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
claude mcp add ufal -s user -- /cesta/k/.venv/bin/ufal-mcp
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Poté Claude Code restartuj — nástroje budou dostupné jako:
|
|
48
|
+
|
|
49
|
+
- `mcp__ufal__anonymize`
|
|
50
|
+
- `mcp__ufal__extract_entities`
|
|
51
|
+
- `mcp__ufal__analyze_morphology`
|
|
52
|
+
- `mcp__ufal__check_readability`
|
|
53
|
+
|
|
54
|
+
## Použití
|
|
55
|
+
|
|
56
|
+
V Claude Code stačí napsat například:
|
|
57
|
+
|
|
58
|
+
> Anonymizuj text z `PRICHOZI_POSTA/2026-03-02_odpoved_na_stiznost.md` a vrať mi čistou verzi pro veřejný demo.
|
|
59
|
+
|
|
60
|
+
> Vytáhni z dokumentu všechny osoby, soudy a č.j. — chystám matter intake pro `/litigation-legal:matter-intake`.
|
|
61
|
+
|
|
62
|
+
> Zlemmatizuj tenhle text a vyhoď mi všechny tvary slova "soud" — potřebuju fulltextové vyhledávání.
|
|
63
|
+
|
|
64
|
+
> Projeď moje podání přes PONK — kolik vět má příliš dlouhých?
|
|
65
|
+
|
|
66
|
+
## Licence
|
|
67
|
+
|
|
68
|
+
- **Kód**: MIT
|
|
69
|
+
- **Modely (přes API)**: CC BY-NC-SA — **NEKOMERČNÍ použití**. Pro placené nasazení potřebuješ explicitní písemné svolení autorů (Jana Straková, Milan Straka).
|
|
70
|
+
|
|
71
|
+
## Bezpečnost
|
|
72
|
+
|
|
73
|
+
- **Vše posíláš na externí server ÚFAL** (`quest.ms.mff.cuni.cz`, `lindat.mff.cuni.cz`)
|
|
74
|
+
- ÚFAL loguje: čas, velikost dat, konfigurace serveru, IP. **Obsah neloguje** (přes POST).
|
|
75
|
+
- Pro plně privátní variantu lze rozšířit o lokální self-host (UDPipe + NameTag mají modely ke stažení).
|
|
76
|
+
|
|
77
|
+
## Použité API
|
|
78
|
+
|
|
79
|
+
- `POST https://lindat.mff.cuni.cz/services/nametag/api/recognize`
|
|
80
|
+
- `POST https://lindat.mff.cuni.cz/services/udpipe/api/process`
|
|
81
|
+
- `POST https://quest.ms.mff.cuni.cz/maskit/api/process`
|
|
82
|
+
- `POST https://quest.ms.mff.cuni.cz/ponk/api/process`
|
|
83
|
+
|
|
84
|
+
## Vývoj
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
python3 -m venv .venv
|
|
88
|
+
source .venv/bin/activate
|
|
89
|
+
pip install -e .
|
|
90
|
+
|
|
91
|
+
# Smoke test (volá živé ÚFAL API)
|
|
92
|
+
python test_live.py
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Release proces
|
|
96
|
+
|
|
97
|
+
PyPI publish je automatický přes [Trusted Publisher (OIDC)](https://docs.pypi.org/trusted-publishers/).
|
|
98
|
+
|
|
99
|
+
**Jednorázové nastavení (PyPI strana):**
|
|
100
|
+
1. Vytvořit balíček na https://pypi.org (nebo nechat workflow, ať ho vytvoří první run)
|
|
101
|
+
2. PyPI → Account settings → Publishing → Add pending publisher:
|
|
102
|
+
- PyPI Project Name: `ufal-mcp`
|
|
103
|
+
- Owner: `Buggy1111`
|
|
104
|
+
- Repository: `ufal-mcp`
|
|
105
|
+
- Workflow: `release.yml`
|
|
106
|
+
- Environment: `pypi`
|
|
107
|
+
|
|
108
|
+
**Release nového releasu:**
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# Bump version v pyproject.toml a src/ufal_mcp/__init__.py
|
|
112
|
+
git commit -am "release: v0.X.0"
|
|
113
|
+
git tag v0.X.0
|
|
114
|
+
git push origin main --tags
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
GHA workflow `release.yml` automaticky postaví distribution, publishne na PyPI a vytvoří GitHub Release s artefakty.
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ufal-mcp"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "MCP server pro ÚFAL MFF UK NLP nástroje (NameTag, MasKIT, UDPipe, PONK) — česká právní AI"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
authors = [{ name = "Michal Bürgermeister", email = "michalbugy12@gmail.com" }]
|
|
8
|
+
license = { text = "MIT" }
|
|
9
|
+
keywords = [
|
|
10
|
+
"mcp", "model-context-protocol", "claude", "anthropic",
|
|
11
|
+
"czech", "nlp", "ner", "anonymization", "legal-tech", "ufal",
|
|
12
|
+
"nametag", "udpipe", "maskit", "ponk",
|
|
13
|
+
]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Intended Audience :: Legal Industry",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Natural Language :: Czech",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
27
|
+
"Topic :: Text Processing :: Linguistic",
|
|
28
|
+
]
|
|
29
|
+
dependencies = [
|
|
30
|
+
"mcp>=1.2.0",
|
|
31
|
+
"httpx>=0.27.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/Buggy1111/ufal-mcp"
|
|
36
|
+
Repository = "https://github.com/Buggy1111/ufal-mcp"
|
|
37
|
+
Issues = "https://github.com/Buggy1111/ufal-mcp/issues"
|
|
38
|
+
"ÚFAL MFF UK" = "https://ufal.mff.cuni.cz/"
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
ufal-mcp = "ufal_mcp.server:main"
|
|
42
|
+
|
|
43
|
+
[build-system]
|
|
44
|
+
requires = ["hatchling"]
|
|
45
|
+
build-backend = "hatchling.build"
|
|
46
|
+
|
|
47
|
+
[tool.hatch.build.targets.wheel]
|
|
48
|
+
packages = ["src/ufal_mcp"]
|
|
49
|
+
|
|
50
|
+
[tool.hatch.build.targets.sdist]
|
|
51
|
+
include = [
|
|
52
|
+
"src/",
|
|
53
|
+
"README.md",
|
|
54
|
+
"LICENSE",
|
|
55
|
+
"pyproject.toml",
|
|
56
|
+
]
|
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
"""ÚFAL MCP server — anonymizace, NER, morfologie a čitelnost českých právních textů.
|
|
2
|
+
|
|
3
|
+
Wrappuje 4 REST API:
|
|
4
|
+
- MasKIT — pseudonymizace osobních údajů
|
|
5
|
+
- NameTag — Czech NER
|
|
6
|
+
- UDPipe — tokenizace, lemmatizace, POS tagging, dependency parse
|
|
7
|
+
- PONK — analýza čitelnosti
|
|
8
|
+
|
|
9
|
+
Modely jsou pod CC BY-NC-SA, takže výsledky **nesmí být použity komerčně**
|
|
10
|
+
bez explicitního písemného svolení autorů.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from typing import Any, Literal
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
from mcp.server.fastmcp import FastMCP
|
|
20
|
+
|
|
21
|
+
MASKIT_URL = "https://quest.ms.mff.cuni.cz/maskit/api/process"
|
|
22
|
+
NAMETAG_URL = "https://lindat.mff.cuni.cz/services/nametag/api/recognize"
|
|
23
|
+
PONK_URL = "https://quest.ms.mff.cuni.cz/ponk/api/process"
|
|
24
|
+
UDPIPE_URL = "https://lindat.mff.cuni.cz/services/udpipe/api/process"
|
|
25
|
+
|
|
26
|
+
HTTP_TIMEOUT = 60.0
|
|
27
|
+
|
|
28
|
+
mcp = FastMCP("ufal")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def _post_form(url: str, data: dict[str, str]) -> dict[str, Any]:
|
|
32
|
+
async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as client:
|
|
33
|
+
response = await client.post(url, data=data)
|
|
34
|
+
response.raise_for_status()
|
|
35
|
+
return response.json()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------- NameTag ---------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
# CNEC 2.0 entity type → human label (česky)
|
|
41
|
+
NAMETAG_LABELS: dict[str, str] = {
|
|
42
|
+
"P": "osoba",
|
|
43
|
+
"pf": "křestní jméno",
|
|
44
|
+
"ps": "příjmení",
|
|
45
|
+
"T": "datum/čas",
|
|
46
|
+
"td": "den",
|
|
47
|
+
"tm": "měsíc",
|
|
48
|
+
"ty": "rok",
|
|
49
|
+
"th": "hodina",
|
|
50
|
+
"A": "číslo",
|
|
51
|
+
"ah": "hodnota",
|
|
52
|
+
"at": "telefon",
|
|
53
|
+
"az": "PSČ",
|
|
54
|
+
"C": "bibliografie",
|
|
55
|
+
"G": "geografická entita",
|
|
56
|
+
"gu": "město/obec",
|
|
57
|
+
"gs": "ulice/náměstí",
|
|
58
|
+
"gc": "stát/země",
|
|
59
|
+
"gr": "region",
|
|
60
|
+
"I": "instituce",
|
|
61
|
+
"io": "úřad/instituce",
|
|
62
|
+
"if": "firma/společnost",
|
|
63
|
+
"ic": "kulturní/vědecká instituce",
|
|
64
|
+
"M": "média",
|
|
65
|
+
"O": "objekt",
|
|
66
|
+
"om": "měna",
|
|
67
|
+
"or": "produkt",
|
|
68
|
+
"N": "číselný výraz",
|
|
69
|
+
"no": "pořadí",
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _parse_conll(conll: str) -> list[dict[str, Any]]:
|
|
74
|
+
"""Zploští CoNLL výstup NameTag do seznamu entit."""
|
|
75
|
+
entities: list[dict[str, Any]] = []
|
|
76
|
+
current: dict[str, Any] | None = None
|
|
77
|
+
for raw_line in conll.splitlines():
|
|
78
|
+
line = raw_line.strip()
|
|
79
|
+
if not line:
|
|
80
|
+
if current:
|
|
81
|
+
entities.append(current)
|
|
82
|
+
current = None
|
|
83
|
+
continue
|
|
84
|
+
parts = line.split("\t")
|
|
85
|
+
if len(parts) != 2:
|
|
86
|
+
continue
|
|
87
|
+
token, tags = parts
|
|
88
|
+
# tags can be like "B-P|B-pf" nebo "I-P|I-pf" nebo "O"
|
|
89
|
+
if tags == "O":
|
|
90
|
+
if current:
|
|
91
|
+
entities.append(current)
|
|
92
|
+
current = None
|
|
93
|
+
continue
|
|
94
|
+
labels = [t for t in tags.split("|") if t]
|
|
95
|
+
starts = [lab.split("-", 1)[1] for lab in labels if lab.startswith("B-")]
|
|
96
|
+
if starts:
|
|
97
|
+
if current:
|
|
98
|
+
entities.append(current)
|
|
99
|
+
primary = starts[0]
|
|
100
|
+
current = {
|
|
101
|
+
"type": primary,
|
|
102
|
+
"label": NAMETAG_LABELS.get(primary, primary),
|
|
103
|
+
"tokens": [token],
|
|
104
|
+
"nested": [t for t in starts[1:]],
|
|
105
|
+
}
|
|
106
|
+
else:
|
|
107
|
+
if current is None:
|
|
108
|
+
# I- bez předchozího B- — vytvoř best-effort entitu
|
|
109
|
+
inside = [lab.split("-", 1)[1] for lab in labels if lab.startswith("I-")]
|
|
110
|
+
primary = inside[0] if inside else "?"
|
|
111
|
+
current = {
|
|
112
|
+
"type": primary,
|
|
113
|
+
"label": NAMETAG_LABELS.get(primary, primary),
|
|
114
|
+
"tokens": [token],
|
|
115
|
+
"nested": [],
|
|
116
|
+
}
|
|
117
|
+
else:
|
|
118
|
+
current["tokens"].append(token)
|
|
119
|
+
if current:
|
|
120
|
+
entities.append(current)
|
|
121
|
+
for ent in entities:
|
|
122
|
+
ent["text"] = " ".join(ent["tokens"])
|
|
123
|
+
return entities
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@mcp.tool()
|
|
127
|
+
async def extract_entities(text: str) -> dict[str, Any]:
|
|
128
|
+
"""Rozpozná pojmenované entity v českém (nebo jiném podporovaném) textu pomocí NameTag 3.
|
|
129
|
+
|
|
130
|
+
Vrací strukturovaný seznam entit s typem (osoba, instituce, datum, firma, geo…)
|
|
131
|
+
a původním textem. Hodí se pro matter intake — kdo, kdy, kde, jaké instituce.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
text: Vstupní text k analýze (UTF-8). Optimalizováno pro češtinu.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Slovník s ``entities`` (list) a ``model`` (verze).
|
|
138
|
+
"""
|
|
139
|
+
if not text.strip():
|
|
140
|
+
return {"entities": [], "model": None, "note": "empty input"}
|
|
141
|
+
data = await _post_form(NAMETAG_URL, {"data": text, "output": "conll"})
|
|
142
|
+
entities = _parse_conll(data.get("result", ""))
|
|
143
|
+
return {
|
|
144
|
+
"entities": entities,
|
|
145
|
+
"model": data.get("model"),
|
|
146
|
+
"count": len(entities),
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# ---------- MasKIT ----------------------------------------------------------
|
|
151
|
+
|
|
152
|
+
_MASKIT_PLACEHOLDER = re.compile(r"([^\s_\[\]]+)_\[([^\]]+)\]")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _parse_maskit(result: str) -> tuple[str, list[dict[str, str]]]:
|
|
156
|
+
"""Z MasKIT výstupu vytáhne čistý anonymizovaný text + mapping originál→placeholder."""
|
|
157
|
+
replacements: list[dict[str, str]] = []
|
|
158
|
+
anonymized_parts: list[str] = []
|
|
159
|
+
last_end = 0
|
|
160
|
+
for match in _MASKIT_PLACEHOLDER.finditer(result):
|
|
161
|
+
anonymized_parts.append(result[last_end : match.start()])
|
|
162
|
+
placeholder, original = match.group(1), match.group(2)
|
|
163
|
+
anonymized_parts.append(placeholder)
|
|
164
|
+
replacements.append({"original": original, "placeholder": placeholder})
|
|
165
|
+
last_end = match.end()
|
|
166
|
+
anonymized_parts.append(result[last_end:])
|
|
167
|
+
return "".join(anonymized_parts), replacements
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@mcp.tool()
|
|
171
|
+
async def anonymize(
|
|
172
|
+
text: str,
|
|
173
|
+
output: Literal["txt", "html", "conllu"] = "txt",
|
|
174
|
+
keep_mapping: bool = True,
|
|
175
|
+
) -> dict[str, Any]:
|
|
176
|
+
"""Pseudonymizuje osobní údaje v českém právním textu pomocí MasKIT.
|
|
177
|
+
|
|
178
|
+
MasKIT detekuje a nahrazuje fiktivními daty: jména, příjmení, telefony, e-maily,
|
|
179
|
+
URL, ulice, města, PSČ, firmy, instituce, IČO, DIČ, rodná čísla, data narození,
|
|
180
|
+
čísla jednací, SPZ. Soudce **neanonymizuje** (whitelist).
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
text: Vstupní text (čeština).
|
|
184
|
+
output: Formát výstupu — ``txt`` (default), ``html``, ``conllu``.
|
|
185
|
+
keep_mapping: Když True, vrátí mapping originál → placeholder. **POZOR**:
|
|
186
|
+
pokud má text dál opustit důvěrné prostředí, mapping vypni!
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
``anonymized`` (čistý text bez placeholderů),
|
|
190
|
+
``raw`` (raw MasKIT výstup s `placeholder_[original]`),
|
|
191
|
+
``replacements`` (list mappings, jen když ``keep_mapping=True``).
|
|
192
|
+
"""
|
|
193
|
+
if not text.strip():
|
|
194
|
+
return {"anonymized": "", "raw": "", "replacements": []}
|
|
195
|
+
data = await _post_form(
|
|
196
|
+
MASKIT_URL,
|
|
197
|
+
{"text": text, "input": "txt", "output": output},
|
|
198
|
+
)
|
|
199
|
+
raw = data.get("result", "")
|
|
200
|
+
if output == "txt":
|
|
201
|
+
anonymized, replacements = _parse_maskit(raw)
|
|
202
|
+
else:
|
|
203
|
+
anonymized, replacements = raw, []
|
|
204
|
+
out: dict[str, Any] = {"anonymized": anonymized, "raw": raw}
|
|
205
|
+
if keep_mapping:
|
|
206
|
+
out["replacements"] = replacements
|
|
207
|
+
out["count"] = len(replacements)
|
|
208
|
+
return out
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# ---------- PONK ------------------------------------------------------------
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
_PONK_METRIC_RE = re.compile(
|
|
215
|
+
r'<span[^>]*data-tooltip="([^"]+)"[^>]*>\s*-\s*([^:<]+):\s*([^<]+)</span>',
|
|
216
|
+
re.IGNORECASE | re.DOTALL,
|
|
217
|
+
)
|
|
218
|
+
_PONK_COUNTS_RE = re.compile(
|
|
219
|
+
r"number of sentences:\s*(\d+),\s*tokens:\s*(\d+)",
|
|
220
|
+
re.IGNORECASE,
|
|
221
|
+
)
|
|
222
|
+
_PONK_VERSION_RE = re.compile(r"PONK\s*<span[^>]*>([^<]+)</span>", re.IGNORECASE)
|
|
223
|
+
_PONK_TIME_RE = re.compile(r"Processing time:\s*([\d.]+)\s*s", re.IGNORECASE)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _clean(value: str) -> str:
|
|
227
|
+
return re.sub(r"\s+", " ", value).strip()
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _parse_ponk_stats(stats_html: str) -> dict[str, Any]:
|
|
231
|
+
"""Z PONK stats HTML vytáhne metriky čitelnosti, counts a verzi.
|
|
232
|
+
|
|
233
|
+
Vrací: ``metrics`` (label → {value, tooltip}), ``counts`` (sentences, tokens),
|
|
234
|
+
``processing_time_s``, ``version``.
|
|
235
|
+
"""
|
|
236
|
+
metrics: dict[str, dict[str, str]] = {}
|
|
237
|
+
for tooltip, label, value in _PONK_METRIC_RE.findall(stats_html):
|
|
238
|
+
metrics[_clean(label)] = {
|
|
239
|
+
"value": _clean(value),
|
|
240
|
+
"tooltip": _clean(tooltip.replace("<br>", " ").replace("<br/>", " ")),
|
|
241
|
+
}
|
|
242
|
+
counts: dict[str, int] = {}
|
|
243
|
+
if (m := _PONK_COUNTS_RE.search(stats_html)):
|
|
244
|
+
counts = {"sentences": int(m.group(1)), "tokens": int(m.group(2))}
|
|
245
|
+
version = None
|
|
246
|
+
if (m := _PONK_VERSION_RE.search(stats_html)):
|
|
247
|
+
version = _clean(m.group(1))
|
|
248
|
+
processing_time_s = None
|
|
249
|
+
if (m := _PONK_TIME_RE.search(stats_html)):
|
|
250
|
+
processing_time_s = float(m.group(1))
|
|
251
|
+
return {
|
|
252
|
+
"metrics": metrics,
|
|
253
|
+
"counts": counts,
|
|
254
|
+
"processing_time_s": processing_time_s,
|
|
255
|
+
"version": version,
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@mcp.tool()
|
|
260
|
+
async def check_readability(
|
|
261
|
+
text: str,
|
|
262
|
+
input_format: Literal["txt", "md", "docx"] = "txt",
|
|
263
|
+
) -> dict[str, Any]:
|
|
264
|
+
"""Analyzuje čitelnost českého textu pomocí PONK.
|
|
265
|
+
|
|
266
|
+
PONK byl navržen pro úřední komunikaci s občany — najde dlouhé věty,
|
|
267
|
+
pasivum a právnické fráze, které ztěžují porozumění. Užitečné pro kontrolu
|
|
268
|
+
vlastních podání před odesláním na soud.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
text: Vstupní text.
|
|
272
|
+
input_format: ``txt`` (default), ``md``, ``docx`` (jako base64? viz API).
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
``highlighted_html`` (text s vyznačenými problémy),
|
|
276
|
+
``stats`` (slovník metrik),
|
|
277
|
+
``version``.
|
|
278
|
+
"""
|
|
279
|
+
if not text.strip():
|
|
280
|
+
return {"highlighted_html": "", "stats": {}, "version": None}
|
|
281
|
+
data = await _post_form(
|
|
282
|
+
PONK_URL,
|
|
283
|
+
{"text": text, "input": input_format, "output": "html"},
|
|
284
|
+
)
|
|
285
|
+
parsed = _parse_ponk_stats(data.get("stats", ""))
|
|
286
|
+
return {
|
|
287
|
+
"highlighted_html": data.get("result", ""),
|
|
288
|
+
"metrics": parsed["metrics"],
|
|
289
|
+
"counts": parsed["counts"],
|
|
290
|
+
"processing_time_s": parsed["processing_time_s"],
|
|
291
|
+
"version": parsed["version"],
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
# ---------- UDPipe ----------------------------------------------------------
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _parse_conllu(conllu: str) -> list[list[dict[str, Any]]]:
|
|
299
|
+
"""Parsuje CoNLL-U výstup UDPipe na seznam vět, kde každá věta = list tokenů."""
|
|
300
|
+
sentences: list[list[dict[str, Any]]] = []
|
|
301
|
+
current: list[dict[str, Any]] = []
|
|
302
|
+
for raw in conllu.splitlines():
|
|
303
|
+
line = raw.rstrip()
|
|
304
|
+
if not line:
|
|
305
|
+
if current:
|
|
306
|
+
sentences.append(current)
|
|
307
|
+
current = []
|
|
308
|
+
continue
|
|
309
|
+
if line.startswith("#"):
|
|
310
|
+
continue
|
|
311
|
+
parts = line.split("\t")
|
|
312
|
+
if len(parts) < 10:
|
|
313
|
+
continue
|
|
314
|
+
# přeskoč multi-word tokens (1-2) a empty nodes (1.1)
|
|
315
|
+
if "-" in parts[0] or "." in parts[0]:
|
|
316
|
+
continue
|
|
317
|
+
feats = {}
|
|
318
|
+
if parts[5] != "_":
|
|
319
|
+
for kv in parts[5].split("|"):
|
|
320
|
+
if "=" in kv:
|
|
321
|
+
k, v = kv.split("=", 1)
|
|
322
|
+
feats[k] = v
|
|
323
|
+
current.append({
|
|
324
|
+
"id": int(parts[0]),
|
|
325
|
+
"form": parts[1],
|
|
326
|
+
"lemma": parts[2],
|
|
327
|
+
"upos": parts[3],
|
|
328
|
+
"xpos": parts[4] if parts[4] != "_" else None,
|
|
329
|
+
"feats": feats,
|
|
330
|
+
"head": int(parts[6]) if parts[6] != "_" else None,
|
|
331
|
+
"deprel": parts[7] if parts[7] != "_" else None,
|
|
332
|
+
})
|
|
333
|
+
if current:
|
|
334
|
+
sentences.append(current)
|
|
335
|
+
return sentences
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
@mcp.tool()
|
|
339
|
+
async def analyze_morphology(
|
|
340
|
+
text: str,
|
|
341
|
+
model: str = "czech",
|
|
342
|
+
include_parse: bool = False,
|
|
343
|
+
) -> dict[str, Any]:
|
|
344
|
+
"""Tokenizuje, lemmatizuje a označuje slovní druhy pomocí UDPipe 2.
|
|
345
|
+
|
|
346
|
+
Pro každý token vrací **lemma** (základní tvar), **UPOS** (universal POS tag),
|
|
347
|
+
**morphological features** (pád, rod, číslo, čas...) a volitelně závislostní
|
|
348
|
+
parse (head + deprel).
|
|
349
|
+
|
|
350
|
+
Hodí se pro:
|
|
351
|
+
- Fulltextové vyhledávání v právních textech (lemma "soud" matchuje "soudu/soudem/soudy")
|
|
352
|
+
- Filtrování podle slovních druhů (jen substantiva, jen verba)
|
|
353
|
+
- Detekce pasivních konstrukcí (Voice=Pass)
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
text: Vstupní text. Optimalizováno pro češtinu.
|
|
357
|
+
model: UDPipe model alias (default ``czech``) — viz lindat.mff.cuni.cz/services/udpipe.
|
|
358
|
+
include_parse: True = vrátí závislostní parse (head, deprel) pro každý token.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
``sentences`` (list vět = list tokenů), ``model`` (skutečně použitý model),
|
|
362
|
+
``token_count``, ``sentence_count``.
|
|
363
|
+
"""
|
|
364
|
+
if not text.strip():
|
|
365
|
+
return {"sentences": [], "model": None, "token_count": 0, "sentence_count": 0}
|
|
366
|
+
data = await _post_form(
|
|
367
|
+
UDPIPE_URL,
|
|
368
|
+
{
|
|
369
|
+
"data": text,
|
|
370
|
+
"model": model,
|
|
371
|
+
"tokenizer": "",
|
|
372
|
+
"tagger": "",
|
|
373
|
+
"parser": "" if include_parse else "none",
|
|
374
|
+
"output": "conllu",
|
|
375
|
+
},
|
|
376
|
+
)
|
|
377
|
+
sentences = _parse_conllu(data.get("result", ""))
|
|
378
|
+
if not include_parse:
|
|
379
|
+
for sent in sentences:
|
|
380
|
+
for tok in sent:
|
|
381
|
+
tok.pop("head", None)
|
|
382
|
+
tok.pop("deprel", None)
|
|
383
|
+
return {
|
|
384
|
+
"sentences": sentences,
|
|
385
|
+
"model": data.get("model"),
|
|
386
|
+
"sentence_count": len(sentences),
|
|
387
|
+
"token_count": sum(len(s) for s in sentences),
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
# ---------- Entry point -----------------------------------------------------
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def main() -> None:
|
|
395
|
+
mcp.run()
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
if __name__ == "__main__":
|
|
399
|
+
main()
|