udsearch 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- udsearch-0.1.0/LICENSE +21 -0
- udsearch-0.1.0/PKG-INFO +140 -0
- udsearch-0.1.0/README.md +116 -0
- udsearch-0.1.0/pyproject.toml +40 -0
- udsearch-0.1.0/setup.cfg +4 -0
- udsearch-0.1.0/tests/test_compat.py +364 -0
- udsearch-0.1.0/tests/test_conllu.py +113 -0
- udsearch-0.1.0/tests/test_match.py +80 -0
- udsearch-0.1.0/tests/test_pattern.py +55 -0
- udsearch-0.1.0/tests/test_rewrite.py +108 -0
- udsearch-0.1.0/tests/test_search.py +62 -0
- udsearch-0.1.0/tests/test_structural.py +298 -0
- udsearch-0.1.0/udsearch/__init__.py +40 -0
- udsearch-0.1.0/udsearch/_compat.py +225 -0
- udsearch-0.1.0/udsearch/cli.py +410 -0
- udsearch-0.1.0/udsearch/conllu.py +165 -0
- udsearch-0.1.0/udsearch/match.py +68 -0
- udsearch-0.1.0/udsearch/pattern.py +41 -0
- udsearch-0.1.0/udsearch/rewrite.py +118 -0
- udsearch-0.1.0/udsearch/search.py +158 -0
- udsearch-0.1.0/udsearch/structural.py +371 -0
- udsearch-0.1.0/udsearch/treebank.py +174 -0
- udsearch-0.1.0/udsearch.egg-info/PKG-INFO +140 -0
- udsearch-0.1.0/udsearch.egg-info/SOURCES.txt +25 -0
- udsearch-0.1.0/udsearch.egg-info/dependency_links.txt +1 -0
- udsearch-0.1.0/udsearch.egg-info/entry_points.txt +2 -0
- udsearch-0.1.0/udsearch.egg-info/top_level.txt +1 -0
udsearch-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-2026 Furkan Akkurt
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
udsearch-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: udsearch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Search, match, and batch-edit Universal Dependencies treebanks
|
|
5
|
+
Author-email: Furkan Akkurt <furkan.akkurt@bogazici.edu.tr>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://gitlab.com/furkan4829/tools/ud-tools
|
|
8
|
+
Project-URL: Issues, https://gitlab.com/furkan4829/tools/ud-tools/-/issues
|
|
9
|
+
Keywords: universal-dependencies,treebank,conllu,nlp,linguistics,search
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
|
19
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# udsearch
|
|
26
|
+
|
|
27
|
+
Search, match, and batch-edit [Universal Dependencies](https://universaldependencies.org/) treebanks from the command line or as a Python library.
|
|
28
|
+
|
|
29
|
+
Pure Python, no external dependencies.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install udsearch
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## CLI
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# Search for tokens
|
|
41
|
+
udsearch "UPOS=NOUN & Case=Dat" -t Turkish-BOUN
|
|
42
|
+
udsearch "UPOS=NOUN|PROPN & deprel=nsubj" -f corpus.conllu
|
|
43
|
+
|
|
44
|
+
# Structural patterns (multi-node)
|
|
45
|
+
udsearch $'v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v' -f corpus.conllu
|
|
46
|
+
udsearch --pattern-file query.txt -f corpus.conllu
|
|
47
|
+
|
|
48
|
+
# Clustering (Grew-match style)
|
|
49
|
+
udsearch "UPOS=NOUN" -t Turkish-BOUN --cluster
|
|
50
|
+
udsearch "deprel=obl" -t Turkish-BOUN --cluster Case Number
|
|
51
|
+
|
|
52
|
+
# Batch rewrite (dry-run by default)
|
|
53
|
+
udsearch "UPOS=NOUN & lemma=yok" --set "UPOS=ADJ" --set "Polarity=Neg" -f tb.conllu
|
|
54
|
+
udsearch "UPOS=NOUN & lemma=yok" --set "UPOS=ADJ" -f tb.conllu --apply
|
|
55
|
+
|
|
56
|
+
# Structural rewrite (target specific nodes)
|
|
57
|
+
udsearch $'v: [UPOS=VERB]\ns: [] -nsubj-> v' --set "s.Case=Nom" -f tb.conllu --apply
|
|
58
|
+
|
|
59
|
+
# Treebank management
|
|
60
|
+
udsearch --list tr # list Turkish treebanks on GitHub
|
|
61
|
+
udsearch --list-cached # show downloaded treebanks
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Pattern syntax
|
|
65
|
+
|
|
66
|
+
### Single-node
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
UPOS=NOUN exact match
|
|
70
|
+
UPOS=NOUN|PROPN alternatives
|
|
71
|
+
lemma=/^yap/ regex
|
|
72
|
+
!PronType=Prs negation
|
|
73
|
+
PronType feature exists
|
|
74
|
+
UPOS=NOUN & Case=Dat conjunction
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Structural (multi-node)
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
v: [UPOS=VERB] named node
|
|
81
|
+
s: [UPOS=PRON & Case=Nom] -nsubj-> v dependency relation
|
|
82
|
+
!a: [UPOS=AUX] -aux-> v negated (must NOT exist)
|
|
83
|
+
d: [] -nsubj|obj-> v deprel alternatives
|
|
84
|
+
d: [] -/^nsubj/-> v deprel regex
|
|
85
|
+
d: [] -> v any relation
|
|
86
|
+
s << v linear precedence
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Rewrite operations
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
Polarity=Neg add/set feature
|
|
93
|
+
UPOS=ADJ change column field
|
|
94
|
+
-Case remove feature
|
|
95
|
+
MISC.Lang=en set MISC field
|
|
96
|
+
s.Case=Nom target node in structural pattern
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Library usage
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from udsearch import parse_conllu, parse_pattern, search_treebank
|
|
103
|
+
from udsearch import parse_structural, match_structural
|
|
104
|
+
|
|
105
|
+
# Parse CoNLL-U
|
|
106
|
+
sentences = parse_conllu(open("corpus.conllu").read())
|
|
107
|
+
|
|
108
|
+
# Single-node search
|
|
109
|
+
pattern = parse_pattern("UPOS=VERB & Tense=Past")
|
|
110
|
+
for sent, tokens in search_treebank(sentences, pattern):
|
|
111
|
+
print(sent.sent_id, [t.form for t in tokens])
|
|
112
|
+
|
|
113
|
+
# Structural search
|
|
114
|
+
sp = parse_structural("v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v")
|
|
115
|
+
for sent, bindings in search_structural(sentences, sp):
|
|
116
|
+
for b in bindings:
|
|
117
|
+
print(f"{b['v'].form} <- {b['s'].form}")
|
|
118
|
+
|
|
119
|
+
# Batch rewrite
|
|
120
|
+
from udsearch import apply_operations, parse_set_operations
|
|
121
|
+
ops = parse_set_operations(["Polarity=Neg"])
|
|
122
|
+
for sent, tokens in search_treebank(sentences, pattern):
|
|
123
|
+
for token in tokens:
|
|
124
|
+
changes = apply_operations(token, ops)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Dict-based API (for web apps)
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from udsearch import match_structural_dicts, apply_operations_to_dicts
|
|
131
|
+
|
|
132
|
+
# Works with dict-based wordlines (e.g., from a database)
|
|
133
|
+
wordlines = [{"id_f": "1", "form": "cat", "upos": "NOUN", ...}, ...]
|
|
134
|
+
matches = match_structural_dicts("UPOS=NOUN", wordlines)
|
|
135
|
+
modified, changes = apply_operations_to_dicts(wordlines, "UPOS=NOUN", ["Case=Acc"])
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT
|
udsearch-0.1.0/README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# udsearch
|
|
2
|
+
|
|
3
|
+
Search, match, and batch-edit [Universal Dependencies](https://universaldependencies.org/) treebanks from the command line or as a Python library.
|
|
4
|
+
|
|
5
|
+
Pure Python, no external dependencies.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install udsearch
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## CLI
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Search for tokens
|
|
17
|
+
udsearch "UPOS=NOUN & Case=Dat" -t Turkish-BOUN
|
|
18
|
+
udsearch "UPOS=NOUN|PROPN & deprel=nsubj" -f corpus.conllu
|
|
19
|
+
|
|
20
|
+
# Structural patterns (multi-node)
|
|
21
|
+
udsearch $'v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v' -f corpus.conllu
|
|
22
|
+
udsearch --pattern-file query.txt -f corpus.conllu
|
|
23
|
+
|
|
24
|
+
# Clustering (Grew-match style)
|
|
25
|
+
udsearch "UPOS=NOUN" -t Turkish-BOUN --cluster
|
|
26
|
+
udsearch "deprel=obl" -t Turkish-BOUN --cluster Case Number
|
|
27
|
+
|
|
28
|
+
# Batch rewrite (dry-run by default)
|
|
29
|
+
udsearch "UPOS=NOUN & lemma=yok" --set "UPOS=ADJ" --set "Polarity=Neg" -f tb.conllu
|
|
30
|
+
udsearch "UPOS=NOUN & lemma=yok" --set "UPOS=ADJ" -f tb.conllu --apply
|
|
31
|
+
|
|
32
|
+
# Structural rewrite (target specific nodes)
|
|
33
|
+
udsearch $'v: [UPOS=VERB]\ns: [] -nsubj-> v' --set "s.Case=Nom" -f tb.conllu --apply
|
|
34
|
+
|
|
35
|
+
# Treebank management
|
|
36
|
+
udsearch --list tr # list Turkish treebanks on GitHub
|
|
37
|
+
udsearch --list-cached # show downloaded treebanks
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Pattern syntax
|
|
41
|
+
|
|
42
|
+
### Single-node
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
UPOS=NOUN exact match
|
|
46
|
+
UPOS=NOUN|PROPN alternatives
|
|
47
|
+
lemma=/^yap/ regex
|
|
48
|
+
!PronType=Prs negation
|
|
49
|
+
PronType feature exists
|
|
50
|
+
UPOS=NOUN & Case=Dat conjunction
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Structural (multi-node)
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
v: [UPOS=VERB] named node
|
|
57
|
+
s: [UPOS=PRON & Case=Nom] -nsubj-> v dependency relation
|
|
58
|
+
!a: [UPOS=AUX] -aux-> v negated (must NOT exist)
|
|
59
|
+
d: [] -nsubj|obj-> v deprel alternatives
|
|
60
|
+
d: [] -/^nsubj/-> v deprel regex
|
|
61
|
+
d: [] -> v any relation
|
|
62
|
+
s << v linear precedence
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Rewrite operations
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
Polarity=Neg add/set feature
|
|
69
|
+
UPOS=ADJ change column field
|
|
70
|
+
-Case remove feature
|
|
71
|
+
MISC.Lang=en set MISC field
|
|
72
|
+
s.Case=Nom target node in structural pattern
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Library usage
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from udsearch import parse_conllu, parse_pattern, search_treebank
|
|
79
|
+
from udsearch import parse_structural, match_structural
|
|
80
|
+
|
|
81
|
+
# Parse CoNLL-U
|
|
82
|
+
sentences = parse_conllu(open("corpus.conllu").read())
|
|
83
|
+
|
|
84
|
+
# Single-node search
|
|
85
|
+
pattern = parse_pattern("UPOS=VERB & Tense=Past")
|
|
86
|
+
for sent, tokens in search_treebank(sentences, pattern):
|
|
87
|
+
print(sent.sent_id, [t.form for t in tokens])
|
|
88
|
+
|
|
89
|
+
# Structural search
|
|
90
|
+
sp = parse_structural("v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v")
|
|
91
|
+
for sent, bindings in search_structural(sentences, sp):
|
|
92
|
+
for b in bindings:
|
|
93
|
+
print(f"{b['v'].form} <- {b['s'].form}")
|
|
94
|
+
|
|
95
|
+
# Batch rewrite
|
|
96
|
+
from udsearch import apply_operations, parse_set_operations
|
|
97
|
+
ops = parse_set_operations(["Polarity=Neg"])
|
|
98
|
+
for sent, tokens in search_treebank(sentences, pattern):
|
|
99
|
+
for token in tokens:
|
|
100
|
+
changes = apply_operations(token, ops)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Dict-based API (for web apps)
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from udsearch import match_structural_dicts, apply_operations_to_dicts
|
|
107
|
+
|
|
108
|
+
# Works with dict-based wordlines (e.g., from a database)
|
|
109
|
+
wordlines = [{"id_f": "1", "form": "cat", "upos": "NOUN", ...}, ...]
|
|
110
|
+
matches = match_structural_dicts("UPOS=NOUN", wordlines)
|
|
111
|
+
modified, changes = apply_operations_to_dicts(wordlines, "UPOS=NOUN", ["Case=Acc"])
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## License
|
|
115
|
+
|
|
116
|
+
MIT
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "udsearch"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Search, match, and batch-edit Universal Dependencies treebanks"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Furkan Akkurt", email = "furkan.akkurt@bogazici.edu.tr"},
|
|
13
|
+
]
|
|
14
|
+
readme = "README.md"
|
|
15
|
+
keywords = ["universal-dependencies", "treebank", "conllu", "nlp", "linguistics", "search"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
25
|
+
"Topic :: Scientific/Engineering",
|
|
26
|
+
"Topic :: Text Processing :: Linguistic",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://gitlab.com/furkan4829/tools/ud-tools"
|
|
31
|
+
Issues = "https://gitlab.com/furkan4829/tools/ud-tools/-/issues"
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
udsearch = "udsearch.cli:main_search"
|
|
35
|
+
|
|
36
|
+
[tool.setuptools]
|
|
37
|
+
packages = ["udsearch"]
|
|
38
|
+
|
|
39
|
+
[tool.ruff]
|
|
40
|
+
line-length = 120
|
udsearch-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
"""Tests for dict-based compatibility layer (BoAT integration)."""
|
|
2
|
+
|
|
3
|
+
from udsearch._compat import (
|
|
4
|
+
apply_operations_to_dicts,
|
|
5
|
+
dict_from_token,
|
|
6
|
+
match_structural_dicts,
|
|
7
|
+
sentence_from_wordlines,
|
|
8
|
+
token_from_dict,
|
|
9
|
+
)
|
|
10
|
+
from udsearch.conllu import Token
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Sample wordline dicts in BoAT format
|
|
14
|
+
def _sample_wordlines():
|
|
15
|
+
"""A simple sentence: 'The cat sat.'"""
|
|
16
|
+
return [
|
|
17
|
+
{
|
|
18
|
+
"id_f": "1",
|
|
19
|
+
"form": "The",
|
|
20
|
+
"lemma": "the",
|
|
21
|
+
"upos": "DET",
|
|
22
|
+
"xpos": "_",
|
|
23
|
+
"feats": "Definite=Def|PronType=Art",
|
|
24
|
+
"head": "2",
|
|
25
|
+
"deprel": "det",
|
|
26
|
+
"deps": "_",
|
|
27
|
+
"misc": "_",
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id_f": "2",
|
|
31
|
+
"form": "cat",
|
|
32
|
+
"lemma": "cat",
|
|
33
|
+
"upos": "NOUN",
|
|
34
|
+
"xpos": "_",
|
|
35
|
+
"feats": "Case=Nom|Number=Sing",
|
|
36
|
+
"head": "3",
|
|
37
|
+
"deprel": "nsubj",
|
|
38
|
+
"deps": "_",
|
|
39
|
+
"misc": "_",
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"id_f": "3",
|
|
43
|
+
"form": "sat",
|
|
44
|
+
"lemma": "sit",
|
|
45
|
+
"upos": "VERB",
|
|
46
|
+
"xpos": "_",
|
|
47
|
+
"feats": "Mood=Ind|Tense=Past|VerbForm=Fin",
|
|
48
|
+
"head": "0",
|
|
49
|
+
"deprel": "root",
|
|
50
|
+
"deps": "_",
|
|
51
|
+
"misc": "_",
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"id_f": "4",
|
|
55
|
+
"form": ".",
|
|
56
|
+
"lemma": ".",
|
|
57
|
+
"upos": "PUNCT",
|
|
58
|
+
"xpos": "_",
|
|
59
|
+
"feats": "_",
|
|
60
|
+
"head": "3",
|
|
61
|
+
"deprel": "punct",
|
|
62
|
+
"deps": "_",
|
|
63
|
+
"misc": "SpaceAfter=No",
|
|
64
|
+
},
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _two_sentence_wordlines():
|
|
69
|
+
"""Two sentences for multi-sentence testing."""
|
|
70
|
+
s1 = _sample_wordlines()
|
|
71
|
+
s2 = [
|
|
72
|
+
{
|
|
73
|
+
"id_f": "1",
|
|
74
|
+
"form": "Dogs",
|
|
75
|
+
"lemma": "dog",
|
|
76
|
+
"upos": "NOUN",
|
|
77
|
+
"xpos": "_",
|
|
78
|
+
"feats": "Case=Nom|Number=Plur",
|
|
79
|
+
"head": "2",
|
|
80
|
+
"deprel": "nsubj",
|
|
81
|
+
"deps": "_",
|
|
82
|
+
"misc": "_",
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"id_f": "2",
|
|
86
|
+
"form": "run",
|
|
87
|
+
"lemma": "run",
|
|
88
|
+
"upos": "VERB",
|
|
89
|
+
"xpos": "_",
|
|
90
|
+
"feats": "Mood=Ind|Tense=Pres|VerbForm=Fin",
|
|
91
|
+
"head": "0",
|
|
92
|
+
"deprel": "root",
|
|
93
|
+
"deps": "_",
|
|
94
|
+
"misc": "SpaceAfter=No",
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"id_f": "3",
|
|
98
|
+
"form": ".",
|
|
99
|
+
"lemma": ".",
|
|
100
|
+
"upos": "PUNCT",
|
|
101
|
+
"xpos": "_",
|
|
102
|
+
"feats": "_",
|
|
103
|
+
"head": "2",
|
|
104
|
+
"deprel": "punct",
|
|
105
|
+
"deps": "_",
|
|
106
|
+
"misc": "_",
|
|
107
|
+
},
|
|
108
|
+
]
|
|
109
|
+
return s1, s2
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# ---------------------------------------------------------------------------
|
|
113
|
+
# token_from_dict / dict_from_token
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class TestTokenConversion:
|
|
118
|
+
def test_roundtrip(self):
|
|
119
|
+
wl = _sample_wordlines()[1] # cat
|
|
120
|
+
token = token_from_dict(wl)
|
|
121
|
+
assert isinstance(token, Token)
|
|
122
|
+
assert token.id == "2"
|
|
123
|
+
assert token.form == "cat"
|
|
124
|
+
assert token.upos == "NOUN"
|
|
125
|
+
assert token.feats == {"Case": "Nom", "Number": "Sing"}
|
|
126
|
+
assert token.head == "3"
|
|
127
|
+
assert token.deprel == "nsubj"
|
|
128
|
+
|
|
129
|
+
# Convert back
|
|
130
|
+
result = dict_from_token(token)
|
|
131
|
+
assert result == wl
|
|
132
|
+
|
|
133
|
+
def test_underscore_fields(self):
|
|
134
|
+
wl = _sample_wordlines()[0] # The — xpos=_, misc=_
|
|
135
|
+
token = token_from_dict(wl)
|
|
136
|
+
assert token.xpos == "_"
|
|
137
|
+
assert token.misc == {}
|
|
138
|
+
result = dict_from_token(token)
|
|
139
|
+
assert result["misc"] == "_"
|
|
140
|
+
|
|
141
|
+
def test_misc_field(self):
|
|
142
|
+
wl = _sample_wordlines()[3] # . — misc=SpaceAfter=No
|
|
143
|
+
token = token_from_dict(wl)
|
|
144
|
+
assert token.misc == {"SpaceAfter": "No"}
|
|
145
|
+
result = dict_from_token(token)
|
|
146
|
+
assert result["misc"] == "SpaceAfter=No"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# ---------------------------------------------------------------------------
|
|
150
|
+
# sentence_from_wordlines
|
|
151
|
+
# ---------------------------------------------------------------------------
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class TestSentenceFromWordlines:
|
|
155
|
+
def test_basic(self):
|
|
156
|
+
wls = _sample_wordlines()
|
|
157
|
+
sent = sentence_from_wordlines(wls, sent_id="test-1", text="The cat sat.")
|
|
158
|
+
assert sent.sent_id == "test-1"
|
|
159
|
+
assert sent.text == "The cat sat."
|
|
160
|
+
assert len(sent.tokens) == 4
|
|
161
|
+
assert sent.tokens[0].form == "The"
|
|
162
|
+
assert sent.tokens[1].upos == "NOUN"
|
|
163
|
+
|
|
164
|
+
def test_mwt_preserved(self):
|
|
165
|
+
"""MWT range tokens are preserved in _all_lines but excluded from tokens."""
|
|
166
|
+
wls = [
|
|
167
|
+
{
|
|
168
|
+
"id_f": "1-2",
|
|
169
|
+
"form": "Gidiyorum",
|
|
170
|
+
"lemma": "_",
|
|
171
|
+
"upos": "_",
|
|
172
|
+
"xpos": "_",
|
|
173
|
+
"feats": "_",
|
|
174
|
+
"head": "_",
|
|
175
|
+
"deprel": "_",
|
|
176
|
+
"deps": "_",
|
|
177
|
+
"misc": "SpaceAfter=No",
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
"id_f": "1",
|
|
181
|
+
"form": "Gidiyor",
|
|
182
|
+
"lemma": "git",
|
|
183
|
+
"upos": "VERB",
|
|
184
|
+
"xpos": "_",
|
|
185
|
+
"feats": "Aspect=Prog",
|
|
186
|
+
"head": "0",
|
|
187
|
+
"deprel": "root",
|
|
188
|
+
"deps": "_",
|
|
189
|
+
"misc": "_",
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
"id_f": "2",
|
|
193
|
+
"form": "um",
|
|
194
|
+
"lemma": "ben",
|
|
195
|
+
"upos": "PRON",
|
|
196
|
+
"xpos": "_",
|
|
197
|
+
"feats": "Case=Nom",
|
|
198
|
+
"head": "1",
|
|
199
|
+
"deprel": "nsubj",
|
|
200
|
+
"deps": "_",
|
|
201
|
+
"misc": "_",
|
|
202
|
+
},
|
|
203
|
+
]
|
|
204
|
+
sent = sentence_from_wordlines(wls, sent_id="mwt", text="Gidiyorum.")
|
|
205
|
+
# MWT excluded from searchable tokens
|
|
206
|
+
assert len(sent.tokens) == 2
|
|
207
|
+
assert sent.tokens[0].form == "Gidiyor"
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# ---------------------------------------------------------------------------
|
|
211
|
+
# match_structural_dicts
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class TestMatchStructuralDicts:
|
|
216
|
+
def test_basic_search(self):
|
|
217
|
+
wls = _sample_wordlines()
|
|
218
|
+
results = match_structural_dicts("UPOS=NOUN", wls)
|
|
219
|
+
assert len(results) == 1
|
|
220
|
+
assert results[0]["target"]["form"] == "cat"
|
|
221
|
+
|
|
222
|
+
def test_structural_pattern_string(self):
|
|
223
|
+
wls = _sample_wordlines()
|
|
224
|
+
results = match_structural_dicts(
|
|
225
|
+
"v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v",
|
|
226
|
+
wls,
|
|
227
|
+
)
|
|
228
|
+
assert len(results) == 1
|
|
229
|
+
assert results[0]["v"]["form"] == "sat"
|
|
230
|
+
assert results[0]["s"]["form"] == "cat"
|
|
231
|
+
|
|
232
|
+
def test_negated_node(self):
|
|
233
|
+
wls = _sample_wordlines()
|
|
234
|
+
# VERB without advmod
|
|
235
|
+
results = match_structural_dicts(
|
|
236
|
+
"v: [UPOS=VERB & deprel=root]\n!a: [] -advmod-> v",
|
|
237
|
+
wls,
|
|
238
|
+
)
|
|
239
|
+
assert len(results) == 1
|
|
240
|
+
assert results[0]["v"]["form"] == "sat"
|
|
241
|
+
|
|
242
|
+
def test_no_match(self):
|
|
243
|
+
wls = _sample_wordlines()
|
|
244
|
+
results = match_structural_dicts("UPOS=NUM", wls)
|
|
245
|
+
assert results == []
|
|
246
|
+
|
|
247
|
+
def test_result_is_dict(self):
|
|
248
|
+
"""Results should be dicts, not Token objects."""
|
|
249
|
+
wls = _sample_wordlines()
|
|
250
|
+
results = match_structural_dicts("UPOS=NOUN", wls)
|
|
251
|
+
binding = results[0]
|
|
252
|
+
assert isinstance(binding["target"], dict)
|
|
253
|
+
assert "id_f" in binding["target"]
|
|
254
|
+
assert "form" in binding["target"]
|
|
255
|
+
|
|
256
|
+
def test_with_parsed_structural_pattern(self):
|
|
257
|
+
from udsearch.structural import parse_structural
|
|
258
|
+
|
|
259
|
+
wls = _sample_wordlines()
|
|
260
|
+
pattern = parse_structural("v: [UPOS=VERB]\ns: [] -nsubj-> v")
|
|
261
|
+
results = match_structural_dicts(pattern, wls)
|
|
262
|
+
assert len(results) == 1
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# ---------------------------------------------------------------------------
|
|
266
|
+
# apply_operations_to_dicts
|
|
267
|
+
# ---------------------------------------------------------------------------
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class TestApplyOperationsToDicts:
|
|
271
|
+
def test_simple_rewrite(self):
|
|
272
|
+
wls = _sample_wordlines()
|
|
273
|
+
modified, changes = apply_operations_to_dicts(
|
|
274
|
+
wls,
|
|
275
|
+
pattern="UPOS=NOUN",
|
|
276
|
+
operations=["Number=Plur"],
|
|
277
|
+
)
|
|
278
|
+
# Only cat (NOUN) should be modified
|
|
279
|
+
assert len(changes) == 1
|
|
280
|
+
assert changes[0]["token_id"] == "2"
|
|
281
|
+
assert changes[0]["form"] == "cat"
|
|
282
|
+
assert "Number: Sing → Plur" in changes[0]["descriptions"][0]
|
|
283
|
+
|
|
284
|
+
# Check modified wordlines
|
|
285
|
+
cat_wl = next(wl for wl in modified if wl["id_f"] == "2")
|
|
286
|
+
assert "Number=Plur" in cat_wl["feats"]
|
|
287
|
+
assert "Number=Sing" not in cat_wl["feats"]
|
|
288
|
+
|
|
289
|
+
# Other tokens unchanged
|
|
290
|
+
the_wl = next(wl for wl in modified if wl["id_f"] == "1")
|
|
291
|
+
assert the_wl == _sample_wordlines()[0]
|
|
292
|
+
|
|
293
|
+
def test_structural_rewrite_node_targeted(self):
|
|
294
|
+
wls = _sample_wordlines()
|
|
295
|
+
modified, changes = apply_operations_to_dicts(
|
|
296
|
+
wls,
|
|
297
|
+
pattern="v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v",
|
|
298
|
+
operations=["s.Case=Acc"], # target the nsubj node
|
|
299
|
+
)
|
|
300
|
+
assert len(changes) == 1
|
|
301
|
+
assert changes[0]["node_name"] == "s"
|
|
302
|
+
assert changes[0]["token_id"] == "2"
|
|
303
|
+
assert "Case: Nom → Acc" in changes[0]["descriptions"][0]
|
|
304
|
+
|
|
305
|
+
cat_wl = next(wl for wl in modified if wl["id_f"] == "2")
|
|
306
|
+
assert "Case=Acc" in cat_wl["feats"]
|
|
307
|
+
|
|
308
|
+
def test_structural_rewrite_default_ops(self):
|
|
309
|
+
"""Default ops (no node prefix) apply to anchor node."""
|
|
310
|
+
wls = _sample_wordlines()
|
|
311
|
+
modified, changes = apply_operations_to_dicts(
|
|
312
|
+
wls,
|
|
313
|
+
pattern="v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v",
|
|
314
|
+
operations=["Polarity=Neg"], # no prefix -> anchor (v)
|
|
315
|
+
)
|
|
316
|
+
assert len(changes) == 1
|
|
317
|
+
assert changes[0]["node_name"] == "v"
|
|
318
|
+
assert changes[0]["token_id"] == "3"
|
|
319
|
+
|
|
320
|
+
verb_wl = next(wl for wl in modified if wl["id_f"] == "3")
|
|
321
|
+
assert "Polarity=Neg" in verb_wl["feats"]
|
|
322
|
+
|
|
323
|
+
def test_multiple_operations(self):
|
|
324
|
+
wls = _sample_wordlines()
|
|
325
|
+
modified, changes = apply_operations_to_dicts(
|
|
326
|
+
wls,
|
|
327
|
+
pattern="UPOS=NOUN",
|
|
328
|
+
operations=["UPOS=PROPN", "Number=Plur"],
|
|
329
|
+
)
|
|
330
|
+
assert len(changes) == 1
|
|
331
|
+
cat_wl = next(wl for wl in modified if wl["id_f"] == "2")
|
|
332
|
+
assert cat_wl["upos"] == "PROPN"
|
|
333
|
+
assert "Number=Plur" in cat_wl["feats"]
|
|
334
|
+
|
|
335
|
+
def test_no_match_no_changes(self):
|
|
336
|
+
wls = _sample_wordlines()
|
|
337
|
+
modified, changes = apply_operations_to_dicts(
|
|
338
|
+
wls,
|
|
339
|
+
pattern="UPOS=NUM",
|
|
340
|
+
operations=["Number=Plur"],
|
|
341
|
+
)
|
|
342
|
+
assert changes == []
|
|
343
|
+
assert modified == _sample_wordlines()
|
|
344
|
+
|
|
345
|
+
def test_remove_feature(self):
|
|
346
|
+
wls = _sample_wordlines()
|
|
347
|
+
modified, changes = apply_operations_to_dicts(
|
|
348
|
+
wls,
|
|
349
|
+
pattern="UPOS=NOUN",
|
|
350
|
+
operations=["-Case"],
|
|
351
|
+
)
|
|
352
|
+
assert len(changes) == 1
|
|
353
|
+
cat_wl = next(wl for wl in modified if wl["id_f"] == "2")
|
|
354
|
+
assert "Case" not in cat_wl["feats"]
|
|
355
|
+
assert "Number=Sing" in cat_wl["feats"]
|
|
356
|
+
|
|
357
|
+
def test_preserves_wordline_count(self):
|
|
358
|
+
wls = _sample_wordlines()
|
|
359
|
+
modified, _ = apply_operations_to_dicts(
|
|
360
|
+
wls,
|
|
361
|
+
pattern="UPOS=NOUN",
|
|
362
|
+
operations=["Number=Plur"],
|
|
363
|
+
)
|
|
364
|
+
assert len(modified) == len(wls)
|