udsearch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
udsearch-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024-2026 Furkan Akkurt
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,140 @@
1
+ Metadata-Version: 2.4
2
+ Name: udsearch
3
+ Version: 0.1.0
4
+ Summary: Search, match, and batch-edit Universal Dependencies treebanks
5
+ Author-email: Furkan Akkurt <furkan.akkurt@bogazici.edu.tr>
6
+ License: MIT
7
+ Project-URL: Homepage, https://gitlab.com/furkan4829/tools/ud-tools
8
+ Project-URL: Issues, https://gitlab.com/furkan4829/tools/ud-tools/-/issues
9
+ Keywords: universal-dependencies,treebank,conllu,nlp,linguistics,search
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Text Processing :: Linguistic
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Dynamic: license-file
24
+
25
+ # udsearch
26
+
27
+ Search, match, and batch-edit [Universal Dependencies](https://universaldependencies.org/) treebanks from the command line or as a Python library.
28
+
29
+ Pure Python, no external dependencies.
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pip install udsearch
35
+ ```
36
+
37
+ ## CLI
38
+
39
+ ```bash
40
+ # Search for tokens
41
+ udsearch "UPOS=NOUN & Case=Dat" -t Turkish-BOUN
42
+ udsearch "UPOS=NOUN|PROPN & deprel=nsubj" -f corpus.conllu
43
+
44
+ # Structural patterns (multi-node)
45
+ udsearch $'v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v' -f corpus.conllu
46
+ udsearch --pattern-file query.txt -f corpus.conllu
47
+
48
+ # Clustering (Grew-match style)
49
+ udsearch "UPOS=NOUN" -t Turkish-BOUN --cluster
50
+ udsearch "deprel=obl" -t Turkish-BOUN --cluster Case Number
51
+
52
+ # Batch rewrite (dry-run by default)
53
+ udsearch "UPOS=NOUN & lemma=yok" --set "UPOS=ADJ" --set "Polarity=Neg" -f tb.conllu
54
+ udsearch "UPOS=NOUN & lemma=yok" --set "UPOS=ADJ" -f tb.conllu --apply
55
+
56
+ # Structural rewrite (target specific nodes)
57
+ udsearch $'v: [UPOS=VERB]\ns: [] -nsubj-> v' --set "s.Case=Nom" -f tb.conllu --apply
58
+
59
+ # Treebank management
60
+ udsearch --list tr # list Turkish treebanks on GitHub
61
+ udsearch --list-cached # show downloaded treebanks
62
+ ```
63
+
64
+ ## Pattern syntax
65
+
66
+ ### Single-node
67
+
68
+ ```
69
+ UPOS=NOUN exact match
70
+ UPOS=NOUN|PROPN alternatives
71
+ lemma=/^yap/ regex
72
+ !PronType=Prs negation
73
+ PronType feature exists
74
+ UPOS=NOUN & Case=Dat conjunction
75
+ ```
76
+
77
+ ### Structural (multi-node)
78
+
79
+ ```
80
+ v: [UPOS=VERB] named node
81
+ s: [UPOS=PRON & Case=Nom] -nsubj-> v dependency relation
82
+ !a: [UPOS=AUX] -aux-> v negated (must NOT exist)
83
+ d: [] -nsubj|obj-> v deprel alternatives
84
+ d: [] -/^nsubj/-> v deprel regex
85
+ d: [] -> v any relation
86
+ s << v linear precedence
87
+ ```
88
+
89
+ ### Rewrite operations
90
+
91
+ ```
92
+ Polarity=Neg add/set feature
93
+ UPOS=ADJ change column field
94
+ -Case remove feature
95
+ MISC.Lang=en set MISC field
96
+ s.Case=Nom target node in structural pattern
97
+ ```
98
+
99
+ ## Library usage
100
+
101
+ ```python
102
+ from udsearch import parse_conllu, parse_pattern, search_treebank
103
+ from udsearch import parse_structural, match_structural
104
+
105
+ # Parse CoNLL-U
106
+ sentences = parse_conllu(open("corpus.conllu").read())
107
+
108
+ # Single-node search
109
+ pattern = parse_pattern("UPOS=VERB & Tense=Past")
110
+ for sent, tokens in search_treebank(sentences, pattern):
111
+ print(sent.sent_id, [t.form for t in tokens])
112
+
113
+ # Structural search
114
+ sp = parse_structural("v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v")
115
+ for sent, bindings in search_structural(sentences, sp):
116
+ for b in bindings:
117
+ print(f"{b['v'].form} <- {b['s'].form}")
118
+
119
+ # Batch rewrite
120
+ from udsearch import apply_operations, parse_set_operations
121
+ ops = parse_set_operations(["Polarity=Neg"])
122
+ for sent, tokens in search_treebank(sentences, pattern):
123
+ for token in tokens:
124
+ changes = apply_operations(token, ops)
125
+ ```
126
+
127
+ ### Dict-based API (for web apps)
128
+
129
+ ```python
130
+ from udsearch import match_structural_dicts, apply_operations_to_dicts
131
+
132
+ # Works with dict-based wordlines (e.g., from a database)
133
+ wordlines = [{"id_f": "1", "form": "cat", "upos": "NOUN", ...}, ...]
134
+ matches = match_structural_dicts("UPOS=NOUN", wordlines)
135
+ modified, changes = apply_operations_to_dicts(wordlines, "UPOS=NOUN", ["Case=Acc"])
136
+ ```
137
+
138
+ ## License
139
+
140
+ MIT
@@ -0,0 +1,116 @@
1
+ # udsearch
2
+
3
+ Search, match, and batch-edit [Universal Dependencies](https://universaldependencies.org/) treebanks from the command line or as a Python library.
4
+
5
+ Pure Python, no external dependencies.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install udsearch
11
+ ```
12
+
13
+ ## CLI
14
+
15
+ ```bash
16
+ # Search for tokens
17
+ udsearch "UPOS=NOUN & Case=Dat" -t Turkish-BOUN
18
+ udsearch "UPOS=NOUN|PROPN & deprel=nsubj" -f corpus.conllu
19
+
20
+ # Structural patterns (multi-node)
21
+ udsearch $'v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v' -f corpus.conllu
22
+ udsearch --pattern-file query.txt -f corpus.conllu
23
+
24
+ # Clustering (Grew-match style)
25
+ udsearch "UPOS=NOUN" -t Turkish-BOUN --cluster
26
+ udsearch "deprel=obl" -t Turkish-BOUN --cluster Case Number
27
+
28
+ # Batch rewrite (dry-run by default)
29
+ udsearch "UPOS=NOUN & lemma=yok" --set "UPOS=ADJ" --set "Polarity=Neg" -f tb.conllu
30
+ udsearch "UPOS=NOUN & lemma=yok" --set "UPOS=ADJ" -f tb.conllu --apply
31
+
32
+ # Structural rewrite (target specific nodes)
33
+ udsearch $'v: [UPOS=VERB]\ns: [] -nsubj-> v' --set "s.Case=Nom" -f tb.conllu --apply
34
+
35
+ # Treebank management
36
+ udsearch --list tr # list Turkish treebanks on GitHub
37
+ udsearch --list-cached # show downloaded treebanks
38
+ ```
39
+
40
+ ## Pattern syntax
41
+
42
+ ### Single-node
43
+
44
+ ```
45
+ UPOS=NOUN exact match
46
+ UPOS=NOUN|PROPN alternatives
47
+ lemma=/^yap/ regex
48
+ !PronType=Prs negation
49
+ PronType feature exists
50
+ UPOS=NOUN & Case=Dat conjunction
51
+ ```
52
+
53
+ ### Structural (multi-node)
54
+
55
+ ```
56
+ v: [UPOS=VERB] named node
57
+ s: [UPOS=PRON & Case=Nom] -nsubj-> v dependency relation
58
+ !a: [UPOS=AUX] -aux-> v negated (must NOT exist)
59
+ d: [] -nsubj|obj-> v deprel alternatives
60
+ d: [] -/^nsubj/-> v deprel regex
61
+ d: [] -> v any relation
62
+ s << v linear precedence
63
+ ```
64
+
65
+ ### Rewrite operations
66
+
67
+ ```
68
+ Polarity=Neg add/set feature
69
+ UPOS=ADJ change column field
70
+ -Case remove feature
71
+ MISC.Lang=en set MISC field
72
+ s.Case=Nom target node in structural pattern
73
+ ```
74
+
75
+ ## Library usage
76
+
77
+ ```python
78
+ from udsearch import parse_conllu, parse_pattern, search_treebank
79
+ from udsearch import parse_structural, match_structural
80
+
81
+ # Parse CoNLL-U
82
+ sentences = parse_conllu(open("corpus.conllu").read())
83
+
84
+ # Single-node search
85
+ pattern = parse_pattern("UPOS=VERB & Tense=Past")
86
+ for sent, tokens in search_treebank(sentences, pattern):
87
+ print(sent.sent_id, [t.form for t in tokens])
88
+
89
+ # Structural search
90
+ sp = parse_structural("v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v")
91
+ for sent, bindings in search_structural(sentences, sp):
92
+ for b in bindings:
93
+ print(f"{b['v'].form} <- {b['s'].form}")
94
+
95
+ # Batch rewrite
96
+ from udsearch import apply_operations, parse_set_operations
97
+ ops = parse_set_operations(["Polarity=Neg"])
98
+ for sent, tokens in search_treebank(sentences, pattern):
99
+ for token in tokens:
100
+ changes = apply_operations(token, ops)
101
+ ```
102
+
103
+ ### Dict-based API (for web apps)
104
+
105
+ ```python
106
+ from udsearch import match_structural_dicts, apply_operations_to_dicts
107
+
108
+ # Works with dict-based wordlines (e.g., from a database)
109
+ wordlines = [{"id_f": "1", "form": "cat", "upos": "NOUN", ...}, ...]
110
+ matches = match_structural_dicts("UPOS=NOUN", wordlines)
111
+ modified, changes = apply_operations_to_dicts(wordlines, "UPOS=NOUN", ["Case=Acc"])
112
+ ```
113
+
114
+ ## License
115
+
116
+ MIT
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "udsearch"
7
+ version = "0.1.0"
8
+ description = "Search, match, and batch-edit Universal Dependencies treebanks"
9
+ requires-python = ">=3.10"
10
+ license = {text = "MIT"}
11
+ authors = [
12
+ {name = "Furkan Akkurt", email = "furkan.akkurt@bogazici.edu.tr"},
13
+ ]
14
+ readme = "README.md"
15
+ keywords = ["universal-dependencies", "treebank", "conllu", "nlp", "linguistics", "search"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Topic :: Scientific/Engineering",
26
+ "Topic :: Text Processing :: Linguistic",
27
+ ]
28
+
29
+ [project.urls]
30
+ Homepage = "https://gitlab.com/furkan4829/tools/ud-tools"
31
+ Issues = "https://gitlab.com/furkan4829/tools/ud-tools/-/issues"
32
+
33
+ [project.scripts]
34
+ udsearch = "udsearch.cli:main_search"
35
+
36
+ [tool.setuptools]
37
+ packages = ["udsearch"]
38
+
39
+ [tool.ruff]
40
+ line-length = 120
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,364 @@
1
+ """Tests for dict-based compatibility layer (BoAT integration)."""
2
+
3
+ from udsearch._compat import (
4
+ apply_operations_to_dicts,
5
+ dict_from_token,
6
+ match_structural_dicts,
7
+ sentence_from_wordlines,
8
+ token_from_dict,
9
+ )
10
+ from udsearch.conllu import Token
11
+
12
+
13
+ # Sample wordline dicts in BoAT format
14
+ def _sample_wordlines():
15
+ """A simple sentence: 'The cat sat.'"""
16
+ return [
17
+ {
18
+ "id_f": "1",
19
+ "form": "The",
20
+ "lemma": "the",
21
+ "upos": "DET",
22
+ "xpos": "_",
23
+ "feats": "Definite=Def|PronType=Art",
24
+ "head": "2",
25
+ "deprel": "det",
26
+ "deps": "_",
27
+ "misc": "_",
28
+ },
29
+ {
30
+ "id_f": "2",
31
+ "form": "cat",
32
+ "lemma": "cat",
33
+ "upos": "NOUN",
34
+ "xpos": "_",
35
+ "feats": "Case=Nom|Number=Sing",
36
+ "head": "3",
37
+ "deprel": "nsubj",
38
+ "deps": "_",
39
+ "misc": "_",
40
+ },
41
+ {
42
+ "id_f": "3",
43
+ "form": "sat",
44
+ "lemma": "sit",
45
+ "upos": "VERB",
46
+ "xpos": "_",
47
+ "feats": "Mood=Ind|Tense=Past|VerbForm=Fin",
48
+ "head": "0",
49
+ "deprel": "root",
50
+ "deps": "_",
51
+ "misc": "_",
52
+ },
53
+ {
54
+ "id_f": "4",
55
+ "form": ".",
56
+ "lemma": ".",
57
+ "upos": "PUNCT",
58
+ "xpos": "_",
59
+ "feats": "_",
60
+ "head": "3",
61
+ "deprel": "punct",
62
+ "deps": "_",
63
+ "misc": "SpaceAfter=No",
64
+ },
65
+ ]
66
+
67
+
68
+ def _two_sentence_wordlines():
69
+ """Two sentences for multi-sentence testing."""
70
+ s1 = _sample_wordlines()
71
+ s2 = [
72
+ {
73
+ "id_f": "1",
74
+ "form": "Dogs",
75
+ "lemma": "dog",
76
+ "upos": "NOUN",
77
+ "xpos": "_",
78
+ "feats": "Case=Nom|Number=Plur",
79
+ "head": "2",
80
+ "deprel": "nsubj",
81
+ "deps": "_",
82
+ "misc": "_",
83
+ },
84
+ {
85
+ "id_f": "2",
86
+ "form": "run",
87
+ "lemma": "run",
88
+ "upos": "VERB",
89
+ "xpos": "_",
90
+ "feats": "Mood=Ind|Tense=Pres|VerbForm=Fin",
91
+ "head": "0",
92
+ "deprel": "root",
93
+ "deps": "_",
94
+ "misc": "SpaceAfter=No",
95
+ },
96
+ {
97
+ "id_f": "3",
98
+ "form": ".",
99
+ "lemma": ".",
100
+ "upos": "PUNCT",
101
+ "xpos": "_",
102
+ "feats": "_",
103
+ "head": "2",
104
+ "deprel": "punct",
105
+ "deps": "_",
106
+ "misc": "_",
107
+ },
108
+ ]
109
+ return s1, s2
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # token_from_dict / dict_from_token
114
+ # ---------------------------------------------------------------------------
115
+
116
+
117
+ class TestTokenConversion:
118
+ def test_roundtrip(self):
119
+ wl = _sample_wordlines()[1] # cat
120
+ token = token_from_dict(wl)
121
+ assert isinstance(token, Token)
122
+ assert token.id == "2"
123
+ assert token.form == "cat"
124
+ assert token.upos == "NOUN"
125
+ assert token.feats == {"Case": "Nom", "Number": "Sing"}
126
+ assert token.head == "3"
127
+ assert token.deprel == "nsubj"
128
+
129
+ # Convert back
130
+ result = dict_from_token(token)
131
+ assert result == wl
132
+
133
+ def test_underscore_fields(self):
134
+ wl = _sample_wordlines()[0] # The — xpos=_, misc=_
135
+ token = token_from_dict(wl)
136
+ assert token.xpos == "_"
137
+ assert token.misc == {}
138
+ result = dict_from_token(token)
139
+ assert result["misc"] == "_"
140
+
141
+ def test_misc_field(self):
142
+ wl = _sample_wordlines()[3] # . — misc=SpaceAfter=No
143
+ token = token_from_dict(wl)
144
+ assert token.misc == {"SpaceAfter": "No"}
145
+ result = dict_from_token(token)
146
+ assert result["misc"] == "SpaceAfter=No"
147
+
148
+
149
+ # ---------------------------------------------------------------------------
150
+ # sentence_from_wordlines
151
+ # ---------------------------------------------------------------------------
152
+
153
+
154
+ class TestSentenceFromWordlines:
155
+ def test_basic(self):
156
+ wls = _sample_wordlines()
157
+ sent = sentence_from_wordlines(wls, sent_id="test-1", text="The cat sat.")
158
+ assert sent.sent_id == "test-1"
159
+ assert sent.text == "The cat sat."
160
+ assert len(sent.tokens) == 4
161
+ assert sent.tokens[0].form == "The"
162
+ assert sent.tokens[1].upos == "NOUN"
163
+
164
+ def test_mwt_preserved(self):
165
+ """MWT range tokens are preserved in _all_lines but excluded from tokens."""
166
+ wls = [
167
+ {
168
+ "id_f": "1-2",
169
+ "form": "Gidiyorum",
170
+ "lemma": "_",
171
+ "upos": "_",
172
+ "xpos": "_",
173
+ "feats": "_",
174
+ "head": "_",
175
+ "deprel": "_",
176
+ "deps": "_",
177
+ "misc": "SpaceAfter=No",
178
+ },
179
+ {
180
+ "id_f": "1",
181
+ "form": "Gidiyor",
182
+ "lemma": "git",
183
+ "upos": "VERB",
184
+ "xpos": "_",
185
+ "feats": "Aspect=Prog",
186
+ "head": "0",
187
+ "deprel": "root",
188
+ "deps": "_",
189
+ "misc": "_",
190
+ },
191
+ {
192
+ "id_f": "2",
193
+ "form": "um",
194
+ "lemma": "ben",
195
+ "upos": "PRON",
196
+ "xpos": "_",
197
+ "feats": "Case=Nom",
198
+ "head": "1",
199
+ "deprel": "nsubj",
200
+ "deps": "_",
201
+ "misc": "_",
202
+ },
203
+ ]
204
+ sent = sentence_from_wordlines(wls, sent_id="mwt", text="Gidiyorum.")
205
+ # MWT excluded from searchable tokens
206
+ assert len(sent.tokens) == 2
207
+ assert sent.tokens[0].form == "Gidiyor"
208
+
209
+
210
+ # ---------------------------------------------------------------------------
211
+ # match_structural_dicts
212
+ # ---------------------------------------------------------------------------
213
+
214
+
215
+ class TestMatchStructuralDicts:
216
+ def test_basic_search(self):
217
+ wls = _sample_wordlines()
218
+ results = match_structural_dicts("UPOS=NOUN", wls)
219
+ assert len(results) == 1
220
+ assert results[0]["target"]["form"] == "cat"
221
+
222
+ def test_structural_pattern_string(self):
223
+ wls = _sample_wordlines()
224
+ results = match_structural_dicts(
225
+ "v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v",
226
+ wls,
227
+ )
228
+ assert len(results) == 1
229
+ assert results[0]["v"]["form"] == "sat"
230
+ assert results[0]["s"]["form"] == "cat"
231
+
232
+ def test_negated_node(self):
233
+ wls = _sample_wordlines()
234
+ # VERB without advmod
235
+ results = match_structural_dicts(
236
+ "v: [UPOS=VERB & deprel=root]\n!a: [] -advmod-> v",
237
+ wls,
238
+ )
239
+ assert len(results) == 1
240
+ assert results[0]["v"]["form"] == "sat"
241
+
242
+ def test_no_match(self):
243
+ wls = _sample_wordlines()
244
+ results = match_structural_dicts("UPOS=NUM", wls)
245
+ assert results == []
246
+
247
+ def test_result_is_dict(self):
248
+ """Results should be dicts, not Token objects."""
249
+ wls = _sample_wordlines()
250
+ results = match_structural_dicts("UPOS=NOUN", wls)
251
+ binding = results[0]
252
+ assert isinstance(binding["target"], dict)
253
+ assert "id_f" in binding["target"]
254
+ assert "form" in binding["target"]
255
+
256
+ def test_with_parsed_structural_pattern(self):
257
+ from udsearch.structural import parse_structural
258
+
259
+ wls = _sample_wordlines()
260
+ pattern = parse_structural("v: [UPOS=VERB]\ns: [] -nsubj-> v")
261
+ results = match_structural_dicts(pattern, wls)
262
+ assert len(results) == 1
263
+
264
+
265
+ # ---------------------------------------------------------------------------
266
+ # apply_operations_to_dicts
267
+ # ---------------------------------------------------------------------------
268
+
269
+
270
+ class TestApplyOperationsToDicts:
271
+ def test_simple_rewrite(self):
272
+ wls = _sample_wordlines()
273
+ modified, changes = apply_operations_to_dicts(
274
+ wls,
275
+ pattern="UPOS=NOUN",
276
+ operations=["Number=Plur"],
277
+ )
278
+ # Only cat (NOUN) should be modified
279
+ assert len(changes) == 1
280
+ assert changes[0]["token_id"] == "2"
281
+ assert changes[0]["form"] == "cat"
282
+ assert "Number: Sing → Plur" in changes[0]["descriptions"][0]
283
+
284
+ # Check modified wordlines
285
+ cat_wl = next(wl for wl in modified if wl["id_f"] == "2")
286
+ assert "Number=Plur" in cat_wl["feats"]
287
+ assert "Number=Sing" not in cat_wl["feats"]
288
+
289
+ # Other tokens unchanged
290
+ the_wl = next(wl for wl in modified if wl["id_f"] == "1")
291
+ assert the_wl == _sample_wordlines()[0]
292
+
293
+ def test_structural_rewrite_node_targeted(self):
294
+ wls = _sample_wordlines()
295
+ modified, changes = apply_operations_to_dicts(
296
+ wls,
297
+ pattern="v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v",
298
+ operations=["s.Case=Acc"], # target the nsubj node
299
+ )
300
+ assert len(changes) == 1
301
+ assert changes[0]["node_name"] == "s"
302
+ assert changes[0]["token_id"] == "2"
303
+ assert "Case: Nom → Acc" in changes[0]["descriptions"][0]
304
+
305
+ cat_wl = next(wl for wl in modified if wl["id_f"] == "2")
306
+ assert "Case=Acc" in cat_wl["feats"]
307
+
308
+ def test_structural_rewrite_default_ops(self):
309
+ """Default ops (no node prefix) apply to anchor node."""
310
+ wls = _sample_wordlines()
311
+ modified, changes = apply_operations_to_dicts(
312
+ wls,
313
+ pattern="v: [UPOS=VERB]\ns: [UPOS=NOUN] -nsubj-> v",
314
+ operations=["Polarity=Neg"], # no prefix -> anchor (v)
315
+ )
316
+ assert len(changes) == 1
317
+ assert changes[0]["node_name"] == "v"
318
+ assert changes[0]["token_id"] == "3"
319
+
320
+ verb_wl = next(wl for wl in modified if wl["id_f"] == "3")
321
+ assert "Polarity=Neg" in verb_wl["feats"]
322
+
323
+ def test_multiple_operations(self):
324
+ wls = _sample_wordlines()
325
+ modified, changes = apply_operations_to_dicts(
326
+ wls,
327
+ pattern="UPOS=NOUN",
328
+ operations=["UPOS=PROPN", "Number=Plur"],
329
+ )
330
+ assert len(changes) == 1
331
+ cat_wl = next(wl for wl in modified if wl["id_f"] == "2")
332
+ assert cat_wl["upos"] == "PROPN"
333
+ assert "Number=Plur" in cat_wl["feats"]
334
+
335
+ def test_no_match_no_changes(self):
336
+ wls = _sample_wordlines()
337
+ modified, changes = apply_operations_to_dicts(
338
+ wls,
339
+ pattern="UPOS=NUM",
340
+ operations=["Number=Plur"],
341
+ )
342
+ assert changes == []
343
+ assert modified == _sample_wordlines()
344
+
345
+ def test_remove_feature(self):
346
+ wls = _sample_wordlines()
347
+ modified, changes = apply_operations_to_dicts(
348
+ wls,
349
+ pattern="UPOS=NOUN",
350
+ operations=["-Case"],
351
+ )
352
+ assert len(changes) == 1
353
+ cat_wl = next(wl for wl in modified if wl["id_f"] == "2")
354
+ assert "Case" not in cat_wl["feats"]
355
+ assert "Number=Sing" in cat_wl["feats"]
356
+
357
+ def test_preserves_wordline_count(self):
358
+ wls = _sample_wordlines()
359
+ modified, _ = apply_operations_to_dicts(
360
+ wls,
361
+ pattern="UPOS=NOUN",
362
+ operations=["Number=Plur"],
363
+ )
364
+ assert len(modified) == len(wls)