hyperbase-parser-ab 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/CHANGELOG.md +14 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/PKG-INFO +3 -3
- hyperbase_parser_ab-0.3.0/VERSION +1 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/pyproject.toml +2 -2
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/src/hyperbase_parser_ab/parser.py +134 -40
- hyperbase_parser_ab-0.3.0/src/hyperbase_parser_ab/repl.py +82 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/src/hyperbase_parser_ab/rules.py +2 -1
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/tests/test_parser.py +85 -8
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/tests/test_parser_helpers.py +2 -1
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/tests/test_rules.py +1 -1
- hyperbase_parser_ab-0.2.0/VERSION +0 -1
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/.github/workflows/publish.yml +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/.gitignore +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/.pre-commit-config.yaml +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/LICENSE +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/README.md +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/scripts/generate_alpha_training_data.py +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/scripts/train_atomizer.py +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/src/hyperbase_parser_ab/__init__.py +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/src/hyperbase_parser_ab/alpha.py +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/src/hyperbase_parser_ab/atomizer.py +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/src/hyperbase_parser_ab/lang_models.py +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/src/hyperbase_parser_ab/sentensizer.py +0 -0
- {hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/tests/__init__.py +0 -0
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.3.0] - 11-04-2026
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- Maximum depth protection.
|
|
8
|
+
- Conjunction flattening.
|
|
9
|
+
- Show dependency parse tree on REPL.
|
|
10
|
+
- lang_namespace parameter, defaults to False (no language namespaces in atoms).
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
- Adopted new hyperbase API (0.10.0).
|
|
15
|
+
- Adopted REPL API.
|
|
16
|
+
|
|
3
17
|
## [0.2.0] - 05-04-2026
|
|
4
18
|
|
|
5
19
|
### Changed
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hyperbase-parser-ab
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Semantic Hypergraph
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Semantic Hypergraph Alpha-Beta Parser
|
|
5
5
|
Project-URL: Homepage, https://hyperquest.ai/hyperbase
|
|
6
6
|
Author-email: "Telmo Menezes et al." <telmo@telmomenezes.net>
|
|
7
7
|
License-Expression: MIT
|
|
@@ -15,7 +15,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
16
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
17
17
|
Requires-Python: >=3.10
|
|
18
|
-
Requires-Dist: hyperbase>=0.
|
|
18
|
+
Requires-Dist: hyperbase>=0.10.0
|
|
19
19
|
Requires-Dist: pip
|
|
20
20
|
Requires-Dist: scikit-learn>=1.3.0
|
|
21
21
|
Requires-Dist: spacy>=3.8.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.3.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "hyperbase-parser-ab"
|
|
3
3
|
dynamic = ["version"]
|
|
4
|
-
description = "Semantic Hypergraph
|
|
4
|
+
description = "Semantic Hypergraph Alpha-Beta Parser"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "MIT"
|
|
7
7
|
requires-python = ">=3.10"
|
|
@@ -26,7 +26,7 @@ classifiers = [
|
|
|
26
26
|
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
27
27
|
]
|
|
28
28
|
dependencies = [
|
|
29
|
-
"hyperbase>=0.
|
|
29
|
+
"hyperbase>=0.10.0",
|
|
30
30
|
"scikit-learn>=1.3.0",
|
|
31
31
|
"spacy>=3.8.0",
|
|
32
32
|
"torch>=2.0.0",
|
|
@@ -4,16 +4,16 @@ from typing import Any, cast
|
|
|
4
4
|
|
|
5
5
|
import hyperbase.constants as const
|
|
6
6
|
import spacy
|
|
7
|
+
from hyperbase.builders import build_atom, hedge
|
|
7
8
|
from hyperbase.hyperedge import (
|
|
8
9
|
Atom,
|
|
9
10
|
Hyperedge,
|
|
10
11
|
UniqueAtom,
|
|
11
|
-
build_atom,
|
|
12
|
-
hedge,
|
|
13
12
|
non_unique,
|
|
14
13
|
unique,
|
|
15
14
|
)
|
|
16
15
|
from hyperbase.parsers import Parser, ParseResult
|
|
16
|
+
from hyperbase.parsers.utils import edge_depth_exceeds
|
|
17
17
|
from spacy.language import Language
|
|
18
18
|
from spacy.tokens import Doc, Span, Token
|
|
19
19
|
|
|
@@ -165,22 +165,67 @@ def _generate_tok_pos(atom2word: dict[Atom, tuple[str, int]], edge: Hyperedge) -
|
|
|
165
165
|
|
|
166
166
|
|
|
167
167
|
class AlphaBetaParser(Parser):
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
168
|
+
@classmethod
|
|
169
|
+
def accepted_params(cls) -> dict[str, dict[str, Any]]:
|
|
170
|
+
return {
|
|
171
|
+
**super().accepted_params(),
|
|
172
|
+
"lang": {
|
|
173
|
+
"type": str,
|
|
174
|
+
"default": None,
|
|
175
|
+
"description": "Language code (e.g. 'de', 'en', 'fr').",
|
|
176
|
+
"required": True,
|
|
177
|
+
},
|
|
178
|
+
"beta": {
|
|
179
|
+
"type": str,
|
|
180
|
+
"default": "repair",
|
|
181
|
+
"description": "Beta stage rules: 'strict' or 'repair'.",
|
|
182
|
+
"required": False,
|
|
183
|
+
},
|
|
184
|
+
"normalise": {
|
|
185
|
+
"type": bool,
|
|
186
|
+
"default": True,
|
|
187
|
+
"description": "Enable normalization of parsed edges.",
|
|
188
|
+
"required": False,
|
|
189
|
+
},
|
|
190
|
+
"post_process": {
|
|
191
|
+
"type": bool,
|
|
192
|
+
"default": True,
|
|
193
|
+
"description": "Enable post-processing of edges.",
|
|
194
|
+
"required": False,
|
|
195
|
+
},
|
|
196
|
+
"debug": {
|
|
197
|
+
"type": bool,
|
|
198
|
+
"default": False,
|
|
199
|
+
"description": "Enable debug message output.",
|
|
200
|
+
"required": False,
|
|
201
|
+
},
|
|
202
|
+
"lang_namespace": {
|
|
203
|
+
"type": bool,
|
|
204
|
+
"default": False,
|
|
205
|
+
"description": (
|
|
206
|
+
"Include the language code as a namespace in atoms "
|
|
207
|
+
"(e.g. 'apple/Cc/en' instead of 'apple/Cc')."
|
|
208
|
+
),
|
|
209
|
+
"required": False,
|
|
210
|
+
},
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
def __init__(self, params: dict[str, Any] | None = None) -> None:
|
|
214
|
+
super().__init__(params)
|
|
215
|
+
|
|
216
|
+
self.lang: str = self.params["lang"]
|
|
217
|
+
|
|
218
|
+
if self.lang not in SPACY_MODELS:
|
|
219
|
+
raise RuntimeError(f"Language code '{self.lang}' is not recognized.")
|
|
220
|
+
|
|
221
|
+
beta: str = self.params.get("beta", "repair")
|
|
222
|
+
normalise: bool = self.params.get("normalise", True)
|
|
223
|
+
post_process: bool = self.params.get("post_process", True)
|
|
224
|
+
debug: bool = self.params.get("debug", False)
|
|
225
|
+
lang_namespace: bool = self.params.get("lang_namespace", False)
|
|
226
|
+
self.atom_lang: str = self.lang if lang_namespace else ""
|
|
227
|
+
|
|
228
|
+
models: list[str] = SPACY_MODELS[self.lang]
|
|
184
229
|
|
|
185
230
|
self.nlp: Language | None = None
|
|
186
231
|
for model in models:
|
|
@@ -191,8 +236,8 @@ class AlphaBetaParser(Parser):
|
|
|
191
236
|
if self.nlp is None:
|
|
192
237
|
models_list: str = ", ".join(models)
|
|
193
238
|
raise RuntimeError(
|
|
194
|
-
f"Language '{lang}' requires one of the following
|
|
195
|
-
f"{models_list}."
|
|
239
|
+
f"Language '{self.lang}' requires one of the following "
|
|
240
|
+
f"language models:\n{models_list}."
|
|
196
241
|
)
|
|
197
242
|
|
|
198
243
|
self.alpha: Alpha = Alpha(use_atomizer=True)
|
|
@@ -202,7 +247,7 @@ class AlphaBetaParser(Parser):
|
|
|
202
247
|
elif beta == "repair":
|
|
203
248
|
self.rules = repair_rules
|
|
204
249
|
else:
|
|
205
|
-
raise RuntimeError(f"
|
|
250
|
+
raise RuntimeError(f"unknown beta stage: {beta}")
|
|
206
251
|
self.normalise: bool = normalise
|
|
207
252
|
self.post_process: bool = post_process
|
|
208
253
|
self.debug: bool = debug
|
|
@@ -224,6 +269,11 @@ class AlphaBetaParser(Parser):
|
|
|
224
269
|
if self.debug:
|
|
225
270
|
print(msg)
|
|
226
271
|
|
|
272
|
+
def install_repl(self, session: object) -> None:
|
|
273
|
+
from hyperbase_parser_ab.repl import install
|
|
274
|
+
|
|
275
|
+
install(self, session)
|
|
276
|
+
|
|
227
277
|
def parse_sentence(self, sentence: str) -> list[ParseResult]:
|
|
228
278
|
# This runs spacy own sentensizer anyway...
|
|
229
279
|
|
|
@@ -264,6 +314,16 @@ class AlphaBetaParser(Parser):
|
|
|
264
314
|
if result and len(result) == 1:
|
|
265
315
|
edge = non_unique(result[0])
|
|
266
316
|
|
|
317
|
+
# Reject pathologically deep parses before they reach the
|
|
318
|
+
# recursive transforms below (which would otherwise blow the
|
|
319
|
+
# Python stack on inputs with extreme nesting).
|
|
320
|
+
if edge is not None and edge_depth_exceeds(edge, self.max_depth):
|
|
321
|
+
self.debug_msg(
|
|
322
|
+
f"Rejecting parse: edge depth exceeds max_depth="
|
|
323
|
+
f"{self.max_depth} for sentence: {sent!s}"
|
|
324
|
+
)
|
|
325
|
+
return None
|
|
326
|
+
|
|
267
327
|
atom2word: dict[Atom, tuple[str, int]] = {}
|
|
268
328
|
if edge:
|
|
269
329
|
edge = self._apply_arg_roles(edge)
|
|
@@ -340,12 +400,12 @@ class AlphaBetaParser(Parser):
|
|
|
340
400
|
# subject
|
|
341
401
|
if dep in {"nsubj", "sb"}:
|
|
342
402
|
return "s"
|
|
343
|
-
# passive subject
|
|
403
|
+
# passive subject (becomes object)
|
|
344
404
|
elif dep in {"nsubjpass", "nsubj:pass"}:
|
|
345
|
-
return "
|
|
346
|
-
# agent
|
|
405
|
+
return "o"
|
|
406
|
+
# agent (becomes subject)
|
|
347
407
|
elif dep == "agent":
|
|
348
|
-
return "
|
|
408
|
+
return "s"
|
|
349
409
|
# object
|
|
350
410
|
elif dep in {
|
|
351
411
|
"obj",
|
|
@@ -361,17 +421,18 @@ class AlphaBetaParser(Parser):
|
|
|
361
421
|
}:
|
|
362
422
|
return "o"
|
|
363
423
|
# indirect object
|
|
364
|
-
elif dep in {"iobj", "dative", "obl:arg", "da"}
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
424
|
+
elif dep in {"iobj", "dative", "obl:arg", "da"} or dep in {
|
|
425
|
+
"advcl",
|
|
426
|
+
"prep",
|
|
427
|
+
"npadvmod",
|
|
428
|
+
"advmod",
|
|
429
|
+
"mo",
|
|
430
|
+
"mnr",
|
|
431
|
+
}:
|
|
368
432
|
return "x"
|
|
369
433
|
# parataxis
|
|
370
|
-
elif dep in {"parataxis", "par"}:
|
|
371
|
-
return "
|
|
372
|
-
# interjection
|
|
373
|
-
elif dep in {"intj", "ng", "dm"}:
|
|
374
|
-
return "j"
|
|
434
|
+
elif dep in {"parataxis", "par"} or dep in {"intj", "ng", "dm"}:
|
|
435
|
+
return "?"
|
|
375
436
|
# clausal complement
|
|
376
437
|
elif dep in {"xcomp", "ccomp", "oc"}:
|
|
377
438
|
return "r"
|
|
@@ -451,7 +512,7 @@ class AlphaBetaParser(Parser):
|
|
|
451
512
|
elif ent_type[0] == "M":
|
|
452
513
|
atom = self._build_atom_modifier(token)
|
|
453
514
|
else:
|
|
454
|
-
atom = build_atom(text, et, self.
|
|
515
|
+
atom = build_atom(text, et, self.atom_lang)
|
|
455
516
|
return atom
|
|
456
517
|
|
|
457
518
|
def _build_atom_predicate(
|
|
@@ -474,17 +535,25 @@ class AlphaBetaParser(Parser):
|
|
|
474
535
|
else:
|
|
475
536
|
ent_type = "Pd"
|
|
476
537
|
|
|
477
|
-
return build_atom(text, ent_type, self.
|
|
538
|
+
return build_atom(text, ent_type, self.atom_lang)
|
|
478
539
|
|
|
479
540
|
def _build_atom_trigger(self, token: Token, ent_type: str) -> Atom:
|
|
480
541
|
text: str = token.text.lower()
|
|
481
|
-
|
|
482
|
-
|
|
542
|
+
|
|
543
|
+
# indirect object
|
|
544
|
+
if token.dep_ in {"iobj", "dative", "obl:arg", "da"}:
|
|
545
|
+
et = "Ti"
|
|
546
|
+
elif _is_verb(token):
|
|
547
|
+
et = "Tv"
|
|
548
|
+
else:
|
|
549
|
+
et = ent_type
|
|
550
|
+
|
|
551
|
+
return build_atom(text, et, self.atom_lang)
|
|
483
552
|
|
|
484
553
|
def _build_atom_modifier(self, token: Token) -> Atom:
|
|
485
554
|
text: str = token.text.lower()
|
|
486
555
|
et: str = "Mv" if _is_verb(token) else _modifier_type_and_subtype(token)
|
|
487
|
-
return build_atom(text, et, self.
|
|
556
|
+
return build_atom(text, et, self.atom_lang)
|
|
488
557
|
|
|
489
558
|
def _repair(self, edge: Hyperedge) -> Hyperedge:
|
|
490
559
|
if edge.not_atom:
|
|
@@ -768,7 +837,7 @@ class AlphaBetaParser(Parser):
|
|
|
768
837
|
if len(sequence) < 2:
|
|
769
838
|
return sequence, False
|
|
770
839
|
|
|
771
|
-
def
|
|
840
|
+
def get_sentences(self, text: str) -> list[str]:
|
|
772
841
|
if self.nlp:
|
|
773
842
|
doc: Doc = self.nlp(text.strip())
|
|
774
843
|
return [str(sent).strip() for sent in doc.sents]
|
|
@@ -893,9 +962,34 @@ class AlphaBetaParser(Parser):
|
|
|
893
962
|
return self._replace_argroles(edge, _ars)
|
|
894
963
|
return edge
|
|
895
964
|
|
|
965
|
+
def _flatten_conjunctions(self, edge: Hyperedge) -> Hyperedge:
|
|
966
|
+
if edge.atom:
|
|
967
|
+
return edge
|
|
968
|
+
new_edge: Hyperedge = hedge(
|
|
969
|
+
[self._flatten_conjunctions(subedge) for subedge in edge]
|
|
970
|
+
)
|
|
971
|
+
if new_edge is None:
|
|
972
|
+
return edge
|
|
973
|
+
edge = new_edge
|
|
974
|
+
if edge[0].mt != "J":
|
|
975
|
+
return edge
|
|
976
|
+
connector: Hyperedge = edge[0]
|
|
977
|
+
flattened: list[Hyperedge] = [connector]
|
|
978
|
+
changed: bool = False
|
|
979
|
+
for subedge in edge[1:]:
|
|
980
|
+
if subedge.not_atom and len(subedge) >= 2 and subedge[0] == connector:
|
|
981
|
+
flattened.extend(list(subedge[1:]))
|
|
982
|
+
changed = True
|
|
983
|
+
else:
|
|
984
|
+
flattened.append(subedge)
|
|
985
|
+
if changed:
|
|
986
|
+
return hedge(flattened)
|
|
987
|
+
return edge
|
|
988
|
+
|
|
896
989
|
def _post_process(self, edge: Hyperedge | None) -> Hyperedge | None:
|
|
897
990
|
if edge is None:
|
|
898
991
|
return None
|
|
899
992
|
_edge: Hyperedge = self._fix_argroles(edge)
|
|
900
993
|
_edge = self._process_colon_conjunctions(_edge)
|
|
994
|
+
_edge = self._flatten_conjunctions(_edge)
|
|
901
995
|
return _edge
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""REPL integration for the AlphaBeta parser.
|
|
2
|
+
|
|
3
|
+
Adds a pre-result hook to the Hyperbase REPL that prints the spaCy
|
|
4
|
+
dependency parse tree for the current sentence. Imported lazily from
|
|
5
|
+
:meth:`AlphaBetaParser.install_repl` so that this module's only purpose
|
|
6
|
+
is keeping REPL-rendering code out of the parser core.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from hyperbase.parsers.repl_api import PreResultHook, ReplContext
|
|
14
|
+
from rich import box
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
from rich.panel import Panel
|
|
17
|
+
from rich.text import Text
|
|
18
|
+
from rich.tree import Tree
|
|
19
|
+
from spacy.tokens import Token
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from hyperbase_parser_ab.parser import AlphaBetaParser
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _build_dependency_tree(
|
|
26
|
+
token: Token,
|
|
27
|
+
visited: set[Token] | None = None,
|
|
28
|
+
) -> Tree | None:
|
|
29
|
+
"""Build a Rich tree representation of a spaCy dependency parse."""
|
|
30
|
+
if visited is None:
|
|
31
|
+
visited = set()
|
|
32
|
+
|
|
33
|
+
if token in visited:
|
|
34
|
+
return None
|
|
35
|
+
visited.add(token)
|
|
36
|
+
|
|
37
|
+
label = Text()
|
|
38
|
+
label.append(token.text, style="bold white")
|
|
39
|
+
label.append(" [", style="dim")
|
|
40
|
+
label.append(f"dep_={token.dep_}", style="cyan")
|
|
41
|
+
label.append(", ", style="dim")
|
|
42
|
+
label.append(f"tag_={token.pos_}", style="yellow")
|
|
43
|
+
label.append("]", style="dim")
|
|
44
|
+
|
|
45
|
+
tree = Tree(label)
|
|
46
|
+
|
|
47
|
+
for child in token.children:
|
|
48
|
+
child_tree = _build_dependency_tree(child, visited)
|
|
49
|
+
if child_tree:
|
|
50
|
+
tree.add(child_tree)
|
|
51
|
+
|
|
52
|
+
return tree
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _make_pre_result_hook(parser: AlphaBetaParser) -> PreResultHook:
|
|
56
|
+
"""Return a pre-result hook bound to *parser*'s spaCy doc."""
|
|
57
|
+
|
|
58
|
+
def hook(ctx: ReplContext) -> None:
|
|
59
|
+
doc = getattr(parser, "doc", None)
|
|
60
|
+
if doc is None:
|
|
61
|
+
return
|
|
62
|
+
console: Console = ctx.session.console
|
|
63
|
+
for sent in doc.sents:
|
|
64
|
+
dep_tree = _build_dependency_tree(sent.root)
|
|
65
|
+
if dep_tree is None:
|
|
66
|
+
continue
|
|
67
|
+
console.print()
|
|
68
|
+
console.print(
|
|
69
|
+
Panel(
|
|
70
|
+
dep_tree,
|
|
71
|
+
title="[bold cyan]Dependency Parse Tree[/bold cyan]",
|
|
72
|
+
border_style="cyan",
|
|
73
|
+
box=box.ROUNDED,
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return hook
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def install(parser: AlphaBetaParser, session: object) -> None:
|
|
81
|
+
"""Register AlphaBeta-specific REPL behavior on *session*."""
|
|
82
|
+
session.register_pre_result_hook(_make_pre_result_hook(parser)) # type: ignore[attr-defined]
|
|
@@ -3,7 +3,8 @@
|
|
|
3
3
|
from unittest.mock import MagicMock, patch
|
|
4
4
|
|
|
5
5
|
import pytest
|
|
6
|
-
from hyperbase
|
|
6
|
+
from hyperbase import hedge
|
|
7
|
+
from hyperbase.hyperedge import UniqueAtom
|
|
7
8
|
|
|
8
9
|
from hyperbase_parser_ab.parser import AlphaBetaParser
|
|
9
10
|
|
|
@@ -11,7 +12,7 @@ from hyperbase_parser_ab.parser import AlphaBetaParser
|
|
|
11
12
|
class TestParserInitErrors:
|
|
12
13
|
def test_unsupported_language_raises(self):
|
|
13
14
|
with pytest.raises(RuntimeError, match="not recognized"):
|
|
14
|
-
AlphaBetaParser("xx")
|
|
15
|
+
AlphaBetaParser({"lang": "xx"})
|
|
15
16
|
|
|
16
17
|
def test_unknown_beta_stage_raises(self):
|
|
17
18
|
with (
|
|
@@ -21,9 +22,9 @@ class TestParserInitErrors:
|
|
|
21
22
|
patch("spacy.util.is_package", return_value=True),
|
|
22
23
|
patch("spacy.load", return_value=MagicMock()),
|
|
23
24
|
patch("hyperbase_parser_ab.parser.Alpha"),
|
|
24
|
-
pytest.raises(RuntimeError, match="
|
|
25
|
+
pytest.raises(RuntimeError, match="unknown beta stage"),
|
|
25
26
|
):
|
|
26
|
-
AlphaBetaParser("en", beta
|
|
27
|
+
AlphaBetaParser({"lang": "en", "beta": "invalid"})
|
|
27
28
|
|
|
28
29
|
def test_no_spacy_model_installed_raises(self):
|
|
29
30
|
with (
|
|
@@ -34,7 +35,7 @@ class TestParserInitErrors:
|
|
|
34
35
|
patch("hyperbase_parser_ab.parser.Alpha"),
|
|
35
36
|
pytest.raises(RuntimeError, match="requires one of the following"),
|
|
36
37
|
):
|
|
37
|
-
AlphaBetaParser("en")
|
|
38
|
+
AlphaBetaParser({"lang": "en"})
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
def _make_parser(beta="repair"):
|
|
@@ -46,7 +47,13 @@ def _make_parser(beta="repair"):
|
|
|
46
47
|
patch("hyperbase_parser_ab.parser.Alpha"),
|
|
47
48
|
):
|
|
48
49
|
parser = AlphaBetaParser(
|
|
49
|
-
|
|
50
|
+
{
|
|
51
|
+
"lang": "en",
|
|
52
|
+
"beta": beta,
|
|
53
|
+
"normalise": True,
|
|
54
|
+
"post_process": True,
|
|
55
|
+
"debug": False,
|
|
56
|
+
}
|
|
50
57
|
)
|
|
51
58
|
return parser
|
|
52
59
|
|
|
@@ -144,7 +151,7 @@ class TestParserRelationArgRole:
|
|
|
144
151
|
parser.atom2token = {uatom: token}
|
|
145
152
|
parser.orig_atom = {uatom: uatom}
|
|
146
153
|
parser.depths = {uatom: 1}
|
|
147
|
-
assert parser._relation_arg_role(edge) == "
|
|
154
|
+
assert parser._relation_arg_role(edge) == "o"
|
|
148
155
|
|
|
149
156
|
def test_indirect_object(self):
|
|
150
157
|
parser = _make_parser()
|
|
@@ -155,7 +162,7 @@ class TestParserRelationArgRole:
|
|
|
155
162
|
parser.atom2token = {uatom: token}
|
|
156
163
|
parser.orig_atom = {uatom: uatom}
|
|
157
164
|
parser.depths = {uatom: 1}
|
|
158
|
-
assert parser._relation_arg_role(edge) == "
|
|
165
|
+
assert parser._relation_arg_role(edge) == "x"
|
|
159
166
|
|
|
160
167
|
def test_specifier(self):
|
|
161
168
|
parser = _make_parser()
|
|
@@ -244,6 +251,76 @@ class TestParserDebug:
|
|
|
244
251
|
assert capsys.readouterr().out == ""
|
|
245
252
|
|
|
246
253
|
|
|
254
|
+
class TestParserFlattenConjunctions:
|
|
255
|
+
def test_flatten_atom_unchanged(self):
|
|
256
|
+
parser = _make_parser()
|
|
257
|
+
atom = hedge("red/Ca/en")
|
|
258
|
+
assert parser._flatten_conjunctions(atom) == atom
|
|
259
|
+
|
|
260
|
+
def test_flatten_no_conjunction_unchanged(self):
|
|
261
|
+
parser = _make_parser()
|
|
262
|
+
edge = hedge("(runs/Pd/en cat/Cc/en dog/Cc/en)")
|
|
263
|
+
assert parser._flatten_conjunctions(edge) == edge
|
|
264
|
+
|
|
265
|
+
def test_flatten_simple_conjunction_unchanged(self):
|
|
266
|
+
"""A flat conjunction with no nested conjunctions stays the same."""
|
|
267
|
+
parser = _make_parser()
|
|
268
|
+
edge = hedge("(,/J red/Ca/en green/Ca/en blue/Ca/en)")
|
|
269
|
+
assert parser._flatten_conjunctions(edge) == edge
|
|
270
|
+
|
|
271
|
+
def test_flatten_nested_same_connector(self):
|
|
272
|
+
"""(,/J red (,/J green blue)) → (,/J red green blue)"""
|
|
273
|
+
parser = _make_parser()
|
|
274
|
+
edge = hedge("(,/J red/Ca/en (,/J green/Ca/en blue/Ca/en))")
|
|
275
|
+
expected = hedge("(,/J red/Ca/en green/Ca/en blue/Ca/en)")
|
|
276
|
+
assert parser._flatten_conjunctions(edge) == expected
|
|
277
|
+
|
|
278
|
+
def test_flatten_nested_different_connector_unchanged(self):
|
|
279
|
+
"""Nested conjunction with a different connector should NOT be flattened."""
|
|
280
|
+
parser = _make_parser()
|
|
281
|
+
edge = hedge("(,/J red/Ca/en (and/J/en green/Ca/en blue/Ca/en))")
|
|
282
|
+
assert parser._flatten_conjunctions(edge) == edge
|
|
283
|
+
|
|
284
|
+
def test_flatten_recursive_bottom_up(self):
|
|
285
|
+
"""Multiple levels of nesting should all collapse."""
|
|
286
|
+
parser = _make_parser()
|
|
287
|
+
edge = hedge("(,/J red/Ca/en (,/J green/Ca/en (,/J blue/Ca/en yellow/Ca/en)))")
|
|
288
|
+
expected = hedge("(,/J red/Ca/en green/Ca/en blue/Ca/en yellow/Ca/en)")
|
|
289
|
+
assert parser._flatten_conjunctions(edge) == expected
|
|
290
|
+
|
|
291
|
+
def test_flatten_multiple_nested_conjunctions(self):
|
|
292
|
+
"""(,/J (,/J a b) (,/J c d)) → (,/J a b c d)"""
|
|
293
|
+
parser = _make_parser()
|
|
294
|
+
edge = hedge("(,/J (,/J a/Ca/en b/Ca/en) (,/J c/Ca/en d/Ca/en))")
|
|
295
|
+
expected = hedge("(,/J a/Ca/en b/Ca/en c/Ca/en d/Ca/en)")
|
|
296
|
+
assert parser._flatten_conjunctions(edge) == expected
|
|
297
|
+
|
|
298
|
+
def test_flatten_inside_outer_edge(self):
|
|
299
|
+
"""A nested conjunction inside a non-conjunction outer edge is still
|
|
300
|
+
flattened bottom-up."""
|
|
301
|
+
parser = _make_parser()
|
|
302
|
+
edge = hedge(
|
|
303
|
+
"(runs/Pd/en cat/Cc/en (,/J red/Ca/en (,/J green/Ca/en blue/Ca/en)))"
|
|
304
|
+
)
|
|
305
|
+
expected = hedge(
|
|
306
|
+
"(runs/Pd/en cat/Cc/en (,/J red/Ca/en green/Ca/en blue/Ca/en))"
|
|
307
|
+
)
|
|
308
|
+
assert parser._flatten_conjunctions(edge) == expected
|
|
309
|
+
|
|
310
|
+
def test_flatten_mixed_connectors_partial(self):
|
|
311
|
+
"""Only the matching nested conjunctions should be flattened."""
|
|
312
|
+
parser = _make_parser()
|
|
313
|
+
edge = hedge(
|
|
314
|
+
"(,/J red/Ca/en (,/J green/Ca/en blue/Ca/en)"
|
|
315
|
+
" (and/J/en yellow/Ca/en purple/Ca/en))"
|
|
316
|
+
)
|
|
317
|
+
expected = hedge(
|
|
318
|
+
"(,/J red/Ca/en green/Ca/en blue/Ca/en"
|
|
319
|
+
" (and/J/en yellow/Ca/en purple/Ca/en))"
|
|
320
|
+
)
|
|
321
|
+
assert parser._flatten_conjunctions(edge) == expected
|
|
322
|
+
|
|
323
|
+
|
|
247
324
|
class TestParserReset:
|
|
248
325
|
def test_reset_clears_state(self):
|
|
249
326
|
parser = _make_parser()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.2.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/scripts/generate_alpha_training_data.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/src/hyperbase_parser_ab/lang_models.py
RENAMED
|
File without changes
|
{hyperbase_parser_ab-0.2.0 → hyperbase_parser_ab-0.3.0}/src/hyperbase_parser_ab/sentensizer.py
RENAMED
|
File without changes
|
|
File without changes
|