hyperbase-parser-ab 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ from hyperbase_parser_ab.parser import AlphaBetaParser
2
+
3
+ __all__ = ["AlphaBetaParser"]
@@ -0,0 +1,69 @@
1
+ import numpy as np
2
+ from numpy.typing import NDArray
3
+ from scipy.sparse import spmatrix
4
+ from sklearn.ensemble import RandomForestClassifier
5
+ from sklearn.preprocessing import OneHotEncoder
6
+ from spacy.tokens import Span
7
+
8
+ from hyperbase_parser_ab.atomizer import Atomizer
9
+
10
+
11
+ class Alpha(object):
12
+ def __init__(self, cases_str: str | None = None, use_atomizer: bool = False) -> None:
13
+ if use_atomizer:
14
+ self.atomizer: Atomizer | None = Atomizer()
15
+ elif cases_str:
16
+ self.atomizer = None
17
+
18
+ X: list[tuple[str, str, str, str, str]] = []
19
+ y: list[list[str]] = []
20
+
21
+ for line in cases_str.strip().split('\n'):
22
+ sline: str = line.strip()
23
+ if len(sline) > 0:
24
+ row: list[str] = sline.strip().split('\t')
25
+ true_value: str = row[0]
26
+ tag: str = row[3]
27
+ dep: str = row[4]
28
+ hpos: str = row[6]
29
+ hdep: str = row[8]
30
+ pos_after: str = row[19]
31
+
32
+ y.append([true_value])
33
+ X.append((tag, dep, hpos, hdep, pos_after))
34
+
35
+ if len(y) > 0:
36
+ self.empty: bool = False
37
+
38
+ self.encX: OneHotEncoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
39
+ self.encX.fit(np.array(X))
40
+ self.ency: OneHotEncoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
41
+ self.ency.fit(np.array(y))
42
+
43
+ X_: NDArray | spmatrix = self.encX.transform(np.array(X))
44
+ y_: NDArray | spmatrix = self.ency.transform(np.array(y))
45
+
46
+ self.clf: RandomForestClassifier = RandomForestClassifier(random_state=777)
47
+ self.clf.fit(X_, y_)
48
+ else:
49
+ self.empty = True
50
+
51
+ def predict(self, sentence: Span, features: list[tuple[str, str, str, str, str]]) -> tuple[str, ...] | list[str]:
52
+ if self.atomizer:
53
+ preds: list[tuple[str, str]] = self.atomizer.atomize(
54
+ sentence=str(sentence),
55
+ tokens=[str(token) for token in sentence])
56
+ atom_types: list[str] = [pred[1] for pred in preds]
57
+
58
+ # force known cases
59
+ for i in range(len(atom_types)):
60
+ if sentence[i].pos_ == 'VERB':
61
+ atom_types[i] = 'P'
62
+ return atom_types
63
+ else:
64
+ # an empty classifier always predicts 'C'
65
+ if self.empty:
66
+ return tuple('C' for _ in range(len(features)))
67
+ _features: NDArray | spmatrix = self.encX.transform(np.array(features))
68
+ preds_arr: NDArray | spmatrix = self.ency.inverse_transform(self.clf.predict(_features))
69
+ return tuple(pred[0] if pred else 'C' for pred in preds_arr)
@@ -0,0 +1,142 @@
1
+ from collections import Counter
2
+
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, PreTrainedTokenizerBase, PreTrainedModel
5
+
6
+
7
+ HF_REPO: str = "hyperquest/atom-classifier"
8
+
9
+
10
+ class Atomizer:
11
+ def __init__(self, model_path: str | None = None) -> None:
12
+ model_id: str = model_path or HF_REPO
13
+ self.model_path: str = model_id
14
+ self.tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(model_id, use_fast=True)
15
+ self.model: PreTrainedModel = AutoModelForTokenClassification.from_pretrained(model_id)
16
+ assert self.model.config.id2label
17
+ self.id2label: dict[int, str] = self.model.config.id2label
18
+
19
+ def atomize(self,
20
+ sentence: str,
21
+ tokens: list[str] | None = None
22
+ ) -> list[tuple[str, str]]:
23
+ # Tokenize the raw sentence and request offsets
24
+ encoded = self.tokenizer(
25
+ sentence,
26
+ return_tensors="pt",
27
+ truncation=True,
28
+ return_offsets_mapping=True
29
+ )
30
+
31
+ offset_mapping = encoded.pop("offset_mapping") # remove so model doesn't see it
32
+ word_ids: list[int | None] = encoded.word_ids(0)
33
+
34
+ with torch.no_grad():
35
+ outputs = self.model(**encoded)
36
+
37
+ pred_ids: list[int] = outputs.logits.argmax(-1)[0].tolist()
38
+ offset_mapping = offset_mapping[0].tolist()
39
+
40
+ if tokens is not None:
41
+ # Map provided tokens to model predictions based on character offsets
42
+ return self._map_tokens_to_predictions(sentence, tokens, word_ids, pred_ids, offset_mapping)
43
+
44
+ predicted_labels: list[tuple[str, str]] = []
45
+ current_word_id: int | None = None
46
+ current_start: int | None = None
47
+ current_end: int = -1
48
+ current_label: str | None = None
49
+
50
+ for idx, word_id in enumerate(word_ids):
51
+ if word_id is None:
52
+ continue # skip CLS, SEP, etc.
53
+
54
+ start: int
55
+ end: int
56
+ start, end = offset_mapping[idx]
57
+ label_id: int = pred_ids[idx]
58
+ label: str = self.id2label[label_id]
59
+
60
+ if word_id != current_word_id:
61
+ # flush previous word
62
+ if current_label is not None:
63
+ word_text: str = sentence[current_start:current_end]
64
+ predicted_labels.append((word_text, current_label))
65
+
66
+ # start new word
67
+ current_word_id = word_id
68
+ current_start = start
69
+ current_end = end
70
+ current_label = label
71
+ else:
72
+ # same word, extend its span
73
+ current_end = max(current_end, end)
74
+
75
+ # flush last word
76
+ if current_label is not None:
77
+ word_text = sentence[current_start:current_end]
78
+ predicted_labels.append((word_text, current_label))
79
+
80
+ return predicted_labels
81
+
82
+ def _map_tokens_to_predictions(self,
83
+ sentence: str,
84
+ tokens: list[str],
85
+ word_ids: list[int | None],
86
+ pred_ids: list[int],
87
+ offset_mapping: list[list[int]]
88
+ ) -> list[tuple[str, str]]:
89
+ """
90
+ Maps provided tokens to model predictions by finding character offsets
91
+ and assigning the most appropriate label based on overlapping model tokens.
92
+ """
93
+ # Find character positions of each provided token in the sentence
94
+ token_positions: list[tuple[int, int] | None] = []
95
+ search_start: int = 0
96
+
97
+ for token in tokens:
98
+ pos: int = sentence.find(token, search_start)
99
+ if pos == -1:
100
+ # Token not found - skip or use fallback
101
+ token_positions.append(None)
102
+ else:
103
+ token_positions.append((pos, pos + len(token)))
104
+ search_start = pos + len(token)
105
+
106
+ # For each provided token, collect overlapping model predictions
107
+ result: list[tuple[str, str]] = []
108
+ for token, positions in zip(tokens, token_positions):
109
+ if positions is None:
110
+ # Token not found in sentence - assign default label
111
+ result.append((token, 'C'))
112
+ continue
113
+
114
+ token_start: int
115
+ token_end: int
116
+ token_start, token_end = positions
117
+
118
+ # Collect all labels from model tokens that overlap with this token
119
+ overlapping_labels: list[str] = []
120
+ for idx, word_id in enumerate(word_ids):
121
+ if word_id is None:
122
+ continue
123
+
124
+ model_start: int
125
+ model_end: int
126
+ model_start, model_end = offset_mapping[idx]
127
+
128
+ # Check if model token overlaps with provided token
129
+ if model_start < token_end and model_end > token_start:
130
+ label: str = self.id2label[pred_ids[idx]]
131
+ overlapping_labels.append(label)
132
+
133
+ # Assign the most common label, or first label if tie
134
+ if overlapping_labels:
135
+ # Use most common label
136
+ most_common_label: str = Counter(overlapping_labels).most_common(1)[0][0]
137
+ result.append((token, most_common_label))
138
+ else:
139
+ # No overlap found - use default
140
+ result.append((token, 'C'))
141
+
142
+ return result
@@ -0,0 +1,50 @@
1
+ def get_spacy_models(lang: str) -> list[str]:
2
+ if lang == 'de':
3
+ return [
4
+ 'de_dep_news_trf',
5
+ 'de_core_news_lg',
6
+ 'de_core_news_md',
7
+ 'de_core_news_sm'
8
+ ]
9
+ elif lang == 'en':
10
+ return [
11
+ 'en_core_web_trf',
12
+ 'en_core_web_lg',
13
+ 'en_core_web_md',
14
+ 'en_core_web_sm'
15
+ ]
16
+ elif lang == 'es':
17
+ return [
18
+ 'es_dep_news_trf',
19
+ 'es_core_news_lg',
20
+ 'es_core_news_md',
21
+ 'es_core_news_sm'
22
+ ]
23
+ elif lang == 'fr':
24
+ return [
25
+ 'fr_dep_news_trf',
26
+ 'fr_core_news_lg',
27
+ 'fr_core_news_md',
28
+ 'fr_core_news_sm'
29
+ ]
30
+ elif lang == 'it':
31
+ return [
32
+ 'it_core_news_lg',
33
+ 'it_core_news_md',
34
+ 'it_core_news_sm'
35
+ ]
36
+ elif lang == 'pt':
37
+ return [
38
+ 'pt_core_news_lg',
39
+ 'pt_core_news_md',
40
+ 'pt_core_news_sm'
41
+ ]
42
+ elif lang == 'zh':
43
+ return [
44
+ 'zh_core_news_trf',
45
+ 'zh_core_news_lg',
46
+ 'zh_core_news_md',
47
+ 'zh_core_news_sm'
48
+ ]
49
+ else:
50
+ return []
@@ -0,0 +1,835 @@
1
+ import re
2
+ import traceback
3
+ from typing import Any, cast
4
+
5
+ import spacy
6
+ from spacy.language import Language
7
+ from spacy.tokens import Doc, Span, Token
8
+
9
+ import hyperbase.constants as const
10
+ from hyperbase.hyperedge import Atom, Hyperedge, build_atom, hedge, non_unique, unique, UniqueAtom
11
+ from hyperbase.parsers import Parser
12
+
13
+ from hyperbase_parser_ab.alpha import Alpha
14
+ from hyperbase_parser_ab.lang_models import get_spacy_models
15
+ from hyperbase_parser_ab.rules import apply_rule, Rule, strict_rules, repair_rules
16
+
17
+
18
+ def _edge2txt_parts(edge: Hyperedge, parse: dict[str, Any]) -> list[tuple[str, str, int]]:
19
+ atoms: list[Atom] = [UniqueAtom(atom) for atom in edge.all_atoms()]
20
+ tokens: list[Token] = [parse['atom2token'][atom] for atom in atoms if atom in parse['atom2token']]
21
+ txts: list[str] = [token.text for token in tokens]
22
+ pos: list[int] = [token.i for token in tokens]
23
+ return list(zip(txts, txts, pos))
24
+
25
+
26
+ def _edge2text(edge: Hyperedge, parse: dict[str, Any]) -> str:
27
+ if edge.not_atom and str(edge[0]) == const.possessive_builder:
28
+ return _poss2text(edge, parse)
29
+
30
+ parts: list[tuple[str, str, int]] = _edge2txt_parts(edge, parse)
31
+ parts = sorted(parts, key=lambda x: x[2])
32
+
33
+ prev_txt: str | None = None
34
+ txt_parts: list[str] = []
35
+ sentence: str = str(parse['spacy_sentence'])
36
+ for txt, _txt, _ in parts:
37
+ if prev_txt is not None:
38
+ res: re.Match[str] | None = re.search(r'{}(.*?){}'.format(re.escape(prev_txt), re.escape(txt)), sentence)
39
+ if res:
40
+ sep: str = res.group(1)
41
+ else:
42
+ sep = ' '
43
+ if any(letter.isalnum() for letter in sep):
44
+ sep = ' '
45
+ txt_parts.append(sep)
46
+ txt_parts.append(_txt)
47
+ prev_txt = txt
48
+ return ''.join(txt_parts)
49
+
50
+
51
+ def _concept_type_and_subtype(token: Token) -> str:
52
+ pos: str = token.pos_
53
+ dep: str = token.dep_
54
+ if dep == 'nmod':
55
+ return 'Cm'
56
+ if pos == 'ADJ':
57
+ return 'Ca'
58
+ elif pos == 'NOUN':
59
+ return 'Cc'
60
+ elif pos == 'PROPN':
61
+ return 'Cp'
62
+ elif pos == 'NUM':
63
+ return 'C#'
64
+ elif pos == 'DET':
65
+ return 'Cd'
66
+ elif pos == 'PRON':
67
+ return 'Ci'
68
+ else:
69
+ return 'C'
70
+
71
+
72
+ def _modifier_type_and_subtype(token: Token) -> str:
73
+ pos: str = token.pos_
74
+ dep: str = token.dep_
75
+ if dep in {'neg', 'nk'}:
76
+ return 'Mn'
77
+ elif dep in {'poss', 'pg', 'ag'}:
78
+ return 'Mp'
79
+ elif dep == 'prep':
80
+ return 'Mt' # preposition
81
+ elif dep == 'preconj':
82
+ return 'Mj' # conjunctional
83
+ elif pos == 'ADJ':
84
+ return 'Ma'
85
+ elif pos == 'DET':
86
+ return 'Md'
87
+ elif pos == 'NUM':
88
+ return 'M#'
89
+ elif pos == 'AUX':
90
+ return 'Mm' # modal
91
+ elif token.dep_ == 'prt':
92
+ return 'Ml' # particle
93
+ elif pos == 'PART':
94
+ return 'Mi' # infinitive
95
+ elif pos == 'ADV': # adverb
96
+ return 'M' # quintissential modifier, no subtype needed
97
+ else:
98
+ return 'M'
99
+
100
+
101
+ def _builder_type_and_subtype(token: Token) -> str:
102
+ pos: str = token.pos_
103
+ dep: str = token.dep_
104
+ if dep in {'case', 'pg', 'ag'}:
105
+ # if token.head.dep_ == 'poss':
106
+ return 'Bp'
107
+ elif pos == 'ADP':
108
+ return 'Br' # relational (proposition)
109
+ elif pos == 'DET':
110
+ return 'Bd'
111
+ else:
112
+ return 'B'
113
+
114
+
115
+ def _predicate_type_and_subtype(token: Token) -> str:
116
+ dep: str = token.dep_
117
+ if dep in {'advcl', 'csubj', 'csubjpass', 'parataxis'}:
118
+ return 'Pd'
119
+ elif dep in {'relcl', 'ccomp', 'acl', 'pcomp', 'xcomp', 'rc'}:
120
+ return 'P'
121
+ elif _is_verb(token):
122
+ return 'Pd'
123
+ else:
124
+ return 'P'
125
+
126
+
127
+ def _predicate_post_type_and_subtype(edge: Hyperedge, subparts: list[str], args_string: str) -> str:
128
+ return subparts[0]
129
+
130
+
131
+ def _is_verb(token: Token) -> bool:
132
+ return token.pos_ == 'VERB'
133
+
134
+
135
+ def _poss2text(edge: Hyperedge, parse: dict[str, Any]) -> str:
136
+ part1: str = _edge2text(edge[1], parse).strip()
137
+ part2: str = _edge2text(edge[2], parse)
138
+ if part1[-1] == 's':
139
+ poss: str = "'"
140
+ else:
141
+ poss = "'s"
142
+ return f'{part1}{poss} {part2}'
143
+
144
+
145
+ def _generate_tok_pos(atom2word: dict[Atom, tuple[str, int]], edge: Hyperedge) -> str:
146
+ if edge.atom:
147
+ uatom: Atom | None = cast(Atom, unique(edge))
148
+ if uatom is not None and uatom in atom2word:
149
+ return str(atom2word[uatom][1])
150
+ else:
151
+ return '-1'
152
+ else:
153
+ return '({})'.format(' '.join([_generate_tok_pos(atom2word, subedge) for subedge in edge]))
154
+
155
+
156
+ class AlphaBetaParser(Parser):
157
+ def __init__(self, lang: str, beta: str = 'repair', normalize: bool = True,
158
+ post_process: bool = True, debug: bool = False) -> None:
159
+ super().__init__()
160
+
161
+ self.lang: str = lang
162
+
163
+ models: list[str] = get_spacy_models(lang)
164
+
165
+ if len(models) == 0:
166
+ raise RuntimeError(f"Language code '{lang}' is not recognized / language is nor supported.")
167
+
168
+ self.nlp: Language | None = None
169
+ for model in models:
170
+ if spacy.util.is_package(model):
171
+ self.nlp = spacy.load(model)
172
+ print('Using language model: {}'.format(model))
173
+ break
174
+ if self.nlp is None:
175
+ models_list: str = ", ".join(models)
176
+ raise RuntimeError(f"Language '{lang}' requires one of the following language models to be installed:\n"
177
+ f"{models_list}.")
178
+
179
+ self.alpha: Alpha = Alpha(use_atomizer=True)
180
+
181
+ if beta == 'strict':
182
+ self.rules: list[Rule] = strict_rules
183
+ elif beta == 'repair':
184
+ self.rules = repair_rules
185
+ else:
186
+ raise RuntimeError('unkown beta stage: {}'.format(beta))
187
+ self.normalize: bool = normalize
188
+ self.post_process: bool = post_process
189
+ self.debug: bool = debug
190
+
191
+ self.atom2token: dict[Atom, Token] = {}
192
+ self.temp_atoms: set[Atom] = set()
193
+ self.orig_atom: dict[Atom, UniqueAtom] = {}
194
+ self.token2atom: dict[Token, Atom] = {}
195
+ self.depths: dict[Atom, int] = {}
196
+ self.connections: set[tuple[Atom, Atom]] = set()
197
+ self.edge2text: dict[Hyperedge, str] = {}
198
+ self.edge2toks: dict[Hyperedge, tuple[Token, ...]] = {}
199
+ self.toks2edge: dict[tuple[Token, ...], Hyperedge] = {}
200
+ self.cur_text: str | None = None
201
+ self.doc: Doc | None = None
202
+ self.beta: str = beta
203
+
204
+ def debug_msg(self, msg: str) -> None:
205
+ if self.debug:
206
+ print(msg)
207
+
208
+ def parse_sentence(self, sentence: str) -> list[dict[str, object]]:
209
+ # This runs spacy own sentensizer anyway...
210
+
211
+ sentence = re.sub(r'\s+', ' ', sentence).strip()
212
+
213
+ if self.nlp:
214
+ self.reset(sentence)
215
+ parses: list[dict[str, object]] = []
216
+ try:
217
+ self.doc = self.nlp(sentence)
218
+ offset: int = 0
219
+ for sent in self.doc.sents:
220
+ parse: dict[str, object] | None = self.parse_spacy_sentence(sent, offset=offset)
221
+ if parse:
222
+ parses.append(parse)
223
+ offset += len(sent)
224
+ except RuntimeError as error:
225
+ print(error)
226
+ return parses
227
+ else:
228
+ raise RuntimeError("spaCy model failed to initialize.")
229
+
230
+ def parse_spacy_sentence(self, sent: Span, atom_sequence: list[Atom] | None = None,
231
+ offset: int = 0) -> dict[str, object] | None:
232
+ try:
233
+ if atom_sequence is None:
234
+ atom_sequence = self._build_atom_sequence(sent)
235
+
236
+ self._compute_depths_and_connections(sent.root)
237
+
238
+ edge: Hyperedge | None = None
239
+ result: list[Hyperedge] | None
240
+ failed: bool
241
+ result, failed = self._parse_atom_sequence(atom_sequence)
242
+ if result and len(result) == 1:
243
+ edge = non_unique(result[0])
244
+
245
+ atom2word: dict[Atom, tuple[str, int]] = {}
246
+ if edge:
247
+ edge = self._apply_arg_roles(edge)
248
+ if self.beta == 'repair':
249
+ edge = self._repair(edge)
250
+ if self.normalize:
251
+ edge = self._normalize(edge)
252
+ if self.post_process:
253
+ edge = self._post_process(edge)
254
+ if edge is not None:
255
+ atom2word = self._generate_atom2word(edge, offset=offset)
256
+
257
+ if edge is None:
258
+ return None
259
+
260
+ return {
261
+ 'edge': edge,
262
+ 'failed': failed,
263
+ 'text': str(sent).strip(),
264
+ 'tokens': [str(token) for token in sent],
265
+ 'tok_pos': _generate_tok_pos(atom2word, edge)
266
+ }
267
+ except Exception as e:
268
+ print('Caught exception: {} while parsing: "{}"'.format(str(e), str(sent)))
269
+ traceback.print_exc()
270
+ return None
271
+
272
+ def manual_atom_sequence(self, sentence: Span, token2atom: dict[Token, Atom]) -> list[Atom]:
273
+ self.token2atom = {}
274
+
275
+ atomseq: list[Atom] = []
276
+ for token in sentence:
277
+ if token in token2atom:
278
+ atom: Atom | None = token2atom[token]
279
+ else:
280
+ atom = None
281
+ if atom:
282
+ uatom: Atom = UniqueAtom(atom)
283
+ self.dep_[uatom] = token
284
+ self.token2atom[token] = uatom
285
+ self.orig_atom[uatom] = uatom
286
+ atomseq.append(uatom)
287
+ return atomseq
288
+
289
+ def reset(self, text: str) -> None:
290
+ self.dep_: dict[Atom, Token] = {}
291
+ self.temp_atoms = set()
292
+ self.orig_atom = {}
293
+ self.edge2toks = {}
294
+ self.toks2edge = {}
295
+ self.edge2coref: dict[Hyperedge, object] = {}
296
+ self.resolved_corefs: set[object] = set()
297
+ self.cur_text = text
298
+
299
+ def _builder_arg_roles(self, edge: Hyperedge) -> str:
300
+ depth1: int = self._dep_depth(edge[1])
301
+ depth2: int = self._dep_depth(edge[2])
302
+ if depth1 < depth2:
303
+ return 'ma'
304
+ elif depth1 > depth2:
305
+ return 'am'
306
+ else:
307
+ return 'mm'
308
+
309
+
310
+ def _relation_arg_role(self, edge: Hyperedge) -> str:
311
+ head_token: Token | None = self._head_token(edge)
312
+ if not head_token:
313
+ return '?'
314
+ dep: str = head_token.dep_
315
+
316
+ # subject
317
+ if dep in {'nsubj', 'sb'}:
318
+ return 's'
319
+ # passive subject
320
+ elif dep in {'nsubjpass', 'nsubj:pass'}:
321
+ return 'p'
322
+ # agent
323
+ elif dep == 'agent':
324
+ return 'a'
325
+ # object
326
+ elif dep in {'obj', 'dobj', 'pobj', 'prt', 'oprd', 'acomp', 'attr', 'ROOT', 'oa', 'pd'}:
327
+ return 'o'
328
+ # indirect object
329
+ elif dep in {'iobj', 'dative', 'obl:arg', 'da'}:
330
+ return 'i'
331
+ # specifier
332
+ elif dep in {'advcl', 'prep', 'npadvmod', 'advmod', 'mo', 'mnr'}:
333
+ return 'x'
334
+ # parataxis
335
+ elif dep in {'parataxis', 'par'}:
336
+ return 't'
337
+ # interjection
338
+ elif dep in {'intj', 'ng', 'dm'}:
339
+ return 'j'
340
+ # clausal complement
341
+ elif dep in {'xcomp', 'ccomp', 'oc'}:
342
+ return 'r'
343
+ else:
344
+ return '?'
345
+
346
+
347
+ def _adjust_score(self, edges: list[Hyperedge]) -> int:
348
+ min_depth: int = 9999999
349
+ appos: bool = False
350
+ min_appos_depth: int = 9999999
351
+
352
+ if all([edge.mtype() == 'C' for edge in edges]):
353
+ for edge in edges:
354
+ token: Token | None = self._head_token(edge)
355
+ if token is None:
356
+ continue
357
+ depth: int = self.depths[self.token2atom[token]]
358
+ if depth < min_depth:
359
+ min_depth = depth
360
+ if token.dep_ == 'appos':
361
+ appos = True
362
+ if depth < min_appos_depth:
363
+ min_appos_depth = depth
364
+
365
+ if appos and min_appos_depth > min_depth:
366
+ return -99
367
+ else:
368
+ return 0
369
+
370
+ def _head_token(self, edge: Hyperedge) -> Token | None:
371
+ atoms: list[Atom] = [
372
+ cast(Atom, uatom)
373
+ for atom in edge.all_atoms()
374
+ if (uatom := unique(atom)) is not None and uatom in self.atom2token
375
+ ]
376
+ min_depth: int = 9999999
377
+ main_atom: Atom | None = None
378
+ for atom in atoms:
379
+ if atom in self.orig_atom:
380
+ oatom: Atom = self.orig_atom[atom]
381
+ if oatom in self.depths:
382
+ depth: int = self.depths[oatom]
383
+ if depth < min_depth:
384
+ min_depth = depth
385
+ main_atom = atom
386
+ if main_atom:
387
+ return self.atom2token[main_atom]
388
+ else:
389
+ return None
390
+
391
+ def _dep_depth(self, edge: Hyperedge) -> int:
392
+ atoms: list[Atom] = [
393
+ cast(Atom, uatom)
394
+ for atom in edge.all_atoms()
395
+ if (uatom := unique(atom)) is not None and uatom in self.atom2token
396
+ ]
397
+ mdepth: int = 99999999
398
+ for atom in atoms:
399
+ if atom in self.orig_atom:
400
+ oatom: Atom = self.orig_atom[atom]
401
+ if oatom in self.depths:
402
+ depth: int = self.depths[oatom]
403
+ if depth < mdepth:
404
+ mdepth = depth
405
+ return mdepth
406
+
407
+ def _build_atom(self, token: Token, ent_type: str, last_token: Token | None) -> Atom:
408
+ text: str = token.text.lower()
409
+ et: str = ent_type
410
+
411
+ if ent_type[0] == 'P':
412
+ atom: Atom = self._build_atom_predicate(token, ent_type, last_token)
413
+ elif ent_type[0] == 'T':
414
+ atom = self._build_atom_trigger(token, ent_type)
415
+ elif ent_type[0] == 'M':
416
+ atom = self._build_atom_modifier(token)
417
+ else:
418
+ atom = build_atom(text, et, self.lang)
419
+ return atom
420
+
421
+ def _build_atom_predicate(self, token: Token, ent_type: str, last_token: Token | None) -> Atom:
422
+ text: str = token.text.lower()
423
+ et: str = ent_type
424
+
425
+ # first naive assignment of predicate subtype
426
+ # (can be revised at post-processing stage)
427
+ if ent_type == 'Pd':
428
+ # interrogative cases
429
+ if (last_token and
430
+ last_token.tag_ == '.' and
431
+ last_token.dep_ == 'punct' and
432
+ last_token.lemma_.strip() == '?'):
433
+ ent_type = 'P?'
434
+ # declarative (by default)
435
+ else:
436
+ ent_type = 'Pd'
437
+
438
+ return build_atom(text, ent_type, self.lang)
439
+
440
+ def _build_atom_trigger(self, token: Token, ent_type: str) -> Atom:
441
+ text: str = token.text.lower()
442
+ et: str = 'Tv' if _is_verb(token) else ent_type
443
+ return build_atom(text, et, self.lang)
444
+
445
+ def _build_atom_modifier(self, token: Token) -> Atom:
446
+ text: str = token.text.lower()
447
+ et: str = 'Mv' if _is_verb(token) else _modifier_type_and_subtype(token)
448
+ return build_atom(text, et, self.lang)
449
+
450
+ def _repair(self, edge: Hyperedge) -> Hyperedge:
451
+ if edge.not_atom:
452
+ new_edge: Hyperedge | None = hedge([self._repair(subedge) for subedge in edge])
453
+ if new_edge is None:
454
+ return edge
455
+ edge = new_edge
456
+
457
+ if len(edge) == 3 and str(edge[0])[:4] == '+/B.':
458
+ if len(edge[1]) == 2 and edge[1].cmt == 'J':
459
+ return cast(Hyperedge, hedge([edge[1][0], edge[1][1], edge[2]]))
460
+ elif len(edge[2]) == 2 and edge[2].cmt == 'J':
461
+ return cast(Hyperedge, hedge([edge[2][0], edge[1], edge[2][1]]))
462
+
463
+ return edge
464
+
465
+ def _normalize(self, edge: Hyperedge) -> Hyperedge:
466
+ if edge.not_atom:
467
+ new_edge: Hyperedge | None = hedge([self._normalize(subedge) for subedge in edge])
468
+ if new_edge is None:
469
+ return edge
470
+ edge = new_edge
471
+
472
+ # Move modifier to internal connector if it is applied to
473
+ # relations, specifiers or conjunctions
474
+ if edge.cmt == 'M' and not edge[1].atom:
475
+ innner_conn: str | None = edge[1].cmt
476
+ if innner_conn in {'P', 'T', 'J'}:
477
+ return cast(Hyperedge, hedge(((edge[0], edge[1][0]),) + edge[1][1:]))
478
+
479
+ return edge
480
+
481
+ def _update_atom(self, old: Atom, new: Atom) -> None:
482
+ uold: Atom = UniqueAtom(old)
483
+ unew: Atom = UniqueAtom(new)
484
+ if uold in self.atom2token:
485
+ self.atom2token[unew] = self.atom2token[uold]
486
+ self.temp_atoms.add(uold)
487
+ self.orig_atom[unew] = uold
488
+
489
+ def _replace_atom(self, edge: Hyperedge, old: Atom, new: Atom) -> Hyperedge:
490
+ self._update_atom(old, new)
491
+ return edge.replace_atom(old, new)
492
+
493
+ def _insert_edge_with_argrole(self, edge: Hyperedge, arg: Hyperedge, argrole: str, pos: int) -> Hyperedge:
494
+ new_edge: Hyperedge = edge.insert_edge_with_argrole(arg, argrole, pos)
495
+ old_pred: Atom = edge[0].inner_atom()
496
+ new_pred: Atom = new_edge[0].inner_atom()
497
+ self._update_atom(old_pred, new_pred)
498
+ return new_edge
499
+
500
+ def _replace_argroles(self, edge: Hyperedge, argroles: str) -> Hyperedge:
501
+ new_edge: Hyperedge = edge.replace_argroles(argroles)
502
+ old_pred: Atom = edge[0].inner_atom()
503
+ new_pred: Atom = new_edge[0].inner_atom()
504
+ self._update_atom(old_pred, new_pred)
505
+ return new_edge
506
+
507
+ def _apply_arg_roles(self, edge: Hyperedge) -> Hyperedge:
508
+ if edge.atom:
509
+ return edge
510
+
511
+ new_entity: Hyperedge = edge
512
+
513
+ # Extend predicate connectors with argument types
514
+ if edge.connector_mtype() == 'P':
515
+ pred: Atom | None = edge.atom_with_type('P')
516
+ assert pred is not None
517
+ subparts: list[str] = pred.parts()[1].split('.')
518
+ args: list[str] = [self._relation_arg_role(param) for param in edge[1:]]
519
+ args_string: str = ''.join(args)
520
+ # TODO: this is done to detect imperative, to refactor
521
+ pt: str = _predicate_post_type_and_subtype(edge, subparts, args_string)
522
+ if len(subparts) > 2:
523
+ new_part: str = '{}.{}.{}'.format(pt, args_string, subparts[2])
524
+ else:
525
+ new_part = '{}.{}'.format(pt, args_string)
526
+ new_pred: Atom = pred.replace_atom_part(1, new_part)
527
+ unew_pred: Atom = UniqueAtom(new_pred)
528
+ upred: Atom = UniqueAtom(pred)
529
+ self.atom2token[unew_pred] = self.atom2token[upred]
530
+ self.temp_atoms.add(upred)
531
+ self.orig_atom[unew_pred] = upred
532
+ new_entity = edge.replace_atom(pred, new_pred, unique=True)
533
+
534
+ # Extend builder connectors with argument types
535
+ elif edge.connector_mtype() == 'B':
536
+ builder: Atom | None = edge.atom_with_type('B')
537
+ assert builder is not None
538
+ subparts = builder.parts()[1].split('.')
539
+ arg_roles: str = self._builder_arg_roles(edge)
540
+ if len(arg_roles) > 0:
541
+ if len(subparts) > 1:
542
+ subparts[1] = arg_roles
543
+ else:
544
+ subparts.append(arg_roles)
545
+ new_part = '.'.join(subparts)
546
+ new_builder: Atom = builder.replace_atom_part(1, new_part)
547
+ ubuilder: Atom = UniqueAtom(builder)
548
+ unew_builder: Atom = UniqueAtom(new_builder)
549
+ if ubuilder in self.atom2token:
550
+ self.atom2token[unew_builder] = self.atom2token[ubuilder]
551
+ self.temp_atoms.add(ubuilder)
552
+ self.orig_atom[unew_builder] = ubuilder
553
+ new_entity = edge.replace_atom(builder, new_builder, unique=True)
554
+
555
+ new_args: list[Hyperedge] = [self._apply_arg_roles(subentity) for subentity in new_entity[1:]]
556
+ new_entity = cast(Hyperedge, hedge([new_entity[0]] + new_args))
557
+
558
+ return new_entity
559
+
560
+ def _generate_atom2word(self, edge: Hyperedge, offset: int = 0) -> dict[Atom, tuple[str, int]]:
561
+ atom2word: dict[Atom, tuple[str, int]] = {}
562
+ atoms: list[Atom] = edge.all_atoms()
563
+ for atom in atoms:
564
+ uatom: Atom = UniqueAtom(atom)
565
+ if uatom in self.atom2token:
566
+ token: Token = self.atom2token[uatom]
567
+ word: tuple[str, int] = (token.text, token.i - offset)
568
+ atom2word[uatom] = word
569
+ return atom2word
570
+
571
+ def _parse_token(self, token: Token, atom_type: str) -> Atom | None:
572
+ if atom_type == 'X':
573
+ return None
574
+ elif atom_type == 'C':
575
+ atom_type = _concept_type_and_subtype(token)
576
+ elif atom_type == 'M':
577
+ atom_type = _modifier_type_and_subtype(token)
578
+ elif atom_type == 'B':
579
+ atom_type = _builder_type_and_subtype(token)
580
+ elif atom_type == 'P':
581
+ atom_type = _predicate_type_and_subtype(token)
582
+
583
+ # last token is useful to determine predicate subtype
584
+ tokens: list[Token] = list(token.lefts) + list(token.rights)
585
+ last_token: Token | None = tokens[-1] if len(tokens) > 0 else None
586
+
587
+ atom: Atom = self._build_atom(token, atom_type, last_token)
588
+ self.debug_msg('ATOM: {}'.format(atom))
589
+
590
+ return atom
591
+
592
+ def _build_atom_sequence(self, sentence: Span) -> list[Atom]:
593
+ features: list[tuple[str, str, str, str, str]] = []
594
+ for pos, token in enumerate(sentence):
595
+ head: Token = token.head
596
+ tag: str = token.tag_
597
+ dep: str = token.dep_
598
+ hpos: str = head.pos_ if head else ''
599
+ hdep: str = head.dep_ if head else ''
600
+ if pos + 1 < len(sentence):
601
+ pos_after: str = sentence[pos + 1].pos_
602
+ else:
603
+ pos_after = ''
604
+ features.append((tag, dep, hpos, hdep, pos_after))
605
+
606
+ assert self.alpha is not None, "Alpha classifier must be initialized before parsing"
607
+ atom_types: tuple[str, ...] | list[str] = self.alpha.predict(sentence, features)
608
+
609
+ self.token2atom = {}
610
+
611
+ atomseq: list[Atom] = []
612
+ for token, atom_type in zip(sentence, atom_types):
613
+ atom: Atom | None = self._parse_token(token, atom_type)
614
+ if atom:
615
+ uatom: Atom = UniqueAtom(atom)
616
+ self.atom2token[uatom] = token
617
+ self.token2atom[token] = uatom
618
+ self.orig_atom[uatom] = uatom
619
+ atomseq.append(uatom)
620
+ return atomseq
621
+
622
+ def _compute_depths_and_connections(self, root: Token, depth: int = 0) -> None:
623
+ if depth == 0:
624
+ self.depths = {}
625
+ self.connections = set()
626
+
627
+ if root in self.token2atom:
628
+ parent_atom: Atom | None = self.token2atom[root]
629
+ self.depths[parent_atom] = depth
630
+ else:
631
+ parent_atom = None
632
+
633
+ for child in root.children:
634
+ if parent_atom and child in self.token2atom:
635
+ child_atom: Atom = self.token2atom[child]
636
+ self.connections.add((parent_atom, child_atom))
637
+ self.connections.add((child_atom, parent_atom))
638
+ self._compute_depths_and_connections(child, depth + 1)
639
+
640
+ def _is_pair_connected(self, atoms1: list[Atom], atoms2: list[Atom]) -> bool:
641
+ for atom1 in atoms1:
642
+ for atom2 in atoms2:
643
+ if atom1 in self.orig_atom and atom2 in self.orig_atom:
644
+ pair: tuple[Atom, Atom] = (self.orig_atom[atom1], self.orig_atom[atom2])
645
+ if pair in self.connections:
646
+ return True
647
+ return False
648
+
649
+ def _are_connected(self, atom_sets: list[list[Atom]], connector_pos: int) -> bool:
650
+ conn: bool = True
651
+ for pos, arg in enumerate(atom_sets):
652
+ if pos != connector_pos:
653
+ if not self._is_pair_connected(atom_sets[connector_pos], arg):
654
+ conn = False
655
+ break
656
+ return conn
657
+
658
+ def _score(self, edges: list[Hyperedge]) -> int:
659
+ atom_sets: list[list[Atom]] = [edge.all_atoms() for edge in edges]
660
+
661
+ conn: bool = False
662
+ for pos in range(len(edges)):
663
+ if self._are_connected(atom_sets, pos):
664
+ conn = True
665
+ break
666
+
667
+ mdepth: int = 99999999
668
+ for atom_set in atom_sets:
669
+ for atom in atom_set:
670
+ if atom in self.orig_atom:
671
+ oatom: Atom = self.orig_atom[atom]
672
+ if oatom in self.depths:
673
+ depth: int = self.depths[oatom]
674
+ if depth < mdepth:
675
+ mdepth = depth
676
+
677
+ return (10000000 if conn else 0) + (mdepth * 100) + self._adjust_score(edges)
678
+
679
+ def _parse_atom_sequence(self, atom_sequence: list[Atom]) -> tuple[list[Hyperedge] | None, bool]:
680
+ sequence: list[Hyperedge] = list(atom_sequence)
681
+ while True:
682
+ action: tuple[Rule, int, Hyperedge, int, int] | None = None
683
+ best_score: int = -999999999
684
+ for rule_number, rule in enumerate(self.rules):
685
+ window_start: int = rule.size - 1
686
+ for pos in range(window_start, len(sequence)):
687
+ new_edge: Hyperedge | None = apply_rule(rule, sequence, pos)
688
+ if new_edge:
689
+ score: int = self._score(sequence[pos - window_start:pos + 1])
690
+ score -= rule_number
691
+ if score > best_score:
692
+ action = (rule, score, new_edge, window_start, pos)
693
+ best_score = score
694
+
695
+ # parse failed, make best effort to return something
696
+ if action is None:
697
+ # if all else fails...
698
+ if len(sequence) > 0:
699
+ fallback: Hyperedge | None = hedge([':/J/.'] + sequence[:2])
700
+ new_sequence: list[Hyperedge] = ([fallback] if fallback else []) + sequence[2:]
701
+ else:
702
+ return None, True
703
+ else:
704
+ rule, s, new_edge, window_start, pos = action
705
+ new_sequence = (sequence[:pos - window_start] + [new_edge] + sequence[pos + 1:])
706
+
707
+ self.debug_msg('rule: {}'.format(rule))
708
+ self.debug_msg('score: {}'.format(score))
709
+ self.debug_msg('new_edge: {}'.format(new_edge))
710
+ self.debug_msg('new_sequence: {}'.format(new_sequence))
711
+
712
+ sequence = new_sequence
713
+ if len(sequence) < 2:
714
+ return sequence, False
715
+
716
+ def sentensize(self, text: str) -> list[str]:
717
+ if self.nlp:
718
+ doc: Doc = self.nlp(text.strip())
719
+ return [str(sent).strip() for sent in doc.sents]
720
+ else:
721
+ raise RuntimeError("spaCy model failed to initialize.")
722
+
723
+ def _edge2toks(self, edge: Hyperedge) -> None:
724
+ uatoms: list[Hyperedge | None] = [unique(atom) for atom in edge.all_atoms()]
725
+ toks: tuple[Token, ...] = tuple(sorted(
726
+ [self.atom2token[uatom] for uatom in uatoms if uatom is not None and uatom in self.atom2token]
727
+ ))
728
+ self.edge2toks[edge] = toks
729
+ self.toks2edge[toks] = edge
730
+ if edge.not_atom:
731
+ for subedge in edge:
732
+ self._edge2toks(subedge)
733
+
734
+ # ===============
735
+ # Post-processing
736
+ # ===============
737
+ def _insert_arg_in_tail(self, edge: Hyperedge, arg: Hyperedge) -> Hyperedge:
738
+ if edge.atom:
739
+ return edge
740
+
741
+ if edge.cmt == 'P':
742
+ ars: str = edge.argroles()
743
+ ar: str | None = None
744
+ if 'p' in ars:
745
+ if 'a' not in ars:
746
+ ar = 'a'
747
+ elif 'a' in ars:
748
+ ar = 'p'
749
+ elif 's' not in ars:
750
+ ar = 's'
751
+ elif 'o' not in ars:
752
+ ar = 'o'
753
+ if ar:
754
+ return self._insert_edge_with_argrole(edge, arg, ar, len(edge))
755
+
756
+ new_tail: Hyperedge = self._insert_arg_in_tail(edge[-1], arg)
757
+ if new_tail != edge[-1]:
758
+ return cast(Hyperedge, hedge(list(edge[:-1]) + [new_tail]))
759
+ if edge.cmt != 'P':
760
+ return edge
761
+ ars = edge.argroles()
762
+ if ars == '':
763
+ return edge
764
+ return self._insert_edge_with_argrole(edge, arg, 'x', len(edge))
765
+
766
+ def _insert_spec_rightmost_relation(self, edge: Hyperedge, arg: Hyperedge) -> Hyperedge:
767
+ if edge.atom:
768
+ return edge
769
+ if 'P' in [atom.mt for atom in edge[-1].atoms()]:
770
+ return cast(Hyperedge, hedge(list(edge[:-1]) + [self._insert_spec_rightmost_relation(edge[-1], arg)]))
771
+ if edge[0].mt == 'P':
772
+ return self._insert_edge_with_argrole(edge, arg, 'x', len(edge))
773
+ for pos, subedge in reversed(list(enumerate(edge))):
774
+ if 'P' in [atom.mt for atom in subedge.atoms()]:
775
+ new_edge_list: list[Hyperedge] = list(edge)
776
+ new_edge_list[pos] = self._insert_spec_rightmost_relation(subedge, arg)
777
+ return cast(Hyperedge, hedge(new_edge_list))
778
+ return edge
779
+
780
+ def _process_colon_conjunctions(self, edge: Hyperedge) -> Hyperedge:
781
+ if edge.atom:
782
+ return edge
783
+ new_edge: Hyperedge | None = hedge([self._process_colon_conjunctions(subedge) for subedge in edge])
784
+ if new_edge is None:
785
+ return edge
786
+ edge = new_edge
787
+ if str(edge[0]) == ':/J/.' and any(subedge.mt == 'R' for subedge in edge):
788
+ if edge[1].mt == 'R':
789
+ # RR
790
+ if edge[2].mt == 'S':
791
+ # second is specification
792
+ return self._insert_edge_with_argrole(edge[1], edge[2], 'x', len(edge[1]))
793
+ # RC
794
+ elif edge[2].mt == 'C':
795
+ return self._insert_arg_in_tail(edge[1], edge[2])
796
+ # CR
797
+ elif edge[1].mt == 'C':
798
+ if edge[2].mt == 'R':
799
+ if 's' not in edge[2].argroles():
800
+ # concept is subject
801
+ return self._insert_edge_with_argrole(edge[2], edge[1], 's', 0)
802
+ # SR
803
+ elif edge[1].mt == 'S':
804
+ if edge[2].mt == 'R':
805
+ # first is specification
806
+ return self._insert_edge_with_argrole(edge[2], edge[1], 'x', len(edge[2]))
807
+ return edge
808
+
809
+ def _fix_argroles(self, edge: Hyperedge) -> Hyperedge:
810
+ if edge.atom:
811
+ return edge
812
+ new_edge: Hyperedge | None = hedge([self._fix_argroles(subedge) for subedge in edge])
813
+ if new_edge is None:
814
+ return edge
815
+ edge = new_edge
816
+ ars: str = edge.argroles()
817
+ if ars != '' and edge.mt == 'R':
818
+ _ars: str = ''
819
+ for ar, subedge in zip(ars, edge[1:]):
820
+ _ar: str = ar
821
+ if ar == '?':
822
+ if subedge.mt == 'R':
823
+ _ar = 'r'
824
+ elif subedge.mt == 'S':
825
+ _ar = 'x'
826
+ _ars += _ar
827
+ return self._replace_argroles(edge, _ars)
828
+ return edge
829
+
830
+ def _post_process(self, edge: Hyperedge | None) -> Hyperedge | None:
831
+ if edge is None:
832
+ return None
833
+ _edge: Hyperedge = self._fix_argroles(edge)
834
+ _edge = self._process_colon_conjunctions(_edge)
835
+ return _edge
@@ -0,0 +1,67 @@
1
+ from hyperbase.hyperedge import Hyperedge, hedge
2
+
3
+
4
+ class Rule:
5
+ def __init__(self, first_type: str, arg_types: set[str], size: int, connector: str | None = None) -> None:
6
+ self.first_type: str = first_type
7
+ self.arg_types: set[str] = arg_types
8
+ self.size: int = size
9
+ self.connector: str | None = connector
10
+ self._branches: int = 0
11
+
12
+
13
+ strict_rules: list[Rule] = [
14
+ Rule('C', {'C'}, 2, '+/B/.'),
15
+ Rule('M', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 2),
16
+ Rule('B', {'C'}, 3),
17
+ Rule('T', {'C', 'R'}, 2),
18
+ Rule('P', {'C', 'R', 'S'}, 6),
19
+ Rule('P', {'C', 'R', 'S'}, 5),
20
+ Rule('P', {'C', 'R', 'S'}, 4),
21
+ Rule('P', {'C', 'R', 'S'}, 3),
22
+ Rule('P', {'C', 'R', 'S'}, 2),
23
+ Rule('J', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 3)]
24
+
25
+
26
+ repair_rules: list[Rule] = [
27
+ Rule('C', {'C'}, 2, '+/B/.'),
28
+ Rule('M', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 2),
29
+ Rule('B', {'C', 'R'}, 3),
30
+ Rule('T', {'C', 'R'}, 2),
31
+ Rule('P', {'C', 'R', 'S'}, 6),
32
+ Rule('P', {'C', 'R', 'S'}, 5),
33
+ Rule('P', {'C', 'R', 'S'}, 4),
34
+ Rule('P', {'C', 'R', 'S'}, 3),
35
+ Rule('P', {'C', 'R', 'S'}, 2),
36
+ Rule('J', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 3),
37
+ Rule('J', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 2)]
38
+
39
+
40
+ def apply_rule(rule: Rule, sentence: list[Hyperedge], pos: int) -> Hyperedge | None:
41
+ for pivot_pos in range(rule.size):
42
+ args: list[Hyperedge] = []
43
+ pivot: Hyperedge | None = None
44
+ valid: bool = True
45
+ for i in range(rule.size):
46
+ edge: Hyperedge = sentence[pos - rule.size + i + 1]
47
+ if i == pivot_pos:
48
+ if edge.mtype() == rule.first_type:
49
+ if rule.connector:
50
+ args.append(edge)
51
+ else:
52
+ pivot = edge
53
+ else:
54
+ valid = False
55
+ break
56
+ else:
57
+ if edge.mtype() in rule.arg_types:
58
+ args.append(edge)
59
+ else:
60
+ valid = False
61
+ break
62
+ if valid:
63
+ if rule.connector:
64
+ return hedge([rule.connector] + args)
65
+ else:
66
+ return hedge([pivot] + args)
67
+ return None
@@ -0,0 +1,9 @@
1
+ from wtpsplit import SaT
2
+
3
+
4
+ class Sentensizer:
5
+ def __init__(self) -> None:
6
+ self.sat: SaT = SaT('sat-3l')
7
+
8
+ def sentensize(self, text: str) -> list[str]:
9
+ return list(self.sat.split(text))
@@ -0,0 +1,62 @@
1
+ Metadata-Version: 2.4
2
+ Name: hyperbase-parser-ab
3
+ Version: 0.1.0
4
+ Summary: Semantic Hypergraph AlphaBeta Parser
5
+ Project-URL: Homepage, https://hyperquest.ai/hyperbase
6
+ Author-email: "Telmo Menezes et al." <telmo@telmomenezes.net>
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Keywords: AI,Knowledge Representation,NLP,Natural Language Understanding,Parsing,Semantic Hypergraphs
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: hyperbase>=0.8.0
19
+ Requires-Dist: pip
20
+ Requires-Dist: scikit-learn>=1.3.0
21
+ Requires-Dist: spacy>=3.8.0
22
+ Requires-Dist: torch>=2.0.0
23
+ Requires-Dist: transformers>=4.46.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: coverage>=7.4.3; extra == 'dev'
26
+ Requires-Dist: datasets>=4.0.0; extra == 'dev'
27
+ Requires-Dist: mypy>=1.8.0; extra == 'dev'
28
+ Requires-Dist: pre-commit>=3.6.2; extra == 'dev'
29
+ Requires-Dist: pytest>=9.0.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.2.2; extra == 'dev'
31
+ Requires-Dist: types-passlib>=1.7.7.20240106; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # Hyperbase Alpha-Beta Parser
35
+
36
+ ## A semantic hypergraph parser for natural language
37
+
38
+ The Alpha-Beta parser is a [Hyperbase](https://hyperquest.ai/hyperbase) plugin that converts natural language text into *Semantic Hypergraphs (SH)*. It works in two stages:
39
+
40
+ - **Alpha stage**: A multilingual neural token classifier (based on DistilBERT) assigns one of 39 semantic atom types to each token in a sentence -- for example, concepts, predicates, modifiers, builders, triggers and conjunctions.
41
+ - **Beta stage**: A rule-based engine combines classified atoms into ordered, recursive hyperedges using syntactic and semantic composition rules, producing structured representations that can be manipulated with Hyperbase.
42
+
43
+ ## Supported languages
44
+
45
+ The parser supports any language with a [spaCy](https://spacy.io) model available, including English, French, German, Italian, Portuguese and Spanish.
46
+
47
+ While the parser is theoretically language-agnostic and could in principle support languages such as Mandarin, which differ substantially in morphological and syntactic structure, the authors' linguistic competence is limited to Germanic and Romance languages. We welcome the help of native speakers or domain experts in validating/improving support for other language families.
48
+
49
+ ## Installation and manual
50
+
51
+ Installation instructions, the manual and more information can be found here: <https://hyperquest.ai/hyperbase>
52
+
53
+ ## Contributing
54
+
55
+ Pull requests are welcome. For major changes, please open an issue first
56
+ to discuss what you would like to change.
57
+
58
+ Please make sure to update tests as appropriate.
59
+
60
+ ## License
61
+
62
+ [MIT](https://choosealicense.com/licenses/mit/)
@@ -0,0 +1,12 @@
1
+ hyperbase_parser_ab/__init__.py,sha256=Cf3ZbUHvRSSfXOGAkfY6OMqy7QwZtKiQi-rdGqrPRv4,86
2
+ hyperbase_parser_ab/alpha.py,sha256=4zfa31GI-OeTKzvKnibu3lsU9NEzQm-BB41muRa2bt4,2831
3
+ hyperbase_parser_ab/atomizer.py,sha256=ewHhTWRZGP7ouheFLp-CnOFItnNLu4v3kwPOGgeyPo0,5527
4
+ hyperbase_parser_ab/lang_models.py,sha256=qTAcdZeM_NXTnl5RYzdDq4SH4dZ_oj2mFBUI7fE8PUI,1219
5
+ hyperbase_parser_ab/parser.py,sha256=g5-zxUZKYbOQE0bhojTHsQ9g78MuzCV1rr-4q_WqjFA,31528
6
+ hyperbase_parser_ab/rules.py,sha256=tr8UGTN_BlinjhtmAjm9Gevv-RVGENPFJbnIn9TKqPU,2271
7
+ hyperbase_parser_ab/sentensizer.py,sha256=tO4Qf5X8JmthiYDX_ObBXdvGqzgXC3eGsKieqOEmk2k,209
8
+ hyperbase_parser_ab-0.1.0.dist-info/METADATA,sha256=dLJ0Icaj0b9cIkbzzMP1jHzUuxfqstlt0RSMVx_j6LU,2990
9
+ hyperbase_parser_ab-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
10
+ hyperbase_parser_ab-0.1.0.dist-info/entry_points.txt,sha256=rqN89onaLcCtRnbJPFoXC1RCFxRBj2wGo321TDSnqKU,68
11
+ hyperbase_parser_ab-0.1.0.dist-info/licenses/LICENSE,sha256=6p_7YBrzvSBO3phQeZm2sRg0JmEKiyBJwzZVwseDaGE,1118
12
+ hyperbase_parser_ab-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [hyperbase.parsers]
2
+ alphabeta = hyperbase_parser_ab:AlphaBetaParser
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (C) 2026 CNRS - Centre national de la recherche scientifique
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.