hyperbase-parser-ab 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hyperbase_parser_ab/__init__.py +3 -0
- hyperbase_parser_ab/alpha.py +69 -0
- hyperbase_parser_ab/atomizer.py +142 -0
- hyperbase_parser_ab/lang_models.py +50 -0
- hyperbase_parser_ab/parser.py +835 -0
- hyperbase_parser_ab/rules.py +67 -0
- hyperbase_parser_ab/sentensizer.py +9 -0
- hyperbase_parser_ab-0.1.0.dist-info/METADATA +62 -0
- hyperbase_parser_ab-0.1.0.dist-info/RECORD +12 -0
- hyperbase_parser_ab-0.1.0.dist-info/WHEEL +4 -0
- hyperbase_parser_ab-0.1.0.dist-info/entry_points.txt +2 -0
- hyperbase_parser_ab-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from numpy.typing import NDArray
|
|
3
|
+
from scipy.sparse import spmatrix
|
|
4
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
5
|
+
from sklearn.preprocessing import OneHotEncoder
|
|
6
|
+
from spacy.tokens import Span
|
|
7
|
+
|
|
8
|
+
from hyperbase_parser_ab.atomizer import Atomizer
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Alpha(object):
|
|
12
|
+
def __init__(self, cases_str: str | None = None, use_atomizer: bool = False) -> None:
|
|
13
|
+
if use_atomizer:
|
|
14
|
+
self.atomizer: Atomizer | None = Atomizer()
|
|
15
|
+
elif cases_str:
|
|
16
|
+
self.atomizer = None
|
|
17
|
+
|
|
18
|
+
X: list[tuple[str, str, str, str, str]] = []
|
|
19
|
+
y: list[list[str]] = []
|
|
20
|
+
|
|
21
|
+
for line in cases_str.strip().split('\n'):
|
|
22
|
+
sline: str = line.strip()
|
|
23
|
+
if len(sline) > 0:
|
|
24
|
+
row: list[str] = sline.strip().split('\t')
|
|
25
|
+
true_value: str = row[0]
|
|
26
|
+
tag: str = row[3]
|
|
27
|
+
dep: str = row[4]
|
|
28
|
+
hpos: str = row[6]
|
|
29
|
+
hdep: str = row[8]
|
|
30
|
+
pos_after: str = row[19]
|
|
31
|
+
|
|
32
|
+
y.append([true_value])
|
|
33
|
+
X.append((tag, dep, hpos, hdep, pos_after))
|
|
34
|
+
|
|
35
|
+
if len(y) > 0:
|
|
36
|
+
self.empty: bool = False
|
|
37
|
+
|
|
38
|
+
self.encX: OneHotEncoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
|
39
|
+
self.encX.fit(np.array(X))
|
|
40
|
+
self.ency: OneHotEncoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
|
41
|
+
self.ency.fit(np.array(y))
|
|
42
|
+
|
|
43
|
+
X_: NDArray | spmatrix = self.encX.transform(np.array(X))
|
|
44
|
+
y_: NDArray | spmatrix = self.ency.transform(np.array(y))
|
|
45
|
+
|
|
46
|
+
self.clf: RandomForestClassifier = RandomForestClassifier(random_state=777)
|
|
47
|
+
self.clf.fit(X_, y_)
|
|
48
|
+
else:
|
|
49
|
+
self.empty = True
|
|
50
|
+
|
|
51
|
+
def predict(self, sentence: Span, features: list[tuple[str, str, str, str, str]]) -> tuple[str, ...] | list[str]:
|
|
52
|
+
if self.atomizer:
|
|
53
|
+
preds: list[tuple[str, str]] = self.atomizer.atomize(
|
|
54
|
+
sentence=str(sentence),
|
|
55
|
+
tokens=[str(token) for token in sentence])
|
|
56
|
+
atom_types: list[str] = [pred[1] for pred in preds]
|
|
57
|
+
|
|
58
|
+
# force known cases
|
|
59
|
+
for i in range(len(atom_types)):
|
|
60
|
+
if sentence[i].pos_ == 'VERB':
|
|
61
|
+
atom_types[i] = 'P'
|
|
62
|
+
return atom_types
|
|
63
|
+
else:
|
|
64
|
+
# an empty classifier always predicts 'C'
|
|
65
|
+
if self.empty:
|
|
66
|
+
return tuple('C' for _ in range(len(features)))
|
|
67
|
+
_features: NDArray | spmatrix = self.encX.transform(np.array(features))
|
|
68
|
+
preds_arr: NDArray | spmatrix = self.ency.inverse_transform(self.clf.predict(_features))
|
|
69
|
+
return tuple(pred[0] if pred else 'C' for pred in preds_arr)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, PreTrainedTokenizerBase, PreTrainedModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
HF_REPO: str = "hyperquest/atom-classifier"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Atomizer:
|
|
11
|
+
def __init__(self, model_path: str | None = None) -> None:
|
|
12
|
+
model_id: str = model_path or HF_REPO
|
|
13
|
+
self.model_path: str = model_id
|
|
14
|
+
self.tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(model_id, use_fast=True)
|
|
15
|
+
self.model: PreTrainedModel = AutoModelForTokenClassification.from_pretrained(model_id)
|
|
16
|
+
assert self.model.config.id2label
|
|
17
|
+
self.id2label: dict[int, str] = self.model.config.id2label
|
|
18
|
+
|
|
19
|
+
def atomize(self,
|
|
20
|
+
sentence: str,
|
|
21
|
+
tokens: list[str] | None = None
|
|
22
|
+
) -> list[tuple[str, str]]:
|
|
23
|
+
# Tokenize the raw sentence and request offsets
|
|
24
|
+
encoded = self.tokenizer(
|
|
25
|
+
sentence,
|
|
26
|
+
return_tensors="pt",
|
|
27
|
+
truncation=True,
|
|
28
|
+
return_offsets_mapping=True
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
offset_mapping = encoded.pop("offset_mapping") # remove so model doesn't see it
|
|
32
|
+
word_ids: list[int | None] = encoded.word_ids(0)
|
|
33
|
+
|
|
34
|
+
with torch.no_grad():
|
|
35
|
+
outputs = self.model(**encoded)
|
|
36
|
+
|
|
37
|
+
pred_ids: list[int] = outputs.logits.argmax(-1)[0].tolist()
|
|
38
|
+
offset_mapping = offset_mapping[0].tolist()
|
|
39
|
+
|
|
40
|
+
if tokens is not None:
|
|
41
|
+
# Map provided tokens to model predictions based on character offsets
|
|
42
|
+
return self._map_tokens_to_predictions(sentence, tokens, word_ids, pred_ids, offset_mapping)
|
|
43
|
+
|
|
44
|
+
predicted_labels: list[tuple[str, str]] = []
|
|
45
|
+
current_word_id: int | None = None
|
|
46
|
+
current_start: int | None = None
|
|
47
|
+
current_end: int = -1
|
|
48
|
+
current_label: str | None = None
|
|
49
|
+
|
|
50
|
+
for idx, word_id in enumerate(word_ids):
|
|
51
|
+
if word_id is None:
|
|
52
|
+
continue # skip CLS, SEP, etc.
|
|
53
|
+
|
|
54
|
+
start: int
|
|
55
|
+
end: int
|
|
56
|
+
start, end = offset_mapping[idx]
|
|
57
|
+
label_id: int = pred_ids[idx]
|
|
58
|
+
label: str = self.id2label[label_id]
|
|
59
|
+
|
|
60
|
+
if word_id != current_word_id:
|
|
61
|
+
# flush previous word
|
|
62
|
+
if current_label is not None:
|
|
63
|
+
word_text: str = sentence[current_start:current_end]
|
|
64
|
+
predicted_labels.append((word_text, current_label))
|
|
65
|
+
|
|
66
|
+
# start new word
|
|
67
|
+
current_word_id = word_id
|
|
68
|
+
current_start = start
|
|
69
|
+
current_end = end
|
|
70
|
+
current_label = label
|
|
71
|
+
else:
|
|
72
|
+
# same word, extend its span
|
|
73
|
+
current_end = max(current_end, end)
|
|
74
|
+
|
|
75
|
+
# flush last word
|
|
76
|
+
if current_label is not None:
|
|
77
|
+
word_text = sentence[current_start:current_end]
|
|
78
|
+
predicted_labels.append((word_text, current_label))
|
|
79
|
+
|
|
80
|
+
return predicted_labels
|
|
81
|
+
|
|
82
|
+
def _map_tokens_to_predictions(self,
|
|
83
|
+
sentence: str,
|
|
84
|
+
tokens: list[str],
|
|
85
|
+
word_ids: list[int | None],
|
|
86
|
+
pred_ids: list[int],
|
|
87
|
+
offset_mapping: list[list[int]]
|
|
88
|
+
) -> list[tuple[str, str]]:
|
|
89
|
+
"""
|
|
90
|
+
Maps provided tokens to model predictions by finding character offsets
|
|
91
|
+
and assigning the most appropriate label based on overlapping model tokens.
|
|
92
|
+
"""
|
|
93
|
+
# Find character positions of each provided token in the sentence
|
|
94
|
+
token_positions: list[tuple[int, int] | None] = []
|
|
95
|
+
search_start: int = 0
|
|
96
|
+
|
|
97
|
+
for token in tokens:
|
|
98
|
+
pos: int = sentence.find(token, search_start)
|
|
99
|
+
if pos == -1:
|
|
100
|
+
# Token not found - skip or use fallback
|
|
101
|
+
token_positions.append(None)
|
|
102
|
+
else:
|
|
103
|
+
token_positions.append((pos, pos + len(token)))
|
|
104
|
+
search_start = pos + len(token)
|
|
105
|
+
|
|
106
|
+
# For each provided token, collect overlapping model predictions
|
|
107
|
+
result: list[tuple[str, str]] = []
|
|
108
|
+
for token, positions in zip(tokens, token_positions):
|
|
109
|
+
if positions is None:
|
|
110
|
+
# Token not found in sentence - assign default label
|
|
111
|
+
result.append((token, 'C'))
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
token_start: int
|
|
115
|
+
token_end: int
|
|
116
|
+
token_start, token_end = positions
|
|
117
|
+
|
|
118
|
+
# Collect all labels from model tokens that overlap with this token
|
|
119
|
+
overlapping_labels: list[str] = []
|
|
120
|
+
for idx, word_id in enumerate(word_ids):
|
|
121
|
+
if word_id is None:
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
model_start: int
|
|
125
|
+
model_end: int
|
|
126
|
+
model_start, model_end = offset_mapping[idx]
|
|
127
|
+
|
|
128
|
+
# Check if model token overlaps with provided token
|
|
129
|
+
if model_start < token_end and model_end > token_start:
|
|
130
|
+
label: str = self.id2label[pred_ids[idx]]
|
|
131
|
+
overlapping_labels.append(label)
|
|
132
|
+
|
|
133
|
+
# Assign the most common label, or first label if tie
|
|
134
|
+
if overlapping_labels:
|
|
135
|
+
# Use most common label
|
|
136
|
+
most_common_label: str = Counter(overlapping_labels).most_common(1)[0][0]
|
|
137
|
+
result.append((token, most_common_label))
|
|
138
|
+
else:
|
|
139
|
+
# No overlap found - use default
|
|
140
|
+
result.append((token, 'C'))
|
|
141
|
+
|
|
142
|
+
return result
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
def get_spacy_models(lang: str) -> list[str]:
|
|
2
|
+
if lang == 'de':
|
|
3
|
+
return [
|
|
4
|
+
'de_dep_news_trf',
|
|
5
|
+
'de_core_news_lg',
|
|
6
|
+
'de_core_news_md',
|
|
7
|
+
'de_core_news_sm'
|
|
8
|
+
]
|
|
9
|
+
elif lang == 'en':
|
|
10
|
+
return [
|
|
11
|
+
'en_core_web_trf',
|
|
12
|
+
'en_core_web_lg',
|
|
13
|
+
'en_core_web_md',
|
|
14
|
+
'en_core_web_sm'
|
|
15
|
+
]
|
|
16
|
+
elif lang == 'es':
|
|
17
|
+
return [
|
|
18
|
+
'es_dep_news_trf',
|
|
19
|
+
'es_core_news_lg',
|
|
20
|
+
'es_core_news_md',
|
|
21
|
+
'es_core_news_sm'
|
|
22
|
+
]
|
|
23
|
+
elif lang == 'fr':
|
|
24
|
+
return [
|
|
25
|
+
'fr_dep_news_trf',
|
|
26
|
+
'fr_core_news_lg',
|
|
27
|
+
'fr_core_news_md',
|
|
28
|
+
'fr_core_news_sm'
|
|
29
|
+
]
|
|
30
|
+
elif lang == 'it':
|
|
31
|
+
return [
|
|
32
|
+
'it_core_news_lg',
|
|
33
|
+
'it_core_news_md',
|
|
34
|
+
'it_core_news_sm'
|
|
35
|
+
]
|
|
36
|
+
elif lang == 'pt':
|
|
37
|
+
return [
|
|
38
|
+
'pt_core_news_lg',
|
|
39
|
+
'pt_core_news_md',
|
|
40
|
+
'pt_core_news_sm'
|
|
41
|
+
]
|
|
42
|
+
elif lang == 'zh':
|
|
43
|
+
return [
|
|
44
|
+
'zh_core_news_trf',
|
|
45
|
+
'zh_core_news_lg',
|
|
46
|
+
'zh_core_news_md',
|
|
47
|
+
'zh_core_news_sm'
|
|
48
|
+
]
|
|
49
|
+
else:
|
|
50
|
+
return []
|
|
@@ -0,0 +1,835 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import traceback
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
import spacy
|
|
6
|
+
from spacy.language import Language
|
|
7
|
+
from spacy.tokens import Doc, Span, Token
|
|
8
|
+
|
|
9
|
+
import hyperbase.constants as const
|
|
10
|
+
from hyperbase.hyperedge import Atom, Hyperedge, build_atom, hedge, non_unique, unique, UniqueAtom
|
|
11
|
+
from hyperbase.parsers import Parser
|
|
12
|
+
|
|
13
|
+
from hyperbase_parser_ab.alpha import Alpha
|
|
14
|
+
from hyperbase_parser_ab.lang_models import get_spacy_models
|
|
15
|
+
from hyperbase_parser_ab.rules import apply_rule, Rule, strict_rules, repair_rules
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _edge2txt_parts(edge: Hyperedge, parse: dict[str, Any]) -> list[tuple[str, str, int]]:
|
|
19
|
+
atoms: list[Atom] = [UniqueAtom(atom) for atom in edge.all_atoms()]
|
|
20
|
+
tokens: list[Token] = [parse['atom2token'][atom] for atom in atoms if atom in parse['atom2token']]
|
|
21
|
+
txts: list[str] = [token.text for token in tokens]
|
|
22
|
+
pos: list[int] = [token.i for token in tokens]
|
|
23
|
+
return list(zip(txts, txts, pos))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _edge2text(edge: Hyperedge, parse: dict[str, Any]) -> str:
|
|
27
|
+
if edge.not_atom and str(edge[0]) == const.possessive_builder:
|
|
28
|
+
return _poss2text(edge, parse)
|
|
29
|
+
|
|
30
|
+
parts: list[tuple[str, str, int]] = _edge2txt_parts(edge, parse)
|
|
31
|
+
parts = sorted(parts, key=lambda x: x[2])
|
|
32
|
+
|
|
33
|
+
prev_txt: str | None = None
|
|
34
|
+
txt_parts: list[str] = []
|
|
35
|
+
sentence: str = str(parse['spacy_sentence'])
|
|
36
|
+
for txt, _txt, _ in parts:
|
|
37
|
+
if prev_txt is not None:
|
|
38
|
+
res: re.Match[str] | None = re.search(r'{}(.*?){}'.format(re.escape(prev_txt), re.escape(txt)), sentence)
|
|
39
|
+
if res:
|
|
40
|
+
sep: str = res.group(1)
|
|
41
|
+
else:
|
|
42
|
+
sep = ' '
|
|
43
|
+
if any(letter.isalnum() for letter in sep):
|
|
44
|
+
sep = ' '
|
|
45
|
+
txt_parts.append(sep)
|
|
46
|
+
txt_parts.append(_txt)
|
|
47
|
+
prev_txt = txt
|
|
48
|
+
return ''.join(txt_parts)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _concept_type_and_subtype(token: Token) -> str:
|
|
52
|
+
pos: str = token.pos_
|
|
53
|
+
dep: str = token.dep_
|
|
54
|
+
if dep == 'nmod':
|
|
55
|
+
return 'Cm'
|
|
56
|
+
if pos == 'ADJ':
|
|
57
|
+
return 'Ca'
|
|
58
|
+
elif pos == 'NOUN':
|
|
59
|
+
return 'Cc'
|
|
60
|
+
elif pos == 'PROPN':
|
|
61
|
+
return 'Cp'
|
|
62
|
+
elif pos == 'NUM':
|
|
63
|
+
return 'C#'
|
|
64
|
+
elif pos == 'DET':
|
|
65
|
+
return 'Cd'
|
|
66
|
+
elif pos == 'PRON':
|
|
67
|
+
return 'Ci'
|
|
68
|
+
else:
|
|
69
|
+
return 'C'
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _modifier_type_and_subtype(token: Token) -> str:
|
|
73
|
+
pos: str = token.pos_
|
|
74
|
+
dep: str = token.dep_
|
|
75
|
+
if dep in {'neg', 'nk'}:
|
|
76
|
+
return 'Mn'
|
|
77
|
+
elif dep in {'poss', 'pg', 'ag'}:
|
|
78
|
+
return 'Mp'
|
|
79
|
+
elif dep == 'prep':
|
|
80
|
+
return 'Mt' # preposition
|
|
81
|
+
elif dep == 'preconj':
|
|
82
|
+
return 'Mj' # conjunctional
|
|
83
|
+
elif pos == 'ADJ':
|
|
84
|
+
return 'Ma'
|
|
85
|
+
elif pos == 'DET':
|
|
86
|
+
return 'Md'
|
|
87
|
+
elif pos == 'NUM':
|
|
88
|
+
return 'M#'
|
|
89
|
+
elif pos == 'AUX':
|
|
90
|
+
return 'Mm' # modal
|
|
91
|
+
elif token.dep_ == 'prt':
|
|
92
|
+
return 'Ml' # particle
|
|
93
|
+
elif pos == 'PART':
|
|
94
|
+
return 'Mi' # infinitive
|
|
95
|
+
elif pos == 'ADV': # adverb
|
|
96
|
+
return 'M' # quintissential modifier, no subtype needed
|
|
97
|
+
else:
|
|
98
|
+
return 'M'
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _builder_type_and_subtype(token: Token) -> str:
|
|
102
|
+
pos: str = token.pos_
|
|
103
|
+
dep: str = token.dep_
|
|
104
|
+
if dep in {'case', 'pg', 'ag'}:
|
|
105
|
+
# if token.head.dep_ == 'poss':
|
|
106
|
+
return 'Bp'
|
|
107
|
+
elif pos == 'ADP':
|
|
108
|
+
return 'Br' # relational (proposition)
|
|
109
|
+
elif pos == 'DET':
|
|
110
|
+
return 'Bd'
|
|
111
|
+
else:
|
|
112
|
+
return 'B'
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _predicate_type_and_subtype(token: Token) -> str:
|
|
116
|
+
dep: str = token.dep_
|
|
117
|
+
if dep in {'advcl', 'csubj', 'csubjpass', 'parataxis'}:
|
|
118
|
+
return 'Pd'
|
|
119
|
+
elif dep in {'relcl', 'ccomp', 'acl', 'pcomp', 'xcomp', 'rc'}:
|
|
120
|
+
return 'P'
|
|
121
|
+
elif _is_verb(token):
|
|
122
|
+
return 'Pd'
|
|
123
|
+
else:
|
|
124
|
+
return 'P'
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _predicate_post_type_and_subtype(edge: Hyperedge, subparts: list[str], args_string: str) -> str:
|
|
128
|
+
return subparts[0]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _is_verb(token: Token) -> bool:
|
|
132
|
+
return token.pos_ == 'VERB'
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _poss2text(edge: Hyperedge, parse: dict[str, Any]) -> str:
|
|
136
|
+
part1: str = _edge2text(edge[1], parse).strip()
|
|
137
|
+
part2: str = _edge2text(edge[2], parse)
|
|
138
|
+
if part1[-1] == 's':
|
|
139
|
+
poss: str = "'"
|
|
140
|
+
else:
|
|
141
|
+
poss = "'s"
|
|
142
|
+
return f'{part1}{poss} {part2}'
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _generate_tok_pos(atom2word: dict[Atom, tuple[str, int]], edge: Hyperedge) -> str:
|
|
146
|
+
if edge.atom:
|
|
147
|
+
uatom: Atom | None = cast(Atom, unique(edge))
|
|
148
|
+
if uatom is not None and uatom in atom2word:
|
|
149
|
+
return str(atom2word[uatom][1])
|
|
150
|
+
else:
|
|
151
|
+
return '-1'
|
|
152
|
+
else:
|
|
153
|
+
return '({})'.format(' '.join([_generate_tok_pos(atom2word, subedge) for subedge in edge]))
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class AlphaBetaParser(Parser):
|
|
157
|
+
def __init__(self, lang: str, beta: str = 'repair', normalize: bool = True,
|
|
158
|
+
post_process: bool = True, debug: bool = False) -> None:
|
|
159
|
+
super().__init__()
|
|
160
|
+
|
|
161
|
+
self.lang: str = lang
|
|
162
|
+
|
|
163
|
+
models: list[str] = get_spacy_models(lang)
|
|
164
|
+
|
|
165
|
+
if len(models) == 0:
|
|
166
|
+
raise RuntimeError(f"Language code '{lang}' is not recognized / language is nor supported.")
|
|
167
|
+
|
|
168
|
+
self.nlp: Language | None = None
|
|
169
|
+
for model in models:
|
|
170
|
+
if spacy.util.is_package(model):
|
|
171
|
+
self.nlp = spacy.load(model)
|
|
172
|
+
print('Using language model: {}'.format(model))
|
|
173
|
+
break
|
|
174
|
+
if self.nlp is None:
|
|
175
|
+
models_list: str = ", ".join(models)
|
|
176
|
+
raise RuntimeError(f"Language '{lang}' requires one of the following language models to be installed:\n"
|
|
177
|
+
f"{models_list}.")
|
|
178
|
+
|
|
179
|
+
self.alpha: Alpha = Alpha(use_atomizer=True)
|
|
180
|
+
|
|
181
|
+
if beta == 'strict':
|
|
182
|
+
self.rules: list[Rule] = strict_rules
|
|
183
|
+
elif beta == 'repair':
|
|
184
|
+
self.rules = repair_rules
|
|
185
|
+
else:
|
|
186
|
+
raise RuntimeError('unkown beta stage: {}'.format(beta))
|
|
187
|
+
self.normalize: bool = normalize
|
|
188
|
+
self.post_process: bool = post_process
|
|
189
|
+
self.debug: bool = debug
|
|
190
|
+
|
|
191
|
+
self.atom2token: dict[Atom, Token] = {}
|
|
192
|
+
self.temp_atoms: set[Atom] = set()
|
|
193
|
+
self.orig_atom: dict[Atom, UniqueAtom] = {}
|
|
194
|
+
self.token2atom: dict[Token, Atom] = {}
|
|
195
|
+
self.depths: dict[Atom, int] = {}
|
|
196
|
+
self.connections: set[tuple[Atom, Atom]] = set()
|
|
197
|
+
self.edge2text: dict[Hyperedge, str] = {}
|
|
198
|
+
self.edge2toks: dict[Hyperedge, tuple[Token, ...]] = {}
|
|
199
|
+
self.toks2edge: dict[tuple[Token, ...], Hyperedge] = {}
|
|
200
|
+
self.cur_text: str | None = None
|
|
201
|
+
self.doc: Doc | None = None
|
|
202
|
+
self.beta: str = beta
|
|
203
|
+
|
|
204
|
+
def debug_msg(self, msg: str) -> None:
|
|
205
|
+
if self.debug:
|
|
206
|
+
print(msg)
|
|
207
|
+
|
|
208
|
+
def parse_sentence(self, sentence: str) -> list[dict[str, object]]:
|
|
209
|
+
# This runs spacy own sentensizer anyway...
|
|
210
|
+
|
|
211
|
+
sentence = re.sub(r'\s+', ' ', sentence).strip()
|
|
212
|
+
|
|
213
|
+
if self.nlp:
|
|
214
|
+
self.reset(sentence)
|
|
215
|
+
parses: list[dict[str, object]] = []
|
|
216
|
+
try:
|
|
217
|
+
self.doc = self.nlp(sentence)
|
|
218
|
+
offset: int = 0
|
|
219
|
+
for sent in self.doc.sents:
|
|
220
|
+
parse: dict[str, object] | None = self.parse_spacy_sentence(sent, offset=offset)
|
|
221
|
+
if parse:
|
|
222
|
+
parses.append(parse)
|
|
223
|
+
offset += len(sent)
|
|
224
|
+
except RuntimeError as error:
|
|
225
|
+
print(error)
|
|
226
|
+
return parses
|
|
227
|
+
else:
|
|
228
|
+
raise RuntimeError("spaCy model failed to initialize.")
|
|
229
|
+
|
|
230
|
+
def parse_spacy_sentence(self, sent: Span, atom_sequence: list[Atom] | None = None,
|
|
231
|
+
offset: int = 0) -> dict[str, object] | None:
|
|
232
|
+
try:
|
|
233
|
+
if atom_sequence is None:
|
|
234
|
+
atom_sequence = self._build_atom_sequence(sent)
|
|
235
|
+
|
|
236
|
+
self._compute_depths_and_connections(sent.root)
|
|
237
|
+
|
|
238
|
+
edge: Hyperedge | None = None
|
|
239
|
+
result: list[Hyperedge] | None
|
|
240
|
+
failed: bool
|
|
241
|
+
result, failed = self._parse_atom_sequence(atom_sequence)
|
|
242
|
+
if result and len(result) == 1:
|
|
243
|
+
edge = non_unique(result[0])
|
|
244
|
+
|
|
245
|
+
atom2word: dict[Atom, tuple[str, int]] = {}
|
|
246
|
+
if edge:
|
|
247
|
+
edge = self._apply_arg_roles(edge)
|
|
248
|
+
if self.beta == 'repair':
|
|
249
|
+
edge = self._repair(edge)
|
|
250
|
+
if self.normalize:
|
|
251
|
+
edge = self._normalize(edge)
|
|
252
|
+
if self.post_process:
|
|
253
|
+
edge = self._post_process(edge)
|
|
254
|
+
if edge is not None:
|
|
255
|
+
atom2word = self._generate_atom2word(edge, offset=offset)
|
|
256
|
+
|
|
257
|
+
if edge is None:
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
'edge': edge,
|
|
262
|
+
'failed': failed,
|
|
263
|
+
'text': str(sent).strip(),
|
|
264
|
+
'tokens': [str(token) for token in sent],
|
|
265
|
+
'tok_pos': _generate_tok_pos(atom2word, edge)
|
|
266
|
+
}
|
|
267
|
+
except Exception as e:
|
|
268
|
+
print('Caught exception: {} while parsing: "{}"'.format(str(e), str(sent)))
|
|
269
|
+
traceback.print_exc()
|
|
270
|
+
return None
|
|
271
|
+
|
|
272
|
+
def manual_atom_sequence(self, sentence: Span, token2atom: dict[Token, Atom]) -> list[Atom]:
|
|
273
|
+
self.token2atom = {}
|
|
274
|
+
|
|
275
|
+
atomseq: list[Atom] = []
|
|
276
|
+
for token in sentence:
|
|
277
|
+
if token in token2atom:
|
|
278
|
+
atom: Atom | None = token2atom[token]
|
|
279
|
+
else:
|
|
280
|
+
atom = None
|
|
281
|
+
if atom:
|
|
282
|
+
uatom: Atom = UniqueAtom(atom)
|
|
283
|
+
self.dep_[uatom] = token
|
|
284
|
+
self.token2atom[token] = uatom
|
|
285
|
+
self.orig_atom[uatom] = uatom
|
|
286
|
+
atomseq.append(uatom)
|
|
287
|
+
return atomseq
|
|
288
|
+
|
|
289
|
+
def reset(self, text: str) -> None:
|
|
290
|
+
self.dep_: dict[Atom, Token] = {}
|
|
291
|
+
self.temp_atoms = set()
|
|
292
|
+
self.orig_atom = {}
|
|
293
|
+
self.edge2toks = {}
|
|
294
|
+
self.toks2edge = {}
|
|
295
|
+
self.edge2coref: dict[Hyperedge, object] = {}
|
|
296
|
+
self.resolved_corefs: set[object] = set()
|
|
297
|
+
self.cur_text = text
|
|
298
|
+
|
|
299
|
+
def _builder_arg_roles(self, edge: Hyperedge) -> str:
|
|
300
|
+
depth1: int = self._dep_depth(edge[1])
|
|
301
|
+
depth2: int = self._dep_depth(edge[2])
|
|
302
|
+
if depth1 < depth2:
|
|
303
|
+
return 'ma'
|
|
304
|
+
elif depth1 > depth2:
|
|
305
|
+
return 'am'
|
|
306
|
+
else:
|
|
307
|
+
return 'mm'
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _relation_arg_role(self, edge: Hyperedge) -> str:
|
|
311
|
+
head_token: Token | None = self._head_token(edge)
|
|
312
|
+
if not head_token:
|
|
313
|
+
return '?'
|
|
314
|
+
dep: str = head_token.dep_
|
|
315
|
+
|
|
316
|
+
# subject
|
|
317
|
+
if dep in {'nsubj', 'sb'}:
|
|
318
|
+
return 's'
|
|
319
|
+
# passive subject
|
|
320
|
+
elif dep in {'nsubjpass', 'nsubj:pass'}:
|
|
321
|
+
return 'p'
|
|
322
|
+
# agent
|
|
323
|
+
elif dep == 'agent':
|
|
324
|
+
return 'a'
|
|
325
|
+
# object
|
|
326
|
+
elif dep in {'obj', 'dobj', 'pobj', 'prt', 'oprd', 'acomp', 'attr', 'ROOT', 'oa', 'pd'}:
|
|
327
|
+
return 'o'
|
|
328
|
+
# indirect object
|
|
329
|
+
elif dep in {'iobj', 'dative', 'obl:arg', 'da'}:
|
|
330
|
+
return 'i'
|
|
331
|
+
# specifier
|
|
332
|
+
elif dep in {'advcl', 'prep', 'npadvmod', 'advmod', 'mo', 'mnr'}:
|
|
333
|
+
return 'x'
|
|
334
|
+
# parataxis
|
|
335
|
+
elif dep in {'parataxis', 'par'}:
|
|
336
|
+
return 't'
|
|
337
|
+
# interjection
|
|
338
|
+
elif dep in {'intj', 'ng', 'dm'}:
|
|
339
|
+
return 'j'
|
|
340
|
+
# clausal complement
|
|
341
|
+
elif dep in {'xcomp', 'ccomp', 'oc'}:
|
|
342
|
+
return 'r'
|
|
343
|
+
else:
|
|
344
|
+
return '?'
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _adjust_score(self, edges: list[Hyperedge]) -> int:
|
|
348
|
+
min_depth: int = 9999999
|
|
349
|
+
appos: bool = False
|
|
350
|
+
min_appos_depth: int = 9999999
|
|
351
|
+
|
|
352
|
+
if all([edge.mtype() == 'C' for edge in edges]):
|
|
353
|
+
for edge in edges:
|
|
354
|
+
token: Token | None = self._head_token(edge)
|
|
355
|
+
if token is None:
|
|
356
|
+
continue
|
|
357
|
+
depth: int = self.depths[self.token2atom[token]]
|
|
358
|
+
if depth < min_depth:
|
|
359
|
+
min_depth = depth
|
|
360
|
+
if token.dep_ == 'appos':
|
|
361
|
+
appos = True
|
|
362
|
+
if depth < min_appos_depth:
|
|
363
|
+
min_appos_depth = depth
|
|
364
|
+
|
|
365
|
+
if appos and min_appos_depth > min_depth:
|
|
366
|
+
return -99
|
|
367
|
+
else:
|
|
368
|
+
return 0
|
|
369
|
+
|
|
370
|
+
def _head_token(self, edge: Hyperedge) -> Token | None:
|
|
371
|
+
atoms: list[Atom] = [
|
|
372
|
+
cast(Atom, uatom)
|
|
373
|
+
for atom in edge.all_atoms()
|
|
374
|
+
if (uatom := unique(atom)) is not None and uatom in self.atom2token
|
|
375
|
+
]
|
|
376
|
+
min_depth: int = 9999999
|
|
377
|
+
main_atom: Atom | None = None
|
|
378
|
+
for atom in atoms:
|
|
379
|
+
if atom in self.orig_atom:
|
|
380
|
+
oatom: Atom = self.orig_atom[atom]
|
|
381
|
+
if oatom in self.depths:
|
|
382
|
+
depth: int = self.depths[oatom]
|
|
383
|
+
if depth < min_depth:
|
|
384
|
+
min_depth = depth
|
|
385
|
+
main_atom = atom
|
|
386
|
+
if main_atom:
|
|
387
|
+
return self.atom2token[main_atom]
|
|
388
|
+
else:
|
|
389
|
+
return None
|
|
390
|
+
|
|
391
|
+
def _dep_depth(self, edge: Hyperedge) -> int:
|
|
392
|
+
atoms: list[Atom] = [
|
|
393
|
+
cast(Atom, uatom)
|
|
394
|
+
for atom in edge.all_atoms()
|
|
395
|
+
if (uatom := unique(atom)) is not None and uatom in self.atom2token
|
|
396
|
+
]
|
|
397
|
+
mdepth: int = 99999999
|
|
398
|
+
for atom in atoms:
|
|
399
|
+
if atom in self.orig_atom:
|
|
400
|
+
oatom: Atom = self.orig_atom[atom]
|
|
401
|
+
if oatom in self.depths:
|
|
402
|
+
depth: int = self.depths[oatom]
|
|
403
|
+
if depth < mdepth:
|
|
404
|
+
mdepth = depth
|
|
405
|
+
return mdepth
|
|
406
|
+
|
|
407
|
+
def _build_atom(self, token: Token, ent_type: str, last_token: Token | None) -> Atom:
|
|
408
|
+
text: str = token.text.lower()
|
|
409
|
+
et: str = ent_type
|
|
410
|
+
|
|
411
|
+
if ent_type[0] == 'P':
|
|
412
|
+
atom: Atom = self._build_atom_predicate(token, ent_type, last_token)
|
|
413
|
+
elif ent_type[0] == 'T':
|
|
414
|
+
atom = self._build_atom_trigger(token, ent_type)
|
|
415
|
+
elif ent_type[0] == 'M':
|
|
416
|
+
atom = self._build_atom_modifier(token)
|
|
417
|
+
else:
|
|
418
|
+
atom = build_atom(text, et, self.lang)
|
|
419
|
+
return atom
|
|
420
|
+
|
|
421
|
+
def _build_atom_predicate(self, token: Token, ent_type: str, last_token: Token | None) -> Atom:
|
|
422
|
+
text: str = token.text.lower()
|
|
423
|
+
et: str = ent_type
|
|
424
|
+
|
|
425
|
+
# first naive assignment of predicate subtype
|
|
426
|
+
# (can be revised at post-processing stage)
|
|
427
|
+
if ent_type == 'Pd':
|
|
428
|
+
# interrogative cases
|
|
429
|
+
if (last_token and
|
|
430
|
+
last_token.tag_ == '.' and
|
|
431
|
+
last_token.dep_ == 'punct' and
|
|
432
|
+
last_token.lemma_.strip() == '?'):
|
|
433
|
+
ent_type = 'P?'
|
|
434
|
+
# declarative (by default)
|
|
435
|
+
else:
|
|
436
|
+
ent_type = 'Pd'
|
|
437
|
+
|
|
438
|
+
return build_atom(text, ent_type, self.lang)
|
|
439
|
+
|
|
440
|
+
def _build_atom_trigger(self, token: Token, ent_type: str) -> Atom:
|
|
441
|
+
text: str = token.text.lower()
|
|
442
|
+
et: str = 'Tv' if _is_verb(token) else ent_type
|
|
443
|
+
return build_atom(text, et, self.lang)
|
|
444
|
+
|
|
445
|
+
def _build_atom_modifier(self, token: Token) -> Atom:
|
|
446
|
+
text: str = token.text.lower()
|
|
447
|
+
et: str = 'Mv' if _is_verb(token) else _modifier_type_and_subtype(token)
|
|
448
|
+
return build_atom(text, et, self.lang)
|
|
449
|
+
|
|
450
|
+
def _repair(self, edge: Hyperedge) -> Hyperedge:
|
|
451
|
+
if edge.not_atom:
|
|
452
|
+
new_edge: Hyperedge | None = hedge([self._repair(subedge) for subedge in edge])
|
|
453
|
+
if new_edge is None:
|
|
454
|
+
return edge
|
|
455
|
+
edge = new_edge
|
|
456
|
+
|
|
457
|
+
if len(edge) == 3 and str(edge[0])[:4] == '+/B.':
|
|
458
|
+
if len(edge[1]) == 2 and edge[1].cmt == 'J':
|
|
459
|
+
return cast(Hyperedge, hedge([edge[1][0], edge[1][1], edge[2]]))
|
|
460
|
+
elif len(edge[2]) == 2 and edge[2].cmt == 'J':
|
|
461
|
+
return cast(Hyperedge, hedge([edge[2][0], edge[1], edge[2][1]]))
|
|
462
|
+
|
|
463
|
+
return edge
|
|
464
|
+
|
|
465
|
+
def _normalize(self, edge: Hyperedge) -> Hyperedge:
|
|
466
|
+
if edge.not_atom:
|
|
467
|
+
new_edge: Hyperedge | None = hedge([self._normalize(subedge) for subedge in edge])
|
|
468
|
+
if new_edge is None:
|
|
469
|
+
return edge
|
|
470
|
+
edge = new_edge
|
|
471
|
+
|
|
472
|
+
# Move modifier to internal connector if it is applied to
|
|
473
|
+
# relations, specifiers or conjunctions
|
|
474
|
+
if edge.cmt == 'M' and not edge[1].atom:
|
|
475
|
+
innner_conn: str | None = edge[1].cmt
|
|
476
|
+
if innner_conn in {'P', 'T', 'J'}:
|
|
477
|
+
return cast(Hyperedge, hedge(((edge[0], edge[1][0]),) + edge[1][1:]))
|
|
478
|
+
|
|
479
|
+
return edge
|
|
480
|
+
|
|
481
|
+
def _update_atom(self, old: Atom, new: Atom) -> None:
|
|
482
|
+
uold: Atom = UniqueAtom(old)
|
|
483
|
+
unew: Atom = UniqueAtom(new)
|
|
484
|
+
if uold in self.atom2token:
|
|
485
|
+
self.atom2token[unew] = self.atom2token[uold]
|
|
486
|
+
self.temp_atoms.add(uold)
|
|
487
|
+
self.orig_atom[unew] = uold
|
|
488
|
+
|
|
489
|
+
def _replace_atom(self, edge: Hyperedge, old: Atom, new: Atom) -> Hyperedge:
|
|
490
|
+
self._update_atom(old, new)
|
|
491
|
+
return edge.replace_atom(old, new)
|
|
492
|
+
|
|
493
|
+
def _insert_edge_with_argrole(self, edge: Hyperedge, arg: Hyperedge, argrole: str, pos: int) -> Hyperedge:
|
|
494
|
+
new_edge: Hyperedge = edge.insert_edge_with_argrole(arg, argrole, pos)
|
|
495
|
+
old_pred: Atom = edge[0].inner_atom()
|
|
496
|
+
new_pred: Atom = new_edge[0].inner_atom()
|
|
497
|
+
self._update_atom(old_pred, new_pred)
|
|
498
|
+
return new_edge
|
|
499
|
+
|
|
500
|
+
def _replace_argroles(self, edge: Hyperedge, argroles: str) -> Hyperedge:
|
|
501
|
+
new_edge: Hyperedge = edge.replace_argroles(argroles)
|
|
502
|
+
old_pred: Atom = edge[0].inner_atom()
|
|
503
|
+
new_pred: Atom = new_edge[0].inner_atom()
|
|
504
|
+
self._update_atom(old_pred, new_pred)
|
|
505
|
+
return new_edge
|
|
506
|
+
|
|
507
|
+
def _apply_arg_roles(self, edge: Hyperedge) -> Hyperedge:
|
|
508
|
+
if edge.atom:
|
|
509
|
+
return edge
|
|
510
|
+
|
|
511
|
+
new_entity: Hyperedge = edge
|
|
512
|
+
|
|
513
|
+
# Extend predicate connectors with argument types
|
|
514
|
+
if edge.connector_mtype() == 'P':
|
|
515
|
+
pred: Atom | None = edge.atom_with_type('P')
|
|
516
|
+
assert pred is not None
|
|
517
|
+
subparts: list[str] = pred.parts()[1].split('.')
|
|
518
|
+
args: list[str] = [self._relation_arg_role(param) for param in edge[1:]]
|
|
519
|
+
args_string: str = ''.join(args)
|
|
520
|
+
# TODO: this is done to detect imperative, to refactor
|
|
521
|
+
pt: str = _predicate_post_type_and_subtype(edge, subparts, args_string)
|
|
522
|
+
if len(subparts) > 2:
|
|
523
|
+
new_part: str = '{}.{}.{}'.format(pt, args_string, subparts[2])
|
|
524
|
+
else:
|
|
525
|
+
new_part = '{}.{}'.format(pt, args_string)
|
|
526
|
+
new_pred: Atom = pred.replace_atom_part(1, new_part)
|
|
527
|
+
unew_pred: Atom = UniqueAtom(new_pred)
|
|
528
|
+
upred: Atom = UniqueAtom(pred)
|
|
529
|
+
self.atom2token[unew_pred] = self.atom2token[upred]
|
|
530
|
+
self.temp_atoms.add(upred)
|
|
531
|
+
self.orig_atom[unew_pred] = upred
|
|
532
|
+
new_entity = edge.replace_atom(pred, new_pred, unique=True)
|
|
533
|
+
|
|
534
|
+
# Extend builder connectors with argument types
|
|
535
|
+
elif edge.connector_mtype() == 'B':
|
|
536
|
+
builder: Atom | None = edge.atom_with_type('B')
|
|
537
|
+
assert builder is not None
|
|
538
|
+
subparts = builder.parts()[1].split('.')
|
|
539
|
+
arg_roles: str = self._builder_arg_roles(edge)
|
|
540
|
+
if len(arg_roles) > 0:
|
|
541
|
+
if len(subparts) > 1:
|
|
542
|
+
subparts[1] = arg_roles
|
|
543
|
+
else:
|
|
544
|
+
subparts.append(arg_roles)
|
|
545
|
+
new_part = '.'.join(subparts)
|
|
546
|
+
new_builder: Atom = builder.replace_atom_part(1, new_part)
|
|
547
|
+
ubuilder: Atom = UniqueAtom(builder)
|
|
548
|
+
unew_builder: Atom = UniqueAtom(new_builder)
|
|
549
|
+
if ubuilder in self.atom2token:
|
|
550
|
+
self.atom2token[unew_builder] = self.atom2token[ubuilder]
|
|
551
|
+
self.temp_atoms.add(ubuilder)
|
|
552
|
+
self.orig_atom[unew_builder] = ubuilder
|
|
553
|
+
new_entity = edge.replace_atom(builder, new_builder, unique=True)
|
|
554
|
+
|
|
555
|
+
new_args: list[Hyperedge] = [self._apply_arg_roles(subentity) for subentity in new_entity[1:]]
|
|
556
|
+
new_entity = cast(Hyperedge, hedge([new_entity[0]] + new_args))
|
|
557
|
+
|
|
558
|
+
return new_entity
|
|
559
|
+
|
|
560
|
+
def _generate_atom2word(self, edge: Hyperedge, offset: int = 0) -> dict[Atom, tuple[str, int]]:
|
|
561
|
+
atom2word: dict[Atom, tuple[str, int]] = {}
|
|
562
|
+
atoms: list[Atom] = edge.all_atoms()
|
|
563
|
+
for atom in atoms:
|
|
564
|
+
uatom: Atom = UniqueAtom(atom)
|
|
565
|
+
if uatom in self.atom2token:
|
|
566
|
+
token: Token = self.atom2token[uatom]
|
|
567
|
+
word: tuple[str, int] = (token.text, token.i - offset)
|
|
568
|
+
atom2word[uatom] = word
|
|
569
|
+
return atom2word
|
|
570
|
+
|
|
571
|
+
def _parse_token(self, token: Token, atom_type: str) -> Atom | None:
|
|
572
|
+
if atom_type == 'X':
|
|
573
|
+
return None
|
|
574
|
+
elif atom_type == 'C':
|
|
575
|
+
atom_type = _concept_type_and_subtype(token)
|
|
576
|
+
elif atom_type == 'M':
|
|
577
|
+
atom_type = _modifier_type_and_subtype(token)
|
|
578
|
+
elif atom_type == 'B':
|
|
579
|
+
atom_type = _builder_type_and_subtype(token)
|
|
580
|
+
elif atom_type == 'P':
|
|
581
|
+
atom_type = _predicate_type_and_subtype(token)
|
|
582
|
+
|
|
583
|
+
# last token is useful to determine predicate subtype
|
|
584
|
+
tokens: list[Token] = list(token.lefts) + list(token.rights)
|
|
585
|
+
last_token: Token | None = tokens[-1] if len(tokens) > 0 else None
|
|
586
|
+
|
|
587
|
+
atom: Atom = self._build_atom(token, atom_type, last_token)
|
|
588
|
+
self.debug_msg('ATOM: {}'.format(atom))
|
|
589
|
+
|
|
590
|
+
return atom
|
|
591
|
+
|
|
592
|
+
def _build_atom_sequence(self, sentence: Span) -> list[Atom]:
|
|
593
|
+
features: list[tuple[str, str, str, str, str]] = []
|
|
594
|
+
for pos, token in enumerate(sentence):
|
|
595
|
+
head: Token = token.head
|
|
596
|
+
tag: str = token.tag_
|
|
597
|
+
dep: str = token.dep_
|
|
598
|
+
hpos: str = head.pos_ if head else ''
|
|
599
|
+
hdep: str = head.dep_ if head else ''
|
|
600
|
+
if pos + 1 < len(sentence):
|
|
601
|
+
pos_after: str = sentence[pos + 1].pos_
|
|
602
|
+
else:
|
|
603
|
+
pos_after = ''
|
|
604
|
+
features.append((tag, dep, hpos, hdep, pos_after))
|
|
605
|
+
|
|
606
|
+
assert self.alpha is not None, "Alpha classifier must be initialized before parsing"
|
|
607
|
+
atom_types: tuple[str, ...] | list[str] = self.alpha.predict(sentence, features)
|
|
608
|
+
|
|
609
|
+
self.token2atom = {}
|
|
610
|
+
|
|
611
|
+
atomseq: list[Atom] = []
|
|
612
|
+
for token, atom_type in zip(sentence, atom_types):
|
|
613
|
+
atom: Atom | None = self._parse_token(token, atom_type)
|
|
614
|
+
if atom:
|
|
615
|
+
uatom: Atom = UniqueAtom(atom)
|
|
616
|
+
self.atom2token[uatom] = token
|
|
617
|
+
self.token2atom[token] = uatom
|
|
618
|
+
self.orig_atom[uatom] = uatom
|
|
619
|
+
atomseq.append(uatom)
|
|
620
|
+
return atomseq
|
|
621
|
+
|
|
622
|
+
def _compute_depths_and_connections(self, root: Token, depth: int = 0) -> None:
|
|
623
|
+
if depth == 0:
|
|
624
|
+
self.depths = {}
|
|
625
|
+
self.connections = set()
|
|
626
|
+
|
|
627
|
+
if root in self.token2atom:
|
|
628
|
+
parent_atom: Atom | None = self.token2atom[root]
|
|
629
|
+
self.depths[parent_atom] = depth
|
|
630
|
+
else:
|
|
631
|
+
parent_atom = None
|
|
632
|
+
|
|
633
|
+
for child in root.children:
|
|
634
|
+
if parent_atom and child in self.token2atom:
|
|
635
|
+
child_atom: Atom = self.token2atom[child]
|
|
636
|
+
self.connections.add((parent_atom, child_atom))
|
|
637
|
+
self.connections.add((child_atom, parent_atom))
|
|
638
|
+
self._compute_depths_and_connections(child, depth + 1)
|
|
639
|
+
|
|
640
|
+
def _is_pair_connected(self, atoms1: list[Atom], atoms2: list[Atom]) -> bool:
|
|
641
|
+
for atom1 in atoms1:
|
|
642
|
+
for atom2 in atoms2:
|
|
643
|
+
if atom1 in self.orig_atom and atom2 in self.orig_atom:
|
|
644
|
+
pair: tuple[Atom, Atom] = (self.orig_atom[atom1], self.orig_atom[atom2])
|
|
645
|
+
if pair in self.connections:
|
|
646
|
+
return True
|
|
647
|
+
return False
|
|
648
|
+
|
|
649
|
+
def _are_connected(self, atom_sets: list[list[Atom]], connector_pos: int) -> bool:
|
|
650
|
+
conn: bool = True
|
|
651
|
+
for pos, arg in enumerate(atom_sets):
|
|
652
|
+
if pos != connector_pos:
|
|
653
|
+
if not self._is_pair_connected(atom_sets[connector_pos], arg):
|
|
654
|
+
conn = False
|
|
655
|
+
break
|
|
656
|
+
return conn
|
|
657
|
+
|
|
658
|
+
def _score(self, edges: list[Hyperedge]) -> int:
|
|
659
|
+
atom_sets: list[list[Atom]] = [edge.all_atoms() for edge in edges]
|
|
660
|
+
|
|
661
|
+
conn: bool = False
|
|
662
|
+
for pos in range(len(edges)):
|
|
663
|
+
if self._are_connected(atom_sets, pos):
|
|
664
|
+
conn = True
|
|
665
|
+
break
|
|
666
|
+
|
|
667
|
+
mdepth: int = 99999999
|
|
668
|
+
for atom_set in atom_sets:
|
|
669
|
+
for atom in atom_set:
|
|
670
|
+
if atom in self.orig_atom:
|
|
671
|
+
oatom: Atom = self.orig_atom[atom]
|
|
672
|
+
if oatom in self.depths:
|
|
673
|
+
depth: int = self.depths[oatom]
|
|
674
|
+
if depth < mdepth:
|
|
675
|
+
mdepth = depth
|
|
676
|
+
|
|
677
|
+
return (10000000 if conn else 0) + (mdepth * 100) + self._adjust_score(edges)
|
|
678
|
+
|
|
679
|
+
def _parse_atom_sequence(self, atom_sequence: list[Atom]) -> tuple[list[Hyperedge] | None, bool]:
|
|
680
|
+
sequence: list[Hyperedge] = list(atom_sequence)
|
|
681
|
+
while True:
|
|
682
|
+
action: tuple[Rule, int, Hyperedge, int, int] | None = None
|
|
683
|
+
best_score: int = -999999999
|
|
684
|
+
for rule_number, rule in enumerate(self.rules):
|
|
685
|
+
window_start: int = rule.size - 1
|
|
686
|
+
for pos in range(window_start, len(sequence)):
|
|
687
|
+
new_edge: Hyperedge | None = apply_rule(rule, sequence, pos)
|
|
688
|
+
if new_edge:
|
|
689
|
+
score: int = self._score(sequence[pos - window_start:pos + 1])
|
|
690
|
+
score -= rule_number
|
|
691
|
+
if score > best_score:
|
|
692
|
+
action = (rule, score, new_edge, window_start, pos)
|
|
693
|
+
best_score = score
|
|
694
|
+
|
|
695
|
+
# parse failed, make best effort to return something
|
|
696
|
+
if action is None:
|
|
697
|
+
# if all else fails...
|
|
698
|
+
if len(sequence) > 0:
|
|
699
|
+
fallback: Hyperedge | None = hedge([':/J/.'] + sequence[:2])
|
|
700
|
+
new_sequence: list[Hyperedge] = ([fallback] if fallback else []) + sequence[2:]
|
|
701
|
+
else:
|
|
702
|
+
return None, True
|
|
703
|
+
else:
|
|
704
|
+
rule, s, new_edge, window_start, pos = action
|
|
705
|
+
new_sequence = (sequence[:pos - window_start] + [new_edge] + sequence[pos + 1:])
|
|
706
|
+
|
|
707
|
+
self.debug_msg('rule: {}'.format(rule))
|
|
708
|
+
self.debug_msg('score: {}'.format(score))
|
|
709
|
+
self.debug_msg('new_edge: {}'.format(new_edge))
|
|
710
|
+
self.debug_msg('new_sequence: {}'.format(new_sequence))
|
|
711
|
+
|
|
712
|
+
sequence = new_sequence
|
|
713
|
+
if len(sequence) < 2:
|
|
714
|
+
return sequence, False
|
|
715
|
+
|
|
716
|
+
def sentensize(self, text: str) -> list[str]:
|
|
717
|
+
if self.nlp:
|
|
718
|
+
doc: Doc = self.nlp(text.strip())
|
|
719
|
+
return [str(sent).strip() for sent in doc.sents]
|
|
720
|
+
else:
|
|
721
|
+
raise RuntimeError("spaCy model failed to initialize.")
|
|
722
|
+
|
|
723
|
+
def _edge2toks(self, edge: Hyperedge) -> None:
|
|
724
|
+
uatoms: list[Hyperedge | None] = [unique(atom) for atom in edge.all_atoms()]
|
|
725
|
+
toks: tuple[Token, ...] = tuple(sorted(
|
|
726
|
+
[self.atom2token[uatom] for uatom in uatoms if uatom is not None and uatom in self.atom2token]
|
|
727
|
+
))
|
|
728
|
+
self.edge2toks[edge] = toks
|
|
729
|
+
self.toks2edge[toks] = edge
|
|
730
|
+
if edge.not_atom:
|
|
731
|
+
for subedge in edge:
|
|
732
|
+
self._edge2toks(subedge)
|
|
733
|
+
|
|
734
|
+
# ===============
|
|
735
|
+
# Post-processing
|
|
736
|
+
# ===============
|
|
737
|
+
def _insert_arg_in_tail(self, edge: Hyperedge, arg: Hyperedge) -> Hyperedge:
|
|
738
|
+
if edge.atom:
|
|
739
|
+
return edge
|
|
740
|
+
|
|
741
|
+
if edge.cmt == 'P':
|
|
742
|
+
ars: str = edge.argroles()
|
|
743
|
+
ar: str | None = None
|
|
744
|
+
if 'p' in ars:
|
|
745
|
+
if 'a' not in ars:
|
|
746
|
+
ar = 'a'
|
|
747
|
+
elif 'a' in ars:
|
|
748
|
+
ar = 'p'
|
|
749
|
+
elif 's' not in ars:
|
|
750
|
+
ar = 's'
|
|
751
|
+
elif 'o' not in ars:
|
|
752
|
+
ar = 'o'
|
|
753
|
+
if ar:
|
|
754
|
+
return self._insert_edge_with_argrole(edge, arg, ar, len(edge))
|
|
755
|
+
|
|
756
|
+
new_tail: Hyperedge = self._insert_arg_in_tail(edge[-1], arg)
|
|
757
|
+
if new_tail != edge[-1]:
|
|
758
|
+
return cast(Hyperedge, hedge(list(edge[:-1]) + [new_tail]))
|
|
759
|
+
if edge.cmt != 'P':
|
|
760
|
+
return edge
|
|
761
|
+
ars = edge.argroles()
|
|
762
|
+
if ars == '':
|
|
763
|
+
return edge
|
|
764
|
+
return self._insert_edge_with_argrole(edge, arg, 'x', len(edge))
|
|
765
|
+
|
|
766
|
+
def _insert_spec_rightmost_relation(self, edge: Hyperedge, arg: Hyperedge) -> Hyperedge:
|
|
767
|
+
if edge.atom:
|
|
768
|
+
return edge
|
|
769
|
+
if 'P' in [atom.mt for atom in edge[-1].atoms()]:
|
|
770
|
+
return cast(Hyperedge, hedge(list(edge[:-1]) + [self._insert_spec_rightmost_relation(edge[-1], arg)]))
|
|
771
|
+
if edge[0].mt == 'P':
|
|
772
|
+
return self._insert_edge_with_argrole(edge, arg, 'x', len(edge))
|
|
773
|
+
for pos, subedge in reversed(list(enumerate(edge))):
|
|
774
|
+
if 'P' in [atom.mt for atom in subedge.atoms()]:
|
|
775
|
+
new_edge_list: list[Hyperedge] = list(edge)
|
|
776
|
+
new_edge_list[pos] = self._insert_spec_rightmost_relation(subedge, arg)
|
|
777
|
+
return cast(Hyperedge, hedge(new_edge_list))
|
|
778
|
+
return edge
|
|
779
|
+
|
|
780
|
+
def _process_colon_conjunctions(self, edge: Hyperedge) -> Hyperedge:
|
|
781
|
+
if edge.atom:
|
|
782
|
+
return edge
|
|
783
|
+
new_edge: Hyperedge | None = hedge([self._process_colon_conjunctions(subedge) for subedge in edge])
|
|
784
|
+
if new_edge is None:
|
|
785
|
+
return edge
|
|
786
|
+
edge = new_edge
|
|
787
|
+
if str(edge[0]) == ':/J/.' and any(subedge.mt == 'R' for subedge in edge):
|
|
788
|
+
if edge[1].mt == 'R':
|
|
789
|
+
# RR
|
|
790
|
+
if edge[2].mt == 'S':
|
|
791
|
+
# second is specification
|
|
792
|
+
return self._insert_edge_with_argrole(edge[1], edge[2], 'x', len(edge[1]))
|
|
793
|
+
# RC
|
|
794
|
+
elif edge[2].mt == 'C':
|
|
795
|
+
return self._insert_arg_in_tail(edge[1], edge[2])
|
|
796
|
+
# CR
|
|
797
|
+
elif edge[1].mt == 'C':
|
|
798
|
+
if edge[2].mt == 'R':
|
|
799
|
+
if 's' not in edge[2].argroles():
|
|
800
|
+
# concept is subject
|
|
801
|
+
return self._insert_edge_with_argrole(edge[2], edge[1], 's', 0)
|
|
802
|
+
# SR
|
|
803
|
+
elif edge[1].mt == 'S':
|
|
804
|
+
if edge[2].mt == 'R':
|
|
805
|
+
# first is specification
|
|
806
|
+
return self._insert_edge_with_argrole(edge[2], edge[1], 'x', len(edge[2]))
|
|
807
|
+
return edge
|
|
808
|
+
|
|
809
|
+
def _fix_argroles(self, edge: Hyperedge) -> Hyperedge:
|
|
810
|
+
if edge.atom:
|
|
811
|
+
return edge
|
|
812
|
+
new_edge: Hyperedge | None = hedge([self._fix_argroles(subedge) for subedge in edge])
|
|
813
|
+
if new_edge is None:
|
|
814
|
+
return edge
|
|
815
|
+
edge = new_edge
|
|
816
|
+
ars: str = edge.argroles()
|
|
817
|
+
if ars != '' and edge.mt == 'R':
|
|
818
|
+
_ars: str = ''
|
|
819
|
+
for ar, subedge in zip(ars, edge[1:]):
|
|
820
|
+
_ar: str = ar
|
|
821
|
+
if ar == '?':
|
|
822
|
+
if subedge.mt == 'R':
|
|
823
|
+
_ar = 'r'
|
|
824
|
+
elif subedge.mt == 'S':
|
|
825
|
+
_ar = 'x'
|
|
826
|
+
_ars += _ar
|
|
827
|
+
return self._replace_argroles(edge, _ars)
|
|
828
|
+
return edge
|
|
829
|
+
|
|
830
|
+
def _post_process(self, edge: Hyperedge | None) -> Hyperedge | None:
|
|
831
|
+
if edge is None:
|
|
832
|
+
return None
|
|
833
|
+
_edge: Hyperedge = self._fix_argroles(edge)
|
|
834
|
+
_edge = self._process_colon_conjunctions(_edge)
|
|
835
|
+
return _edge
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from hyperbase.hyperedge import Hyperedge, hedge
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Rule:
|
|
5
|
+
def __init__(self, first_type: str, arg_types: set[str], size: int, connector: str | None = None) -> None:
|
|
6
|
+
self.first_type: str = first_type
|
|
7
|
+
self.arg_types: set[str] = arg_types
|
|
8
|
+
self.size: int = size
|
|
9
|
+
self.connector: str | None = connector
|
|
10
|
+
self._branches: int = 0
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
strict_rules: list[Rule] = [
|
|
14
|
+
Rule('C', {'C'}, 2, '+/B/.'),
|
|
15
|
+
Rule('M', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 2),
|
|
16
|
+
Rule('B', {'C'}, 3),
|
|
17
|
+
Rule('T', {'C', 'R'}, 2),
|
|
18
|
+
Rule('P', {'C', 'R', 'S'}, 6),
|
|
19
|
+
Rule('P', {'C', 'R', 'S'}, 5),
|
|
20
|
+
Rule('P', {'C', 'R', 'S'}, 4),
|
|
21
|
+
Rule('P', {'C', 'R', 'S'}, 3),
|
|
22
|
+
Rule('P', {'C', 'R', 'S'}, 2),
|
|
23
|
+
Rule('J', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 3)]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
repair_rules: list[Rule] = [
|
|
27
|
+
Rule('C', {'C'}, 2, '+/B/.'),
|
|
28
|
+
Rule('M', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 2),
|
|
29
|
+
Rule('B', {'C', 'R'}, 3),
|
|
30
|
+
Rule('T', {'C', 'R'}, 2),
|
|
31
|
+
Rule('P', {'C', 'R', 'S'}, 6),
|
|
32
|
+
Rule('P', {'C', 'R', 'S'}, 5),
|
|
33
|
+
Rule('P', {'C', 'R', 'S'}, 4),
|
|
34
|
+
Rule('P', {'C', 'R', 'S'}, 3),
|
|
35
|
+
Rule('P', {'C', 'R', 'S'}, 2),
|
|
36
|
+
Rule('J', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 3),
|
|
37
|
+
Rule('J', {'C', 'R', 'M', 'S', 'T', 'P', 'B', 'J'}, 2)]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def apply_rule(rule: Rule, sentence: list[Hyperedge], pos: int) -> Hyperedge | None:
|
|
41
|
+
for pivot_pos in range(rule.size):
|
|
42
|
+
args: list[Hyperedge] = []
|
|
43
|
+
pivot: Hyperedge | None = None
|
|
44
|
+
valid: bool = True
|
|
45
|
+
for i in range(rule.size):
|
|
46
|
+
edge: Hyperedge = sentence[pos - rule.size + i + 1]
|
|
47
|
+
if i == pivot_pos:
|
|
48
|
+
if edge.mtype() == rule.first_type:
|
|
49
|
+
if rule.connector:
|
|
50
|
+
args.append(edge)
|
|
51
|
+
else:
|
|
52
|
+
pivot = edge
|
|
53
|
+
else:
|
|
54
|
+
valid = False
|
|
55
|
+
break
|
|
56
|
+
else:
|
|
57
|
+
if edge.mtype() in rule.arg_types:
|
|
58
|
+
args.append(edge)
|
|
59
|
+
else:
|
|
60
|
+
valid = False
|
|
61
|
+
break
|
|
62
|
+
if valid:
|
|
63
|
+
if rule.connector:
|
|
64
|
+
return hedge([rule.connector] + args)
|
|
65
|
+
else:
|
|
66
|
+
return hedge([pivot] + args)
|
|
67
|
+
return None
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hyperbase-parser-ab
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Semantic Hypergraph AlphaBeta Parser
|
|
5
|
+
Project-URL: Homepage, https://hyperquest.ai/hyperbase
|
|
6
|
+
Author-email: "Telmo Menezes et al." <telmo@telmomenezes.net>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: AI,Knowledge Representation,NLP,Natural Language Understanding,Parsing,Semantic Hypergraphs
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: hyperbase>=0.8.0
|
|
19
|
+
Requires-Dist: pip
|
|
20
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
21
|
+
Requires-Dist: spacy>=3.8.0
|
|
22
|
+
Requires-Dist: torch>=2.0.0
|
|
23
|
+
Requires-Dist: transformers>=4.46.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: coverage>=7.4.3; extra == 'dev'
|
|
26
|
+
Requires-Dist: datasets>=4.0.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: mypy>=1.8.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pre-commit>=3.6.2; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=9.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.2.2; extra == 'dev'
|
|
31
|
+
Requires-Dist: types-passlib>=1.7.7.20240106; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# Hyperbase Alpha-Beta Parser
|
|
35
|
+
|
|
36
|
+
## A semantic hypergraph parser for natural language
|
|
37
|
+
|
|
38
|
+
The Alpha-Beta parser is a [Hyperbase](https://hyperquest.ai/hyperbase) plugin that converts natural language text into *Semantic Hypergraphs (SH)*. It works in two stages:
|
|
39
|
+
|
|
40
|
+
- **Alpha stage**: A multilingual neural token classifier (based on DistilBERT) assigns one of 39 semantic atom types to each token in a sentence -- for example, concepts, predicates, modifiers, builders, triggers and conjunctions.
|
|
41
|
+
- **Beta stage**: A rule-based engine combines classified atoms into ordered, recursive hyperedges using syntactic and semantic composition rules, producing structured representations that can be manipulated with Hyperbase.
|
|
42
|
+
|
|
43
|
+
## Supported languages
|
|
44
|
+
|
|
45
|
+
The parser supports any language with a [spaCy](https://spacy.io) model available, including English, French, German, Italian, Portuguese and Spanish.
|
|
46
|
+
|
|
47
|
+
While the parser is theoretically language-agnostic and could in principle support languages such as Mandarin, which differ substantially in morphological and syntactic structure, the authors' linguistic competence is limited to Germanic and Romance languages. We welcome the help of native speakers or domain experts in validating/improving support for other language families.
|
|
48
|
+
|
|
49
|
+
## Installation and manual
|
|
50
|
+
|
|
51
|
+
Installation instructions, the manual and more information can be found here: <https://hyperquest.ai/hyperbase>
|
|
52
|
+
|
|
53
|
+
## Contributing
|
|
54
|
+
|
|
55
|
+
Pull requests are welcome. For major changes, please open an issue first
|
|
56
|
+
to discuss what you would like to change.
|
|
57
|
+
|
|
58
|
+
Please make sure to update tests as appropriate.
|
|
59
|
+
|
|
60
|
+
## License
|
|
61
|
+
|
|
62
|
+
[MIT](https://choosealicense.com/licenses/mit/)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
hyperbase_parser_ab/__init__.py,sha256=Cf3ZbUHvRSSfXOGAkfY6OMqy7QwZtKiQi-rdGqrPRv4,86
|
|
2
|
+
hyperbase_parser_ab/alpha.py,sha256=4zfa31GI-OeTKzvKnibu3lsU9NEzQm-BB41muRa2bt4,2831
|
|
3
|
+
hyperbase_parser_ab/atomizer.py,sha256=ewHhTWRZGP7ouheFLp-CnOFItnNLu4v3kwPOGgeyPo0,5527
|
|
4
|
+
hyperbase_parser_ab/lang_models.py,sha256=qTAcdZeM_NXTnl5RYzdDq4SH4dZ_oj2mFBUI7fE8PUI,1219
|
|
5
|
+
hyperbase_parser_ab/parser.py,sha256=g5-zxUZKYbOQE0bhojTHsQ9g78MuzCV1rr-4q_WqjFA,31528
|
|
6
|
+
hyperbase_parser_ab/rules.py,sha256=tr8UGTN_BlinjhtmAjm9Gevv-RVGENPFJbnIn9TKqPU,2271
|
|
7
|
+
hyperbase_parser_ab/sentensizer.py,sha256=tO4Qf5X8JmthiYDX_ObBXdvGqzgXC3eGsKieqOEmk2k,209
|
|
8
|
+
hyperbase_parser_ab-0.1.0.dist-info/METADATA,sha256=dLJ0Icaj0b9cIkbzzMP1jHzUuxfqstlt0RSMVx_j6LU,2990
|
|
9
|
+
hyperbase_parser_ab-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
10
|
+
hyperbase_parser_ab-0.1.0.dist-info/entry_points.txt,sha256=rqN89onaLcCtRnbJPFoXC1RCFxRBj2wGo321TDSnqKU,68
|
|
11
|
+
hyperbase_parser_ab-0.1.0.dist-info/licenses/LICENSE,sha256=6p_7YBrzvSBO3phQeZm2sRg0JmEKiyBJwzZVwseDaGE,1118
|
|
12
|
+
hyperbase_parser_ab-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (C) 2026 CNRS - Centre national de la recherche scientifique
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|