hyperbase-parser-ab 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ environment: pypi
11
+ permissions:
12
+ id-token: write
13
+ steps:
14
+ - name: Checkout
15
+ uses: actions/checkout@v4
16
+
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: '3.12'
21
+
22
+ - name: Install build dependencies
23
+ run: pip install build
24
+
25
+ - name: Build package
26
+ run: python -m build
27
+
28
+ - name: Publish to PyPI
29
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,69 @@
1
+ # Compiled source #
2
+ ###################
3
+ *.com
4
+ *.class
5
+ *.dll
6
+ *.exe
7
+ *.o
8
+ *.so
9
+ build/
10
+
11
+ # Packages #
12
+ ############
13
+ # it's better to unpack these files and commit the raw source
14
+ # git has its own built in compression methods
15
+ *.7z
16
+ *.dmg
17
+ *.gz
18
+ *.iso
19
+ *.jar
20
+ *.rar
21
+ *.tar
22
+ *.zip
23
+
24
+ # Logs and databases #
25
+ ######################
26
+ *.log
27
+ *.sql
28
+ *.sqlite
29
+
30
+ # OS generated files #
31
+ ######################
32
+ .DS_Store*
33
+ ehthumbs.db
34
+ Icon?
35
+ Thumbs.db
36
+
37
+ # IDE stuff #
38
+ #############
39
+ .idea
40
+ .vscode
41
+
42
+
43
+ # Python version/venv #
44
+ #######################
45
+ .python-version
46
+
47
+ # Python bytecode #
48
+ ###################
49
+ __pycache__
50
+ *.pyc
51
+ *.pyo
52
+
53
+ # Python package stuff #
54
+ ########################
55
+ *.egg-info
56
+
57
+ # Jupyter notebooks #
58
+ #####################
59
+ .ipynb_checkpoints
60
+
61
+ # db files #
62
+ #####################
63
+ *.db
64
+
65
+ /venv
66
+ /dist
67
+ /site
68
+
69
+ /models
@@ -0,0 +1,12 @@
1
+ # Changelog
2
+
3
+ ## [0.1.0] - 02-04-2026 - extracted from graphbrain
4
+
5
+ ### Added
6
+
7
+ - Atomizier, a multilingual classifier for atom types.
8
+ - Can now parse all languages supported by spaCy.
9
+
10
+ ### Changed
11
+
12
+ - Original alpha-beta parser from Graphbrain was extracted to create this plugin.
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (C) 2026 CNRS - Centre national de la recherche scientifique
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,62 @@
1
+ Metadata-Version: 2.4
2
+ Name: hyperbase-parser-ab
3
+ Version: 0.1.0
4
+ Summary: Semantic Hypergraph AlphaBeta Parser
5
+ Project-URL: Homepage, https://hyperquest.ai/hyperbase
6
+ Author-email: "Telmo Menezes et al." <telmo@telmomenezes.net>
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Keywords: AI,Knowledge Representation,NLP,Natural Language Understanding,Parsing,Semantic Hypergraphs
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: hyperbase>=0.8.0
19
+ Requires-Dist: pip
20
+ Requires-Dist: scikit-learn>=1.3.0
21
+ Requires-Dist: spacy>=3.8.0
22
+ Requires-Dist: torch>=2.0.0
23
+ Requires-Dist: transformers>=4.46.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: coverage>=7.4.3; extra == 'dev'
26
+ Requires-Dist: datasets>=4.0.0; extra == 'dev'
27
+ Requires-Dist: mypy>=1.8.0; extra == 'dev'
28
+ Requires-Dist: pre-commit>=3.6.2; extra == 'dev'
29
+ Requires-Dist: pytest>=9.0.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.2.2; extra == 'dev'
31
+ Requires-Dist: types-passlib>=1.7.7.20240106; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # Hyperbase Alpha-Beta Parser
35
+
36
+ ## A semantic hypergraph parser for natural language
37
+
38
+ The Alpha-Beta parser is a [Hyperbase](https://hyperquest.ai/hyperbase) plugin that converts natural language text into *Semantic Hypergraphs (SH)*. It works in two stages:
39
+
40
+ - **Alpha stage**: A multilingual neural token classifier (based on DistilBERT) assigns one of 39 semantic atom types to each token in a sentence -- for example, concepts, predicates, modifiers, builders, triggers and conjunctions.
41
+ - **Beta stage**: A rule-based engine combines classified atoms into ordered, recursive hyperedges using syntactic and semantic composition rules, producing structured representations that can be manipulated with Hyperbase.
42
+
43
+ ## Supported languages
44
+
45
+ The parser supports any language with a [spaCy](https://spacy.io) model available, including English, French, German, Italian, Portuguese and Spanish.
46
+
47
+ While the parser is theoretically language-agnostic and could in principle support languages such as Mandarin, which differ substantially in morphological and syntactic structure, the authors' linguistic competence is limited to Germanic and Romance languages. We welcome the help of native speakers or domain experts in validating/improving support for other language families.
48
+
49
+ ## Installation and manual
50
+
51
+ Installation instructions, the manual and more information can be found here: <https://hyperquest.ai/hyperbase>
52
+
53
+ ## Contributing
54
+
55
+ Pull requests are welcome. For major changes, please open an issue first
56
+ to discuss what you would like to change.
57
+
58
+ Please make sure to update tests as appropriate.
59
+
60
+ ## License
61
+
62
+ [MIT](https://choosealicense.com/licenses/mit/)
@@ -0,0 +1,29 @@
1
+ # Hyperbase Alpha-Beta Parser
2
+
3
+ ## A semantic hypergraph parser for natural language
4
+
5
+ The Alpha-Beta parser is a [Hyperbase](https://hyperquest.ai/hyperbase) plugin that converts natural language text into *Semantic Hypergraphs (SH)*. It works in two stages:
6
+
7
+ - **Alpha stage**: A multilingual neural token classifier (based on DistilBERT) assigns one of 39 semantic atom types to each token in a sentence -- for example, concepts, predicates, modifiers, builders, triggers and conjunctions.
8
+ - **Beta stage**: A rule-based engine combines classified atoms into ordered, recursive hyperedges using syntactic and semantic composition rules, producing structured representations that can be manipulated with Hyperbase.
9
+
10
+ ## Supported languages
11
+
12
+ The parser supports any language with a [spaCy](https://spacy.io) model available, including English, French, German, Italian, Portuguese and Spanish.
13
+
14
+ While the parser is theoretically language-agnostic and could in principle support languages such as Mandarin, which differ substantially in morphological and syntactic structure, the authors' linguistic competence is limited to Germanic and Romance languages. We welcome the help of native speakers or domain experts in validating/improving support for other language families.
15
+
16
+ ## Installation and manual
17
+
18
+ Installation instructions, the manual and more information can be found here: <https://hyperquest.ai/hyperbase>
19
+
20
+ ## Contributing
21
+
22
+ Pull requests are welcome. For major changes, please open an issue first
23
+ to discuss what you would like to change.
24
+
25
+ Please make sure to update tests as appropriate.
26
+
27
+ ## License
28
+
29
+ [MIT](https://choosealicense.com/licenses/mit/)
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,72 @@
1
+ [project]
2
+ name = "hyperbase-parser-ab"
3
+ dynamic = ["version"]
4
+ description = "Semantic Hypergraph AlphaBeta Parser"
5
+ readme = "README.md"
6
+ license = "MIT"
7
+ requires-python = ">=3.10"
8
+ authors = [
9
+ { name = "Telmo Menezes et al.", email = "telmo@telmomenezes.net" },
10
+ ]
11
+ keywords = [
12
+ "NLP",
13
+ "AI",
14
+ "Knowledge Representation",
15
+ "Natural Language Understanding",
16
+ "Parsing",
17
+ "Semantic Hypergraphs",
18
+ ]
19
+ classifiers = [
20
+ "Development Status :: 4 - Beta",
21
+ "Programming Language :: Python :: 3",
22
+ "Operating System :: OS Independent",
23
+ "Environment :: Console",
24
+ "Intended Audience :: Science/Research",
25
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
+ "Topic :: Scientific/Engineering :: Information Analysis",
27
+ ]
28
+ dependencies = [
29
+ "hyperbase>=0.8.0",
30
+ "scikit-learn>=1.3.0",
31
+ "spacy>=3.8.0",
32
+ "torch>=2.0.0",
33
+ "transformers>=4.46.0",
34
+ "pip", # so that spaCy models can be easily installed with uv
35
+ ]
36
+
37
+ [tool.uv.sources]
38
+ hyperbase = { workspace = true }
39
+
40
+ [project.optional-dependencies]
41
+ dev = [
42
+ "mypy>=1.8.0",
43
+ "ruff>=0.2.2",
44
+ "pre-commit>=3.6.2",
45
+ "types-passlib>=1.7.7.20240106",
46
+ "coverage>=7.4.3",
47
+ "datasets>=4.0.0",
48
+ "pytest>=9.0.0",
49
+ ]
50
+
51
+ [project.urls]
52
+ Homepage = "https://hyperquest.ai/hyperbase"
53
+
54
+ [project.entry-points."hyperbase.parsers"]
55
+ alphabeta = "hyperbase_parser_ab:AlphaBetaParser"
56
+
57
+ [build-system]
58
+ requires = ["hatchling"]
59
+ build-backend = "hatchling.build"
60
+
61
+ [tool.hatch.version]
62
+ path = "VERSION"
63
+ pattern = "(?P<version>.+)"
64
+
65
+ [tool.hatch.build.targets.wheel]
66
+ packages = ["src/hyperbase_parser_ab"]
67
+
68
+ [tool.mypy]
69
+ strict = true
70
+
71
+ [tool.ruff]
72
+ target-version = "py310"
@@ -0,0 +1,107 @@
1
+ import argparse
2
+ import json
3
+
4
+ from hyperbase import hedge
5
+ from hyperbase_parser_ab import AlphaBetaParser
6
+
7
+
8
+ if __name__ == '__main__':
9
+ arg_parser = argparse.ArgumentParser(
10
+ description='Generate alpha training data.')
11
+ arg_parser.add_argument('infile', type=str, help='input jsonl file')
12
+ arg_parser.add_argument('outfile', type=str, help='output tsv file')
13
+ arg_parser.add_argument(
14
+ '--lang', type=str, default='en', help='language (default: en)')
15
+ args = arg_parser.parse_args()
16
+
17
+ total_sentences = 0
18
+ ignored_sentences = 0
19
+ failed_parses = 0
20
+ total_atoms = 0
21
+
22
+ parser = AlphaBetaParser(lang=args.lang)
23
+
24
+ with open(args.infile, 'r') as infile, open(args.outfile, 'w') as outfile:
25
+ for line in infile.readlines():
26
+ case = json.loads(line)
27
+ sentence = case['sentence']
28
+ atoms = case['atoms']
29
+ parses = parser.parse_sentence(sentence)
30
+ spacy_sentence = list(parser.doc.sents)[0] if parser.doc else None
31
+ if not spacy_sentence or not parses:
32
+ failed_parses += 1
33
+ elif case['ignore']:
34
+ ignored_sentences += 1
35
+ elif len(atoms) == len(spacy_sentence):
36
+ total_sentences += 1
37
+ total_atoms += len(atoms)
38
+
39
+ for i in range(len(atoms)):
40
+ atom = atoms[i]
41
+ token = spacy_sentence[i]
42
+ atom_edge = hedge(atom)
43
+ if atom_edge is None:
44
+ continue
45
+
46
+ word_before = ''
47
+ word_after = ''
48
+ pos_before = ''
49
+ pos_after = ''
50
+ tag_before = ''
51
+ tag_after = ''
52
+ dep_before = ''
53
+ dep_after = ''
54
+ punct_before = False
55
+ punct_after = False
56
+ if i > 0:
57
+ word_before = str(spacy_sentence[i - 1])
58
+ pos_before = spacy_sentence[i - 1].pos_
59
+ tag_before = spacy_sentence[i - 1].tag_
60
+ dep_before = spacy_sentence[i - 1].dep_
61
+ if spacy_sentence[i - 1].pos_ == 'PUNCT':
62
+ punct_before = True
63
+ if i < len(atoms) - 1:
64
+ word_after = str(spacy_sentence[i + 1])
65
+ pos_after = spacy_sentence[i + 1].pos_
66
+ tag_after = spacy_sentence[i + 1].tag_
67
+ dep_after = spacy_sentence[i + 1].dep_
68
+ if spacy_sentence[i + 1].pos_ == 'PUNCT':
69
+ punct_after = True
70
+
71
+ head = token.head
72
+ is_root = head is None
73
+ has_lefts = token.n_lefts > 0
74
+ has_rights = token.n_rights > 0
75
+ outfile.write(('{}' + '\t{}' * 25 + '\n').format(
76
+ atom_edge.mtype(),
77
+ str(token),
78
+ token.pos_,
79
+ token.tag_,
80
+ token.dep_,
81
+ str(head) if head else '',
82
+ head.pos_ if head else '',
83
+ head.tag_ if head else '',
84
+ head.dep_ if head else '',
85
+ is_root,
86
+ has_lefts,
87
+ has_rights,
88
+ token.ent_type_,
89
+ token.shape_[:2],
90
+ word_before,
91
+ word_after,
92
+ punct_before,
93
+ punct_after,
94
+ pos_before,
95
+ pos_after,
96
+ tag_before,
97
+ tag_after,
98
+ dep_before,
99
+ dep_after,
100
+ case['correct'],
101
+ case['source']))
102
+ else:
103
+ failed_parses += 1
104
+ print('sentences: {}; ignored: {}; failed: {}; atoms: {}'.format(
105
+ total_sentences, ignored_sentences, failed_parses,
106
+ total_atoms))
107
+ print('done.')
@@ -0,0 +1,159 @@
1
+ import json
2
+
3
+ import numpy as np
4
+ from numpy.typing import NDArray
5
+ from datasets import Dataset
6
+ from transformers import (
7
+ AutoTokenizer,
8
+ AutoModelForTokenClassification,
9
+ TrainingArguments,
10
+ Trainer
11
+ )
12
+
13
+
14
+ def tokenize_and_align_labels(examples: dict[str, list]) -> dict[str, list]:
15
+ """Tokenize each sample and align the original token labels
16
+ to the new subword (tokenized) structure."""
17
+
18
+ tokenized_outputs = tokenizer(
19
+ examples["tokens"],
20
+ truncation=True,
21
+ is_split_into_words=True, # Important for token-based tasks
22
+ return_offsets_mapping=True, # We'll use this if needed
23
+ padding="max_length", # or "longest" / "do_not_pad"
24
+ max_length=200 # adjust as needed
25
+ )
26
+
27
+ labels_aligned: list[list[int]] = []
28
+ for i, labels in enumerate(examples["labels"]):
29
+ # The tokenizer may split single words into multiple subwords.
30
+ # We create a label list the same length as input_ids,
31
+ # repeating the label for all subwords of the original token.
32
+ word_ids: list[int | None] = tokenized_outputs.word_ids(batch_index=i)
33
+ label_ids: list[int] = []
34
+ previous_word_idx: int | None = None
35
+
36
+ for word_idx in word_ids:
37
+ if word_idx is None:
38
+ # This is a special token like [CLS], [SEP], or padding
39
+ label_ids.append(-100)
40
+ else:
41
+ label_ids.append(label_to_id[labels[word_idx]])
42
+ previous_word_idx = word_idx
43
+
44
+ labels_aligned.append(label_ids)
45
+
46
+ # We don't need offset_mapping during model training, so we remove it
47
+ tokenized_outputs["offset_mapping"] = [None for _ in examples["tokens"]]
48
+
49
+ tokenized_outputs["labels"] = labels_aligned
50
+ return tokenized_outputs
51
+
52
+
53
+ def compute_metrics(eval_pred: tuple[NDArray, NDArray]) -> dict[str, float]:
54
+ """Compute accuracy at the token level (simple example).
55
+ You can also compute F1, precision, recall, etc. by ignoring
56
+ the -100 special tokens."""
57
+ logits: NDArray
58
+ labels: NDArray
59
+ logits, labels = eval_pred
60
+ predictions: NDArray = np.argmax(logits, axis=-1)
61
+
62
+ # Flatten ignoring -100
63
+ true_predictions: list[int] = []
64
+ true_labels: list[int] = []
65
+ for pred, lab in zip(predictions, labels):
66
+ for p, l in zip(pred, lab):
67
+ if l != -100: # skip special tokens
68
+ true_predictions.append(p)
69
+ true_labels.append(l)
70
+
71
+ results: dict[str, float] = accuracy_metric.compute(
72
+ references=true_labels,
73
+ predictions=true_predictions
74
+ )
75
+ return {"accuracy": results["accuracy"]}
76
+
77
+
78
+ if __name__ == '__main__':
79
+ with open("sentences.jsonl", "rt") as f:
80
+ sentences: list[dict] = [json.loads(line) for line in f]
81
+
82
+ dataset_dict: dict[str, list] = {
83
+ "tokens": [sentence["words"] for sentence in sentences],
84
+ "labels": [sentence["types"] for sentence in sentences]
85
+ }
86
+
87
+ full_dataset: Dataset = Dataset.from_dict(dataset_dict)
88
+
89
+ max_words: int = max([len(sentence["words"]) for sentence in sentences])
90
+
91
+
92
+ labels: set[str] = set()
93
+ for sentence in sentences:
94
+ labels |= set(sentence["types"])
95
+ print(labels)
96
+ label_to_id: dict[str, int] = {label: i for i, label in enumerate(labels)}
97
+ id_to_label: dict[int, str] = {i: label for label, i in label_to_id.items()}
98
+
99
+ dataset = full_dataset.train_test_split(test_size=0.25, seed=42)
100
+ train_dataset = dataset["train"]
101
+ test_dataset = dataset["test"]
102
+
103
+ print("Num train samples:", len(train_dataset))
104
+ print("Num test samples: ", len(test_dataset))
105
+
106
+
107
+ model_checkpoint: str = "distilbert-base-multilingual-cased"
108
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, add_prefix_space=True)
109
+
110
+ # Apply to train/test datasets
111
+ train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
112
+ test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)
113
+
114
+ # Remove columns we don't feed directly to the model
115
+ # train_dataset = train_dataset.remove_columns(["tokens", "labels"])
116
+ # test_dataset = test_dataset.remove_columns(["tokens", "labels"])
117
+
118
+ # Set format for PyTorch
119
+ train_dataset.set_format("torch")
120
+ test_dataset.set_format("torch")
121
+
122
+ model = AutoModelForTokenClassification.from_pretrained(
123
+ model_checkpoint,
124
+ num_labels=len(labels),
125
+ id2label=id_to_label,
126
+ label2id=label_to_id
127
+ )
128
+
129
+ accuracy_metric = evaluate.load("accuracy") # type: ignore[attr-defined]
130
+
131
+ training_args: TrainingArguments = TrainingArguments(
132
+ output_dir="./test-roberta-token-classifier",
133
+ eval_strategy="epoch",
134
+ save_strategy="epoch",
135
+ learning_rate=5e-5,
136
+ per_device_train_batch_size=2,
137
+ per_device_eval_batch_size=2,
138
+ num_train_epochs=3,
139
+ weight_decay=0.01,
140
+ logging_dir="./logs",
141
+ logging_steps=10,
142
+ report_to="none" # Set to "tensorboard" if you want logs
143
+ )
144
+
145
+ trainer: Trainer = Trainer(
146
+ model=model,
147
+ args=training_args,
148
+ train_dataset=train_dataset,
149
+ eval_dataset=test_dataset,
150
+ processing_class=tokenizer,
151
+ compute_metrics=compute_metrics
152
+ )
153
+
154
+ trainer.train()
155
+
156
+ results: dict[str, float] = trainer.evaluate(test_dataset) # type: ignore[arg-type]
157
+ print("Test set results:", results)
158
+
159
+ trainer.save_model("./token-classifier")
@@ -0,0 +1,3 @@
1
+ from hyperbase_parser_ab.parser import AlphaBetaParser
2
+
3
+ __all__ = ["AlphaBetaParser"]
@@ -0,0 +1,69 @@
1
+ import numpy as np
2
+ from numpy.typing import NDArray
3
+ from scipy.sparse import spmatrix
4
+ from sklearn.ensemble import RandomForestClassifier
5
+ from sklearn.preprocessing import OneHotEncoder
6
+ from spacy.tokens import Span
7
+
8
+ from hyperbase_parser_ab.atomizer import Atomizer
9
+
10
+
11
+ class Alpha(object):
12
+ def __init__(self, cases_str: str | None = None, use_atomizer: bool = False) -> None:
13
+ if use_atomizer:
14
+ self.atomizer: Atomizer | None = Atomizer()
15
+ elif cases_str:
16
+ self.atomizer = None
17
+
18
+ X: list[tuple[str, str, str, str, str]] = []
19
+ y: list[list[str]] = []
20
+
21
+ for line in cases_str.strip().split('\n'):
22
+ sline: str = line.strip()
23
+ if len(sline) > 0:
24
+ row: list[str] = sline.strip().split('\t')
25
+ true_value: str = row[0]
26
+ tag: str = row[3]
27
+ dep: str = row[4]
28
+ hpos: str = row[6]
29
+ hdep: str = row[8]
30
+ pos_after: str = row[19]
31
+
32
+ y.append([true_value])
33
+ X.append((tag, dep, hpos, hdep, pos_after))
34
+
35
+ if len(y) > 0:
36
+ self.empty: bool = False
37
+
38
+ self.encX: OneHotEncoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
39
+ self.encX.fit(np.array(X))
40
+ self.ency: OneHotEncoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
41
+ self.ency.fit(np.array(y))
42
+
43
+ X_: NDArray | spmatrix = self.encX.transform(np.array(X))
44
+ y_: NDArray | spmatrix = self.ency.transform(np.array(y))
45
+
46
+ self.clf: RandomForestClassifier = RandomForestClassifier(random_state=777)
47
+ self.clf.fit(X_, y_)
48
+ else:
49
+ self.empty = True
50
+
51
+ def predict(self, sentence: Span, features: list[tuple[str, str, str, str, str]]) -> tuple[str, ...] | list[str]:
52
+ if self.atomizer:
53
+ preds: list[tuple[str, str]] = self.atomizer.atomize(
54
+ sentence=str(sentence),
55
+ tokens=[str(token) for token in sentence])
56
+ atom_types: list[str] = [pred[1] for pred in preds]
57
+
58
+ # force known cases
59
+ for i in range(len(atom_types)):
60
+ if sentence[i].pos_ == 'VERB':
61
+ atom_types[i] = 'P'
62
+ return atom_types
63
+ else:
64
+ # an empty classifier always predicts 'C'
65
+ if self.empty:
66
+ return tuple('C' for _ in range(len(features)))
67
+ _features: NDArray | spmatrix = self.encX.transform(np.array(features))
68
+ preds_arr: NDArray | spmatrix = self.ency.inverse_transform(self.clf.predict(_features))
69
+ return tuple(pred[0] if pred else 'C' for pred in preds_arr)