hyperbase-parser-ab 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hyperbase_parser_ab-0.1.0/.github/workflows/publish.yml +29 -0
- hyperbase_parser_ab-0.1.0/.gitignore +69 -0
- hyperbase_parser_ab-0.1.0/CHANGELOG.md +12 -0
- hyperbase_parser_ab-0.1.0/LICENSE +21 -0
- hyperbase_parser_ab-0.1.0/PKG-INFO +62 -0
- hyperbase_parser_ab-0.1.0/README.md +29 -0
- hyperbase_parser_ab-0.1.0/VERSION +1 -0
- hyperbase_parser_ab-0.1.0/pyproject.toml +72 -0
- hyperbase_parser_ab-0.1.0/scripts/generate_alpha_training_data.py +107 -0
- hyperbase_parser_ab-0.1.0/scripts/train_atomizer.py +159 -0
- hyperbase_parser_ab-0.1.0/src/hyperbase_parser_ab/__init__.py +3 -0
- hyperbase_parser_ab-0.1.0/src/hyperbase_parser_ab/alpha.py +69 -0
- hyperbase_parser_ab-0.1.0/src/hyperbase_parser_ab/atomizer.py +142 -0
- hyperbase_parser_ab-0.1.0/src/hyperbase_parser_ab/lang_models.py +50 -0
- hyperbase_parser_ab-0.1.0/src/hyperbase_parser_ab/parser.py +835 -0
- hyperbase_parser_ab-0.1.0/src/hyperbase_parser_ab/rules.py +67 -0
- hyperbase_parser_ab-0.1.0/src/hyperbase_parser_ab/sentensizer.py +9 -0
- hyperbase_parser_ab-0.1.0/tests/__init__.py +0 -0
- hyperbase_parser_ab-0.1.0/tests/test_parser.py +243 -0
- hyperbase_parser_ab-0.1.0/tests/test_parser_helpers.py +250 -0
- hyperbase_parser_ab-0.1.0/tests/test_rules.py +113 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
environment: pypi
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write
|
|
13
|
+
steps:
|
|
14
|
+
- name: Checkout
|
|
15
|
+
uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Set up Python
|
|
18
|
+
uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: '3.12'
|
|
21
|
+
|
|
22
|
+
- name: Install build dependencies
|
|
23
|
+
run: pip install build
|
|
24
|
+
|
|
25
|
+
- name: Build package
|
|
26
|
+
run: python -m build
|
|
27
|
+
|
|
28
|
+
- name: Publish to PyPI
|
|
29
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Compiled source #
|
|
2
|
+
###################
|
|
3
|
+
*.com
|
|
4
|
+
*.class
|
|
5
|
+
*.dll
|
|
6
|
+
*.exe
|
|
7
|
+
*.o
|
|
8
|
+
*.so
|
|
9
|
+
build/
|
|
10
|
+
|
|
11
|
+
# Packages #
|
|
12
|
+
############
|
|
13
|
+
# it's better to unpack these files and commit the raw source
|
|
14
|
+
# git has its own built in compression methods
|
|
15
|
+
*.7z
|
|
16
|
+
*.dmg
|
|
17
|
+
*.gz
|
|
18
|
+
*.iso
|
|
19
|
+
*.jar
|
|
20
|
+
*.rar
|
|
21
|
+
*.tar
|
|
22
|
+
*.zip
|
|
23
|
+
|
|
24
|
+
# Logs and databases #
|
|
25
|
+
######################
|
|
26
|
+
*.log
|
|
27
|
+
*.sql
|
|
28
|
+
*.sqlite
|
|
29
|
+
|
|
30
|
+
# OS generated files #
|
|
31
|
+
######################
|
|
32
|
+
.DS_Store*
|
|
33
|
+
ehthumbs.db
|
|
34
|
+
Icon?
|
|
35
|
+
Thumbs.db
|
|
36
|
+
|
|
37
|
+
# IDE stuff #
|
|
38
|
+
#############
|
|
39
|
+
.idea
|
|
40
|
+
.vscode
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# Python version/venv #
|
|
44
|
+
#######################
|
|
45
|
+
.python-version
|
|
46
|
+
|
|
47
|
+
# Python bytecode #
|
|
48
|
+
###################
|
|
49
|
+
__pycache__
|
|
50
|
+
*.pyc
|
|
51
|
+
*.pyo
|
|
52
|
+
|
|
53
|
+
# Python package stuff #
|
|
54
|
+
########################
|
|
55
|
+
*.egg-info
|
|
56
|
+
|
|
57
|
+
# Jupyter notebooks #
|
|
58
|
+
#####################
|
|
59
|
+
.ipynb_checkpoints
|
|
60
|
+
|
|
61
|
+
# db files #
|
|
62
|
+
#####################
|
|
63
|
+
*.db
|
|
64
|
+
|
|
65
|
+
/venv
|
|
66
|
+
/dist
|
|
67
|
+
/site
|
|
68
|
+
|
|
69
|
+
/models
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0] - 02-04-2026 - extracted from graphbrain
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- Atomizier, a multilingual classifier for atom types.
|
|
8
|
+
- Can now parse all languages supported by spaCy.
|
|
9
|
+
|
|
10
|
+
### Changed
|
|
11
|
+
|
|
12
|
+
- Original alpha-beta parser from Graphbrain was extracted to create this plugin.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (C) 2026 CNRS - Centre national de la recherche scientifique
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hyperbase-parser-ab
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Semantic Hypergraph AlphaBeta Parser
|
|
5
|
+
Project-URL: Homepage, https://hyperquest.ai/hyperbase
|
|
6
|
+
Author-email: "Telmo Menezes et al." <telmo@telmomenezes.net>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: AI,Knowledge Representation,NLP,Natural Language Understanding,Parsing,Semantic Hypergraphs
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: hyperbase>=0.8.0
|
|
19
|
+
Requires-Dist: pip
|
|
20
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
21
|
+
Requires-Dist: spacy>=3.8.0
|
|
22
|
+
Requires-Dist: torch>=2.0.0
|
|
23
|
+
Requires-Dist: transformers>=4.46.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: coverage>=7.4.3; extra == 'dev'
|
|
26
|
+
Requires-Dist: datasets>=4.0.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: mypy>=1.8.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pre-commit>=3.6.2; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=9.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.2.2; extra == 'dev'
|
|
31
|
+
Requires-Dist: types-passlib>=1.7.7.20240106; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# Hyperbase Alpha-Beta Parser
|
|
35
|
+
|
|
36
|
+
## A semantic hypergraph parser for natural language
|
|
37
|
+
|
|
38
|
+
The Alpha-Beta parser is a [Hyperbase](https://hyperquest.ai/hyperbase) plugin that converts natural language text into *Semantic Hypergraphs (SH)*. It works in two stages:
|
|
39
|
+
|
|
40
|
+
- **Alpha stage**: A multilingual neural token classifier (based on DistilBERT) assigns one of 39 semantic atom types to each token in a sentence -- for example, concepts, predicates, modifiers, builders, triggers and conjunctions.
|
|
41
|
+
- **Beta stage**: A rule-based engine combines classified atoms into ordered, recursive hyperedges using syntactic and semantic composition rules, producing structured representations that can be manipulated with Hyperbase.
|
|
42
|
+
|
|
43
|
+
## Supported languages
|
|
44
|
+
|
|
45
|
+
The parser supports any language with a [spaCy](https://spacy.io) model available, including English, French, German, Italian, Portuguese and Spanish.
|
|
46
|
+
|
|
47
|
+
While the parser is theoretically language-agnostic and could in principle support languages such as Mandarin, which differ substantially in morphological and syntactic structure, the authors' linguistic competence is limited to Germanic and Romance languages. We welcome the help of native speakers or domain experts in validating/improving support for other language families.
|
|
48
|
+
|
|
49
|
+
## Installation and manual
|
|
50
|
+
|
|
51
|
+
Installation instructions, the manual and more information can be found here: <https://hyperquest.ai/hyperbase>
|
|
52
|
+
|
|
53
|
+
## Contributing
|
|
54
|
+
|
|
55
|
+
Pull requests are welcome. For major changes, please open an issue first
|
|
56
|
+
to discuss what you would like to change.
|
|
57
|
+
|
|
58
|
+
Please make sure to update tests as appropriate.
|
|
59
|
+
|
|
60
|
+
## License
|
|
61
|
+
|
|
62
|
+
[MIT](https://choosealicense.com/licenses/mit/)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Hyperbase Alpha-Beta Parser
|
|
2
|
+
|
|
3
|
+
## A semantic hypergraph parser for natural language
|
|
4
|
+
|
|
5
|
+
The Alpha-Beta parser is a [Hyperbase](https://hyperquest.ai/hyperbase) plugin that converts natural language text into *Semantic Hypergraphs (SH)*. It works in two stages:
|
|
6
|
+
|
|
7
|
+
- **Alpha stage**: A multilingual neural token classifier (based on DistilBERT) assigns one of 39 semantic atom types to each token in a sentence -- for example, concepts, predicates, modifiers, builders, triggers and conjunctions.
|
|
8
|
+
- **Beta stage**: A rule-based engine combines classified atoms into ordered, recursive hyperedges using syntactic and semantic composition rules, producing structured representations that can be manipulated with Hyperbase.
|
|
9
|
+
|
|
10
|
+
## Supported languages
|
|
11
|
+
|
|
12
|
+
The parser supports any language with a [spaCy](https://spacy.io) model available, including English, French, German, Italian, Portuguese and Spanish.
|
|
13
|
+
|
|
14
|
+
While the parser is theoretically language-agnostic and could in principle support languages such as Mandarin, which differ substantially in morphological and syntactic structure, the authors' linguistic competence is limited to Germanic and Romance languages. We welcome the help of native speakers or domain experts in validating/improving support for other language families.
|
|
15
|
+
|
|
16
|
+
## Installation and manual
|
|
17
|
+
|
|
18
|
+
Installation instructions, the manual and more information can be found here: <https://hyperquest.ai/hyperbase>
|
|
19
|
+
|
|
20
|
+
## Contributing
|
|
21
|
+
|
|
22
|
+
Pull requests are welcome. For major changes, please open an issue first
|
|
23
|
+
to discuss what you would like to change.
|
|
24
|
+
|
|
25
|
+
Please make sure to update tests as appropriate.
|
|
26
|
+
|
|
27
|
+
## License
|
|
28
|
+
|
|
29
|
+
[MIT](https://choosealicense.com/licenses/mit/)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.1.0
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "hyperbase-parser-ab"
|
|
3
|
+
dynamic = ["version"]
|
|
4
|
+
description = "Semantic Hypergraph AlphaBeta Parser"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.10"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Telmo Menezes et al.", email = "telmo@telmomenezes.net" },
|
|
10
|
+
]
|
|
11
|
+
keywords = [
|
|
12
|
+
"NLP",
|
|
13
|
+
"AI",
|
|
14
|
+
"Knowledge Representation",
|
|
15
|
+
"Natural Language Understanding",
|
|
16
|
+
"Parsing",
|
|
17
|
+
"Semantic Hypergraphs",
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 4 - Beta",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Operating System :: OS Independent",
|
|
23
|
+
"Environment :: Console",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"hyperbase>=0.8.0",
|
|
30
|
+
"scikit-learn>=1.3.0",
|
|
31
|
+
"spacy>=3.8.0",
|
|
32
|
+
"torch>=2.0.0",
|
|
33
|
+
"transformers>=4.46.0",
|
|
34
|
+
"pip", # so that spaCy models can be easily installed with uv
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[tool.uv.sources]
|
|
38
|
+
hyperbase = { workspace = true }
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
dev = [
|
|
42
|
+
"mypy>=1.8.0",
|
|
43
|
+
"ruff>=0.2.2",
|
|
44
|
+
"pre-commit>=3.6.2",
|
|
45
|
+
"types-passlib>=1.7.7.20240106",
|
|
46
|
+
"coverage>=7.4.3",
|
|
47
|
+
"datasets>=4.0.0",
|
|
48
|
+
"pytest>=9.0.0",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[project.urls]
|
|
52
|
+
Homepage = "https://hyperquest.ai/hyperbase"
|
|
53
|
+
|
|
54
|
+
[project.entry-points."hyperbase.parsers"]
|
|
55
|
+
alphabeta = "hyperbase_parser_ab:AlphaBetaParser"
|
|
56
|
+
|
|
57
|
+
[build-system]
|
|
58
|
+
requires = ["hatchling"]
|
|
59
|
+
build-backend = "hatchling.build"
|
|
60
|
+
|
|
61
|
+
[tool.hatch.version]
|
|
62
|
+
path = "VERSION"
|
|
63
|
+
pattern = "(?P<version>.+)"
|
|
64
|
+
|
|
65
|
+
[tool.hatch.build.targets.wheel]
|
|
66
|
+
packages = ["src/hyperbase_parser_ab"]
|
|
67
|
+
|
|
68
|
+
[tool.mypy]
|
|
69
|
+
strict = true
|
|
70
|
+
|
|
71
|
+
[tool.ruff]
|
|
72
|
+
target-version = "py310"
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from hyperbase import hedge
|
|
5
|
+
from hyperbase_parser_ab import AlphaBetaParser
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
if __name__ == '__main__':
|
|
9
|
+
arg_parser = argparse.ArgumentParser(
|
|
10
|
+
description='Generate alpha training data.')
|
|
11
|
+
arg_parser.add_argument('infile', type=str, help='input jsonl file')
|
|
12
|
+
arg_parser.add_argument('outfile', type=str, help='output tsv file')
|
|
13
|
+
arg_parser.add_argument(
|
|
14
|
+
'--lang', type=str, default='en', help='language (default: en)')
|
|
15
|
+
args = arg_parser.parse_args()
|
|
16
|
+
|
|
17
|
+
total_sentences = 0
|
|
18
|
+
ignored_sentences = 0
|
|
19
|
+
failed_parses = 0
|
|
20
|
+
total_atoms = 0
|
|
21
|
+
|
|
22
|
+
parser = AlphaBetaParser(lang=args.lang)
|
|
23
|
+
|
|
24
|
+
with open(args.infile, 'r') as infile, open(args.outfile, 'w') as outfile:
|
|
25
|
+
for line in infile.readlines():
|
|
26
|
+
case = json.loads(line)
|
|
27
|
+
sentence = case['sentence']
|
|
28
|
+
atoms = case['atoms']
|
|
29
|
+
parses = parser.parse_sentence(sentence)
|
|
30
|
+
spacy_sentence = list(parser.doc.sents)[0] if parser.doc else None
|
|
31
|
+
if not spacy_sentence or not parses:
|
|
32
|
+
failed_parses += 1
|
|
33
|
+
elif case['ignore']:
|
|
34
|
+
ignored_sentences += 1
|
|
35
|
+
elif len(atoms) == len(spacy_sentence):
|
|
36
|
+
total_sentences += 1
|
|
37
|
+
total_atoms += len(atoms)
|
|
38
|
+
|
|
39
|
+
for i in range(len(atoms)):
|
|
40
|
+
atom = atoms[i]
|
|
41
|
+
token = spacy_sentence[i]
|
|
42
|
+
atom_edge = hedge(atom)
|
|
43
|
+
if atom_edge is None:
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
word_before = ''
|
|
47
|
+
word_after = ''
|
|
48
|
+
pos_before = ''
|
|
49
|
+
pos_after = ''
|
|
50
|
+
tag_before = ''
|
|
51
|
+
tag_after = ''
|
|
52
|
+
dep_before = ''
|
|
53
|
+
dep_after = ''
|
|
54
|
+
punct_before = False
|
|
55
|
+
punct_after = False
|
|
56
|
+
if i > 0:
|
|
57
|
+
word_before = str(spacy_sentence[i - 1])
|
|
58
|
+
pos_before = spacy_sentence[i - 1].pos_
|
|
59
|
+
tag_before = spacy_sentence[i - 1].tag_
|
|
60
|
+
dep_before = spacy_sentence[i - 1].dep_
|
|
61
|
+
if spacy_sentence[i - 1].pos_ == 'PUNCT':
|
|
62
|
+
punct_before = True
|
|
63
|
+
if i < len(atoms) - 1:
|
|
64
|
+
word_after = str(spacy_sentence[i + 1])
|
|
65
|
+
pos_after = spacy_sentence[i + 1].pos_
|
|
66
|
+
tag_after = spacy_sentence[i + 1].tag_
|
|
67
|
+
dep_after = spacy_sentence[i + 1].dep_
|
|
68
|
+
if spacy_sentence[i + 1].pos_ == 'PUNCT':
|
|
69
|
+
punct_after = True
|
|
70
|
+
|
|
71
|
+
head = token.head
|
|
72
|
+
is_root = head is None
|
|
73
|
+
has_lefts = token.n_lefts > 0
|
|
74
|
+
has_rights = token.n_rights > 0
|
|
75
|
+
outfile.write(('{}' + '\t{}' * 25 + '\n').format(
|
|
76
|
+
atom_edge.mtype(),
|
|
77
|
+
str(token),
|
|
78
|
+
token.pos_,
|
|
79
|
+
token.tag_,
|
|
80
|
+
token.dep_,
|
|
81
|
+
str(head) if head else '',
|
|
82
|
+
head.pos_ if head else '',
|
|
83
|
+
head.tag_ if head else '',
|
|
84
|
+
head.dep_ if head else '',
|
|
85
|
+
is_root,
|
|
86
|
+
has_lefts,
|
|
87
|
+
has_rights,
|
|
88
|
+
token.ent_type_,
|
|
89
|
+
token.shape_[:2],
|
|
90
|
+
word_before,
|
|
91
|
+
word_after,
|
|
92
|
+
punct_before,
|
|
93
|
+
punct_after,
|
|
94
|
+
pos_before,
|
|
95
|
+
pos_after,
|
|
96
|
+
tag_before,
|
|
97
|
+
tag_after,
|
|
98
|
+
dep_before,
|
|
99
|
+
dep_after,
|
|
100
|
+
case['correct'],
|
|
101
|
+
case['source']))
|
|
102
|
+
else:
|
|
103
|
+
failed_parses += 1
|
|
104
|
+
print('sentences: {}; ignored: {}; failed: {}; atoms: {}'.format(
|
|
105
|
+
total_sentences, ignored_sentences, failed_parses,
|
|
106
|
+
total_atoms))
|
|
107
|
+
print('done.')
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from numpy.typing import NDArray
|
|
5
|
+
from datasets import Dataset
|
|
6
|
+
from transformers import (
|
|
7
|
+
AutoTokenizer,
|
|
8
|
+
AutoModelForTokenClassification,
|
|
9
|
+
TrainingArguments,
|
|
10
|
+
Trainer
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def tokenize_and_align_labels(examples: dict[str, list]) -> dict[str, list]:
|
|
15
|
+
"""Tokenize each sample and align the original token labels
|
|
16
|
+
to the new subword (tokenized) structure."""
|
|
17
|
+
|
|
18
|
+
tokenized_outputs = tokenizer(
|
|
19
|
+
examples["tokens"],
|
|
20
|
+
truncation=True,
|
|
21
|
+
is_split_into_words=True, # Important for token-based tasks
|
|
22
|
+
return_offsets_mapping=True, # We'll use this if needed
|
|
23
|
+
padding="max_length", # or "longest" / "do_not_pad"
|
|
24
|
+
max_length=200 # adjust as needed
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
labels_aligned: list[list[int]] = []
|
|
28
|
+
for i, labels in enumerate(examples["labels"]):
|
|
29
|
+
# The tokenizer may split single words into multiple subwords.
|
|
30
|
+
# We create a label list the same length as input_ids,
|
|
31
|
+
# repeating the label for all subwords of the original token.
|
|
32
|
+
word_ids: list[int | None] = tokenized_outputs.word_ids(batch_index=i)
|
|
33
|
+
label_ids: list[int] = []
|
|
34
|
+
previous_word_idx: int | None = None
|
|
35
|
+
|
|
36
|
+
for word_idx in word_ids:
|
|
37
|
+
if word_idx is None:
|
|
38
|
+
# This is a special token like [CLS], [SEP], or padding
|
|
39
|
+
label_ids.append(-100)
|
|
40
|
+
else:
|
|
41
|
+
label_ids.append(label_to_id[labels[word_idx]])
|
|
42
|
+
previous_word_idx = word_idx
|
|
43
|
+
|
|
44
|
+
labels_aligned.append(label_ids)
|
|
45
|
+
|
|
46
|
+
# We don't need offset_mapping during model training, so we remove it
|
|
47
|
+
tokenized_outputs["offset_mapping"] = [None for _ in examples["tokens"]]
|
|
48
|
+
|
|
49
|
+
tokenized_outputs["labels"] = labels_aligned
|
|
50
|
+
return tokenized_outputs
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def compute_metrics(eval_pred: tuple[NDArray, NDArray]) -> dict[str, float]:
|
|
54
|
+
"""Compute accuracy at the token level (simple example).
|
|
55
|
+
You can also compute F1, precision, recall, etc. by ignoring
|
|
56
|
+
the -100 special tokens."""
|
|
57
|
+
logits: NDArray
|
|
58
|
+
labels: NDArray
|
|
59
|
+
logits, labels = eval_pred
|
|
60
|
+
predictions: NDArray = np.argmax(logits, axis=-1)
|
|
61
|
+
|
|
62
|
+
# Flatten ignoring -100
|
|
63
|
+
true_predictions: list[int] = []
|
|
64
|
+
true_labels: list[int] = []
|
|
65
|
+
for pred, lab in zip(predictions, labels):
|
|
66
|
+
for p, l in zip(pred, lab):
|
|
67
|
+
if l != -100: # skip special tokens
|
|
68
|
+
true_predictions.append(p)
|
|
69
|
+
true_labels.append(l)
|
|
70
|
+
|
|
71
|
+
results: dict[str, float] = accuracy_metric.compute(
|
|
72
|
+
references=true_labels,
|
|
73
|
+
predictions=true_predictions
|
|
74
|
+
)
|
|
75
|
+
return {"accuracy": results["accuracy"]}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == '__main__':
|
|
79
|
+
with open("sentences.jsonl", "rt") as f:
|
|
80
|
+
sentences: list[dict] = [json.loads(line) for line in f]
|
|
81
|
+
|
|
82
|
+
dataset_dict: dict[str, list] = {
|
|
83
|
+
"tokens": [sentence["words"] for sentence in sentences],
|
|
84
|
+
"labels": [sentence["types"] for sentence in sentences]
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
full_dataset: Dataset = Dataset.from_dict(dataset_dict)
|
|
88
|
+
|
|
89
|
+
max_words: int = max([len(sentence["words"]) for sentence in sentences])
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
labels: set[str] = set()
|
|
93
|
+
for sentence in sentences:
|
|
94
|
+
labels |= set(sentence["types"])
|
|
95
|
+
print(labels)
|
|
96
|
+
label_to_id: dict[str, int] = {label: i for i, label in enumerate(labels)}
|
|
97
|
+
id_to_label: dict[int, str] = {i: label for label, i in label_to_id.items()}
|
|
98
|
+
|
|
99
|
+
dataset = full_dataset.train_test_split(test_size=0.25, seed=42)
|
|
100
|
+
train_dataset = dataset["train"]
|
|
101
|
+
test_dataset = dataset["test"]
|
|
102
|
+
|
|
103
|
+
print("Num train samples:", len(train_dataset))
|
|
104
|
+
print("Num test samples: ", len(test_dataset))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
model_checkpoint: str = "distilbert-base-multilingual-cased"
|
|
108
|
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True, add_prefix_space=True)
|
|
109
|
+
|
|
110
|
+
# Apply to train/test datasets
|
|
111
|
+
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
|
|
112
|
+
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)
|
|
113
|
+
|
|
114
|
+
# Remove columns we don't feed directly to the model
|
|
115
|
+
# train_dataset = train_dataset.remove_columns(["tokens", "labels"])
|
|
116
|
+
# test_dataset = test_dataset.remove_columns(["tokens", "labels"])
|
|
117
|
+
|
|
118
|
+
# Set format for PyTorch
|
|
119
|
+
train_dataset.set_format("torch")
|
|
120
|
+
test_dataset.set_format("torch")
|
|
121
|
+
|
|
122
|
+
model = AutoModelForTokenClassification.from_pretrained(
|
|
123
|
+
model_checkpoint,
|
|
124
|
+
num_labels=len(labels),
|
|
125
|
+
id2label=id_to_label,
|
|
126
|
+
label2id=label_to_id
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
accuracy_metric = evaluate.load("accuracy") # type: ignore[attr-defined]
|
|
130
|
+
|
|
131
|
+
training_args: TrainingArguments = TrainingArguments(
|
|
132
|
+
output_dir="./test-roberta-token-classifier",
|
|
133
|
+
eval_strategy="epoch",
|
|
134
|
+
save_strategy="epoch",
|
|
135
|
+
learning_rate=5e-5,
|
|
136
|
+
per_device_train_batch_size=2,
|
|
137
|
+
per_device_eval_batch_size=2,
|
|
138
|
+
num_train_epochs=3,
|
|
139
|
+
weight_decay=0.01,
|
|
140
|
+
logging_dir="./logs",
|
|
141
|
+
logging_steps=10,
|
|
142
|
+
report_to="none" # Set to "tensorboard" if you want logs
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
trainer: Trainer = Trainer(
|
|
146
|
+
model=model,
|
|
147
|
+
args=training_args,
|
|
148
|
+
train_dataset=train_dataset,
|
|
149
|
+
eval_dataset=test_dataset,
|
|
150
|
+
processing_class=tokenizer,
|
|
151
|
+
compute_metrics=compute_metrics
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
trainer.train()
|
|
155
|
+
|
|
156
|
+
results: dict[str, float] = trainer.evaluate(test_dataset) # type: ignore[arg-type]
|
|
157
|
+
print("Test set results:", results)
|
|
158
|
+
|
|
159
|
+
trainer.save_model("./token-classifier")
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from numpy.typing import NDArray
|
|
3
|
+
from scipy.sparse import spmatrix
|
|
4
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
5
|
+
from sklearn.preprocessing import OneHotEncoder
|
|
6
|
+
from spacy.tokens import Span
|
|
7
|
+
|
|
8
|
+
from hyperbase_parser_ab.atomizer import Atomizer
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Alpha(object):
|
|
12
|
+
def __init__(self, cases_str: str | None = None, use_atomizer: bool = False) -> None:
|
|
13
|
+
if use_atomizer:
|
|
14
|
+
self.atomizer: Atomizer | None = Atomizer()
|
|
15
|
+
elif cases_str:
|
|
16
|
+
self.atomizer = None
|
|
17
|
+
|
|
18
|
+
X: list[tuple[str, str, str, str, str]] = []
|
|
19
|
+
y: list[list[str]] = []
|
|
20
|
+
|
|
21
|
+
for line in cases_str.strip().split('\n'):
|
|
22
|
+
sline: str = line.strip()
|
|
23
|
+
if len(sline) > 0:
|
|
24
|
+
row: list[str] = sline.strip().split('\t')
|
|
25
|
+
true_value: str = row[0]
|
|
26
|
+
tag: str = row[3]
|
|
27
|
+
dep: str = row[4]
|
|
28
|
+
hpos: str = row[6]
|
|
29
|
+
hdep: str = row[8]
|
|
30
|
+
pos_after: str = row[19]
|
|
31
|
+
|
|
32
|
+
y.append([true_value])
|
|
33
|
+
X.append((tag, dep, hpos, hdep, pos_after))
|
|
34
|
+
|
|
35
|
+
if len(y) > 0:
|
|
36
|
+
self.empty: bool = False
|
|
37
|
+
|
|
38
|
+
self.encX: OneHotEncoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
|
39
|
+
self.encX.fit(np.array(X))
|
|
40
|
+
self.ency: OneHotEncoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
|
41
|
+
self.ency.fit(np.array(y))
|
|
42
|
+
|
|
43
|
+
X_: NDArray | spmatrix = self.encX.transform(np.array(X))
|
|
44
|
+
y_: NDArray | spmatrix = self.ency.transform(np.array(y))
|
|
45
|
+
|
|
46
|
+
self.clf: RandomForestClassifier = RandomForestClassifier(random_state=777)
|
|
47
|
+
self.clf.fit(X_, y_)
|
|
48
|
+
else:
|
|
49
|
+
self.empty = True
|
|
50
|
+
|
|
51
|
+
def predict(self, sentence: Span, features: list[tuple[str, str, str, str, str]]) -> tuple[str, ...] | list[str]:
|
|
52
|
+
if self.atomizer:
|
|
53
|
+
preds: list[tuple[str, str]] = self.atomizer.atomize(
|
|
54
|
+
sentence=str(sentence),
|
|
55
|
+
tokens=[str(token) for token in sentence])
|
|
56
|
+
atom_types: list[str] = [pred[1] for pred in preds]
|
|
57
|
+
|
|
58
|
+
# force known cases
|
|
59
|
+
for i in range(len(atom_types)):
|
|
60
|
+
if sentence[i].pos_ == 'VERB':
|
|
61
|
+
atom_types[i] = 'P'
|
|
62
|
+
return atom_types
|
|
63
|
+
else:
|
|
64
|
+
# an empty classifier always predicts 'C'
|
|
65
|
+
if self.empty:
|
|
66
|
+
return tuple('C' for _ in range(len(features)))
|
|
67
|
+
_features: NDArray | spmatrix = self.encX.transform(np.array(features))
|
|
68
|
+
preds_arr: NDArray | spmatrix = self.ency.inverse_transform(self.clf.predict(_features))
|
|
69
|
+
return tuple(pred[0] if pred else 'C' for pred in preds_arr)
|