gsapere 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. gsapere-0.2.0/.gitignore +185 -0
  2. gsapere-0.2.0/LICENSE +22 -0
  3. gsapere-0.2.0/PKG-INFO +381 -0
  4. gsapere-0.2.0/README.md +344 -0
  5. gsapere-0.2.0/documentation/README.md +19 -0
  6. gsapere-0.2.0/documentation/api/hgere.md +162 -0
  7. gsapere-0.2.0/documentation/api/pruner.md +139 -0
  8. gsapere-0.2.0/documentation/download-dataset.md +159 -0
  9. gsapere-0.2.0/documentation/multi_head_training.md +226 -0
  10. gsapere-0.2.0/documentation/ner_candidates_by_sent_len.png +0 -0
  11. gsapere-0.2.0/documentation/ner_candidates_by_sent_len_t0.1.png +0 -0
  12. gsapere-0.2.0/documentation/pruner.md +104 -0
  13. gsapere-0.2.0/documentation/refactoring/data_handling.md +320 -0
  14. gsapere-0.2.0/documentation/rulebased_pruner_prefiltering.md +124 -0
  15. gsapere-0.2.0/documentation/train_time_loss_weighting.md +132 -0
  16. gsapere-0.2.0/documentation/train_time_loss_weighting_default.png +0 -0
  17. gsapere-0.2.0/documentation/train_time_loss_weighting_steepness.png +0 -0
  18. gsapere-0.2.0/documentation/train_time_loss_weighting_turn.png +0 -0
  19. gsapere-0.2.0/l3s/README.md +32 -0
  20. gsapere-0.2.0/pyproject.toml +85 -0
  21. gsapere-0.2.0/src/gsapere/__init__.py +0 -0
  22. gsapere-0.2.0/src/gsapere/commands/__init__.py +0 -0
  23. gsapere-0.2.0/src/gsapere/commands/_cli_utils.py +101 -0
  24. gsapere-0.2.0/src/gsapere/commands/add_gold_annotations.py +83 -0
  25. gsapere-0.2.0/src/gsapere/commands/benchmark_pipeline.py +260 -0
  26. gsapere-0.2.0/src/gsapere/commands/download_dataset.py +432 -0
  27. gsapere-0.2.0/src/gsapere/commands/evaluate.py +0 -0
  28. gsapere-0.2.0/src/gsapere/commands/fit_rulebased_pruner.py +971 -0
  29. gsapere-0.2.0/src/gsapere/commands/infer_fixed_spans.py +309 -0
  30. gsapere-0.2.0/src/gsapere/commands/infer_hgere.py +0 -0
  31. gsapere-0.2.0/src/gsapere/commands/infer_pruner_augmented.py +277 -0
  32. gsapere-0.2.0/src/gsapere/commands/ner_length_distribution.py +131 -0
  33. gsapere-0.2.0/src/gsapere/commands/plot_ner_candidates_by_sent_len.py +330 -0
  34. gsapere-0.2.0/src/gsapere/commands/run_pipeline.py +331 -0
  35. gsapere-0.2.0/src/gsapere/commands/train_hgere.py +297 -0
  36. gsapere-0.2.0/src/gsapere/commands/train_pruner.py +91 -0
  37. gsapere-0.2.0/src/gsapere/config/__init__.py +34 -0
  38. gsapere-0.2.0/src/gsapere/config/cli_gen.py +384 -0
  39. gsapere-0.2.0/src/gsapere/data/__init__.py +0 -0
  40. gsapere-0.2.0/src/gsapere/data/base_dataset.py +327 -0
  41. gsapere-0.2.0/src/gsapere/data/collators.py +198 -0
  42. gsapere-0.2.0/src/gsapere/data/config.py +110 -0
  43. gsapere-0.2.0/src/gsapere/data/data_types.py +392 -0
  44. gsapere-0.2.0/src/gsapere/data/multi_dataset.py +176 -0
  45. gsapere-0.2.0/src/gsapere/data/pruner_dataset.py +430 -0
  46. gsapere-0.2.0/src/gsapere/data/relation_dataset.py +969 -0
  47. gsapere-0.2.0/src/gsapere/data/samplers.py +193 -0
  48. gsapere-0.2.0/src/gsapere/data/tokenizer_utils.py +198 -0
  49. gsapere-0.2.0/src/gsapere/data/vocabulary.py +98 -0
  50. gsapere-0.2.0/src/gsapere/documentation/__init__.py +0 -0
  51. gsapere-0.2.0/src/gsapere/documentation/generate_pruner_docs.py +488 -0
  52. gsapere-0.2.0/src/gsapere/evaluation/__init__.py +0 -0
  53. gsapere-0.2.0/src/gsapere/evaluation/hgere.py +575 -0
  54. gsapere-0.2.0/src/gsapere/evaluation/pruner.py +425 -0
  55. gsapere-0.2.0/src/gsapere/evaluation/threshold_analysis.py +826 -0
  56. gsapere-0.2.0/src/gsapere/hgere/__init__.py +0 -0
  57. gsapere-0.2.0/src/gsapere/hgere/config.py +599 -0
  58. gsapere-0.2.0/src/gsapere/hgere/evaluate.py +337 -0
  59. gsapere-0.2.0/src/gsapere/hgere/inference.py +605 -0
  60. gsapere-0.2.0/src/gsapere/hgere/train.py +647 -0
  61. gsapere-0.2.0/src/gsapere/hgere/train_setup.py +528 -0
  62. gsapere-0.2.0/src/gsapere/label_configs/ace.yaml +19 -0
  63. gsapere-0.2.0/src/gsapere/label_configs/ace04.yaml +17 -0
  64. gsapere-0.2.0/src/gsapere/label_configs/ace05.yaml +18 -0
  65. gsapere-0.2.0/src/gsapere/label_configs/default.yaml +24 -0
  66. gsapere-0.2.0/src/gsapere/label_configs/gsap.yaml +35 -0
  67. gsapere-0.2.0/src/gsapere/label_configs/gsap_abbr.yaml +36 -0
  68. gsapere-0.2.0/src/gsapere/label_configs/mler.yaml +13 -0
  69. gsapere-0.2.0/src/gsapere/label_configs/scier.yaml +17 -0
  70. gsapere-0.2.0/src/gsapere/label_configs/scier_abbr.yaml +18 -0
  71. gsapere-0.2.0/src/gsapere/label_configs/scierc.yaml +18 -0
  72. gsapere-0.2.0/src/gsapere/label_configs/scinlp.yaml +20 -0
  73. gsapere-0.2.0/src/gsapere/label_configs/scinlp_abbr.yaml +21 -0
  74. gsapere-0.2.0/src/gsapere/label_configs/somd.yaml +30 -0
  75. gsapere-0.2.0/src/gsapere/label_configs/unifiedsciere.yaml +17 -0
  76. gsapere-0.2.0/src/gsapere/labels.py +131 -0
  77. gsapere-0.2.0/src/gsapere/models/hgere.py +2380 -0
  78. gsapere-0.2.0/src/gsapere/models/model_ace_albert.py +545 -0
  79. gsapere-0.2.0/src/gsapere/models/model_ace_bert.py +444 -0
  80. gsapere-0.2.0/src/gsapere/models/modules_ace.py +384 -0
  81. gsapere-0.2.0/src/gsapere/models/span_classifier.py +556 -0
  82. gsapere-0.2.0/src/gsapere/pipeline/__init__.py +17 -0
  83. gsapere-0.2.0/src/gsapere/pipeline/config.py +178 -0
  84. gsapere-0.2.0/src/gsapere/pipeline/hgere_runner.py +471 -0
  85. gsapere-0.2.0/src/gsapere/pipeline/pipeline.py +146 -0
  86. gsapere-0.2.0/src/gsapere/pipeline/presets.py +65 -0
  87. gsapere-0.2.0/src/gsapere/pipeline/pruner_runner.py +336 -0
  88. gsapere-0.2.0/src/gsapere/pre_filter/__init__.py +12 -0
  89. gsapere-0.2.0/src/gsapere/pre_filter/config.py +44 -0
  90. gsapere-0.2.0/src/gsapere/pre_filter/filter.py +131 -0
  91. gsapere-0.2.0/src/gsapere/pre_filter/fit.py +101 -0
  92. gsapere-0.2.0/src/gsapere/pre_filter/statistics.py +242 -0
  93. gsapere-0.2.0/src/gsapere/pruner/__init__.py +0 -0
  94. gsapere-0.2.0/src/gsapere/pruner/config.py +358 -0
  95. gsapere-0.2.0/src/gsapere/pruner/evaluate.py +1057 -0
  96. gsapere-0.2.0/src/gsapere/pruner/train.py +745 -0
  97. gsapere-0.2.0/src/gsapere/resources/gsap_ere_vocabulary.json +1 -0
  98. gsapere-0.2.0/src/gsapere/utils.py +50 -0
  99. gsapere-0.2.0/tests/__init__.py +0 -0
  100. gsapere-0.2.0/tests/gsapere/__init__.py +0 -0
  101. gsapere-0.2.0/tests/gsapere/commands/__init__.py +0 -0
  102. gsapere-0.2.0/tests/gsapere/commands/test_download_dataset.py +54 -0
  103. gsapere-0.2.0/tests/gsapere/commands/test_train_hgere.py +409 -0
  104. gsapere-0.2.0/tests/gsapere/commands/test_train_pruner.py +213 -0
  105. gsapere-0.2.0/tests/gsapere/config/__init__.py +0 -0
  106. gsapere-0.2.0/tests/gsapere/config/test_cli_gen.py +273 -0
  107. gsapere-0.2.0/tests/gsapere/data/__init__.py +0 -0
  108. gsapere-0.2.0/tests/gsapere/data/conftest.py +305 -0
  109. gsapere-0.2.0/tests/gsapere/data/test_base_dataset.py +422 -0
  110. gsapere-0.2.0/tests/gsapere/data/test_data_types.py +410 -0
  111. gsapere-0.2.0/tests/gsapere/data/test_integration.py +151 -0
  112. gsapere-0.2.0/tests/gsapere/data/test_multi_dataset.py +255 -0
  113. gsapere-0.2.0/tests/gsapere/data/test_pruner_dataset.py +230 -0
  114. gsapere-0.2.0/tests/gsapere/data/test_relation_dataset.py +775 -0
  115. gsapere-0.2.0/tests/gsapere/data/test_samplers.py +118 -0
  116. gsapere-0.2.0/tests/gsapere/data/test_tokenizer_utils.py +716 -0
  117. gsapere-0.2.0/tests/gsapere/hgere/__init__.py +0 -0
  118. gsapere-0.2.0/tests/gsapere/hgere/test_config.py +156 -0
  119. gsapere-0.2.0/tests/gsapere/hgere/test_multi_head_model.py +417 -0
  120. gsapere-0.2.0/tests/gsapere/hgere/test_train_setup.py +332 -0
  121. gsapere-0.2.0/tests/gsapere/pipeline/__init__.py +0 -0
  122. gsapere-0.2.0/tests/gsapere/pipeline/test_pipeline_config.py +94 -0
  123. gsapere-0.2.0/tests/gsapere/pruner/__init__.py +0 -0
  124. gsapere-0.2.0/tests/gsapere/pruner/test_pruner_config.py +204 -0
  125. gsapere-0.2.0/tests/gsapere/test_labels.py +128 -0
  126. gsapere-0.2.0/tests/integration/__init__.py +0 -0
  127. gsapere-0.2.0/tests/integration/test_docker_api.py +173 -0
  128. gsapere-0.2.0/tests/pipeline/__init__.py +0 -0
  129. gsapere-0.2.0/tests/pipeline/conftest.py +147 -0
  130. gsapere-0.2.0/tests/pipeline/test_config.py +193 -0
  131. gsapere-0.2.0/tests/pipeline/test_hgere_runner.py +188 -0
  132. gsapere-0.2.0/tests/pipeline/test_pipeline.py +153 -0
  133. gsapere-0.2.0/tests/pipeline/test_pruner_runner.py +441 -0
  134. gsapere-0.2.0/tests/pipeline/test_run_pipeline_cli.py +590 -0
  135. gsapere-0.2.0/tests/pipeline/test_train_commands.py +125 -0
  136. gsapere-0.2.0/tests/test_evaluation_hgere.py +201 -0
  137. gsapere-0.2.0/tests/test_evaluation_pruner.py +576 -0
  138. gsapere-0.2.0/tests/test_threshold_analysis.py +75 -0
@@ -0,0 +1,185 @@
1
+ # Project related
2
+ reports/
3
+
4
+ # ignore saves
5
+ saves
6
+ datasets_copy/
7
+ /models/
8
+ slurm/
9
+ logs/
10
+ /data/
11
+ datasets/
12
+ pretrained_models/
13
+ uv.lock
14
+ pruner_predictions/
15
+ /pipeline/
16
+ /scripts/
17
+ output/
18
+ # ML related
19
+ wandb/
20
+
21
+ # Byte-compiled / optimized / DLL files
22
+ __pycache__/
23
+ *.py[cod]
24
+ *$py.class
25
+
26
+ # C extensions
27
+ *.so
28
+
29
+ # Distribution / packaging
30
+ .Python
31
+ build/
32
+ develop-eggs/
33
+ dist/
34
+ downloads/
35
+ eggs/
36
+ .eggs/
37
+ lib/
38
+ lib64/
39
+ parts/
40
+ sdist/
41
+ var/
42
+ wheels/
43
+ share/python-wheels/
44
+ *.egg-info/
45
+ .installed.cfg
46
+ *.egg
47
+ MANIFEST
48
+
49
+ # PyInstaller
50
+ # Usually these files are written by a python script from a template
51
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
52
+ *.manifest
53
+ *.spec
54
+
55
+ # Installer logs
56
+ pip-log.txt
57
+ pip-delete-this-directory.txt
58
+
59
+ # Unit test / coverage reports
60
+ htmlcov/
61
+ .tox/
62
+ .nox/
63
+ .coverage
64
+ .coverage.*
65
+ .cache
66
+ nosetests.xml
67
+ coverage.xml
68
+ *.cover
69
+ *.py,cover
70
+ .hypothesis/
71
+ .pytest_cache/
72
+ cover/
73
+
74
+ # Translations
75
+ *.mo
76
+ *.pot
77
+
78
+ # Django stuff:
79
+ *.log
80
+ local_settings.py
81
+ db.sqlite3
82
+ db.sqlite3-journal
83
+
84
+ # Flask stuff:
85
+ instance/
86
+ .webassets-cache
87
+
88
+ # Scrapy stuff:
89
+ .scrapy
90
+
91
+ # Sphinx documentation
92
+ docs/_build/
93
+
94
+ # PyBuilder
95
+ .pybuilder/
96
+ target/
97
+
98
+ # Jupyter Notebook
99
+ .ipynb_checkpoints
100
+ notebooks/
101
+
102
+ # IPython
103
+ profile_default/
104
+ ipython_config.py
105
+
106
+ # pyenv
107
+ # For a library or package, you might want to ignore these files since the code is
108
+ # intended to run in multiple environments; otherwise, check them in:
109
+ # .python-version
110
+
111
+ # pipenv
112
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
113
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
114
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
115
+ # install all needed dependencies.
116
+ #Pipfile.lock
117
+
118
+ # poetry
119
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
120
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
121
+ # commonly ignored for libraries.
122
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
123
+ #poetry.lock
124
+
125
+ # pdm
126
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
127
+ #pdm.lock
128
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
129
+ # in version control.
130
+ # https://pdm.fming.dev/#use-with-ide
131
+ .pdm.toml
132
+
133
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134
+ __pypackages__/
135
+
136
+ # Celery stuff
137
+ celerybeat-schedule
138
+ celerybeat.pid
139
+
140
+ # SageMath parsed files
141
+ *.sage.py
142
+
143
+ # Environments
144
+ .env
145
+ .venv
146
+ env/
147
+ venv/
148
+ ENV/
149
+ env.bak/
150
+ venv.bak/
151
+
152
+ # Spyder project settings
153
+ .spyderproject
154
+ .spyproject
155
+
156
+ # Rope project settings
157
+ .ropeproject
158
+
159
+ # mkdocs documentation
160
+ /site
161
+
162
+ # mypy
163
+ .mypy_cache/
164
+ .dmypy.json
165
+ dmypy.json
166
+
167
+ # Pyre type checker
168
+ .pyre/
169
+
170
+ # pytype static type analyzer
171
+ .pytype/
172
+
173
+ # Cython debug symbols
174
+ cython_debug/
175
+
176
+ # PyCharm
177
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
178
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
179
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
180
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
181
+ #.idea/
182
+
183
+ # PyPI token
184
+ .pypi
185
+
gsapere-0.2.0/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 yanzhh
4
+ Copyright (c) 2026 ottowg
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
gsapere-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,381 @@
1
+ Metadata-Version: 2.4
2
+ Name: gsapere
3
+ Version: 0.2.0
4
+ Summary: Entity and Relation Extraction on scientific text using HGERE with a span-pruning stage.
5
+ Author: Wolfgang Otto
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: entity-recognition,information-extraction,nlp,relation-extraction,scientific-text
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Topic :: Text Processing :: Linguistic
16
+ Requires-Python: <3.11,>=3.9
17
+ Requires-Dist: einops>=0.8.0
18
+ Requires-Dist: huggingface-hub>=0.20.0
19
+ Requires-Dist: matplotlib>=3.9.0
20
+ Requires-Dist: pandas>=2.0.0
21
+ Requires-Dist: pydantic>=2.0.0
22
+ Requires-Dist: pyyaml>=6.0
23
+ Requires-Dist: requests>=2.32.0
24
+ Requires-Dist: scikit-learn>=1.6.0
25
+ Requires-Dist: setuptools>=75.6.0
26
+ Requires-Dist: tabulate>=0.9.0
27
+ Requires-Dist: torch>=2.8.0
28
+ Requires-Dist: tqdm>=4.67.1
29
+ Requires-Dist: transformers
30
+ Requires-Dist: wandb>=0.19.9
31
+ Provides-Extra: dev
32
+ Requires-Dist: lint>=1.2.1; extra == 'dev'
33
+ Requires-Dist: pytest>=8.4.2; extra == 'dev'
34
+ Requires-Dist: ruff>=0.15.5; extra == 'dev'
35
+ Requires-Dist: twine>=6.1.0; extra == 'dev'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # gsapere — Entity and Relation Extraction for Scientific Text
39
+
40
+ A fork of [HGERE](https://github.com/yanzhh/HGERE) adapted for scientific text, with a two-stage pipeline for **joint entity and relation extraction (ERE)**.
41
+
42
+ > **Paper under review.**
43
+ > Configs used for our experiments are in [`configs/`](configs/).
44
+
45
+ The pipeline consists of:
46
+
47
+ 1. **Rule-based pre-filter** *(optional)* — removes deterministically non-entity spans (punctuation, function-word sequences, etc.) before the neural pruner sees training data, reducing trivial negatives and speeding up training
48
+ 2. **Span Pruner** — a binary classifier that scores remaining candidate n-grams and filters them to a manageable set (target: ≥ 98 % entity recall)
49
+ 3. **HGERE** — a Hypergraph GNN that jointly predicts entity types and relations on the pruned candidates
50
+
51
+ Supported datasets: **GSAP-ERE**, **SciER**, **SciNLP**, **SciERC**
52
+
53
+ ---
54
+
55
+ ## Changes from the original
56
+
57
+ - Large-scale code restructuring: Pydantic-first configs, typed signatures throughout, proper package layout under `src/`
58
+ - All dependencies updated to current versions
59
+ - The transformer package is **no longer hardcoded** — any compatible HuggingFace `transformers` version works
60
+ - Added rule-based pre-filter, span pruner stage, multi-dataset joint training, and full CLI entry points
61
+ - Tests for all major components
62
+
63
+ ---
64
+
65
+ ## Requirements
66
+
67
+ - **Python 3.9** (tested; `<3.11` required by some dependencies)
68
+ - CUDA 12.8 (adjust `pyproject.toml` for other CUDA versions)
69
+ - A GPU with at least ~24 GB VRAM for default batch sizes (tested on A40 / 40 GB)
70
+
71
+ ---
72
+
73
+ ## Installation
74
+
75
+ Install [uv](https://github.com/astral-sh/uv):
76
+
77
+ ```bash
78
+ curl -LsSf https://astral.sh/uv/install.sh | sh
79
+ ```
80
+
81
+ Clone the repository and install:
82
+
83
+ ```bash
84
+ git clone <repo-url>
85
+ cd HGERE
86
+ uv sync
87
+ ```
88
+
89
+ ### Datasets
90
+
91
+ Datasets are loaded from their original sources via the download command:
92
+
93
+ ```bash
94
+ uv run gsapere-download-dataset --list # list available datasets
95
+ uv run gsapere-download-dataset gsap-ere
96
+ uv run gsapere-download-dataset scier
97
+ uv run gsapere-download-dataset scinlp
98
+ uv run gsapere-download-dataset scierc
99
+ uv run gsapere-download-dataset --all # download everything
100
+ ```
101
+
102
+ See [documentation/download-dataset.md](documentation/download-dataset.md) for split details and manual download fallbacks.
103
+
104
+ #### GSAP-ERE
105
+
106
+ Fine-grained entity and relation extraction focused on machine learning — 100 annotated full-text ML publications, 63K entities, 35K relations, 10 entity types, 18 relation types.
107
+ DOI: <https://doi.org/10.60914/c4c1d-s0587>
108
+
109
+ > Otto et al., "GSAP-ERE: Fine-Grained Scholarly Entity and Relation Extraction Focused on Machine Learning", AAAI 2026.
110
+ > <https://ojs.aaai.org/index.php/AAAI/article/view/40537>
111
+
112
+ #### SciER
113
+
114
+ Entity and relation extraction dataset for datasets, methods, and tasks in scientific documents — 106 annotated full-text papers, 24k entities, 12k relations.
115
+
116
+ > Dziadek et al., "SciER: An Entity and Relation Extraction Dataset for Datasets, Methods, and Tasks in Scientific Documents", EMNLP 2024.
117
+ > <https://aclanthology.org/2024.emnlp-main.726/>
118
+
119
+ #### SciNLP
120
+
121
+ Full-text entity and relation extraction benchmark for the NLP domain — 60 annotated ACL papers, 6,409 entities, 1,648 relations.
122
+
123
+ > "SciNLP: A Domain-Specific Benchmark for Full-Text Scientific Entity and Relation Extraction in NLP", EMNLP 2025.
124
+ > <https://aclanthology.org/2025.emnlp-main.732/>
125
+
126
+ #### SciERC
127
+
128
+ Scientific information extraction benchmark — 500 annotated AI abstracts,
129
+ 6 entity types, 7 relation types.
130
+
131
+ > Luan et al., "Multi-Task Identification of Entities, Relations, and Coreference for Scientific Knowledge Graph Construction", EMNLP 2018.
132
+ > <https://aclanthology.org/D18-1360/>
133
+
134
+ ---
135
+
136
+ ## Training
137
+
138
+ Training is a two-step process: first train the pruner, then train HGERE on the pruner's output.
139
+
140
+ ### Step 1 — Fit the rule-based pre-filter (optional)
141
+
142
+ ```bash
143
+ uv run gsapere-fit-rulebased-pruner configs/train/gsap/fit_rulebased_pruner.yaml
144
+ ```
145
+
146
+ This fits token n-gram patterns from the training data that deterministically exclude non-entity spans. The saved JSON file is referenced in the pruner training config to speed up training.
147
+
148
+ ### Step 2 — Train the span pruner
149
+
150
+ ```bash
151
+ uv run gsapere-train-pruner configs/train/gsap/train_gsap_pruner.yaml
152
+ ```
153
+
154
+ After training, run pruner inference on train/dev/test to produce the enriched input files for HGERE (see `scripts/pruner/`).
155
+
156
+ ### Step 3 — Train HGERE (single dataset)
157
+
158
+ ```bash
159
+ uv run gsapere-train-hgere configs/train/gsap/train_gsap_hgere.yaml
160
+ ```
161
+
162
+ Example config:
163
+
164
+ ```yaml
165
+ schema_version: "1.0"
166
+ label_set: gsap
167
+ model_dir: saves/hgere/gsap
168
+ base_model_name_or_path: pretrained_models/scibert_scivocab_uncased
169
+ ner_prediction_dir: saves/pruner/gsap/output
170
+ max_seq_length: 512
171
+ n_iter: 3
172
+ layernorm: true
173
+ attn_self: true
174
+
175
+ train_params:
176
+ learning_rate: 1e-5
177
+ num_train_epochs: 8
178
+ per_gpu_train_batch_size: 21
179
+ fp16: true
180
+ evaluate_during_training: true
181
+ eval_epochs: 1
182
+ loss_re_weight_alpha: 0.9
183
+ log_wandb: true
184
+ ```
185
+
186
+ ### Step 3 (alt) — Train HGERE on multiple datasets jointly
187
+
188
+ Multi-dataset mode trains a shared encoder with per-dataset NER and relation heads. Each dataset must have its own pruner output directory.
189
+
190
+ ```bash
191
+ uv run gsapere-train-hgere configs/multi-sciere-scinlp-gsap-ere/train/hgere/train_multi.yaml
192
+ ```
193
+
194
+ Example config:
195
+
196
+ ```yaml
197
+ schema_version: "1.0"
198
+ model_dir: saves/multi/hgere/run1
199
+ base_model_name_or_path: pretrained_models/scibert_scivocab_uncased
200
+ max_seq_length: 512
201
+ n_iter: 3
202
+ layernorm: true
203
+ attn_self: true
204
+ sampling_temperature: 0.8 # 0 = always largest dataset, 1 = proportional to size
205
+ seeds: [42, 43, 44] # run once per seed; _seed<n> appended to model_dir
206
+
207
+ datasets:
208
+ - label_set: scier
209
+ ner_prediction_dir: saves/pruner/scier/output
210
+ train_file: ent_pred_train.json
211
+ dev_file: ent_pred_dev.json
212
+ test_file: ent_pred_test.json
213
+ - label_set: scinlp
214
+ ner_prediction_dir: saves/pruner/scinlp/output
215
+ train_file: ent_pred_train.json
216
+ dev_file: ent_pred_dev.json # omit (null) to skip dev evaluation for this dataset
217
+ - label_set: gsap
218
+ ner_prediction_dir: saves/pruner/gsap/output
219
+ train_file: ent_pred_train.json
220
+
221
+ train_params:
222
+ learning_rate: 1e-5
223
+ num_train_epochs: 8
224
+ per_gpu_train_batch_size: 21
225
+ fp16: true
226
+ evaluate_during_training: true
227
+ log_wandb: true
228
+ ```
229
+
230
+ ---
231
+
232
+ ## Inference
233
+
234
+ ### Full pipeline (pruner → HGERE)
235
+
236
+ ```bash
237
+ CUDA_VISIBLE_DEVICES=0 uv run gsapere-pipeline \
238
+ --config configs/inference/gsap-pipeline-best.yaml \
239
+ --input input/ \
240
+ --output output/
241
+ ```
242
+
243
+ `--input` can be a `.jsonl` file or a directory of `.jsonl` files.
244
+ Ready-to-use configs for all supported datasets are in [`configs/inference/`](configs/inference/).
245
+
246
+ The pipeline config combines pruner and HGERE settings in a single YAML file:
247
+
248
+ ```yaml
249
+ label_set: gsap
250
+
251
+ pruner:
252
+ model_dir: saves/pruner/gsap/best
253
+ base_model_name_or_path: pretrained_models/scibert_scivocab_uncased
254
+ model_type: bertspanmarkerpruner
255
+ max_seq_length: 256
256
+ per_gpu_eval_batch_size: 32
257
+ final_pruning:
258
+ method: threshold
259
+ threshold: 0.0005
260
+
261
+ hgere:
262
+ model_dir: saves/hgere/gsap/best
263
+ base_model_name_or_path: pretrained_models/scibert_scivocab_uncased
264
+ model_type: hyper
265
+ max_seq_length: 512
266
+ per_gpu_eval_batch_size: 32
267
+ n_iter: 3
268
+ layernorm: true
269
+ attn_self: true
270
+ pre_filter_params:
271
+ method: threshold
272
+ value: 0.0125
273
+ ```
274
+
275
+ ---
276
+
277
+ ## Docker API
278
+
279
+ The pipeline can be served as a REST API. Build and run with Docker (requires `--gpus all`):
280
+
281
+ ```bash
282
+ docker build -t gsapere-api .
283
+
284
+ docker run --gpus all \
285
+ -v /path/to/models:/app/models \
286
+ -v /path/to/config.yaml:/app/config.yaml \
287
+ -e PIPELINE_CONFIG=/app/config.yaml \
288
+ -p 8000:8000 \
289
+ gsapere-api
290
+ ```
291
+
292
+ Models and the pipeline config are mounted at runtime — the image itself contains only the code.
293
+
294
+ **Endpoints:**
295
+
296
+ | Method | Path | Description |
297
+ |---|---|---|
298
+ | `GET` | `/health` | Liveness check |
299
+ | `POST` | `/predict` | Run the pipeline on a batch of documents |
300
+
301
+ **Example request:**
302
+
303
+ ```bash
304
+ curl -X POST http://localhost:8000/predict \
305
+ -H "Content-Type: application/json" \
306
+ -d '{"documents": [{"doc_key": "doc1", "sentences": [["We", "train", "BERT", "."]]}]}'
307
+ ```
308
+
309
+ ---
310
+
311
+ ## CLI reference
312
+
313
+ | Command | Description |
314
+ |---|---|
315
+ | `gsapere-train-pruner` | Train the span pruner |
316
+ | `gsapere-train-hgere` | Train the HGERE ERE model |
317
+ | `gsapere-pipeline` | Run the full two-stage pipeline on new documents |
318
+ | `gsapere-download-dataset` | Download supported datasets |
319
+ | `gsapere-tune-pruner` | Threshold sweep and optimisation for the pruner |
320
+ | `gsapere-fit-rulebased-pruner` | Fit a rule-based pruner baseline |
321
+ | `infer-fixed-spans` | Run HGERE on fixed (gold) spans |
322
+ | `infer-pruner-augmented` | Run HGERE on pruner-predicted spans |
323
+ | `gsap-ere-benchmark-pipeline` | Benchmark pipeline throughput |
324
+ | `gsapere-fix-gold-annos` | Add gold annotations to prediction files |
325
+ | `gsapere-analysis-ner-length-distribution` | Analyse entity length distributions |
326
+ | `gsapere-generate-pruner-docs` | Regenerate parameter docs in `documentation/api/` |
327
+
328
+ ---
329
+
330
+ ## Development
331
+
332
+ ```bash
333
+ uv run pytest # run tests
334
+ uv run ruff format src/ tests/ # format
335
+ uv run ruff check src/ tests/ # lint
336
+ ```
337
+
338
+ ---
339
+
340
+ ## Building and publishing
341
+
342
+ ```bash
343
+ uv build # produces dist/ wheel + sdist
344
+ bash publish.sh # build + upload to PyPI (requires .pypi token file)
345
+ ```
346
+
347
+ ---
348
+
349
+ ## Citation
350
+
351
+ Please cite this work and the original HGERE:
352
+
353
+ ```bibtex
354
+ @article{Otto2026GSAP-ERE,
355
+ title = {{GSAP-ERE}: Fine-Grained Scholarly Entity and Relation Extraction Focused on Machine Learning},
356
+ author = {Otto, Wolfgang and Gan, Lu and Upadhyaya, Sharmila and Karmakar, Saurav and Dietze, Stefan},
357
+ journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
358
+ volume = {40},
359
+ number = {38},
360
+ pages = {32600--32609},
361
+ year = {2026},
362
+ month = {Mar.},
363
+ doi = {10.1609/aaai.v40i38.40537},
364
+ url = {https://ojs.aaai.org/index.php/AAAI/article/view/40537},
365
+ }
366
+
367
+ @misc{yan2023joint,
368
+ title = {Joint Entity and Relation Extraction with Span Pruning and Hypergraph Neural Networks},
369
+ author = {Zhaohui Yan and Songlin Yang and Wei Liu and Kewei Tu},
370
+ year = {2023},
371
+ eprint = {2310.17238},
372
+ archivePrefix = {arXiv},
373
+ primaryClass = {cs.CL}
374
+ }
375
+ ```
376
+
377
+ ---
378
+
379
+ ## License
380
+
381
+ MIT — see [LICENSE](LICENSE).