PyPI - gsapere - Versions diffs - 0.2.0__tar.gz - Mend

gsapere 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

gsapere-0.2.0/.gitignore +185 -0
gsapere-0.2.0/LICENSE +22 -0
gsapere-0.2.0/PKG-INFO +381 -0
gsapere-0.2.0/README.md +344 -0
gsapere-0.2.0/documentation/README.md +19 -0
gsapere-0.2.0/documentation/api/hgere.md +162 -0
gsapere-0.2.0/documentation/api/pruner.md +139 -0
gsapere-0.2.0/documentation/download-dataset.md +159 -0
gsapere-0.2.0/documentation/multi_head_training.md +226 -0
gsapere-0.2.0/documentation/ner_candidates_by_sent_len.png +0 -0
gsapere-0.2.0/documentation/ner_candidates_by_sent_len_t0.1.png +0 -0
gsapere-0.2.0/documentation/pruner.md +104 -0
gsapere-0.2.0/documentation/refactoring/data_handling.md +320 -0
gsapere-0.2.0/documentation/rulebased_pruner_prefiltering.md +124 -0
gsapere-0.2.0/documentation/train_time_loss_weighting.md +132 -0
gsapere-0.2.0/documentation/train_time_loss_weighting_default.png +0 -0
gsapere-0.2.0/documentation/train_time_loss_weighting_steepness.png +0 -0
gsapere-0.2.0/documentation/train_time_loss_weighting_turn.png +0 -0
gsapere-0.2.0/l3s/README.md +32 -0
gsapere-0.2.0/pyproject.toml +85 -0
gsapere-0.2.0/src/gsapere/__init__.py +0 -0
gsapere-0.2.0/src/gsapere/commands/__init__.py +0 -0
gsapere-0.2.0/src/gsapere/commands/_cli_utils.py +101 -0
gsapere-0.2.0/src/gsapere/commands/add_gold_annotations.py +83 -0
gsapere-0.2.0/src/gsapere/commands/benchmark_pipeline.py +260 -0
gsapere-0.2.0/src/gsapere/commands/download_dataset.py +432 -0
gsapere-0.2.0/src/gsapere/commands/evaluate.py +0 -0
gsapere-0.2.0/src/gsapere/commands/fit_rulebased_pruner.py +971 -0
gsapere-0.2.0/src/gsapere/commands/infer_fixed_spans.py +309 -0
gsapere-0.2.0/src/gsapere/commands/infer_hgere.py +0 -0
gsapere-0.2.0/src/gsapere/commands/infer_pruner_augmented.py +277 -0
gsapere-0.2.0/src/gsapere/commands/ner_length_distribution.py +131 -0
gsapere-0.2.0/src/gsapere/commands/plot_ner_candidates_by_sent_len.py +330 -0
gsapere-0.2.0/src/gsapere/commands/run_pipeline.py +331 -0
gsapere-0.2.0/src/gsapere/commands/train_hgere.py +297 -0
gsapere-0.2.0/src/gsapere/commands/train_pruner.py +91 -0
gsapere-0.2.0/src/gsapere/config/__init__.py +34 -0
gsapere-0.2.0/src/gsapere/config/cli_gen.py +384 -0
gsapere-0.2.0/src/gsapere/data/__init__.py +0 -0
gsapere-0.2.0/src/gsapere/data/base_dataset.py +327 -0
gsapere-0.2.0/src/gsapere/data/collators.py +198 -0
gsapere-0.2.0/src/gsapere/data/config.py +110 -0
gsapere-0.2.0/src/gsapere/data/data_types.py +392 -0
gsapere-0.2.0/src/gsapere/data/multi_dataset.py +176 -0
gsapere-0.2.0/src/gsapere/data/pruner_dataset.py +430 -0
gsapere-0.2.0/src/gsapere/data/relation_dataset.py +969 -0
gsapere-0.2.0/src/gsapere/data/samplers.py +193 -0
gsapere-0.2.0/src/gsapere/data/tokenizer_utils.py +198 -0
gsapere-0.2.0/src/gsapere/data/vocabulary.py +98 -0
gsapere-0.2.0/src/gsapere/documentation/__init__.py +0 -0
gsapere-0.2.0/src/gsapere/documentation/generate_pruner_docs.py +488 -0
gsapere-0.2.0/src/gsapere/evaluation/__init__.py +0 -0
gsapere-0.2.0/src/gsapere/evaluation/hgere.py +575 -0
gsapere-0.2.0/src/gsapere/evaluation/pruner.py +425 -0
gsapere-0.2.0/src/gsapere/evaluation/threshold_analysis.py +826 -0
gsapere-0.2.0/src/gsapere/hgere/__init__.py +0 -0
gsapere-0.2.0/src/gsapere/hgere/config.py +599 -0
gsapere-0.2.0/src/gsapere/hgere/evaluate.py +337 -0
gsapere-0.2.0/src/gsapere/hgere/inference.py +605 -0
gsapere-0.2.0/src/gsapere/hgere/train.py +647 -0
gsapere-0.2.0/src/gsapere/hgere/train_setup.py +528 -0
gsapere-0.2.0/src/gsapere/label_configs/ace.yaml +19 -0
gsapere-0.2.0/src/gsapere/label_configs/ace04.yaml +17 -0
gsapere-0.2.0/src/gsapere/label_configs/ace05.yaml +18 -0
gsapere-0.2.0/src/gsapere/label_configs/default.yaml +24 -0
gsapere-0.2.0/src/gsapere/label_configs/gsap.yaml +35 -0
gsapere-0.2.0/src/gsapere/label_configs/gsap_abbr.yaml +36 -0
gsapere-0.2.0/src/gsapere/label_configs/mler.yaml +13 -0
gsapere-0.2.0/src/gsapere/label_configs/scier.yaml +17 -0
gsapere-0.2.0/src/gsapere/label_configs/scier_abbr.yaml +18 -0
gsapere-0.2.0/src/gsapere/label_configs/scierc.yaml +18 -0
gsapere-0.2.0/src/gsapere/label_configs/scinlp.yaml +20 -0
gsapere-0.2.0/src/gsapere/label_configs/scinlp_abbr.yaml +21 -0
gsapere-0.2.0/src/gsapere/label_configs/somd.yaml +30 -0
gsapere-0.2.0/src/gsapere/label_configs/unifiedsciere.yaml +17 -0
gsapere-0.2.0/src/gsapere/labels.py +131 -0
gsapere-0.2.0/src/gsapere/models/hgere.py +2380 -0
gsapere-0.2.0/src/gsapere/models/model_ace_albert.py +545 -0
gsapere-0.2.0/src/gsapere/models/model_ace_bert.py +444 -0
gsapere-0.2.0/src/gsapere/models/modules_ace.py +384 -0
gsapere-0.2.0/src/gsapere/models/span_classifier.py +556 -0
gsapere-0.2.0/src/gsapere/pipeline/__init__.py +17 -0
gsapere-0.2.0/src/gsapere/pipeline/config.py +178 -0
gsapere-0.2.0/src/gsapere/pipeline/hgere_runner.py +471 -0
gsapere-0.2.0/src/gsapere/pipeline/pipeline.py +146 -0
gsapere-0.2.0/src/gsapere/pipeline/presets.py +65 -0
gsapere-0.2.0/src/gsapere/pipeline/pruner_runner.py +336 -0
gsapere-0.2.0/src/gsapere/pre_filter/__init__.py +12 -0
gsapere-0.2.0/src/gsapere/pre_filter/config.py +44 -0
gsapere-0.2.0/src/gsapere/pre_filter/filter.py +131 -0
gsapere-0.2.0/src/gsapere/pre_filter/fit.py +101 -0
gsapere-0.2.0/src/gsapere/pre_filter/statistics.py +242 -0
gsapere-0.2.0/src/gsapere/pruner/__init__.py +0 -0
gsapere-0.2.0/src/gsapere/pruner/config.py +358 -0
gsapere-0.2.0/src/gsapere/pruner/evaluate.py +1057 -0
gsapere-0.2.0/src/gsapere/pruner/train.py +745 -0
gsapere-0.2.0/src/gsapere/resources/gsap_ere_vocabulary.json +1 -0
gsapere-0.2.0/src/gsapere/utils.py +50 -0
gsapere-0.2.0/tests/__init__.py +0 -0
gsapere-0.2.0/tests/gsapere/__init__.py +0 -0
gsapere-0.2.0/tests/gsapere/commands/__init__.py +0 -0
gsapere-0.2.0/tests/gsapere/commands/test_download_dataset.py +54 -0
gsapere-0.2.0/tests/gsapere/commands/test_train_hgere.py +409 -0
gsapere-0.2.0/tests/gsapere/commands/test_train_pruner.py +213 -0
gsapere-0.2.0/tests/gsapere/config/__init__.py +0 -0
gsapere-0.2.0/tests/gsapere/config/test_cli_gen.py +273 -0
gsapere-0.2.0/tests/gsapere/data/__init__.py +0 -0
gsapere-0.2.0/tests/gsapere/data/conftest.py +305 -0
gsapere-0.2.0/tests/gsapere/data/test_base_dataset.py +422 -0
gsapere-0.2.0/tests/gsapere/data/test_data_types.py +410 -0
gsapere-0.2.0/tests/gsapere/data/test_integration.py +151 -0
gsapere-0.2.0/tests/gsapere/data/test_multi_dataset.py +255 -0
gsapere-0.2.0/tests/gsapere/data/test_pruner_dataset.py +230 -0
gsapere-0.2.0/tests/gsapere/data/test_relation_dataset.py +775 -0
gsapere-0.2.0/tests/gsapere/data/test_samplers.py +118 -0
gsapere-0.2.0/tests/gsapere/data/test_tokenizer_utils.py +716 -0
gsapere-0.2.0/tests/gsapere/hgere/__init__.py +0 -0
gsapere-0.2.0/tests/gsapere/hgere/test_config.py +156 -0
gsapere-0.2.0/tests/gsapere/hgere/test_multi_head_model.py +417 -0
gsapere-0.2.0/tests/gsapere/hgere/test_train_setup.py +332 -0
gsapere-0.2.0/tests/gsapere/pipeline/__init__.py +0 -0
gsapere-0.2.0/tests/gsapere/pipeline/test_pipeline_config.py +94 -0
gsapere-0.2.0/tests/gsapere/pruner/__init__.py +0 -0
gsapere-0.2.0/tests/gsapere/pruner/test_pruner_config.py +204 -0
gsapere-0.2.0/tests/gsapere/test_labels.py +128 -0
gsapere-0.2.0/tests/integration/__init__.py +0 -0
gsapere-0.2.0/tests/integration/test_docker_api.py +173 -0
gsapere-0.2.0/tests/pipeline/__init__.py +0 -0
gsapere-0.2.0/tests/pipeline/conftest.py +147 -0
gsapere-0.2.0/tests/pipeline/test_config.py +193 -0
gsapere-0.2.0/tests/pipeline/test_hgere_runner.py +188 -0
gsapere-0.2.0/tests/pipeline/test_pipeline.py +153 -0
gsapere-0.2.0/tests/pipeline/test_pruner_runner.py +441 -0
gsapere-0.2.0/tests/pipeline/test_run_pipeline_cli.py +590 -0
gsapere-0.2.0/tests/pipeline/test_train_commands.py +125 -0
gsapere-0.2.0/tests/test_evaluation_hgere.py +201 -0
gsapere-0.2.0/tests/test_evaluation_pruner.py +576 -0
gsapere-0.2.0/tests/test_threshold_analysis.py +75 -0

gsapere-0.2.0/.gitignore ADDED Viewed

@@ -0,0 +1,185 @@
+# Project related
+reports/
+# ignore saves
+saves
+datasets_copy/
+/models/
+slurm/
+logs/
+/data/
+datasets/
+pretrained_models/
+uv.lock
+pruner_predictions/
+/pipeline/
+/scripts/
+output/
+# ML related
+wandb/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI token
+.pypi

gsapere-0.2.0/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+MIT License
+Copyright (c) 2023 yanzhh
+Copyright (c) 2026 ottowg
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

gsapere-0.2.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,381 @@
+Metadata-Version: 2.4
+Name: gsapere
+Version: 0.2.0
+Summary: Entity and Relation Extraction on scientific text using HGERE with a span-pruning stage.
+Author: Wolfgang Otto
+License: MIT
+License-File: LICENSE
+Keywords: entity-recognition,information-extraction,nlp,relation-extraction,scientific-text
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Text Processing :: Linguistic
+Requires-Python: <3.11,>=3.9
+Requires-Dist: einops>=0.8.0
+Requires-Dist: huggingface-hub>=0.20.0
+Requires-Dist: matplotlib>=3.9.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: requests>=2.32.0
+Requires-Dist: scikit-learn>=1.6.0
+Requires-Dist: setuptools>=75.6.0
+Requires-Dist: tabulate>=0.9.0
+Requires-Dist: torch>=2.8.0
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: transformers
+Requires-Dist: wandb>=0.19.9
+Provides-Extra: dev
+Requires-Dist: lint>=1.2.1; extra == 'dev'
+Requires-Dist: pytest>=8.4.2; extra == 'dev'
+Requires-Dist: ruff>=0.15.5; extra == 'dev'
+Requires-Dist: twine>=6.1.0; extra == 'dev'
+Description-Content-Type: text/markdown
+# gsapere — Entity and Relation Extraction for Scientific Text
+A fork of [HGERE](https://github.com/yanzhh/HGERE) adapted for scientific text, with a two-stage pipeline for **joint entity and relation extraction (ERE)**.
+> **Paper under review.**
+> Configs used for our experiments are in [`configs/`](configs/).
+The pipeline consists of:
+1. **Rule-based pre-filter** *(optional)* — removes deterministically non-entity spans (punctuation, function-word sequences, etc.) before the neural pruner sees training data, reducing trivial negatives and speeding up training
+2. **Span Pruner** — a binary classifier that scores remaining candidate n-grams and filters them to a manageable set (target: ≥ 98 % entity recall)
+3. **HGERE** — a Hypergraph GNN that jointly predicts entity types and relations on the pruned candidates
+Supported datasets: **GSAP-ERE**, **SciER**, **SciNLP**, **SciERC**
+---
+## Changes from the original
+- Large-scale code restructuring: Pydantic-first configs, typed signatures throughout, proper package layout under `src/`
+- All dependencies updated to current versions
+- The transformer package is **no longer hardcoded** — any compatible HuggingFace `transformers` version works
+- Added rule-based pre-filter, span pruner stage, multi-dataset joint training, and full CLI entry points
+- Tests for all major components
+---
+## Requirements
+- **Python 3.9** (tested; `<3.11` required by some dependencies)
+- CUDA 12.8 (adjust `pyproject.toml` for other CUDA versions)
+- A GPU with at least ~24 GB VRAM for default batch sizes (tested on A40 / 40 GB)
+---
+## Installation
+Install [uv](https://github.com/astral-sh/uv):
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+Clone the repository and install:
+```bash
+git clone <repo-url>
+cd HGERE
+uv sync
+```
+### Datasets
+Datasets are loaded from their original sources via the download command:
+```bash
+uv run gsapere-download-dataset --list          # list available datasets
+uv run gsapere-download-dataset gsap-ere
+uv run gsapere-download-dataset scier
+uv run gsapere-download-dataset scinlp
+uv run gsapere-download-dataset scierc
+uv run gsapere-download-dataset --all           # download everything
+```
+See [documentation/download-dataset.md](documentation/download-dataset.md) for split details and manual download fallbacks.
+#### GSAP-ERE
+Fine-grained entity and relation extraction focused on machine learning — 100 annotated full-text ML publications, 63K entities, 35K relations, 10 entity types, 18 relation types.
+DOI: <https://doi.org/10.60914/c4c1d-s0587>
+> Otto et al., "GSAP-ERE: Fine-Grained Scholarly Entity and Relation Extraction Focused on Machine Learning", AAAI 2026.
+> <https://ojs.aaai.org/index.php/AAAI/article/view/40537>
+#### SciER
+Entity and relation extraction dataset for datasets, methods, and tasks in scientific documents — 106 annotated full-text papers, 24k entities, 12k relations.
+> Dziadek et al., "SciER: An Entity and Relation Extraction Dataset for Datasets, Methods, and Tasks in Scientific Documents", EMNLP 2024.
+> <https://aclanthology.org/2024.emnlp-main.726/>
+#### SciNLP
+Full-text entity and relation extraction benchmark for the NLP domain — 60 annotated ACL papers, 6,409 entities, 1,648 relations.
+> "SciNLP: A Domain-Specific Benchmark for Full-Text Scientific Entity and Relation Extraction in NLP", EMNLP 2025.
+> <https://aclanthology.org/2025.emnlp-main.732/>
+#### SciERC
+Scientific information extraction benchmark — 500 annotated AI abstracts,
+6 entity types, 7 relation types.
+> Luan et al., "Multi-Task Identification of Entities, Relations, and Coreference for Scientific Knowledge Graph Construction", EMNLP 2018.
+> <https://aclanthology.org/D18-1360/>
+---
+## Training
+Training is a two-step process: first train the pruner, then train HGERE on the pruner's output.
+### Step 1 — Fit the rule-based pre-filter (optional)
+```bash
+uv run gsapere-fit-rulebased-pruner configs/train/gsap/fit_rulebased_pruner.yaml
+```
+This fits token n-gram patterns from the training data that deterministically exclude non-entity spans. The saved JSON file is referenced in the pruner training config to speed up training.
+### Step 2 — Train the span pruner
+```bash
+uv run gsapere-train-pruner configs/train/gsap/train_gsap_pruner.yaml
+```
+After training, run pruner inference on train/dev/test to produce the enriched input files for HGERE (see `scripts/pruner/`).
+### Step 3 — Train HGERE (single dataset)
+```bash
+uv run gsapere-train-hgere configs/train/gsap/train_gsap_hgere.yaml
+```
+Example config:
+```yaml
+schema_version: "1.0"
+label_set: gsap
+model_dir: saves/hgere/gsap
+base_model_name_or_path: pretrained_models/scibert_scivocab_uncased
+ner_prediction_dir: saves/pruner/gsap/output
+max_seq_length: 512
+n_iter: 3
+layernorm: true
+attn_self: true
+train_params:
+  learning_rate: 1e-5
+  num_train_epochs: 8
+  per_gpu_train_batch_size: 21
+  fp16: true
+  evaluate_during_training: true
+  eval_epochs: 1
+  loss_re_weight_alpha: 0.9
+  log_wandb: true
+```
+### Step 3 (alt) — Train HGERE on multiple datasets jointly
+Multi-dataset mode trains a shared encoder with per-dataset NER and relation heads. Each dataset must have its own pruner output directory.
+```bash
+uv run gsapere-train-hgere configs/multi-sciere-scinlp-gsap-ere/train/hgere/train_multi.yaml
+```
+Example config:
+```yaml
+schema_version: "1.0"
+model_dir: saves/multi/hgere/run1
+base_model_name_or_path: pretrained_models/scibert_scivocab_uncased
+max_seq_length: 512
+n_iter: 3
+layernorm: true
+attn_self: true
+sampling_temperature: 0.8   # 0 = always largest dataset, 1 = proportional to size
+seeds: [42, 43, 44]          # run once per seed; _seed<n> appended to model_dir
+datasets:
+  - label_set: scier
+    ner_prediction_dir: saves/pruner/scier/output
+    train_file: ent_pred_train.json
+    dev_file: ent_pred_dev.json
+    test_file: ent_pred_test.json
+  - label_set: scinlp
+    ner_prediction_dir: saves/pruner/scinlp/output
+    train_file: ent_pred_train.json
+    dev_file: ent_pred_dev.json   # omit (null) to skip dev evaluation for this dataset
+  - label_set: gsap
+    ner_prediction_dir: saves/pruner/gsap/output
+    train_file: ent_pred_train.json
+train_params:
+  learning_rate: 1e-5
+  num_train_epochs: 8
+  per_gpu_train_batch_size: 21
+  fp16: true
+  evaluate_during_training: true
+  log_wandb: true
+```
+---
+## Inference
+### Full pipeline (pruner → HGERE)
+```bash
+CUDA_VISIBLE_DEVICES=0 uv run gsapere-pipeline \
+    --config configs/inference/gsap-pipeline-best.yaml \
+    --input input/ \
+    --output output/
+```
+`--input` can be a `.jsonl` file or a directory of `.jsonl` files.
+Ready-to-use configs for all supported datasets are in [`configs/inference/`](configs/inference/).
+The pipeline config combines pruner and HGERE settings in a single YAML file:
+```yaml
+label_set: gsap
+pruner:
+  model_dir: saves/pruner/gsap/best
+  base_model_name_or_path: pretrained_models/scibert_scivocab_uncased
+  model_type: bertspanmarkerpruner
+  max_seq_length: 256
+  per_gpu_eval_batch_size: 32
+  final_pruning:
+    method: threshold
+    threshold: 0.0005
+hgere:
+  model_dir: saves/hgere/gsap/best
+  base_model_name_or_path: pretrained_models/scibert_scivocab_uncased
+  model_type: hyper
+  max_seq_length: 512
+  per_gpu_eval_batch_size: 32
+  n_iter: 3
+  layernorm: true
+  attn_self: true
+  pre_filter_params:
+    method: threshold
+    value: 0.0125
+```
+---
+## Docker API
+The pipeline can be served as a REST API. Build and run with Docker (requires `--gpus all`):
+```bash
+docker build -t gsapere-api .
+docker run --gpus all \
+    -v /path/to/models:/app/models \
+    -v /path/to/config.yaml:/app/config.yaml \
+    -e PIPELINE_CONFIG=/app/config.yaml \
+    -p 8000:8000 \
+    gsapere-api
+```
+Models and the pipeline config are mounted at runtime — the image itself contains only the code.
+**Endpoints:**
+| Method | Path | Description |
+|---|---|---|
+| `GET` | `/health` | Liveness check |
+| `POST` | `/predict` | Run the pipeline on a batch of documents |
+**Example request:**
+```bash
+curl -X POST http://localhost:8000/predict \
+    -H "Content-Type: application/json" \
+    -d '{"documents": [{"doc_key": "doc1", "sentences": [["We", "train", "BERT", "."]]}]}'
+```
+---
+## CLI reference
+| Command | Description |
+|---|---|
+| `gsapere-train-pruner` | Train the span pruner |
+| `gsapere-train-hgere` | Train the HGERE ERE model |
+| `gsapere-pipeline` | Run the full two-stage pipeline on new documents |
+| `gsapere-download-dataset` | Download supported datasets |
+| `gsapere-tune-pruner` | Threshold sweep and optimisation for the pruner |
+| `gsapere-fit-rulebased-pruner` | Fit a rule-based pruner baseline |
+| `infer-fixed-spans` | Run HGERE on fixed (gold) spans |
+| `infer-pruner-augmented` | Run HGERE on pruner-predicted spans |
+| `gsap-ere-benchmark-pipeline` | Benchmark pipeline throughput |
+| `gsapere-fix-gold-annos` | Add gold annotations to prediction files |
+| `gsapere-analysis-ner-length-distribution` | Analyse entity length distributions |
+| `gsapere-generate-pruner-docs` | Regenerate parameter docs in `documentation/api/` |
+---
+## Development
+```bash
+uv run pytest                          # run tests
+uv run ruff format src/ tests/         # format
+uv run ruff check src/ tests/          # lint
+```
+---
+## Building and publishing
+```bash
+uv build                               # produces dist/ wheel + sdist
+bash publish.sh                        # build + upload to PyPI (requires .pypi token file)
+```
+---
+## Citation
+Please cite this work and the original HGERE:
+```bibtex
+@article{Otto2026GSAP-ERE,
+  title   = {{GSAP-ERE}: Fine-Grained Scholarly Entity and Relation Extraction Focused on Machine Learning},
+  author  = {Otto, Wolfgang and Gan, Lu and Upadhyaya, Sharmila and Karmakar, Saurav and Dietze, Stefan},
+  journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
+  volume  = {40},
+  number  = {38},
+  pages   = {32600--32609},
+  year    = {2026},
+  month   = {Mar.},
+  doi     = {10.1609/aaai.v40i38.40537},
+  url     = {https://ojs.aaai.org/index.php/AAAI/article/view/40537},
+}
+@misc{yan2023joint,
+  title         = {Joint Entity and Relation Extraction with Span Pruning and Hypergraph Neural Networks},
+  author        = {Zhaohui Yan and Songlin Yang and Wei Liu and Kewei Tu},
+  year          = {2023},
+  eprint        = {2310.17238},
+  archivePrefix = {arXiv},
+  primaryClass  = {cs.CL}
+}
+```
+---
+## License
+MIT — see [LICENSE](LICENSE).