acatome-extract 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acatome_extract-0.2.0/.github/workflows/publish.yml +33 -0
- acatome_extract-0.2.0/.gitignore +37 -0
- acatome_extract-0.2.0/CHANGELOG.md +11 -0
- acatome_extract-0.2.0/LICENSE +14 -0
- acatome_extract-0.2.0/PKG-INFO +133 -0
- acatome_extract-0.2.0/README.md +93 -0
- acatome_extract-0.2.0/pyproject.toml +87 -0
- acatome_extract-0.2.0/src/acatome_extract/__init__.py +7 -0
- acatome_extract-0.2.0/src/acatome_extract/bundle.py +55 -0
- acatome_extract-0.2.0/src/acatome_extract/chunker.py +152 -0
- acatome_extract-0.2.0/src/acatome_extract/cli.py +384 -0
- acatome_extract-0.2.0/src/acatome_extract/enrich.py +462 -0
- acatome_extract-0.2.0/src/acatome_extract/figures.py +70 -0
- acatome_extract-0.2.0/src/acatome_extract/ids.py +80 -0
- acatome_extract-0.2.0/src/acatome_extract/marker.py +558 -0
- acatome_extract-0.2.0/src/acatome_extract/pipeline.py +293 -0
- acatome_extract-0.2.0/src/acatome_extract/py.typed +0 -0
- acatome_extract-0.2.0/src/acatome_extract/watch.py +495 -0
- acatome_extract-0.2.0/tests/conftest.py +52 -0
- acatome_extract-0.2.0/tests/test_bundle.py +57 -0
- acatome_extract-0.2.0/tests/test_chunker.py +71 -0
- acatome_extract-0.2.0/tests/test_enrich.py +323 -0
- acatome_extract-0.2.0/tests/test_figures.py +70 -0
- acatome_extract-0.2.0/tests/test_gpu_embed.py +169 -0
- acatome_extract-0.2.0/tests/test_ids.py +94 -0
- acatome_extract-0.2.0/tests/test_marker.py +137 -0
- acatome_extract-0.2.0/tests/test_migrate.py +385 -0
- acatome_extract-0.2.0/tests/test_watch.py +314 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write # trusted publishing (OIDC)
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
test:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: astral-sh/setup-uv@v4
|
|
17
|
+
with:
|
|
18
|
+
version: "latest"
|
|
19
|
+
- run: uv venv
|
|
20
|
+
- run: uv pip install -e ".[dev]"
|
|
21
|
+
- run: uv run --no-sync pytest
|
|
22
|
+
|
|
23
|
+
publish:
|
|
24
|
+
needs: test
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
environment: pypi
|
|
27
|
+
steps:
|
|
28
|
+
- uses: actions/checkout@v4
|
|
29
|
+
- uses: astral-sh/setup-uv@v4
|
|
30
|
+
with:
|
|
31
|
+
version: "latest"
|
|
32
|
+
- run: uv build
|
|
33
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
|
|
10
|
+
# Virtual environments
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
|
|
14
|
+
# Testing
|
|
15
|
+
.pytest_cache/
|
|
16
|
+
.coverage
|
|
17
|
+
htmlcov/
|
|
18
|
+
|
|
19
|
+
# IDE
|
|
20
|
+
.idea/
|
|
21
|
+
.vscode/
|
|
22
|
+
*.swp
|
|
23
|
+
*.swo
|
|
24
|
+
*~
|
|
25
|
+
|
|
26
|
+
# OS
|
|
27
|
+
.DS_Store
|
|
28
|
+
Thumbs.db
|
|
29
|
+
|
|
30
|
+
# UV
|
|
31
|
+
uv.lock
|
|
32
|
+
|
|
33
|
+
# Data
|
|
34
|
+
*.acatome/
|
|
35
|
+
inbox/
|
|
36
|
+
completed/
|
|
37
|
+
errors/
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Copyright (c) 2026 Reto Stamm and Acatome Contributors
|
|
2
|
+
|
|
3
|
+
This program is free software: you can redistribute it and/or modify
|
|
4
|
+
it under the terms of the GNU General Public License as published by
|
|
5
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
6
|
+
(at your option) any later version.
|
|
7
|
+
|
|
8
|
+
This program is distributed in the hope that it will be useful,
|
|
9
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
10
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
11
|
+
GNU General Public License for more details.
|
|
12
|
+
|
|
13
|
+
You should have received a copy of the GNU General Public License
|
|
14
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: acatome-extract
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: PDF extraction pipeline for scientific papers
|
|
5
|
+
Project-URL: Homepage, https://github.com/acatome/acatome-extract
|
|
6
|
+
Project-URL: Repository, https://github.com/acatome/acatome-extract
|
|
7
|
+
Project-URL: Issues, https://github.com/acatome/acatome-extract/issues
|
|
8
|
+
Author-email: Reto Stamm <reto@retostamm.com>
|
|
9
|
+
License-Expression: GPL-3.0-or-later
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: embeddings,extraction,nlp,pdf,scientific-papers,summarization
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: acatome-meta>=0.1.0
|
|
21
|
+
Requires-Dist: litellm>=1.40
|
|
22
|
+
Requires-Dist: marker-pdf>=1.0
|
|
23
|
+
Requires-Dist: precis-summary>=0.1.0
|
|
24
|
+
Requires-Dist: typer>=0.12
|
|
25
|
+
Requires-Dist: watchdog>=4.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: black>=24.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
30
|
+
Provides-Extra: embeddings
|
|
31
|
+
Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
|
|
32
|
+
Provides-Extra: gpu
|
|
33
|
+
Requires-Dist: sentence-transformers>=3.0; extra == 'gpu'
|
|
34
|
+
Requires-Dist: torch>=2.0; extra == 'gpu'
|
|
35
|
+
Provides-Extra: grobid
|
|
36
|
+
Requires-Dist: grobid-client-python>=0.0.7; extra == 'grobid'
|
|
37
|
+
Provides-Extra: store
|
|
38
|
+
Requires-Dist: acatome-store>=0.1.0; extra == 'store'
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
|
|
41
|
+
# acatome-extract
|
|
42
|
+
|
|
43
|
+
PDF extraction and enrichment pipeline for scientific papers. Converts PDFs into structured, searchable bundles with block-level summaries and embeddings.
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- **Marker PDF extraction** — structured block extraction with headings, tables, figures
|
|
48
|
+
- **Fitz fallback** — recursive character chunking when Marker is unavailable
|
|
49
|
+
- **LLM enrichment** — block and paper summaries via Ollama or litellm
|
|
50
|
+
- **Embeddings** — sentence-transformer embeddings for semantic search
|
|
51
|
+
- **File watcher** — `acatome-extract watch` monitors an inbox folder
|
|
52
|
+
- **Bundle format** — `.acatome` companion files for sharing pre-built extractions
|
|
53
|
+
- **CLI** — `acatome-extract` command for extract, enrich, and watch workflows
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
uv pip install -e .
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
With GPU acceleration:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
uv pip install -e ".[gpu]"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Usage
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from acatome_extract.pipeline import extract
|
|
71
|
+
|
|
72
|
+
bundle = extract("/path/to/paper.pdf")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## CLI
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
# Extract (RAKE summaries included automatically, no LLM needed)
|
|
79
|
+
acatome-extract extract paper.pdf
|
|
80
|
+
acatome-extract extract --type datasheet TI_LM317.pdf # non-article types
|
|
81
|
+
|
|
82
|
+
# Enrich — embeddings only by default; add --summarize for LLM summaries
|
|
83
|
+
acatome-extract enrich /path/to/bundle
|
|
84
|
+
acatome-extract enrich --summarize /path/to/bundle # enable LLM summaries
|
|
85
|
+
acatome-extract enrich --summarize --skip-existing dir/ # incremental LLM pass
|
|
86
|
+
|
|
87
|
+
# Watch — extract + embed + ingest; LLM summaries off by default
|
|
88
|
+
acatome-extract watch ~/papers/inbox
|
|
89
|
+
acatome-extract watch ~/papers/inbox --summarize # enable LLM summaries
|
|
90
|
+
|
|
91
|
+
# Migrate old bundles to new summaries dict format + add RAKE
|
|
92
|
+
acatome-extract migrate ~/.acatome/papers
|
|
93
|
+
acatome-extract migrate ~/.acatome/papers --dry-run # preview changes
|
|
94
|
+
|
|
95
|
+
# Supplements
|
|
96
|
+
acatome-extract attach parent-slug supplement.pdf --name s1
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Summaries
|
|
100
|
+
|
|
101
|
+
Extraction always generates **RAKE** (extractive keyword) summaries — instant, no LLM required. LLM-based summaries are opt-in via `--summarize` and require an Ollama or litellm-compatible model.
|
|
102
|
+
|
|
103
|
+
RAKE summaries are used as the default for search and display. To add LLM summaries later:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
acatome-extract enrich --summarize --skip-existing ~/.acatome/papers
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Sidecar metadata
|
|
110
|
+
|
|
111
|
+
Place a `<stem>.meta.json` alongside any PDF to override metadata:
|
|
112
|
+
|
|
113
|
+
```json
|
|
114
|
+
{"type": "datasheet", "title": "LM317 Regulator", "author": "Texas Instruments", "year": 2022}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Supported fields: `type`, `title`, `author` (string or list), `year`, `doi`, `abstract`, `journal`.
|
|
118
|
+
|
|
119
|
+
## Dependencies
|
|
120
|
+
|
|
121
|
+
- **acatome-meta** — metadata lookup and verification
|
|
122
|
+
- **marker-pdf** — structured PDF extraction
|
|
123
|
+
- **litellm** / **Ollama** — LLM-based enrichment
|
|
124
|
+
|
|
125
|
+
## Testing
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
uv run python -m pytest tests/ -v
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
GPL-3.0-or-later — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# acatome-extract
|
|
2
|
+
|
|
3
|
+
PDF extraction and enrichment pipeline for scientific papers. Converts PDFs into structured, searchable bundles with block-level summaries and embeddings.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Marker PDF extraction** — structured block extraction with headings, tables, figures
|
|
8
|
+
- **Fitz fallback** — recursive character chunking when Marker is unavailable
|
|
9
|
+
- **LLM enrichment** — block and paper summaries via Ollama or litellm
|
|
10
|
+
- **Embeddings** — sentence-transformer embeddings for semantic search
|
|
11
|
+
- **File watcher** — `acatome-extract watch` monitors an inbox folder
|
|
12
|
+
- **Bundle format** — `.acatome` companion files for sharing pre-built extractions
|
|
13
|
+
- **CLI** — `acatome-extract` command for extract, enrich, and watch workflows
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
uv pip install -e .
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
With GPU acceleration:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uv pip install -e ".[gpu]"
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from acatome_extract.pipeline import extract
|
|
31
|
+
|
|
32
|
+
bundle = extract("/path/to/paper.pdf")
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## CLI
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# Extract (RAKE summaries included automatically, no LLM needed)
|
|
39
|
+
acatome-extract extract paper.pdf
|
|
40
|
+
acatome-extract extract --type datasheet TI_LM317.pdf # non-article types
|
|
41
|
+
|
|
42
|
+
# Enrich — embeddings only by default; add --summarize for LLM summaries
|
|
43
|
+
acatome-extract enrich /path/to/bundle
|
|
44
|
+
acatome-extract enrich --summarize /path/to/bundle # enable LLM summaries
|
|
45
|
+
acatome-extract enrich --summarize --skip-existing dir/ # incremental LLM pass
|
|
46
|
+
|
|
47
|
+
# Watch — extract + embed + ingest; LLM summaries off by default
|
|
48
|
+
acatome-extract watch ~/papers/inbox
|
|
49
|
+
acatome-extract watch ~/papers/inbox --summarize # enable LLM summaries
|
|
50
|
+
|
|
51
|
+
# Migrate old bundles to new summaries dict format + add RAKE
|
|
52
|
+
acatome-extract migrate ~/.acatome/papers
|
|
53
|
+
acatome-extract migrate ~/.acatome/papers --dry-run # preview changes
|
|
54
|
+
|
|
55
|
+
# Supplements
|
|
56
|
+
acatome-extract attach parent-slug supplement.pdf --name s1
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Summaries
|
|
60
|
+
|
|
61
|
+
Extraction always generates **RAKE** (extractive keyword) summaries — instant, no LLM required. LLM-based summaries are opt-in via `--summarize` and require an Ollama or litellm-compatible model.
|
|
62
|
+
|
|
63
|
+
RAKE summaries are used as the default for search and display. To add LLM summaries later:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
acatome-extract enrich --summarize --skip-existing ~/.acatome/papers
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Sidecar metadata
|
|
70
|
+
|
|
71
|
+
Place a `<stem>.meta.json` alongside any PDF to override metadata:
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{"type": "datasheet", "title": "LM317 Regulator", "author": "Texas Instruments", "year": 2022}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Supported fields: `type`, `title`, `author` (string or list), `year`, `doi`, `abstract`, `journal`.
|
|
78
|
+
|
|
79
|
+
## Dependencies
|
|
80
|
+
|
|
81
|
+
- **acatome-meta** — metadata lookup and verification
|
|
82
|
+
- **marker-pdf** — structured PDF extraction
|
|
83
|
+
- **litellm** / **Ollama** — LLM-based enrichment
|
|
84
|
+
|
|
85
|
+
## Testing
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
uv run python -m pytest tests/ -v
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## License
|
|
92
|
+
|
|
93
|
+
GPL-3.0-or-later — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "acatome-extract"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "PDF extraction pipeline for scientific papers"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
license = "GPL-3.0-or-later"
|
|
7
|
+
authors = [{name = "Reto Stamm", email = "reto@retostamm.com"}]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
keywords = ["pdf", "extraction", "scientific-papers", "nlp", "summarization", "embeddings"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 3 - Alpha",
|
|
12
|
+
"Intended Audience :: Science/Research",
|
|
13
|
+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.11",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
"Topic :: Scientific/Engineering",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
dependencies = [
|
|
21
|
+
"acatome-meta>=0.1.0",
|
|
22
|
+
"marker-pdf>=1.0",
|
|
23
|
+
"precis-summary>=0.1.0",
|
|
24
|
+
"typer>=0.12",
|
|
25
|
+
"litellm>=1.40",
|
|
26
|
+
"watchdog>=4.0",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://github.com/acatome/acatome-extract"
|
|
31
|
+
Repository = "https://github.com/acatome/acatome-extract"
|
|
32
|
+
Issues = "https://github.com/acatome/acatome-extract/issues"
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
store = [
|
|
36
|
+
"acatome-store>=0.1.0",
|
|
37
|
+
]
|
|
38
|
+
embeddings = [
|
|
39
|
+
"sentence-transformers>=3.0",
|
|
40
|
+
]
|
|
41
|
+
gpu = [
|
|
42
|
+
"sentence-transformers>=3.0",
|
|
43
|
+
"torch>=2.0",
|
|
44
|
+
]
|
|
45
|
+
grobid = [
|
|
46
|
+
"grobid-client-python>=0.0.7",
|
|
47
|
+
]
|
|
48
|
+
dev = [
|
|
49
|
+
"pytest>=8.0",
|
|
50
|
+
"black>=24.0",
|
|
51
|
+
"ruff>=0.5",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
[project.scripts]
|
|
55
|
+
acatome-extract = "acatome_extract.cli:app"
|
|
56
|
+
|
|
57
|
+
[build-system]
|
|
58
|
+
requires = ["hatchling"]
|
|
59
|
+
build-backend = "hatchling.build"
|
|
60
|
+
|
|
61
|
+
[tool.hatch.build.targets.wheel]
|
|
62
|
+
packages = ["src/acatome_extract"]
|
|
63
|
+
|
|
64
|
+
[tool.pytest.ini_options]
|
|
65
|
+
testpaths = ["tests"]
|
|
66
|
+
|
|
67
|
+
[tool.black]
|
|
68
|
+
line-length = 88
|
|
69
|
+
|
|
70
|
+
[tool.ruff]
|
|
71
|
+
line-length = 88
|
|
72
|
+
|
|
73
|
+
[tool.bumpversion]
|
|
74
|
+
current_version = "0.2.0"
|
|
75
|
+
commit = true
|
|
76
|
+
tag = true
|
|
77
|
+
tag_name = "v{new_version}"
|
|
78
|
+
|
|
79
|
+
[[tool.bumpversion.files]]
|
|
80
|
+
filename = "pyproject.toml"
|
|
81
|
+
search = 'version = "{current_version}"'
|
|
82
|
+
replace = 'version = "{new_version}"'
|
|
83
|
+
|
|
84
|
+
[[tool.bumpversion.files]]
|
|
85
|
+
filename = "src/acatome_extract/__init__.py"
|
|
86
|
+
search = '__version__ = "{current_version}"'
|
|
87
|
+
replace = '__version__ = "{new_version}"'
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""acatome-extract: PDF extraction pipeline for scientific papers."""
|
|
2
|
+
|
|
3
|
+
from acatome_extract.bundle import read_bundle, write_bundle
|
|
4
|
+
from acatome_extract.pipeline import extract, extract_dir
|
|
5
|
+
|
|
6
|
+
__all__ = ["extract", "extract_dir", "read_bundle", "write_bundle"]
|
|
7
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Read/write .acatome bundle files (gzipped JSON)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import gzip
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def write_bundle(data: dict[str, Any], path: str | Path) -> Path:
|
|
12
|
+
"""Write a bundle dict as gzipped JSON.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
data: Bundle dict (header + blocks + enrichment_meta).
|
|
16
|
+
path: Output path (should end in .acatome).
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Path to written file.
|
|
20
|
+
"""
|
|
21
|
+
path = Path(path)
|
|
22
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
with gzip.open(path, "wt", encoding="utf-8") as f:
|
|
24
|
+
json.dump(data, f, ensure_ascii=False, separators=(",", ":"))
|
|
25
|
+
return path
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def read_bundle(path: str | Path) -> dict[str, Any]:
|
|
29
|
+
"""Read a .acatome bundle file.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
path: Path to .acatome file.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Parsed bundle dict.
|
|
36
|
+
"""
|
|
37
|
+
path = Path(path)
|
|
38
|
+
with gzip.open(path, "rt", encoding="utf-8") as f:
|
|
39
|
+
return json.load(f)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def update_bundle(
|
|
43
|
+
data: dict[str, Any],
|
|
44
|
+
path: str | Path,
|
|
45
|
+
) -> Path:
|
|
46
|
+
"""Write an already-modified bundle dict back to disk.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
data: Bundle dict (header + blocks + enrichment_meta).
|
|
50
|
+
path: Path to .acatome file.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Path to written file.
|
|
54
|
+
"""
|
|
55
|
+
return write_bundle(data, path)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Recursive character text splitter for document chunking.
|
|
2
|
+
|
|
3
|
+
Splits text into chunks of roughly ``chunk_size`` characters, preferring
|
|
4
|
+
to break at natural boundaries (paragraphs → newlines → sentences → words).
|
|
5
|
+
Adjacent chunks overlap by ``chunk_overlap`` characters to preserve context
|
|
6
|
+
across chunk boundaries.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
# Default separators, tried in order (prefer paragraph → line → sentence → word)
|
|
14
|
+
DEFAULT_SEPARATORS: list[str] = ["\n\n", "\n", ". ", ", ", " "]
|
|
15
|
+
|
|
16
|
+
# Reasonable defaults for academic papers
|
|
17
|
+
DEFAULT_CHUNK_SIZE = 800
|
|
18
|
+
DEFAULT_CHUNK_OVERLAP = 150
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def split_text(
|
|
22
|
+
text: str,
|
|
23
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
24
|
+
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
|
25
|
+
separators: list[str] | None = None,
|
|
26
|
+
) -> list[str]:
|
|
27
|
+
"""Split *text* into chunks of approximately *chunk_size* characters.
|
|
28
|
+
|
|
29
|
+
The algorithm tries each separator in order. For the first separator
|
|
30
|
+
that produces pieces, it keeps pieces that fit and recursively splits
|
|
31
|
+
those that don't (using the remaining separators). Adjacent chunks
|
|
32
|
+
share *chunk_overlap* characters of context.
|
|
33
|
+
|
|
34
|
+
Returns a list of non-empty strings, each ≤ ``chunk_size`` chars
|
|
35
|
+
(unless a single word exceeds the limit, in which case it is kept
|
|
36
|
+
whole to avoid mid-word splits).
|
|
37
|
+
"""
|
|
38
|
+
if not text.strip():
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
if len(text) <= chunk_size:
|
|
42
|
+
return [text.strip()]
|
|
43
|
+
|
|
44
|
+
seps = separators if separators is not None else list(DEFAULT_SEPARATORS)
|
|
45
|
+
|
|
46
|
+
return _recursive_split(text, chunk_size, chunk_overlap, seps)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _recursive_split(
|
|
50
|
+
text: str,
|
|
51
|
+
chunk_size: int,
|
|
52
|
+
chunk_overlap: int,
|
|
53
|
+
separators: list[str],
|
|
54
|
+
) -> list[str]:
|
|
55
|
+
"""Core recursive splitting logic."""
|
|
56
|
+
# Base case: text fits
|
|
57
|
+
if len(text) <= chunk_size:
|
|
58
|
+
stripped = text.strip()
|
|
59
|
+
return [stripped] if stripped else []
|
|
60
|
+
|
|
61
|
+
# Try each separator
|
|
62
|
+
for i, sep in enumerate(separators):
|
|
63
|
+
pieces = _split_keeping_sep(text, sep)
|
|
64
|
+
if len(pieces) <= 1:
|
|
65
|
+
continue # separator not found; try next
|
|
66
|
+
|
|
67
|
+
# Merge small pieces back together up to chunk_size
|
|
68
|
+
merged = _merge_pieces(pieces, chunk_size, chunk_overlap, sep)
|
|
69
|
+
|
|
70
|
+
# Recursively split any chunk that's still too big
|
|
71
|
+
remaining_seps = separators[i + 1 :]
|
|
72
|
+
result: list[str] = []
|
|
73
|
+
for chunk in merged:
|
|
74
|
+
if len(chunk) <= chunk_size:
|
|
75
|
+
stripped = chunk.strip()
|
|
76
|
+
if stripped:
|
|
77
|
+
result.append(stripped)
|
|
78
|
+
elif remaining_seps:
|
|
79
|
+
result.extend(
|
|
80
|
+
_recursive_split(chunk, chunk_size, chunk_overlap, remaining_seps)
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
# No more separators — keep as-is (won't split mid-word)
|
|
84
|
+
stripped = chunk.strip()
|
|
85
|
+
if stripped:
|
|
86
|
+
result.append(stripped)
|
|
87
|
+
return result
|
|
88
|
+
|
|
89
|
+
# No separator worked — return text as-is
|
|
90
|
+
stripped = text.strip()
|
|
91
|
+
return [stripped] if stripped else []
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _split_keeping_sep(text: str, sep: str) -> list[str]:
|
|
95
|
+
"""Split text by *sep*, keeping the separator at the start of each piece
|
|
96
|
+
(except the first)."""
|
|
97
|
+
parts = text.split(sep)
|
|
98
|
+
if len(parts) <= 1:
|
|
99
|
+
return parts
|
|
100
|
+
|
|
101
|
+
result = [parts[0]]
|
|
102
|
+
for part in parts[1:]:
|
|
103
|
+
result.append(sep + part)
|
|
104
|
+
return result
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _merge_pieces(
|
|
108
|
+
pieces: list[str],
|
|
109
|
+
chunk_size: int,
|
|
110
|
+
chunk_overlap: int,
|
|
111
|
+
sep: str,
|
|
112
|
+
) -> list[str]:
|
|
113
|
+
"""Greedily merge adjacent pieces into chunks up to *chunk_size*.
|
|
114
|
+
|
|
115
|
+
When starting a new chunk, includes up to *chunk_overlap* characters
|
|
116
|
+
from the tail of the previous chunk.
|
|
117
|
+
"""
|
|
118
|
+
chunks: list[str] = []
|
|
119
|
+
current: list[str] = []
|
|
120
|
+
current_len = 0
|
|
121
|
+
|
|
122
|
+
for piece in pieces:
|
|
123
|
+
piece_len = len(piece)
|
|
124
|
+
|
|
125
|
+
if current and current_len + piece_len > chunk_size:
|
|
126
|
+
# Flush current buffer
|
|
127
|
+
chunk_text = "".join(current)
|
|
128
|
+
if chunk_text.strip():
|
|
129
|
+
chunks.append(chunk_text)
|
|
130
|
+
|
|
131
|
+
# Build overlap from end of current buffer
|
|
132
|
+
overlap_pieces: list[str] = []
|
|
133
|
+
overlap_len = 0
|
|
134
|
+
for prev in reversed(current):
|
|
135
|
+
if overlap_len + len(prev) > chunk_overlap:
|
|
136
|
+
break
|
|
137
|
+
overlap_pieces.insert(0, prev)
|
|
138
|
+
overlap_len += len(prev)
|
|
139
|
+
|
|
140
|
+
current = overlap_pieces
|
|
141
|
+
current_len = overlap_len
|
|
142
|
+
|
|
143
|
+
current.append(piece)
|
|
144
|
+
current_len += piece_len
|
|
145
|
+
|
|
146
|
+
# Flush remaining
|
|
147
|
+
if current:
|
|
148
|
+
chunk_text = "".join(current)
|
|
149
|
+
if chunk_text.strip():
|
|
150
|
+
chunks.append(chunk_text)
|
|
151
|
+
|
|
152
|
+
return chunks
|