acatome-extract 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write # trusted publishing (OIDC)
9
+ contents: read
10
+
11
+ jobs:
12
+ test:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: astral-sh/setup-uv@v4
17
+ with:
18
+ version: "latest"
19
+ - run: uv venv
20
+ - run: uv pip install -e ".[dev]"
21
+ - run: uv run --no-sync pytest
22
+
23
+ publish:
24
+ needs: test
25
+ runs-on: ubuntu-latest
26
+ environment: pypi
27
+ steps:
28
+ - uses: actions/checkout@v4
29
+ - uses: astral-sh/setup-uv@v4
30
+ with:
31
+ version: "latest"
32
+ - run: uv build
33
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,37 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ *.egg
7
+ dist/
8
+ build/
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+
14
+ # Testing
15
+ .pytest_cache/
16
+ .coverage
17
+ htmlcov/
18
+
19
+ # IDE
20
+ .idea/
21
+ .vscode/
22
+ *.swp
23
+ *.swo
24
+ *~
25
+
26
+ # OS
27
+ .DS_Store
28
+ Thumbs.db
29
+
30
+ # UV
31
+ uv.lock
32
+
33
+ # Data
34
+ *.acatome/
35
+ inbox/
36
+ completed/
37
+ errors/
@@ -0,0 +1,11 @@
1
+ # Changelog
2
+
3
+ All notable changes to **acatome-extract** will be documented in this file.
4
+
5
+ Format follows [Keep a Changelog](https://keepachangelog.com/).
6
+
7
+ ## [0.1.0] — 2026-03-11
8
+
9
+ ### Added
10
+
11
+ - Initial release.
@@ -0,0 +1,14 @@
1
+ Copyright (c) 2026 Reto Stamm and Acatome Contributors
2
+
3
+ This program is free software: you can redistribute it and/or modify
4
+ it under the terms of the GNU General Public License as published by
5
+ the Free Software Foundation, either version 3 of the License, or
6
+ (at your option) any later version.
7
+
8
+ This program is distributed in the hope that it will be useful,
9
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ GNU General Public License for more details.
12
+
13
+ You should have received a copy of the GNU General Public License
14
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: acatome-extract
3
+ Version: 0.2.0
4
+ Summary: PDF extraction pipeline for scientific papers
5
+ Project-URL: Homepage, https://github.com/acatome/acatome-extract
6
+ Project-URL: Repository, https://github.com/acatome/acatome-extract
7
+ Project-URL: Issues, https://github.com/acatome/acatome-extract/issues
8
+ Author-email: Reto Stamm <reto@retostamm.com>
9
+ License-Expression: GPL-3.0-or-later
10
+ License-File: LICENSE
11
+ Keywords: embeddings,extraction,nlp,pdf,scientific-papers,summarization
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: acatome-meta>=0.1.0
21
+ Requires-Dist: litellm>=1.40
22
+ Requires-Dist: marker-pdf>=1.0
23
+ Requires-Dist: precis-summary>=0.1.0
24
+ Requires-Dist: typer>=0.12
25
+ Requires-Dist: watchdog>=4.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: black>=24.0; extra == 'dev'
28
+ Requires-Dist: pytest>=8.0; extra == 'dev'
29
+ Requires-Dist: ruff>=0.5; extra == 'dev'
30
+ Provides-Extra: embeddings
31
+ Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
32
+ Provides-Extra: gpu
33
+ Requires-Dist: sentence-transformers>=3.0; extra == 'gpu'
34
+ Requires-Dist: torch>=2.0; extra == 'gpu'
35
+ Provides-Extra: grobid
36
+ Requires-Dist: grobid-client-python>=0.0.7; extra == 'grobid'
37
+ Provides-Extra: store
38
+ Requires-Dist: acatome-store>=0.1.0; extra == 'store'
39
+ Description-Content-Type: text/markdown
40
+
41
+ # acatome-extract
42
+
43
+ PDF extraction and enrichment pipeline for scientific papers. Converts PDFs into structured, searchable bundles with block-level summaries and embeddings.
44
+
45
+ ## Features
46
+
47
+ - **Marker PDF extraction** — structured block extraction with headings, tables, figures
48
+ - **Fitz fallback** — recursive character chunking when Marker is unavailable
49
+ - **LLM enrichment** — block and paper summaries via Ollama or litellm
50
+ - **Embeddings** — sentence-transformer embeddings for semantic search
51
+ - **File watcher** — `acatome-extract watch` monitors an inbox folder
52
+ - **Bundle format** — `.acatome` companion files for sharing pre-built extractions
53
+ - **CLI** — `acatome-extract` command for extract, enrich, and watch workflows
54
+
55
+ ## Installation
56
+
57
+ ```bash
58
+ uv pip install -e .
59
+ ```
60
+
61
+ With GPU acceleration:
62
+
63
+ ```bash
64
+ uv pip install -e ".[gpu]"
65
+ ```
66
+
67
+ ## Usage
68
+
69
+ ```python
70
+ from acatome_extract.pipeline import extract
71
+
72
+ bundle = extract("/path/to/paper.pdf")
73
+ ```
74
+
75
+ ## CLI
76
+
77
+ ```bash
78
+ # Extract (RAKE summaries included automatically, no LLM needed)
79
+ acatome-extract extract paper.pdf
80
+ acatome-extract extract --type datasheet TI_LM317.pdf # non-article types
81
+
82
+ # Enrich — embeddings only by default; add --summarize for LLM summaries
83
+ acatome-extract enrich /path/to/bundle
84
+ acatome-extract enrich --summarize /path/to/bundle # enable LLM summaries
85
+ acatome-extract enrich --summarize --skip-existing dir/ # incremental LLM pass
86
+
87
+ # Watch — extract + embed + ingest; LLM summaries off by default
88
+ acatome-extract watch ~/papers/inbox
89
+ acatome-extract watch ~/papers/inbox --summarize # enable LLM summaries
90
+
91
+ # Migrate old bundles to new summaries dict format + add RAKE
92
+ acatome-extract migrate ~/.acatome/papers
93
+ acatome-extract migrate ~/.acatome/papers --dry-run # preview changes
94
+
95
+ # Supplements
96
+ acatome-extract attach parent-slug supplement.pdf --name s1
97
+ ```
98
+
99
+ ### Summaries
100
+
101
+ Extraction always generates **RAKE** (extractive keyword) summaries — instant, no LLM required. LLM-based summaries are opt-in via `--summarize` and require an Ollama or litellm-compatible model.
102
+
103
+ RAKE summaries are used as the default for search and display. To add LLM summaries later:
104
+
105
+ ```bash
106
+ acatome-extract enrich --summarize --skip-existing ~/.acatome/papers
107
+ ```
108
+
109
+ ### Sidecar metadata
110
+
111
+ Place a `<stem>.meta.json` alongside any PDF to override metadata:
112
+
113
+ ```json
114
+ {"type": "datasheet", "title": "LM317 Regulator", "author": "Texas Instruments", "year": 2022}
115
+ ```
116
+
117
+ Supported fields: `type`, `title`, `author` (string or list), `year`, `doi`, `abstract`, `journal`.
118
+
119
+ ## Dependencies
120
+
121
+ - **acatome-meta** — metadata lookup and verification
122
+ - **marker-pdf** — structured PDF extraction
123
+ - **litellm** / **Ollama** — LLM-based enrichment
124
+
125
+ ## Testing
126
+
127
+ ```bash
128
+ uv run python -m pytest tests/ -v
129
+ ```
130
+
131
+ ## License
132
+
133
+ GPL-3.0-or-later — see [LICENSE](LICENSE).
@@ -0,0 +1,93 @@
1
+ # acatome-extract
2
+
3
+ PDF extraction and enrichment pipeline for scientific papers. Converts PDFs into structured, searchable bundles with block-level summaries and embeddings.
4
+
5
+ ## Features
6
+
7
+ - **Marker PDF extraction** — structured block extraction with headings, tables, figures
8
+ - **Fitz fallback** — recursive character chunking when Marker is unavailable
9
+ - **LLM enrichment** — block and paper summaries via Ollama or litellm
10
+ - **Embeddings** — sentence-transformer embeddings for semantic search
11
+ - **File watcher** — `acatome-extract watch` monitors an inbox folder
12
+ - **Bundle format** — `.acatome` companion files for sharing pre-built extractions
13
+ - **CLI** — `acatome-extract` command for extract, enrich, and watch workflows
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ uv pip install -e .
19
+ ```
20
+
21
+ With GPU acceleration:
22
+
23
+ ```bash
24
+ uv pip install -e ".[gpu]"
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ```python
30
+ from acatome_extract.pipeline import extract
31
+
32
+ bundle = extract("/path/to/paper.pdf")
33
+ ```
34
+
35
+ ## CLI
36
+
37
+ ```bash
38
+ # Extract (RAKE summaries included automatically, no LLM needed)
39
+ acatome-extract extract paper.pdf
40
+ acatome-extract extract --type datasheet TI_LM317.pdf # non-article types
41
+
42
+ # Enrich — embeddings only by default; add --summarize for LLM summaries
43
+ acatome-extract enrich /path/to/bundle
44
+ acatome-extract enrich --summarize /path/to/bundle # enable LLM summaries
45
+ acatome-extract enrich --summarize --skip-existing dir/ # incremental LLM pass
46
+
47
+ # Watch — extract + embed + ingest; LLM summaries off by default
48
+ acatome-extract watch ~/papers/inbox
49
+ acatome-extract watch ~/papers/inbox --summarize # enable LLM summaries
50
+
51
+ # Migrate old bundles to new summaries dict format + add RAKE
52
+ acatome-extract migrate ~/.acatome/papers
53
+ acatome-extract migrate ~/.acatome/papers --dry-run # preview changes
54
+
55
+ # Supplements
56
+ acatome-extract attach parent-slug supplement.pdf --name s1
57
+ ```
58
+
59
+ ### Summaries
60
+
61
+ Extraction always generates **RAKE** (extractive keyword) summaries — instant, no LLM required. LLM-based summaries are opt-in via `--summarize` and require an Ollama or litellm-compatible model.
62
+
63
+ RAKE summaries are used as the default for search and display. To add LLM summaries later:
64
+
65
+ ```bash
66
+ acatome-extract enrich --summarize --skip-existing ~/.acatome/papers
67
+ ```
68
+
69
+ ### Sidecar metadata
70
+
71
+ Place a `<stem>.meta.json` alongside any PDF to override metadata:
72
+
73
+ ```json
74
+ {"type": "datasheet", "title": "LM317 Regulator", "author": "Texas Instruments", "year": 2022}
75
+ ```
76
+
77
+ Supported fields: `type`, `title`, `author` (string or list), `year`, `doi`, `abstract`, `journal`.
78
+
79
+ ## Dependencies
80
+
81
+ - **acatome-meta** — metadata lookup and verification
82
+ - **marker-pdf** — structured PDF extraction
83
+ - **litellm** / **Ollama** — LLM-based enrichment
84
+
85
+ ## Testing
86
+
87
+ ```bash
88
+ uv run python -m pytest tests/ -v
89
+ ```
90
+
91
+ ## License
92
+
93
+ GPL-3.0-or-later — see [LICENSE](LICENSE).
@@ -0,0 +1,87 @@
1
+ [project]
2
+ name = "acatome-extract"
3
+ version = "0.2.0"
4
+ description = "PDF extraction pipeline for scientific papers"
5
+ requires-python = ">=3.11"
6
+ license = "GPL-3.0-or-later"
7
+ authors = [{name = "Reto Stamm", email = "reto@retostamm.com"}]
8
+ readme = "README.md"
9
+ keywords = ["pdf", "extraction", "scientific-papers", "nlp", "summarization", "embeddings"]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "Intended Audience :: Science/Research",
13
+ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.11",
16
+ "Programming Language :: Python :: 3.12",
17
+ "Topic :: Scientific/Engineering",
18
+ ]
19
+
20
+ dependencies = [
21
+ "acatome-meta>=0.1.0",
22
+ "marker-pdf>=1.0",
23
+ "precis-summary>=0.1.0",
24
+ "typer>=0.12",
25
+ "litellm>=1.40",
26
+ "watchdog>=4.0",
27
+ ]
28
+
29
+ [project.urls]
30
+ Homepage = "https://github.com/acatome/acatome-extract"
31
+ Repository = "https://github.com/acatome/acatome-extract"
32
+ Issues = "https://github.com/acatome/acatome-extract/issues"
33
+
34
+ [project.optional-dependencies]
35
+ store = [
36
+ "acatome-store>=0.1.0",
37
+ ]
38
+ embeddings = [
39
+ "sentence-transformers>=3.0",
40
+ ]
41
+ gpu = [
42
+ "sentence-transformers>=3.0",
43
+ "torch>=2.0",
44
+ ]
45
+ grobid = [
46
+ "grobid-client-python>=0.0.7",
47
+ ]
48
+ dev = [
49
+ "pytest>=8.0",
50
+ "black>=24.0",
51
+ "ruff>=0.5",
52
+ ]
53
+
54
+ [project.scripts]
55
+ acatome-extract = "acatome_extract.cli:app"
56
+
57
+ [build-system]
58
+ requires = ["hatchling"]
59
+ build-backend = "hatchling.build"
60
+
61
+ [tool.hatch.build.targets.wheel]
62
+ packages = ["src/acatome_extract"]
63
+
64
+ [tool.pytest.ini_options]
65
+ testpaths = ["tests"]
66
+
67
+ [tool.black]
68
+ line-length = 88
69
+
70
+ [tool.ruff]
71
+ line-length = 88
72
+
73
+ [tool.bumpversion]
74
+ current_version = "0.2.0"
75
+ commit = true
76
+ tag = true
77
+ tag_name = "v{new_version}"
78
+
79
+ [[tool.bumpversion.files]]
80
+ filename = "pyproject.toml"
81
+ search = 'version = "{current_version}"'
82
+ replace = 'version = "{new_version}"'
83
+
84
+ [[tool.bumpversion.files]]
85
+ filename = "src/acatome_extract/__init__.py"
86
+ search = '__version__ = "{current_version}"'
87
+ replace = '__version__ = "{new_version}"'
@@ -0,0 +1,7 @@
1
+ """acatome-extract: PDF extraction pipeline for scientific papers."""
2
+
3
+ from acatome_extract.bundle import read_bundle, write_bundle
4
+ from acatome_extract.pipeline import extract, extract_dir
5
+
6
+ __all__ = ["extract", "extract_dir", "read_bundle", "write_bundle"]
7
+ __version__ = "0.2.0"
@@ -0,0 +1,55 @@
1
+ """Read/write .acatome bundle files (gzipped JSON)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import gzip
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+
11
+ def write_bundle(data: dict[str, Any], path: str | Path) -> Path:
12
+ """Write a bundle dict as gzipped JSON.
13
+
14
+ Args:
15
+ data: Bundle dict (header + blocks + enrichment_meta).
16
+ path: Output path (should end in .acatome).
17
+
18
+ Returns:
19
+ Path to written file.
20
+ """
21
+ path = Path(path)
22
+ path.parent.mkdir(parents=True, exist_ok=True)
23
+ with gzip.open(path, "wt", encoding="utf-8") as f:
24
+ json.dump(data, f, ensure_ascii=False, separators=(",", ":"))
25
+ return path
26
+
27
+
28
+ def read_bundle(path: str | Path) -> dict[str, Any]:
29
+ """Read a .acatome bundle file.
30
+
31
+ Args:
32
+ path: Path to .acatome file.
33
+
34
+ Returns:
35
+ Parsed bundle dict.
36
+ """
37
+ path = Path(path)
38
+ with gzip.open(path, "rt", encoding="utf-8") as f:
39
+ return json.load(f)
40
+
41
+
42
+ def update_bundle(
43
+ data: dict[str, Any],
44
+ path: str | Path,
45
+ ) -> Path:
46
+ """Write an already-modified bundle dict back to disk.
47
+
48
+ Args:
49
+ data: Bundle dict (header + blocks + enrichment_meta).
50
+ path: Path to .acatome file.
51
+
52
+ Returns:
53
+ Path to written file.
54
+ """
55
+ return write_bundle(data, path)
@@ -0,0 +1,152 @@
1
+ """Recursive character text splitter for document chunking.
2
+
3
+ Splits text into chunks of roughly ``chunk_size`` characters, preferring
4
+ to break at natural boundaries (paragraphs → newlines → sentences → words).
5
+ Adjacent chunks overlap by ``chunk_overlap`` characters to preserve context
6
+ across chunk boundaries.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+
13
+ # Default separators, tried in order (prefer paragraph → line → sentence → word)
14
+ DEFAULT_SEPARATORS: list[str] = ["\n\n", "\n", ". ", ", ", " "]
15
+
16
+ # Reasonable defaults for academic papers
17
+ DEFAULT_CHUNK_SIZE = 800
18
+ DEFAULT_CHUNK_OVERLAP = 150
19
+
20
+
21
+ def split_text(
22
+ text: str,
23
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
24
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
25
+ separators: list[str] | None = None,
26
+ ) -> list[str]:
27
+ """Split *text* into chunks of approximately *chunk_size* characters.
28
+
29
+ The algorithm tries each separator in order. For the first separator
30
+ that produces pieces, it keeps pieces that fit and recursively splits
31
+ those that don't (using the remaining separators). Adjacent chunks
32
+ share *chunk_overlap* characters of context.
33
+
34
+ Returns a list of non-empty strings, each ≤ ``chunk_size`` chars
35
+ (unless a single word exceeds the limit, in which case it is kept
36
+ whole to avoid mid-word splits).
37
+ """
38
+ if not text.strip():
39
+ return []
40
+
41
+ if len(text) <= chunk_size:
42
+ return [text.strip()]
43
+
44
+ seps = separators if separators is not None else list(DEFAULT_SEPARATORS)
45
+
46
+ return _recursive_split(text, chunk_size, chunk_overlap, seps)
47
+
48
+
49
+ def _recursive_split(
50
+ text: str,
51
+ chunk_size: int,
52
+ chunk_overlap: int,
53
+ separators: list[str],
54
+ ) -> list[str]:
55
+ """Core recursive splitting logic."""
56
+ # Base case: text fits
57
+ if len(text) <= chunk_size:
58
+ stripped = text.strip()
59
+ return [stripped] if stripped else []
60
+
61
+ # Try each separator
62
+ for i, sep in enumerate(separators):
63
+ pieces = _split_keeping_sep(text, sep)
64
+ if len(pieces) <= 1:
65
+ continue # separator not found; try next
66
+
67
+ # Merge small pieces back together up to chunk_size
68
+ merged = _merge_pieces(pieces, chunk_size, chunk_overlap, sep)
69
+
70
+ # Recursively split any chunk that's still too big
71
+ remaining_seps = separators[i + 1 :]
72
+ result: list[str] = []
73
+ for chunk in merged:
74
+ if len(chunk) <= chunk_size:
75
+ stripped = chunk.strip()
76
+ if stripped:
77
+ result.append(stripped)
78
+ elif remaining_seps:
79
+ result.extend(
80
+ _recursive_split(chunk, chunk_size, chunk_overlap, remaining_seps)
81
+ )
82
+ else:
83
+ # No more separators — keep as-is (won't split mid-word)
84
+ stripped = chunk.strip()
85
+ if stripped:
86
+ result.append(stripped)
87
+ return result
88
+
89
+ # No separator worked — return text as-is
90
+ stripped = text.strip()
91
+ return [stripped] if stripped else []
92
+
93
+
94
+ def _split_keeping_sep(text: str, sep: str) -> list[str]:
95
+ """Split text by *sep*, keeping the separator at the start of each piece
96
+ (except the first)."""
97
+ parts = text.split(sep)
98
+ if len(parts) <= 1:
99
+ return parts
100
+
101
+ result = [parts[0]]
102
+ for part in parts[1:]:
103
+ result.append(sep + part)
104
+ return result
105
+
106
+
107
+ def _merge_pieces(
108
+ pieces: list[str],
109
+ chunk_size: int,
110
+ chunk_overlap: int,
111
+ sep: str,
112
+ ) -> list[str]:
113
+ """Greedily merge adjacent pieces into chunks up to *chunk_size*.
114
+
115
+ When starting a new chunk, includes up to *chunk_overlap* characters
116
+ from the tail of the previous chunk.
117
+ """
118
+ chunks: list[str] = []
119
+ current: list[str] = []
120
+ current_len = 0
121
+
122
+ for piece in pieces:
123
+ piece_len = len(piece)
124
+
125
+ if current and current_len + piece_len > chunk_size:
126
+ # Flush current buffer
127
+ chunk_text = "".join(current)
128
+ if chunk_text.strip():
129
+ chunks.append(chunk_text)
130
+
131
+ # Build overlap from end of current buffer
132
+ overlap_pieces: list[str] = []
133
+ overlap_len = 0
134
+ for prev in reversed(current):
135
+ if overlap_len + len(prev) > chunk_overlap:
136
+ break
137
+ overlap_pieces.insert(0, prev)
138
+ overlap_len += len(prev)
139
+
140
+ current = overlap_pieces
141
+ current_len = overlap_len
142
+
143
+ current.append(piece)
144
+ current_len += piece_len
145
+
146
+ # Flush remaining
147
+ if current:
148
+ chunk_text = "".join(current)
149
+ if chunk_text.strip():
150
+ chunks.append(chunk_text)
151
+
152
+ return chunks