paper-harvest 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paper_harvest-0.2.0/PKG-INFO +144 -0
- paper_harvest-0.2.0/README.md +135 -0
- paper_harvest-0.2.0/pyproject.toml +23 -0
- paper_harvest-0.2.0/setup.cfg +4 -0
- paper_harvest-0.2.0/src/paper_harvest/__init__.py +1 -0
- paper_harvest-0.2.0/src/paper_harvest/cli.py +231 -0
- paper_harvest-0.2.0/src/paper_harvest/core/__init__.py +0 -0
- paper_harvest-0.2.0/src/paper_harvest/core/article_layout.py +67 -0
- paper_harvest-0.2.0/src/paper_harvest/core/md/__init__.py +0 -0
- paper_harvest-0.2.0/src/paper_harvest/core/md/mineru.py +572 -0
- paper_harvest-0.2.0/src/paper_harvest/core/models.py +54 -0
- paper_harvest-0.2.0/src/paper_harvest/core/pdf/__init__.py +0 -0
- paper_harvest-0.2.0/src/paper_harvest/core/pdf/download.py +59 -0
- paper_harvest-0.2.0/src/paper_harvest/core/tex/__init__.py +0 -0
- paper_harvest-0.2.0/src/paper_harvest/core/tex/arxiv.py +133 -0
- paper_harvest-0.2.0/src/paper_harvest/core/url/__init__.py +0 -0
- paper_harvest-0.2.0/src/paper_harvest/core/url/parse.py +73 -0
- paper_harvest-0.2.0/src/paper_harvest.egg-info/PKG-INFO +144 -0
- paper_harvest-0.2.0/src/paper_harvest.egg-info/SOURCES.txt +22 -0
- paper_harvest-0.2.0/src/paper_harvest.egg-info/dependency_links.txt +1 -0
- paper_harvest-0.2.0/src/paper_harvest.egg-info/entry_points.txt +2 -0
- paper_harvest-0.2.0/src/paper_harvest.egg-info/requires.txt +3 -0
- paper_harvest-0.2.0/src/paper_harvest.egg-info/top_level.txt +1 -0
- paper_harvest-0.2.0/tests/test_parse.py +36 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: paper-harvest
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Harvest arXiv papers into organized article directories with PDF, TeX, and Markdown.
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Provides-Extra: dev
|
|
8
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
9
|
+
|
|
10
|
+
# paper-harvest
|
|
11
|
+
|
|
12
|
+
Collect arXiv papers into a unified article directory: PDF, TeX source, optional Markdown, and a structured manifest.
|
|
13
|
+
|
|
14
|
+
Python 3.9+, stdlib only at runtime. Two ways to use it:
|
|
15
|
+
|
|
16
|
+
| Audience | How |
|
|
17
|
+
|----------|-----|
|
|
18
|
+
| Terminal / automation | `uvx paper-harvest` (PyPI) |
|
|
19
|
+
| Cursor / Agent | Install the [skill zip](#cursor-skill) |
|
|
20
|
+
|
|
21
|
+
## Quick start
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
uvx paper-harvest 'https://arxiv.org/abs/2403.18074' --json
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
From a local checkout:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uvx --from . paper-harvest 'https://arxiv.org/abs/2403.18074' --json
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Install permanently:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
uv tool install paper-harvest
|
|
37
|
+
paper-harvest 'https://arxiv.org/abs/2403.18074' --json
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**Input:** arXiv URLs only (`arxiv.org`, `abs` / `pdf` / `src`).
|
|
41
|
+
|
|
42
|
+
## Output
|
|
43
|
+
|
|
44
|
+
```text
|
|
45
|
+
<article_id>/
|
|
46
|
+
pdf/<article_id>.pdf
|
|
47
|
+
tex/
|
|
48
|
+
arXiv-<id>.tar.gz
|
|
49
|
+
source/
|
|
50
|
+
md/
|
|
51
|
+
<article_id>.md
|
|
52
|
+
assets/
|
|
53
|
+
mineru_result.json
|
|
54
|
+
harvest_result.json
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Each step reports `ok`, `skipped`, or `failed` in `harvest_result.json`. Failed steps do not remove earlier artifacts.
|
|
58
|
+
|
|
59
|
+
## CLI flags
|
|
60
|
+
|
|
61
|
+
| Flag | Effect |
|
|
62
|
+
|------|--------|
|
|
63
|
+
| `--target-dir <dir>` | Output root (default: `.`) |
|
|
64
|
+
| `--skip-tex` | Skip TeX source download |
|
|
65
|
+
| `--skip-md` | Skip Markdown extraction |
|
|
66
|
+
| `--force` | Overwrite existing `md/` |
|
|
67
|
+
| `--json` | Print structured JSON to stdout |
|
|
68
|
+
| `--timeout <sec>` | Download timeout (default: 120) |
|
|
69
|
+
|
|
70
|
+
## Markdown (MinerU)
|
|
71
|
+
|
|
72
|
+
Markdown is optional and uses [MinerU](https://mineru.net/).
|
|
73
|
+
|
|
74
|
+
1. **With token** → precision API (better quality, larger PDFs)
|
|
75
|
+
2. **Without token** → lightweight API (PDFs ≤ 10 MB)
|
|
76
|
+
3. **Otherwise** → `md` is `skipped` or `failed`; PDF and TeX are kept
|
|
77
|
+
|
|
78
|
+
Configure a token via any one of:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
export MINERU_API_TOKEN='your-token'
|
|
82
|
+
echo 'your-token' > ~/.mineru_token && chmod 600 ~/.mineru_token
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Also checked: `./.mineru_token`, `~/.config/mineru/token`. Never commit token files.
|
|
86
|
+
|
|
87
|
+
Re-run Markdown only after adding a token:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
uvx paper-harvest '<same-arxiv-url>' --skip-tex --force --json
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Cursor Skill
|
|
94
|
+
|
|
95
|
+
The skill is Agent instructions only; execution still goes through the CLI.
|
|
96
|
+
|
|
97
|
+
**Install** — download `paper-harvest-<version>-skill.zip` from releases:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
unzip paper-harvest-0.2.0-skill.zip -d ~/.cursor/skills/
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**Build locally:**
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
uv build
|
|
107
|
+
python tools/build_skill_zip.py
|
|
108
|
+
# → dist/paper-harvest-0.2.0-skill.zip
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Skill sources live in `skill/` (`SKILL.md`, `references/`, `scripts/harvest.sh`).
|
|
112
|
+
|
|
113
|
+
## Project layout
|
|
114
|
+
|
|
115
|
+
```text
|
|
116
|
+
paper-harvest/
|
|
117
|
+
pyproject.toml
|
|
118
|
+
src/paper_harvest/ # CLI + core
|
|
119
|
+
skill/ # Agent skill (shipped as zip)
|
|
120
|
+
docs/spec.md # product specification
|
|
121
|
+
tools/build_skill_zip.py
|
|
122
|
+
tests/
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Development
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
uv sync --extra dev
|
|
129
|
+
uv run pytest
|
|
130
|
+
uv build
|
|
131
|
+
uvx --from . paper-harvest --help
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Publish
|
|
135
|
+
|
|
136
|
+
1. Bump `version` in `pyproject.toml` and `src/paper_harvest/__init__.py`
|
|
137
|
+
2. `uv run pytest && uv build && python tools/build_skill_zip.py`
|
|
138
|
+
3. Tag `v0.x.y`, push, attach `dist/*-skill.zip` to GitHub Release
|
|
139
|
+
4. `uv publish` for PyPI (`uvx paper-harvest`)
|
|
140
|
+
|
|
141
|
+
## Docs
|
|
142
|
+
|
|
143
|
+
- `docs/spec.md` — product / engineering specification
|
|
144
|
+
- `skill/references/` — Agent-facing reference (manifest schema, MinerU setup)
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# paper-harvest
|
|
2
|
+
|
|
3
|
+
Collect arXiv papers into a unified article directory: PDF, TeX source, optional Markdown, and a structured manifest.
|
|
4
|
+
|
|
5
|
+
Python 3.9+, stdlib only at runtime. Two ways to use it:
|
|
6
|
+
|
|
7
|
+
| Audience | How |
|
|
8
|
+
|----------|-----|
|
|
9
|
+
| Terminal / automation | `uvx paper-harvest` (PyPI) |
|
|
10
|
+
| Cursor / Agent | Install the [skill zip](#cursor-skill) |
|
|
11
|
+
|
|
12
|
+
## Quick start
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
uvx paper-harvest 'https://arxiv.org/abs/2403.18074' --json
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
From a local checkout:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uvx --from . paper-harvest 'https://arxiv.org/abs/2403.18074' --json
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Install permanently:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
uv tool install paper-harvest
|
|
28
|
+
paper-harvest 'https://arxiv.org/abs/2403.18074' --json
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
**Input:** arXiv URLs only (`arxiv.org`, `abs` / `pdf` / `src`).
|
|
32
|
+
|
|
33
|
+
## Output
|
|
34
|
+
|
|
35
|
+
```text
|
|
36
|
+
<article_id>/
|
|
37
|
+
pdf/<article_id>.pdf
|
|
38
|
+
tex/
|
|
39
|
+
arXiv-<id>.tar.gz
|
|
40
|
+
source/
|
|
41
|
+
md/
|
|
42
|
+
<article_id>.md
|
|
43
|
+
assets/
|
|
44
|
+
mineru_result.json
|
|
45
|
+
harvest_result.json
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Each step reports `ok`, `skipped`, or `failed` in `harvest_result.json`. Failed steps do not remove earlier artifacts.
|
|
49
|
+
|
|
50
|
+
## CLI flags
|
|
51
|
+
|
|
52
|
+
| Flag | Effect |
|
|
53
|
+
|------|--------|
|
|
54
|
+
| `--target-dir <dir>` | Output root (default: `.`) |
|
|
55
|
+
| `--skip-tex` | Skip TeX source download |
|
|
56
|
+
| `--skip-md` | Skip Markdown extraction |
|
|
57
|
+
| `--force` | Overwrite existing `md/` |
|
|
58
|
+
| `--json` | Print structured JSON to stdout |
|
|
59
|
+
| `--timeout <sec>` | Download timeout (default: 120) |
|
|
60
|
+
|
|
61
|
+
## Markdown (MinerU)
|
|
62
|
+
|
|
63
|
+
Markdown is optional and uses [MinerU](https://mineru.net/).
|
|
64
|
+
|
|
65
|
+
1. **With token** → precision API (better quality, larger PDFs)
|
|
66
|
+
2. **Without token** → lightweight API (PDFs ≤ 10 MB)
|
|
67
|
+
3. **Otherwise** → `md` is `skipped` or `failed`; PDF and TeX are kept
|
|
68
|
+
|
|
69
|
+
Configure a token via any one of:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
export MINERU_API_TOKEN='your-token'
|
|
73
|
+
echo 'your-token' > ~/.mineru_token && chmod 600 ~/.mineru_token
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Also checked: `./.mineru_token`, `~/.config/mineru/token`. Never commit token files.
|
|
77
|
+
|
|
78
|
+
Re-run Markdown only after adding a token:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
uvx paper-harvest '<same-arxiv-url>' --skip-tex --force --json
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Cursor Skill
|
|
85
|
+
|
|
86
|
+
The skill is Agent instructions only; execution still goes through the CLI.
|
|
87
|
+
|
|
88
|
+
**Install** — download `paper-harvest-<version>-skill.zip` from releases:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
unzip paper-harvest-0.2.0-skill.zip -d ~/.cursor/skills/
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Build locally:**
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
uv build
|
|
98
|
+
python tools/build_skill_zip.py
|
|
99
|
+
# → dist/paper-harvest-0.2.0-skill.zip
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Skill sources live in `skill/` (`SKILL.md`, `references/`, `scripts/harvest.sh`).
|
|
103
|
+
|
|
104
|
+
## Project layout
|
|
105
|
+
|
|
106
|
+
```text
|
|
107
|
+
paper-harvest/
|
|
108
|
+
pyproject.toml
|
|
109
|
+
src/paper_harvest/ # CLI + core
|
|
110
|
+
skill/ # Agent skill (shipped as zip)
|
|
111
|
+
docs/spec.md # product specification
|
|
112
|
+
tools/build_skill_zip.py
|
|
113
|
+
tests/
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Development
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
uv sync --extra dev
|
|
120
|
+
uv run pytest
|
|
121
|
+
uv build
|
|
122
|
+
uvx --from . paper-harvest --help
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Publish
|
|
126
|
+
|
|
127
|
+
1. Bump `version` in `pyproject.toml` and `src/paper_harvest/__init__.py`
|
|
128
|
+
2. `uv run pytest && uv build && python tools/build_skill_zip.py`
|
|
129
|
+
3. Tag `v0.x.y`, push, attach `dist/*-skill.zip` to GitHub Release
|
|
130
|
+
4. `uv publish` for PyPI (`uvx paper-harvest`)
|
|
131
|
+
|
|
132
|
+
## Docs
|
|
133
|
+
|
|
134
|
+
- `docs/spec.md` — product / engineering specification
|
|
135
|
+
- `skill/references/` — Agent-facing reference (manifest schema, MinerU setup)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "paper-harvest"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Harvest arXiv papers into organized article directories with PDF, TeX, and Markdown."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
dependencies = []
|
|
12
|
+
|
|
13
|
+
[project.scripts]
|
|
14
|
+
paper-harvest = "paper_harvest.cli:main"
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
dev = ["pytest>=7"]
|
|
18
|
+
|
|
19
|
+
[tool.setuptools.packages.find]
|
|
20
|
+
where = ["src"]
|
|
21
|
+
|
|
22
|
+
[tool.pytest.ini_options]
|
|
23
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Harvest arXiv papers into organized article directories."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from paper_harvest.core.article_layout import build_article_layout
|
|
11
|
+
from paper_harvest.core.models import HarvestResult, StepResult
|
|
12
|
+
from paper_harvest.core.url.parse import parse_source
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main(argv: list[str] | None = None) -> int:
|
|
16
|
+
parser = argparse.ArgumentParser(
|
|
17
|
+
prog="paper-harvest",
|
|
18
|
+
description="Harvest arXiv papers into organized article directories.",
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"source",
|
|
22
|
+
help="arXiv URL",
|
|
23
|
+
)
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"--target-dir",
|
|
26
|
+
default=".",
|
|
27
|
+
help="Output root directory (default: current directory)",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"--skip-tex",
|
|
31
|
+
action="store_true",
|
|
32
|
+
help="Skip TeX source download",
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"--skip-md",
|
|
36
|
+
action="store_true",
|
|
37
|
+
help="Skip Markdown extraction",
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--force",
|
|
41
|
+
action="store_true",
|
|
42
|
+
help="Overwrite existing md/ output",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--json",
|
|
46
|
+
action="store_true",
|
|
47
|
+
help="Output structured JSON result to stdout",
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument(
|
|
50
|
+
"--timeout",
|
|
51
|
+
type=int,
|
|
52
|
+
default=120,
|
|
53
|
+
help="Download timeout in seconds (default: 120)",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
args = parser.parse_args(argv)
|
|
57
|
+
return run(args)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def run(args: argparse.Namespace) -> int:
|
|
61
|
+
try:
|
|
62
|
+
source = parse_source(args.source)
|
|
63
|
+
except ValueError as exc:
|
|
64
|
+
sys.stderr.write(f"paper-harvest: {exc}\n")
|
|
65
|
+
return 1
|
|
66
|
+
|
|
67
|
+
layout = build_article_layout(args.target_dir, source.article_id_candidate)
|
|
68
|
+
|
|
69
|
+
pdf_result = step_download_pdf(source, layout, timeout=args.timeout)
|
|
70
|
+
|
|
71
|
+
tex_result: StepResult
|
|
72
|
+
if args.skip_tex:
|
|
73
|
+
tex_result = StepResult(status="skipped", message="--skip-tex")
|
|
74
|
+
else:
|
|
75
|
+
tex_result = step_fetch_tex(source, layout, timeout=args.timeout)
|
|
76
|
+
|
|
77
|
+
md_result: StepResult
|
|
78
|
+
if args.skip_md:
|
|
79
|
+
md_result = StepResult(status="skipped", message="--skip-md")
|
|
80
|
+
else:
|
|
81
|
+
md_result = step_extract_md(
|
|
82
|
+
source,
|
|
83
|
+
layout,
|
|
84
|
+
pdf_path=Path(pdf_result.path) if pdf_result.path else None,
|
|
85
|
+
force=args.force,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
result = HarvestResult(
|
|
89
|
+
tool_name="paper-harvest",
|
|
90
|
+
article_id=layout.article_id,
|
|
91
|
+
article_dir=str(layout.article_dir),
|
|
92
|
+
source=source,
|
|
93
|
+
pdf=pdf_result,
|
|
94
|
+
tex=tex_result,
|
|
95
|
+
md=md_result,
|
|
96
|
+
manifest_path=str(layout.article_dir / "harvest_result.json"),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
manifest = result.to_dict()
|
|
100
|
+
layout.article_dir.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
(layout.article_dir / "harvest_result.json").write_text(
|
|
102
|
+
json.dumps(manifest, ensure_ascii=False, indent=2) + "\n",
|
|
103
|
+
encoding="utf-8",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if args.json:
|
|
107
|
+
print(json.dumps(manifest, ensure_ascii=False, indent=2))
|
|
108
|
+
else:
|
|
109
|
+
print_summary(result)
|
|
110
|
+
|
|
111
|
+
failed = any(s.status == "failed" for s in [result.pdf, result.tex, result.md])
|
|
112
|
+
return 1 if failed else 0
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def step_download_pdf(
|
|
116
|
+
source,
|
|
117
|
+
layout,
|
|
118
|
+
*,
|
|
119
|
+
timeout: int,
|
|
120
|
+
) -> StepResult:
|
|
121
|
+
try:
|
|
122
|
+
from paper_harvest.core.pdf.download import download_pdf
|
|
123
|
+
|
|
124
|
+
dl_result = download_pdf(source, layout, timeout=timeout)
|
|
125
|
+
return StepResult(
|
|
126
|
+
status="ok",
|
|
127
|
+
message="pdf downloaded",
|
|
128
|
+
path=str(dl_result.pdf_path),
|
|
129
|
+
)
|
|
130
|
+
except Exception as exc:
|
|
131
|
+
return StepResult(
|
|
132
|
+
status="failed",
|
|
133
|
+
message=str(exc),
|
|
134
|
+
error_code=exc_to_code(exc),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def step_fetch_tex(
|
|
139
|
+
source,
|
|
140
|
+
layout,
|
|
141
|
+
*,
|
|
142
|
+
timeout: int,
|
|
143
|
+
) -> StepResult:
|
|
144
|
+
try:
|
|
145
|
+
from paper_harvest.core.tex.arxiv import fetch_arxiv_source
|
|
146
|
+
|
|
147
|
+
tex_result = fetch_arxiv_source(source, layout, timeout=timeout)
|
|
148
|
+
return StepResult(
|
|
149
|
+
status="ok",
|
|
150
|
+
message="tex source fetched",
|
|
151
|
+
path=str(tex_result.extracted_dir),
|
|
152
|
+
artifacts=[str(tex_result.archive_path)],
|
|
153
|
+
)
|
|
154
|
+
except Exception as exc:
|
|
155
|
+
return StepResult(
|
|
156
|
+
status="failed",
|
|
157
|
+
message=str(exc),
|
|
158
|
+
error_code=exc_to_code(exc),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def step_extract_md(
|
|
163
|
+
source,
|
|
164
|
+
layout,
|
|
165
|
+
*,
|
|
166
|
+
pdf_path: Path | None,
|
|
167
|
+
force: bool,
|
|
168
|
+
) -> StepResult:
|
|
169
|
+
if pdf_path is None or not pdf_path.exists():
|
|
170
|
+
return StepResult(
|
|
171
|
+
status="failed",
|
|
172
|
+
message="no PDF available for Markdown extraction",
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
from paper_harvest.core.md.mineru import MineruError, extract_markdown
|
|
177
|
+
|
|
178
|
+
md_result = extract_markdown(
|
|
179
|
+
pdf_path,
|
|
180
|
+
layout,
|
|
181
|
+
source,
|
|
182
|
+
force=force,
|
|
183
|
+
)
|
|
184
|
+
return StepResult(
|
|
185
|
+
status="ok",
|
|
186
|
+
message="markdown extracted",
|
|
187
|
+
path=str(md_result.markdown_path),
|
|
188
|
+
artifacts=[str(md_result.markdown_path)],
|
|
189
|
+
)
|
|
190
|
+
except MineruError as exc:
|
|
191
|
+
code = exc.error_code or exc_to_code(exc)
|
|
192
|
+
if code in ("-30001",):
|
|
193
|
+
return StepResult(
|
|
194
|
+
status="skipped",
|
|
195
|
+
message=f"lightweight API not applicable: {exc}",
|
|
196
|
+
error_code=code,
|
|
197
|
+
)
|
|
198
|
+
return StepResult(
|
|
199
|
+
status="failed",
|
|
200
|
+
message=str(exc),
|
|
201
|
+
error_code=code,
|
|
202
|
+
)
|
|
203
|
+
except Exception as exc:
|
|
204
|
+
return StepResult(
|
|
205
|
+
status="failed",
|
|
206
|
+
message=str(exc),
|
|
207
|
+
error_code=exc_to_code(exc),
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def exc_to_code(exc: Exception) -> str:
|
|
212
|
+
return type(exc).__name__
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def print_summary(result: HarvestResult) -> None:
|
|
216
|
+
print(f"article : {result.article_id}")
|
|
217
|
+
print(f"directory: {result.article_dir}")
|
|
218
|
+
print(f"pdf : [{result.pdf.status}] {result.pdf.message}")
|
|
219
|
+
if result.pdf.path:
|
|
220
|
+
print(f" {result.pdf.path}")
|
|
221
|
+
print(f"tex : [{result.tex.status}] {result.tex.message}")
|
|
222
|
+
if result.tex.path:
|
|
223
|
+
print(f" {result.tex.path}")
|
|
224
|
+
print(f"md : [{result.md.status}] {result.md.message}")
|
|
225
|
+
if result.md.path:
|
|
226
|
+
print(f" {result.md.path}")
|
|
227
|
+
print(f"manifest : {result.manifest_path}")
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
if __name__ == "__main__":
|
|
231
|
+
raise SystemExit(main())
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class ArticleLayout:
|
|
10
|
+
article_id: str
|
|
11
|
+
target_dir: Path
|
|
12
|
+
article_dir: Path
|
|
13
|
+
pdf_dir: Path
|
|
14
|
+
md_dir: Path
|
|
15
|
+
tex_dir: Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def normalize_article_id(value: str) -> str:
|
|
19
|
+
normalized = re.sub(r"[^A-Za-z0-9]+", "_", value).strip("_").lower()
|
|
20
|
+
if not normalized:
|
|
21
|
+
raise ValueError("article id is empty")
|
|
22
|
+
return normalized
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_article_layout(output_root: str | Path, article_id: str) -> ArticleLayout:
|
|
26
|
+
normalized_id = normalize_article_id(article_id)
|
|
27
|
+
target_dir = Path(output_root).expanduser().resolve()
|
|
28
|
+
article_dir = target_dir / normalized_id
|
|
29
|
+
return ArticleLayout(
|
|
30
|
+
article_id=normalized_id,
|
|
31
|
+
target_dir=target_dir,
|
|
32
|
+
article_dir=article_dir,
|
|
33
|
+
pdf_dir=article_dir / "pdf",
|
|
34
|
+
md_dir=article_dir / "md",
|
|
35
|
+
tex_dir=article_dir / "tex",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def ensure_variant_dir(layout: ArticleLayout, variant: str) -> Path:
|
|
40
|
+
if variant == "pdf":
|
|
41
|
+
target_dir = layout.pdf_dir
|
|
42
|
+
elif variant == "md":
|
|
43
|
+
target_dir = layout.md_dir
|
|
44
|
+
elif variant == "tex":
|
|
45
|
+
target_dir = layout.tex_dir
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError(f"Unsupported article variant: {variant}")
|
|
48
|
+
|
|
49
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
return target_dir
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def ensure_article_layout(layout: ArticleLayout) -> ArticleLayout:
|
|
54
|
+
layout.article_dir.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
layout.pdf_dir.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
layout.md_dir.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
layout.tex_dir.mkdir(parents=True, exist_ok=True)
|
|
58
|
+
return layout
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def to_public_path(path: str | Path, target_dir: str | Path) -> str:
|
|
62
|
+
resolved_path = Path(path).expanduser().resolve()
|
|
63
|
+
resolved_target_dir = Path(target_dir).expanduser().resolve()
|
|
64
|
+
try:
|
|
65
|
+
return str(resolved_path.relative_to(resolved_target_dir))
|
|
66
|
+
except ValueError:
|
|
67
|
+
return str(resolved_path)
|
|
File without changes
|