paper-harvest 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.4
2
+ Name: paper-harvest
3
+ Version: 0.2.0
4
+ Summary: Harvest arXiv papers into organized article directories with PDF, TeX, and Markdown.
5
+ Requires-Python: >=3.9
6
+ Description-Content-Type: text/markdown
7
+ Provides-Extra: dev
8
+ Requires-Dist: pytest>=7; extra == "dev"
9
+
10
+ # paper-harvest
11
+
12
+ Collect arXiv papers into a unified article directory: PDF, TeX source, optional Markdown, and a structured manifest.
13
+
14
+ Python 3.9+, stdlib only at runtime. Two ways to use it:
15
+
16
+ | Audience | How |
17
+ |----------|-----|
18
+ | Terminal / automation | `uvx paper-harvest` (PyPI) |
19
+ | Cursor / Agent | Install the [skill zip](#cursor-skill) |
20
+
21
+ ## Quick start
22
+
23
+ ```bash
24
+ uvx paper-harvest 'https://arxiv.org/abs/2403.18074' --json
25
+ ```
26
+
27
+ From a local checkout:
28
+
29
+ ```bash
30
+ uvx --from . paper-harvest 'https://arxiv.org/abs/2403.18074' --json
31
+ ```
32
+
33
+ Install permanently:
34
+
35
+ ```bash
36
+ uv tool install paper-harvest
37
+ paper-harvest 'https://arxiv.org/abs/2403.18074' --json
38
+ ```
39
+
40
+ **Input:** arXiv URLs only (`arxiv.org`, `abs` / `pdf` / `src`).
41
+
42
+ ## Output
43
+
44
+ ```text
45
+ <article_id>/
46
+ pdf/<article_id>.pdf
47
+ tex/
48
+ arXiv-<id>.tar.gz
49
+ source/
50
+ md/
51
+ <article_id>.md
52
+ assets/
53
+ mineru_result.json
54
+ harvest_result.json
55
+ ```
56
+
57
+ Each step reports `ok`, `skipped`, or `failed` in `harvest_result.json`. Failed steps do not remove earlier artifacts.
58
+
59
+ ## CLI flags
60
+
61
+ | Flag | Effect |
62
+ |------|--------|
63
+ | `--target-dir <dir>` | Output root (default: `.`) |
64
+ | `--skip-tex` | Skip TeX source download |
65
+ | `--skip-md` | Skip Markdown extraction |
66
+ | `--force` | Overwrite existing `md/` |
67
+ | `--json` | Print structured JSON to stdout |
68
+ | `--timeout <sec>` | Download timeout (default: 120) |
69
+
70
+ ## Markdown (MinerU)
71
+
72
+ Markdown is optional and uses [MinerU](https://mineru.net/).
73
+
74
+ 1. **With token** → precision API (better quality, larger PDFs)
75
+ 2. **Without token** → lightweight API (PDFs ≤ 10 MB)
76
+ 3. **Otherwise** → `md` is `skipped` or `failed`; PDF and TeX are kept
77
+
78
+ Configure a token via any one of:
79
+
80
+ ```bash
81
+ export MINERU_API_TOKEN='your-token'
82
+ echo 'your-token' > ~/.mineru_token && chmod 600 ~/.mineru_token
83
+ ```
84
+
85
+ Also checked: `./.mineru_token`, `~/.config/mineru/token`. Never commit token files.
86
+
87
+ Re-run Markdown only after adding a token:
88
+
89
+ ```bash
90
+ uvx paper-harvest '<same-arxiv-url>' --skip-tex --force --json
91
+ ```
92
+
93
+ ## Cursor Skill
94
+
95
+ The skill is Agent instructions only; execution still goes through the CLI.
96
+
97
+ **Install** — download `paper-harvest-<version>-skill.zip` from releases:
98
+
99
+ ```bash
100
+ unzip paper-harvest-0.2.0-skill.zip -d ~/.cursor/skills/
101
+ ```
102
+
103
+ **Build locally:**
104
+
105
+ ```bash
106
+ uv build
107
+ python tools/build_skill_zip.py
108
+ # → dist/paper-harvest-0.2.0-skill.zip
109
+ ```
110
+
111
+ Skill sources live in `skill/` (`SKILL.md`, `references/`, `scripts/harvest.sh`).
112
+
113
+ ## Project layout
114
+
115
+ ```text
116
+ paper-harvest/
117
+ pyproject.toml
118
+ src/paper_harvest/ # CLI + core
119
+ skill/ # Agent skill (shipped as zip)
120
+ docs/spec.md # product specification
121
+ tools/build_skill_zip.py
122
+ tests/
123
+ ```
124
+
125
+ ## Development
126
+
127
+ ```bash
128
+ uv sync --extra dev
129
+ uv run pytest
130
+ uv build
131
+ uvx --from . paper-harvest --help
132
+ ```
133
+
134
+ ## Publish
135
+
136
+ 1. Bump `version` in `pyproject.toml` and `src/paper_harvest/__init__.py`
137
+ 2. `uv run pytest && uv build && python tools/build_skill_zip.py`
138
+ 3. Tag `v0.x.y`, push, attach `dist/*-skill.zip` to GitHub Release
139
+ 4. `uv publish` for PyPI (`uvx paper-harvest`)
140
+
141
+ ## Docs
142
+
143
+ - `docs/spec.md` — product / engineering specification
144
+ - `skill/references/` — Agent-facing reference (manifest schema, MinerU setup)
@@ -0,0 +1,135 @@
1
+ # paper-harvest
2
+
3
+ Collect arXiv papers into a unified article directory: PDF, TeX source, optional Markdown, and a structured manifest.
4
+
5
+ Python 3.9+, stdlib only at runtime. Two ways to use it:
6
+
7
+ | Audience | How |
8
+ |----------|-----|
9
+ | Terminal / automation | `uvx paper-harvest` (PyPI) |
10
+ | Cursor / Agent | Install the [skill zip](#cursor-skill) |
11
+
12
+ ## Quick start
13
+
14
+ ```bash
15
+ uvx paper-harvest 'https://arxiv.org/abs/2403.18074' --json
16
+ ```
17
+
18
+ From a local checkout:
19
+
20
+ ```bash
21
+ uvx --from . paper-harvest 'https://arxiv.org/abs/2403.18074' --json
22
+ ```
23
+
24
+ Install permanently:
25
+
26
+ ```bash
27
+ uv tool install paper-harvest
28
+ paper-harvest 'https://arxiv.org/abs/2403.18074' --json
29
+ ```
30
+
31
+ **Input:** arXiv URLs only (`arxiv.org`, `abs` / `pdf` / `src`).
32
+
33
+ ## Output
34
+
35
+ ```text
36
+ <article_id>/
37
+ pdf/<article_id>.pdf
38
+ tex/
39
+ arXiv-<id>.tar.gz
40
+ source/
41
+ md/
42
+ <article_id>.md
43
+ assets/
44
+ mineru_result.json
45
+ harvest_result.json
46
+ ```
47
+
48
+ Each step reports `ok`, `skipped`, or `failed` in `harvest_result.json`. Failed steps do not remove earlier artifacts.
49
+
50
+ ## CLI flags
51
+
52
+ | Flag | Effect |
53
+ |------|--------|
54
+ | `--target-dir <dir>` | Output root (default: `.`) |
55
+ | `--skip-tex` | Skip TeX source download |
56
+ | `--skip-md` | Skip Markdown extraction |
57
+ | `--force` | Overwrite existing `md/` |
58
+ | `--json` | Print structured JSON to stdout |
59
+ | `--timeout <sec>` | Download timeout (default: 120) |
60
+
61
+ ## Markdown (MinerU)
62
+
63
+ Markdown is optional and uses [MinerU](https://mineru.net/).
64
+
65
+ 1. **With token** → precision API (better quality, larger PDFs)
66
+ 2. **Without token** → lightweight API (PDFs ≤ 10 MB)
67
+ 3. **Otherwise** → `md` is `skipped` or `failed`; PDF and TeX are kept
68
+
69
+ Configure a token via any one of:
70
+
71
+ ```bash
72
+ export MINERU_API_TOKEN='your-token'
73
+ echo 'your-token' > ~/.mineru_token && chmod 600 ~/.mineru_token
74
+ ```
75
+
76
+ Also checked: `./.mineru_token`, `~/.config/mineru/token`. Never commit token files.
77
+
78
+ Re-run Markdown only after adding a token:
79
+
80
+ ```bash
81
+ uvx paper-harvest '<same-arxiv-url>' --skip-tex --force --json
82
+ ```
83
+
84
+ ## Cursor Skill
85
+
86
+ The skill is Agent instructions only; execution still goes through the CLI.
87
+
88
+ **Install** — download `paper-harvest-<version>-skill.zip` from releases:
89
+
90
+ ```bash
91
+ unzip paper-harvest-0.2.0-skill.zip -d ~/.cursor/skills/
92
+ ```
93
+
94
+ **Build locally:**
95
+
96
+ ```bash
97
+ uv build
98
+ python tools/build_skill_zip.py
99
+ # → dist/paper-harvest-0.2.0-skill.zip
100
+ ```
101
+
102
+ Skill sources live in `skill/` (`SKILL.md`, `references/`, `scripts/harvest.sh`).
103
+
104
+ ## Project layout
105
+
106
+ ```text
107
+ paper-harvest/
108
+ pyproject.toml
109
+ src/paper_harvest/ # CLI + core
110
+ skill/ # Agent skill (shipped as zip)
111
+ docs/spec.md # product specification
112
+ tools/build_skill_zip.py
113
+ tests/
114
+ ```
115
+
116
+ ## Development
117
+
118
+ ```bash
119
+ uv sync --extra dev
120
+ uv run pytest
121
+ uv build
122
+ uvx --from . paper-harvest --help
123
+ ```
124
+
125
+ ## Publish
126
+
127
+ 1. Bump `version` in `pyproject.toml` and `src/paper_harvest/__init__.py`
128
+ 2. `uv run pytest && uv build && python tools/build_skill_zip.py`
129
+ 3. Tag `v0.x.y`, push, attach `dist/*-skill.zip` to GitHub Release
130
+ 4. `uv publish` for PyPI (`uvx paper-harvest`)
131
+
132
+ ## Docs
133
+
134
+ - `docs/spec.md` — product / engineering specification
135
+ - `skill/references/` — Agent-facing reference (manifest schema, MinerU setup)
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "paper-harvest"
7
+ version = "0.2.0"
8
+ description = "Harvest arXiv papers into organized article directories with PDF, TeX, and Markdown."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ dependencies = []
12
+
13
+ [project.scripts]
14
+ paper-harvest = "paper_harvest.cli:main"
15
+
16
+ [project.optional-dependencies]
17
+ dev = ["pytest>=7"]
18
+
19
+ [tool.setuptools.packages.find]
20
+ where = ["src"]
21
+
22
+ [tool.pytest.ini_options]
23
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
@@ -0,0 +1,231 @@
1
+ """Harvest arXiv papers into organized article directories."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from paper_harvest.core.article_layout import build_article_layout
11
+ from paper_harvest.core.models import HarvestResult, StepResult
12
+ from paper_harvest.core.url.parse import parse_source
13
+
14
+
15
+ def main(argv: list[str] | None = None) -> int:
16
+ parser = argparse.ArgumentParser(
17
+ prog="paper-harvest",
18
+ description="Harvest arXiv papers into organized article directories.",
19
+ )
20
+ parser.add_argument(
21
+ "source",
22
+ help="arXiv URL",
23
+ )
24
+ parser.add_argument(
25
+ "--target-dir",
26
+ default=".",
27
+ help="Output root directory (default: current directory)",
28
+ )
29
+ parser.add_argument(
30
+ "--skip-tex",
31
+ action="store_true",
32
+ help="Skip TeX source download",
33
+ )
34
+ parser.add_argument(
35
+ "--skip-md",
36
+ action="store_true",
37
+ help="Skip Markdown extraction",
38
+ )
39
+ parser.add_argument(
40
+ "--force",
41
+ action="store_true",
42
+ help="Overwrite existing md/ output",
43
+ )
44
+ parser.add_argument(
45
+ "--json",
46
+ action="store_true",
47
+ help="Output structured JSON result to stdout",
48
+ )
49
+ parser.add_argument(
50
+ "--timeout",
51
+ type=int,
52
+ default=120,
53
+ help="Download timeout in seconds (default: 120)",
54
+ )
55
+
56
+ args = parser.parse_args(argv)
57
+ return run(args)
58
+
59
+
60
+ def run(args: argparse.Namespace) -> int:
61
+ try:
62
+ source = parse_source(args.source)
63
+ except ValueError as exc:
64
+ sys.stderr.write(f"paper-harvest: {exc}\n")
65
+ return 1
66
+
67
+ layout = build_article_layout(args.target_dir, source.article_id_candidate)
68
+
69
+ pdf_result = step_download_pdf(source, layout, timeout=args.timeout)
70
+
71
+ tex_result: StepResult
72
+ if args.skip_tex:
73
+ tex_result = StepResult(status="skipped", message="--skip-tex")
74
+ else:
75
+ tex_result = step_fetch_tex(source, layout, timeout=args.timeout)
76
+
77
+ md_result: StepResult
78
+ if args.skip_md:
79
+ md_result = StepResult(status="skipped", message="--skip-md")
80
+ else:
81
+ md_result = step_extract_md(
82
+ source,
83
+ layout,
84
+ pdf_path=Path(pdf_result.path) if pdf_result.path else None,
85
+ force=args.force,
86
+ )
87
+
88
+ result = HarvestResult(
89
+ tool_name="paper-harvest",
90
+ article_id=layout.article_id,
91
+ article_dir=str(layout.article_dir),
92
+ source=source,
93
+ pdf=pdf_result,
94
+ tex=tex_result,
95
+ md=md_result,
96
+ manifest_path=str(layout.article_dir / "harvest_result.json"),
97
+ )
98
+
99
+ manifest = result.to_dict()
100
+ layout.article_dir.mkdir(parents=True, exist_ok=True)
101
+ (layout.article_dir / "harvest_result.json").write_text(
102
+ json.dumps(manifest, ensure_ascii=False, indent=2) + "\n",
103
+ encoding="utf-8",
104
+ )
105
+
106
+ if args.json:
107
+ print(json.dumps(manifest, ensure_ascii=False, indent=2))
108
+ else:
109
+ print_summary(result)
110
+
111
+ failed = any(s.status == "failed" for s in [result.pdf, result.tex, result.md])
112
+ return 1 if failed else 0
113
+
114
+
115
+ def step_download_pdf(
116
+ source,
117
+ layout,
118
+ *,
119
+ timeout: int,
120
+ ) -> StepResult:
121
+ try:
122
+ from paper_harvest.core.pdf.download import download_pdf
123
+
124
+ dl_result = download_pdf(source, layout, timeout=timeout)
125
+ return StepResult(
126
+ status="ok",
127
+ message="pdf downloaded",
128
+ path=str(dl_result.pdf_path),
129
+ )
130
+ except Exception as exc:
131
+ return StepResult(
132
+ status="failed",
133
+ message=str(exc),
134
+ error_code=exc_to_code(exc),
135
+ )
136
+
137
+
138
+ def step_fetch_tex(
139
+ source,
140
+ layout,
141
+ *,
142
+ timeout: int,
143
+ ) -> StepResult:
144
+ try:
145
+ from paper_harvest.core.tex.arxiv import fetch_arxiv_source
146
+
147
+ tex_result = fetch_arxiv_source(source, layout, timeout=timeout)
148
+ return StepResult(
149
+ status="ok",
150
+ message="tex source fetched",
151
+ path=str(tex_result.extracted_dir),
152
+ artifacts=[str(tex_result.archive_path)],
153
+ )
154
+ except Exception as exc:
155
+ return StepResult(
156
+ status="failed",
157
+ message=str(exc),
158
+ error_code=exc_to_code(exc),
159
+ )
160
+
161
+
162
+ def step_extract_md(
163
+ source,
164
+ layout,
165
+ *,
166
+ pdf_path: Path | None,
167
+ force: bool,
168
+ ) -> StepResult:
169
+ if pdf_path is None or not pdf_path.exists():
170
+ return StepResult(
171
+ status="failed",
172
+ message="no PDF available for Markdown extraction",
173
+ )
174
+
175
+ try:
176
+ from paper_harvest.core.md.mineru import MineruError, extract_markdown
177
+
178
+ md_result = extract_markdown(
179
+ pdf_path,
180
+ layout,
181
+ source,
182
+ force=force,
183
+ )
184
+ return StepResult(
185
+ status="ok",
186
+ message="markdown extracted",
187
+ path=str(md_result.markdown_path),
188
+ artifacts=[str(md_result.markdown_path)],
189
+ )
190
+ except MineruError as exc:
191
+ code = exc.error_code or exc_to_code(exc)
192
+ if code in ("-30001",):
193
+ return StepResult(
194
+ status="skipped",
195
+ message=f"lightweight API not applicable: {exc}",
196
+ error_code=code,
197
+ )
198
+ return StepResult(
199
+ status="failed",
200
+ message=str(exc),
201
+ error_code=code,
202
+ )
203
+ except Exception as exc:
204
+ return StepResult(
205
+ status="failed",
206
+ message=str(exc),
207
+ error_code=exc_to_code(exc),
208
+ )
209
+
210
+
211
+ def exc_to_code(exc: Exception) -> str:
212
+ return type(exc).__name__
213
+
214
+
215
+ def print_summary(result: HarvestResult) -> None:
216
+ print(f"article : {result.article_id}")
217
+ print(f"directory: {result.article_dir}")
218
+ print(f"pdf : [{result.pdf.status}] {result.pdf.message}")
219
+ if result.pdf.path:
220
+ print(f" {result.pdf.path}")
221
+ print(f"tex : [{result.tex.status}] {result.tex.message}")
222
+ if result.tex.path:
223
+ print(f" {result.tex.path}")
224
+ print(f"md : [{result.md.status}] {result.md.message}")
225
+ if result.md.path:
226
+ print(f" {result.md.path}")
227
+ print(f"manifest : {result.manifest_path}")
228
+
229
+
230
+ if __name__ == "__main__":
231
+ raise SystemExit(main())
File without changes
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ import re
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class ArticleLayout:
10
+ article_id: str
11
+ target_dir: Path
12
+ article_dir: Path
13
+ pdf_dir: Path
14
+ md_dir: Path
15
+ tex_dir: Path
16
+
17
+
18
+ def normalize_article_id(value: str) -> str:
19
+ normalized = re.sub(r"[^A-Za-z0-9]+", "_", value).strip("_").lower()
20
+ if not normalized:
21
+ raise ValueError("article id is empty")
22
+ return normalized
23
+
24
+
25
+ def build_article_layout(output_root: str | Path, article_id: str) -> ArticleLayout:
26
+ normalized_id = normalize_article_id(article_id)
27
+ target_dir = Path(output_root).expanduser().resolve()
28
+ article_dir = target_dir / normalized_id
29
+ return ArticleLayout(
30
+ article_id=normalized_id,
31
+ target_dir=target_dir,
32
+ article_dir=article_dir,
33
+ pdf_dir=article_dir / "pdf",
34
+ md_dir=article_dir / "md",
35
+ tex_dir=article_dir / "tex",
36
+ )
37
+
38
+
39
+ def ensure_variant_dir(layout: ArticleLayout, variant: str) -> Path:
40
+ if variant == "pdf":
41
+ target_dir = layout.pdf_dir
42
+ elif variant == "md":
43
+ target_dir = layout.md_dir
44
+ elif variant == "tex":
45
+ target_dir = layout.tex_dir
46
+ else:
47
+ raise ValueError(f"Unsupported article variant: {variant}")
48
+
49
+ target_dir.mkdir(parents=True, exist_ok=True)
50
+ return target_dir
51
+
52
+
53
+ def ensure_article_layout(layout: ArticleLayout) -> ArticleLayout:
54
+ layout.article_dir.mkdir(parents=True, exist_ok=True)
55
+ layout.pdf_dir.mkdir(parents=True, exist_ok=True)
56
+ layout.md_dir.mkdir(parents=True, exist_ok=True)
57
+ layout.tex_dir.mkdir(parents=True, exist_ok=True)
58
+ return layout
59
+
60
+
61
+ def to_public_path(path: str | Path, target_dir: str | Path) -> str:
62
+ resolved_path = Path(path).expanduser().resolve()
63
+ resolved_target_dir = Path(target_dir).expanduser().resolve()
64
+ try:
65
+ return str(resolved_path.relative_to(resolved_target_dir))
66
+ except ValueError:
67
+ return str(resolved_path)