antifile 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- antifile-0.1.0/.gitignore +8 -0
- antifile-0.1.0/PKG-INFO +94 -0
- antifile-0.1.0/README.md +62 -0
- antifile-0.1.0/pyproject.toml +55 -0
- antifile-0.1.0/src/antifile/__init__.py +25 -0
- antifile-0.1.0/src/antifile/_base.py +49 -0
- antifile-0.1.0/src/antifile/bibfile.py +137 -0
- antifile-0.1.0/src/antifile/cli.py +256 -0
- antifile-0.1.0/src/antifile/dispatch.py +94 -0
- antifile-0.1.0/src/antifile/entry.py +45 -0
- antifile-0.1.0/src/antifile/epub/__init__.py +142 -0
- antifile-0.1.0/src/antifile/epub/_meta.py +220 -0
- antifile-0.1.0/src/antifile/pdf/__init__.py +147 -0
- antifile-0.1.0/src/antifile/pdf/_fallback.py +126 -0
- antifile-0.1.0/src/antifile/pdf/_identifiers.py +69 -0
- antifile-0.1.0/src/antifile/pdf/_llm.py +167 -0
- antifile-0.1.0/src/antifile/pdf/_resolvers.py +166 -0
- antifile-0.1.0/src/antifile/pdf/_text.py +51 -0
- antifile-0.1.0/src/antifile/preview.py +28 -0
- antifile-0.1.0/src/antifile/url/__init__.py +7 -0
- antifile-0.1.0/src/antifile/url/_meta.py +209 -0
- antifile-0.1.0/tests/test_epub.py +165 -0
- antifile-0.1.0/tests/test_extractors.py +179 -0
- antifile-0.1.0/uv.lock +343 -0
antifile-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: antifile
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract BibTeX metadata from PDFs, EPUBs, URLs, and identifiers
|
|
5
|
+
Project-URL: Homepage, https://github.com/vxvware/antifile
|
|
6
|
+
Project-URL: Repository, https://github.com/vxvware/antifile
|
|
7
|
+
Project-URL: Issues, https://github.com/vxvware/antifile/issues
|
|
8
|
+
Author-email: VxVware <trevor.j.vincent@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: arxiv,bibliography,bibtex,citations,doi,epub,isbn,metadata,pdf
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: LaTeX
|
|
21
|
+
Classifier: Topic :: Utilities
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Requires-Dist: bibtexparser>=2.0.0b7
|
|
24
|
+
Requires-Dist: httpx>=0.27
|
|
25
|
+
Requires-Dist: pymupdf>=1.24
|
|
26
|
+
Requires-Dist: rapidfuzz>=3.0
|
|
27
|
+
Requires-Dist: rich>=13.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# antifile
|
|
34
|
+
|
|
35
|
+
Extract BibTeX metadata from PDFs, EPUBs, URLs, and identifiers (DOI, arXiv, ISBN)
|
|
36
|
+
and append it to a `.bib` file.
|
|
37
|
+
|
|
38
|
+
## Install
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install antifile
|
|
42
|
+
# or
|
|
43
|
+
uv tool install antifile
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
antifile INPUT -o refs.bib
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
`INPUT` can be:
|
|
53
|
+
|
|
54
|
+
- a **PDF** or **EPUB** file — `antifile paper.pdf -o refs.bib`
|
|
55
|
+
- a **folder** of PDFs/EPUBs — `antifile ~/Downloads/papers -o refs.bib` (add `--recursive` to descend into subfolders)
|
|
56
|
+
- a **URL** — `antifile https://example.com/article -o refs.bib`
|
|
57
|
+
- a **DOI** — `antifile 10.1145/3292500 -o refs.bib`
|
|
58
|
+
- an **arXiv ID** — `antifile arXiv:1706.03762 -o refs.bib`
|
|
59
|
+
- an **ISBN** — `antifile 9780262033848 -o refs.bib`
|
|
60
|
+
|
|
61
|
+
Entries are appended with de-duplication: a new entry matching an existing one
|
|
62
|
+
(by DOI, arXiv ID, ISBN, or normalized title+author) fills in any missing fields
|
|
63
|
+
rather than creating a duplicate. Pass `--no-merge` to skip on duplicate, or
|
|
64
|
+
`--force` to append anyway with an auto-suffixed key.
|
|
65
|
+
|
|
66
|
+
### Options
|
|
67
|
+
|
|
68
|
+
| flag | effect |
|
|
69
|
+
|------|--------|
|
|
70
|
+
| `-o, --output FILE` | target `.bib` (required; created if missing) |
|
|
71
|
+
| `--method {auto,doi,arxiv,isbn,crossref,llm,claude-code}` | force a PDF extraction method (default: `auto`) |
|
|
72
|
+
| `--recursive` | recurse into subfolders for folder input |
|
|
73
|
+
| `--no-preview` | skip the first-page PDF preview |
|
|
74
|
+
| `--no-merge` | on duplicate, skip instead of filling missing fields |
|
|
75
|
+
| `--force` | append even if a duplicate exists |
|
|
76
|
+
|
|
77
|
+
### LLM-assisted extraction
|
|
78
|
+
|
|
79
|
+
When a PDF has no resolvable identifier, antifile can fall back to an LLM to read
|
|
80
|
+
the first page. Set whichever API key you have — it's picked up from the
|
|
81
|
+
environment:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
export ANTHROPIC_API_KEY=... # or OPENAI_API_KEY, or GEMINI_API_KEY / GOOGLE_API_KEY
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Related
|
|
88
|
+
|
|
89
|
+
- [antilibrary](https://github.com/vxvware/antilibrary) — manage BibTeX libraries from the terminal (can call antifile via `--add-from-files`).
|
|
90
|
+
- [antifind](https://github.com/vxvware/antifind) — online metadata search → BibTeX.
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
MIT
|
antifile-0.1.0/README.md
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# antifile
|
|
2
|
+
|
|
3
|
+
Extract BibTeX metadata from PDFs, EPUBs, URLs, and identifiers (DOI, arXiv, ISBN)
|
|
4
|
+
and append it to a `.bib` file.
|
|
5
|
+
|
|
6
|
+
## Install
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
pip install antifile
|
|
10
|
+
# or
|
|
11
|
+
uv tool install antifile
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
antifile INPUT -o refs.bib
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
`INPUT` can be:
|
|
21
|
+
|
|
22
|
+
- a **PDF** or **EPUB** file — `antifile paper.pdf -o refs.bib`
|
|
23
|
+
- a **folder** of PDFs/EPUBs — `antifile ~/Downloads/papers -o refs.bib` (add `--recursive` to descend into subfolders)
|
|
24
|
+
- a **URL** — `antifile https://example.com/article -o refs.bib`
|
|
25
|
+
- a **DOI** — `antifile 10.1145/3292500 -o refs.bib`
|
|
26
|
+
- an **arXiv ID** — `antifile arXiv:1706.03762 -o refs.bib`
|
|
27
|
+
- an **ISBN** — `antifile 9780262033848 -o refs.bib`
|
|
28
|
+
|
|
29
|
+
Entries are appended with de-duplication: a new entry matching an existing one
|
|
30
|
+
(by DOI, arXiv ID, ISBN, or normalized title+author) fills in any missing fields
|
|
31
|
+
rather than creating a duplicate. Pass `--no-merge` to skip on duplicate, or
|
|
32
|
+
`--force` to append anyway with an auto-suffixed key.
|
|
33
|
+
|
|
34
|
+
### Options
|
|
35
|
+
|
|
36
|
+
| flag | effect |
|
|
37
|
+
|------|--------|
|
|
38
|
+
| `-o, --output FILE` | target `.bib` (required; created if missing) |
|
|
39
|
+
| `--method {auto,doi,arxiv,isbn,crossref,llm,claude-code}` | force a PDF extraction method (default: `auto`) |
|
|
40
|
+
| `--recursive` | recurse into subfolders for folder input |
|
|
41
|
+
| `--no-preview` | skip the first-page PDF preview |
|
|
42
|
+
| `--no-merge` | on duplicate, skip instead of filling missing fields |
|
|
43
|
+
| `--force` | append even if a duplicate exists |
|
|
44
|
+
|
|
45
|
+
### LLM-assisted extraction
|
|
46
|
+
|
|
47
|
+
When a PDF has no resolvable identifier, antifile can fall back to an LLM to read
|
|
48
|
+
the first page. Set whichever API key you have — it's picked up from the
|
|
49
|
+
environment:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
export ANTHROPIC_API_KEY=... # or OPENAI_API_KEY, or GEMINI_API_KEY / GOOGLE_API_KEY
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Related
|
|
56
|
+
|
|
57
|
+
- [antilibrary](https://github.com/vxvware/antilibrary) — manage BibTeX libraries from the terminal (can call antifile via `--add-from-files`).
|
|
58
|
+
- [antifind](https://github.com/vxvware/antifind) — online metadata search → BibTeX.
|
|
59
|
+
|
|
60
|
+
## License
|
|
61
|
+
|
|
62
|
+
MIT
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "antifile"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Extract BibTeX metadata from PDFs, EPUBs, URLs, and identifiers"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "VxVware", email = "trevor.j.vincent@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
keywords = ["bibtex", "bibliography", "citations", "pdf", "epub", "doi", "arxiv", "isbn", "metadata"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Environment :: Console",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"Intended Audience :: End Users/Desktop",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Text Processing :: Markup :: LaTeX",
|
|
23
|
+
"Topic :: Utilities",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"bibtexparser>=2.0.0b7",
|
|
27
|
+
"rapidfuzz>=3.0",
|
|
28
|
+
"rich>=13.0",
|
|
29
|
+
"httpx>=0.27",
|
|
30
|
+
"pymupdf>=1.24",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=8.0",
|
|
36
|
+
"ruff>=0.4",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
Homepage = "https://github.com/vxvware/antifile"
|
|
41
|
+
Repository = "https://github.com/vxvware/antifile"
|
|
42
|
+
Issues = "https://github.com/vxvware/antifile/issues"
|
|
43
|
+
|
|
44
|
+
[project.scripts]
|
|
45
|
+
antifile = "antifile.cli:main"
|
|
46
|
+
|
|
47
|
+
[build-system]
|
|
48
|
+
requires = ["hatchling"]
|
|
49
|
+
build-backend = "hatchling.build"
|
|
50
|
+
|
|
51
|
+
[tool.hatch.build.targets.wheel]
|
|
52
|
+
packages = ["src/antifile"]
|
|
53
|
+
|
|
54
|
+
[tool.ruff]
|
|
55
|
+
line-length = 100
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""antifile — extract BibTeX metadata from PDFs, URLs, and identifiers."""
|
|
2
|
+
|
|
3
|
+
from antifile.entry import Entry
|
|
4
|
+
from antifile._base import (
|
|
5
|
+
ExtractionResult,
|
|
6
|
+
Identifiers,
|
|
7
|
+
ExtractorError,
|
|
8
|
+
ExtractorUnavailableError,
|
|
9
|
+
NetworkError,
|
|
10
|
+
PDFError,
|
|
11
|
+
)
|
|
12
|
+
from antifile.dispatch import extract
|
|
13
|
+
from antifile import bibfile
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"extract",
|
|
17
|
+
"bibfile",
|
|
18
|
+
"Entry",
|
|
19
|
+
"ExtractionResult",
|
|
20
|
+
"Identifiers",
|
|
21
|
+
"ExtractorError",
|
|
22
|
+
"ExtractorUnavailableError",
|
|
23
|
+
"NetworkError",
|
|
24
|
+
"PDFError",
|
|
25
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Base types for the extractor system."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from antifile.entry import Entry
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ExtractorError(Exception):
|
|
12
|
+
"""Base exception for extractors."""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ExtractorUnavailableError(ExtractorError):
|
|
16
|
+
"""Raised when a required dependency is missing."""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NetworkError(ExtractorError):
|
|
20
|
+
"""Raised on API/network failures."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PDFError(ExtractorError):
|
|
24
|
+
"""Raised on PDF read failures."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class Identifiers:
|
|
29
|
+
"""Identifiers extracted from document text."""
|
|
30
|
+
|
|
31
|
+
dois: list[str] = field(default_factory=list)
|
|
32
|
+
arxiv_ids: list[str] = field(default_factory=list)
|
|
33
|
+
isbns: list[str] = field(default_factory=list)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def has_any(self) -> bool:
|
|
37
|
+
return bool(self.dois or self.arxiv_ids or self.isbns)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class ExtractionResult:
|
|
42
|
+
"""Result from an extraction attempt."""
|
|
43
|
+
|
|
44
|
+
entry: Entry | None
|
|
45
|
+
source: str
|
|
46
|
+
confidence: float
|
|
47
|
+
identifiers: Identifiers = field(default_factory=Identifiers)
|
|
48
|
+
errors: list[str] = field(default_factory=list)
|
|
49
|
+
pdf_path: Path | None = None
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Read, dedup, and append BibTeX entries to a .bib file."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import bibtexparser
|
|
8
|
+
|
|
9
|
+
from antifile.entry import Entry
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _btp_to_entry(e, source: Path | None = None) -> Entry:
|
|
13
|
+
fields = {f.key: f.value for f in e.fields}
|
|
14
|
+
return Entry(key=e.key, entry_type=e.entry_type, fields=fields, source_file=source)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load(path: Path) -> list[Entry]:
|
|
18
|
+
"""Load all entries from a .bib file. Returns [] if missing/empty."""
|
|
19
|
+
if not path.exists():
|
|
20
|
+
return []
|
|
21
|
+
text = path.read_text()
|
|
22
|
+
if not text.strip():
|
|
23
|
+
return []
|
|
24
|
+
lib = bibtexparser.parse_string(text)
|
|
25
|
+
return [_btp_to_entry(e, path) for e in lib.entries]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _norm(s: str) -> str:
|
|
29
|
+
return s.strip().lower()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def find_duplicate(entry: Entry, existing: list[Entry]) -> Entry | None:
|
|
33
|
+
"""Return an existing entry that matches by identifier or title+author."""
|
|
34
|
+
new_doi = _norm(entry.fields.get("doi", ""))
|
|
35
|
+
new_arxiv = _norm(entry.fields.get("eprint", ""))
|
|
36
|
+
new_isbn = _norm(entry.fields.get("isbn", ""))
|
|
37
|
+
new_title = _norm(entry.fields.get("title", ""))
|
|
38
|
+
new_author = _norm(entry.fields.get("author", ""))
|
|
39
|
+
|
|
40
|
+
for e in existing:
|
|
41
|
+
if e.key == entry.key:
|
|
42
|
+
return e
|
|
43
|
+
if new_doi and _norm(e.fields.get("doi", "")) == new_doi:
|
|
44
|
+
return e
|
|
45
|
+
if new_arxiv and _norm(e.fields.get("eprint", "")) == new_arxiv:
|
|
46
|
+
return e
|
|
47
|
+
if new_isbn and _norm(e.fields.get("isbn", "")) == new_isbn:
|
|
48
|
+
return e
|
|
49
|
+
if (
|
|
50
|
+
new_title
|
|
51
|
+
and _norm(e.fields.get("title", "")) == new_title
|
|
52
|
+
and new_author
|
|
53
|
+
and _norm(e.fields.get("author", "")) == new_author
|
|
54
|
+
):
|
|
55
|
+
return e
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def make_unique_key(base: str, existing: list[Entry]) -> str:
|
|
60
|
+
"""Suffix a, b, c... if `base` collides with any key in existing."""
|
|
61
|
+
keys = {e.key for e in existing}
|
|
62
|
+
if base not in keys:
|
|
63
|
+
return base
|
|
64
|
+
for suffix in "abcdefghijklmnopqrstuvwxyz":
|
|
65
|
+
candidate = f"{base}{suffix}"
|
|
66
|
+
if candidate not in keys:
|
|
67
|
+
return candidate
|
|
68
|
+
n = 1
|
|
69
|
+
while f"{base}{n}" in keys:
|
|
70
|
+
n += 1
|
|
71
|
+
return f"{base}{n}"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _append_entry(path: Path, entry: Entry) -> None:
|
|
75
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
text = path.read_text() if path.exists() else ""
|
|
77
|
+
separator = "\n\n" if text.strip() else ""
|
|
78
|
+
with open(path, "a") as f:
|
|
79
|
+
f.write(separator + entry.to_bibtex() + "\n")
|
|
80
|
+
entry.source_file = path
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _rewrite(path: Path, entries: list[Entry]) -> None:
|
|
84
|
+
content = "\n\n".join(e.to_bibtex() for e in entries)
|
|
85
|
+
path.write_text(content + "\n" if content else "")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def merge_missing(target: Entry, source: Entry) -> bool:
|
|
89
|
+
"""Fill empty/absent fields on `target` from `source`. Returns True if changed."""
|
|
90
|
+
changed = False
|
|
91
|
+
for k, v in source.fields.items():
|
|
92
|
+
if not v:
|
|
93
|
+
continue
|
|
94
|
+
if not target.fields.get(k, "").strip():
|
|
95
|
+
target.fields[k] = v
|
|
96
|
+
changed = True
|
|
97
|
+
return changed
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class AppendResult:
|
|
101
|
+
__slots__ = ("status", "entry", "duplicate_of")
|
|
102
|
+
|
|
103
|
+
def __init__(self, status: str, entry: Entry, duplicate_of: Entry | None = None):
|
|
104
|
+
self.status = status # "added" | "skipped" | "merged"
|
|
105
|
+
self.entry = entry
|
|
106
|
+
self.duplicate_of = duplicate_of
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def append(
|
|
110
|
+
path: Path,
|
|
111
|
+
entry: Entry,
|
|
112
|
+
*,
|
|
113
|
+
merge: bool = True,
|
|
114
|
+
force: bool = False,
|
|
115
|
+
) -> AppendResult:
|
|
116
|
+
"""Append `entry` to the .bib at `path` with dedup.
|
|
117
|
+
|
|
118
|
+
- If a duplicate exists and `force` is False:
|
|
119
|
+
- If `merge` is True (default), fill missing/empty fields on the existing entry
|
|
120
|
+
and rewrite the file. Returns status="merged" if anything changed,
|
|
121
|
+
otherwise "skipped".
|
|
122
|
+
- If `merge` is False, returns status="skipped".
|
|
123
|
+
- Otherwise auto-suffixes the key on collision and appends.
|
|
124
|
+
"""
|
|
125
|
+
existing = load(path)
|
|
126
|
+
dup = find_duplicate(entry, existing) if not force else None
|
|
127
|
+
|
|
128
|
+
if dup is not None:
|
|
129
|
+
if merge:
|
|
130
|
+
if merge_missing(dup, entry):
|
|
131
|
+
_rewrite(path, existing)
|
|
132
|
+
return AppendResult("merged", dup, duplicate_of=dup)
|
|
133
|
+
return AppendResult("skipped", dup, duplicate_of=dup)
|
|
134
|
+
|
|
135
|
+
entry.key = make_unique_key(entry.key, existing)
|
|
136
|
+
_append_entry(path, entry)
|
|
137
|
+
return AppendResult("added", entry)
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""antifile CLI — extract BibTeX from PDFs / URLs / identifiers and append to a .bib file."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.syntax import Syntax
|
|
12
|
+
|
|
13
|
+
from antifile import bibfile
|
|
14
|
+
from antifile._base import ExtractionResult, ExtractorUnavailableError, NetworkError
|
|
15
|
+
from antifile.dispatch import extract as dispatch_extract
|
|
16
|
+
from antifile.pdf import (
|
|
17
|
+
extract_from_pdf,
|
|
18
|
+
extract_text,
|
|
19
|
+
extract_probable_title,
|
|
20
|
+
extract_via_claude_code,
|
|
21
|
+
extract_via_llm,
|
|
22
|
+
find_identifiers,
|
|
23
|
+
resolve_arxiv,
|
|
24
|
+
resolve_doi,
|
|
25
|
+
resolve_isbn,
|
|
26
|
+
search_crossref_by_title,
|
|
27
|
+
)
|
|
28
|
+
from antifile.preview import show_pdf_first_page
|
|
29
|
+
|
|
30
|
+
console = Console()
|
|
31
|
+
_METHODS = ("auto", "doi", "arxiv", "isbn", "crossref", "llm", "claude-code")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _api_keys_from_env() -> dict[str, str]:
|
|
35
|
+
keys: dict[str, str] = {}
|
|
36
|
+
if v := os.environ.get("ANTHROPIC_API_KEY"):
|
|
37
|
+
keys["anthropic"] = v
|
|
38
|
+
if v := os.environ.get("OPENAI_API_KEY"):
|
|
39
|
+
keys["openai"] = v
|
|
40
|
+
if v := os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY"):
|
|
41
|
+
keys["gemini"] = v
|
|
42
|
+
return keys
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _extract_pdf_with_method(
|
|
46
|
+
pdf_path: Path, method: str, api_keys: dict[str, str]
|
|
47
|
+
) -> ExtractionResult:
|
|
48
|
+
"""Run a specific extraction method against a PDF."""
|
|
49
|
+
if method == "auto":
|
|
50
|
+
return extract_from_pdf(pdf_path, api_keys=api_keys)
|
|
51
|
+
|
|
52
|
+
from antifile._base import Identifiers
|
|
53
|
+
|
|
54
|
+
text = extract_text(pdf_path, max_pages=3)
|
|
55
|
+
identifiers = find_identifiers(text) if text.strip() else Identifiers()
|
|
56
|
+
entry = None
|
|
57
|
+
|
|
58
|
+
if method == "doi":
|
|
59
|
+
for doi in identifiers.dois[:3]:
|
|
60
|
+
entry = resolve_doi(doi)
|
|
61
|
+
if entry:
|
|
62
|
+
break
|
|
63
|
+
elif method == "arxiv":
|
|
64
|
+
for aid in identifiers.arxiv_ids[:3]:
|
|
65
|
+
entry = resolve_arxiv(aid)
|
|
66
|
+
if entry:
|
|
67
|
+
break
|
|
68
|
+
elif method == "isbn":
|
|
69
|
+
for isbn in identifiers.isbns[:3]:
|
|
70
|
+
entry = resolve_isbn(isbn)
|
|
71
|
+
if entry:
|
|
72
|
+
break
|
|
73
|
+
elif method == "crossref":
|
|
74
|
+
title = extract_probable_title(text)
|
|
75
|
+
if title:
|
|
76
|
+
entry = search_crossref_by_title(title)
|
|
77
|
+
elif method == "llm":
|
|
78
|
+
entry = extract_via_llm(text, api_keys)
|
|
79
|
+
elif method == "claude-code":
|
|
80
|
+
entry = extract_via_claude_code(text)
|
|
81
|
+
|
|
82
|
+
if entry:
|
|
83
|
+
entry.fields["file"] = str(pdf_path)
|
|
84
|
+
|
|
85
|
+
confidence = 0.7 if method in ("llm", "claude-code", "crossref") else 0.95
|
|
86
|
+
return ExtractionResult(
|
|
87
|
+
entry=entry, source=method,
|
|
88
|
+
confidence=confidence if entry else 0.0,
|
|
89
|
+
identifiers=identifiers, pdf_path=pdf_path,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _extract_one(
|
|
94
|
+
input_str: str, method: str, api_keys: dict[str, str]
|
|
95
|
+
) -> ExtractionResult:
|
|
96
|
+
"""Dispatch helper that honors --method for PDFs."""
|
|
97
|
+
path = Path(input_str).expanduser()
|
|
98
|
+
if method != "auto" and path.exists() and path.is_file() and path.suffix.lower() == ".pdf":
|
|
99
|
+
return _extract_pdf_with_method(path, method, api_keys)
|
|
100
|
+
return dispatch_extract(input_str, api_keys=api_keys)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _looks_like_path(s: str) -> bool:
|
|
104
|
+
return s.startswith(("/", "~", "./", "../")) or "/" in s
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _process(
|
|
108
|
+
input_str: str,
|
|
109
|
+
output: Path,
|
|
110
|
+
method: str,
|
|
111
|
+
api_keys: dict[str, str],
|
|
112
|
+
show_preview: bool,
|
|
113
|
+
merge: bool,
|
|
114
|
+
force: bool,
|
|
115
|
+
counters: dict[str, int],
|
|
116
|
+
label: str | None = None,
|
|
117
|
+
) -> None:
|
|
118
|
+
display = label or input_str
|
|
119
|
+
console.print(f"[cyan]→[/] {display}")
|
|
120
|
+
|
|
121
|
+
path = Path(input_str).expanduser()
|
|
122
|
+
|
|
123
|
+
# Detect path-like inputs that don't exist before trying to classify them
|
|
124
|
+
# as URLs/identifiers — saves a confusing "Could not classify" message.
|
|
125
|
+
if _looks_like_path(input_str) and not path.exists():
|
|
126
|
+
console.print(f" [yellow]skipped:[/] path not found ({path})")
|
|
127
|
+
counters["skipped"] += 1
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
is_pdf = path.exists() and path.is_file() and path.suffix.lower() == ".pdf"
|
|
131
|
+
|
|
132
|
+
if show_preview and is_pdf:
|
|
133
|
+
show_pdf_first_page(path)
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
result = _extract_one(input_str, method, api_keys)
|
|
137
|
+
except (ExtractorUnavailableError, NetworkError) as e:
|
|
138
|
+
console.print(f" [red]error:[/] {e}")
|
|
139
|
+
counters["failed"] += 1
|
|
140
|
+
return
|
|
141
|
+
except Exception as e:
|
|
142
|
+
console.print(f" [red]error:[/] {e}")
|
|
143
|
+
counters["failed"] += 1
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
for err in result.errors:
|
|
147
|
+
console.print(f" [yellow]warning:[/] {err}")
|
|
148
|
+
|
|
149
|
+
if result.entry is None:
|
|
150
|
+
if result.source == "unknown":
|
|
151
|
+
console.print(f" [yellow]skipped:[/] unrecognized input")
|
|
152
|
+
counters["skipped"] += 1
|
|
153
|
+
else:
|
|
154
|
+
console.print(f" [red]no entry extracted[/] (source={result.source})")
|
|
155
|
+
counters["failed"] += 1
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
conf_color = "green" if result.confidence >= 0.9 else "yellow"
|
|
159
|
+
console.print(
|
|
160
|
+
f" [dim]source:[/] {result.source} "
|
|
161
|
+
f"[{conf_color}]confidence: {result.confidence:.0%}[/]"
|
|
162
|
+
)
|
|
163
|
+
console.print(
|
|
164
|
+
Syntax(result.entry.to_bibtex(), "bibtex", theme="monokai", line_numbers=False, padding=(0, 1))
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
res = bibfile.append(output, result.entry, merge=merge, force=force)
|
|
168
|
+
if res.status == "added":
|
|
169
|
+
console.print(f" [green]added[/] [bold]{res.entry.key}[/] → {output}")
|
|
170
|
+
counters["added"] += 1
|
|
171
|
+
elif res.status == "merged":
|
|
172
|
+
console.print(f" [green]merged[/] missing fields into [bold]{res.entry.key}[/]")
|
|
173
|
+
counters["merged"] += 1
|
|
174
|
+
else:
|
|
175
|
+
# silent skip per spec, but keep it visible at INFO level
|
|
176
|
+
counters["duplicates"] += 1
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
_SUPPORTED_SUFFIXES = (".pdf", ".epub")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _iter_supported(folder: Path, recursive: bool) -> list[Path]:
|
|
183
|
+
pattern = "**/*" if recursive else "*"
|
|
184
|
+
return sorted(
|
|
185
|
+
p for p in folder.glob(pattern)
|
|
186
|
+
if p.is_file() and p.suffix.lower() in _SUPPORTED_SUFFIXES
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def main(argv: list[str] | None = None) -> int:
|
|
191
|
+
parser = argparse.ArgumentParser(
|
|
192
|
+
prog="antifile",
|
|
193
|
+
description="Extract BibTeX from PDFs, EPUBs, URLs, DOIs, arXiv IDs, or ISBNs into a .bib file.",
|
|
194
|
+
)
|
|
195
|
+
parser.add_argument("input", help="PDF/EPUB path, folder, URL, DOI, arXiv ID, or ISBN")
|
|
196
|
+
parser.add_argument(
|
|
197
|
+
"-o", "--output", required=True, type=Path,
|
|
198
|
+
help="Target .bib file (created if missing; entries appended with dedup)",
|
|
199
|
+
)
|
|
200
|
+
parser.add_argument(
|
|
201
|
+
"--method", choices=_METHODS, default="auto",
|
|
202
|
+
help="Force a specific extraction method (PDF only). Default: auto.",
|
|
203
|
+
)
|
|
204
|
+
parser.add_argument("--recursive", action="store_true", help="Recurse into subfolders for folder input")
|
|
205
|
+
parser.add_argument("--no-preview", action="store_true", help="Skip first-page PDF preview")
|
|
206
|
+
parser.add_argument("--no-merge", action="store_true",
|
|
207
|
+
help="On duplicate, skip without filling missing fields (default: fill missing)")
|
|
208
|
+
parser.add_argument("--force", action="store_true",
|
|
209
|
+
help="Append even if a duplicate exists (auto-suffixes the key)")
|
|
210
|
+
|
|
211
|
+
args = parser.parse_args(argv)
|
|
212
|
+
|
|
213
|
+
api_keys = _api_keys_from_env()
|
|
214
|
+
show_preview = not args.no_preview
|
|
215
|
+
merge = not args.no_merge
|
|
216
|
+
|
|
217
|
+
counters = {"added": 0, "merged": 0, "duplicates": 0, "failed": 0, "skipped": 0}
|
|
218
|
+
|
|
219
|
+
input_str: str = args.input
|
|
220
|
+
path = Path(input_str).expanduser()
|
|
221
|
+
|
|
222
|
+
if path.exists() and path.is_dir():
|
|
223
|
+
files = _iter_supported(path, args.recursive)
|
|
224
|
+
if not files:
|
|
225
|
+
console.print(f"[dim]No PDF/EPUB files found in {path}[/]")
|
|
226
|
+
return 0
|
|
227
|
+
console.print(f"[cyan]Found {len(files)} file(s) in {path}[/]")
|
|
228
|
+
for i, f in enumerate(files, 1):
|
|
229
|
+
try:
|
|
230
|
+
_process(
|
|
231
|
+
str(f), args.output, args.method, api_keys,
|
|
232
|
+
show_preview, merge, args.force, counters,
|
|
233
|
+
label=f"[{i}/{len(files)}] {f.name}",
|
|
234
|
+
)
|
|
235
|
+
except KeyboardInterrupt:
|
|
236
|
+
console.print("\n[dim]stopped[/]")
|
|
237
|
+
break
|
|
238
|
+
else:
|
|
239
|
+
_process(
|
|
240
|
+
input_str, args.output, args.method, api_keys,
|
|
241
|
+
show_preview, merge, args.force, counters,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
console.print()
|
|
245
|
+
console.print(
|
|
246
|
+
f"[green]added: {counters['added']}[/] "
|
|
247
|
+
f"[green]merged: {counters['merged']}[/] "
|
|
248
|
+
f"[dim]duplicates: {counters['duplicates']}[/] "
|
|
249
|
+
f"[yellow]skipped: {counters['skipped']}[/] "
|
|
250
|
+
f"[red]failed: {counters['failed']}[/]"
|
|
251
|
+
)
|
|
252
|
+
return 0 if counters["failed"] == 0 else 1
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
if __name__ == "__main__":
|
|
256
|
+
sys.exit(main())
|