ardive 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ardive/__init__.py +1 -0
- ardive/arxiv.py +69 -0
- ardive/cli.py +91 -0
- ardive/llm.py +109 -0
- ardive-0.1.0.dist-info/METADATA +92 -0
- ardive-0.1.0.dist-info/RECORD +10 -0
- ardive-0.1.0.dist-info/WHEEL +5 -0
- ardive-0.1.0.dist-info/entry_points.txt +2 -0
- ardive-0.1.0.dist-info/licenses/LICENSE +21 -0
- ardive-0.1.0.dist-info/top_level.txt +1 -0
ardive/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
ardive/arxiv.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Fetch paper metadata and text from arXiv."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import tempfile
|
|
6
|
+
import urllib.request
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
import arxiv
|
|
10
|
+
|
|
11
|
+
_client = arxiv.Client()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Paper:
|
|
16
|
+
id: str
|
|
17
|
+
title: str
|
|
18
|
+
authors: str
|
|
19
|
+
abstract: str
|
|
20
|
+
full_text: str = ""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _result_to_paper(result: arxiv.Result, full_text: str = "") -> Paper:
|
|
24
|
+
return Paper(
|
|
25
|
+
id=result.get_short_id(),
|
|
26
|
+
title=result.title.strip(),
|
|
27
|
+
authors=", ".join(a.name for a in result.authors),
|
|
28
|
+
abstract=result.summary.strip(),
|
|
29
|
+
full_text=full_text,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _pdf_text(url: str) -> str:
|
|
34
|
+
from pypdf import PdfReader
|
|
35
|
+
|
|
36
|
+
req = urllib.request.Request(url, headers={"User-Agent": "arDive"})
|
|
37
|
+
with urllib.request.urlopen(req) as resp, tempfile.NamedTemporaryFile(
|
|
38
|
+
suffix=".pdf"
|
|
39
|
+
) as tmp:
|
|
40
|
+
tmp.write(resp.read())
|
|
41
|
+
tmp.flush()
|
|
42
|
+
reader = PdfReader(tmp.name)
|
|
43
|
+
return "\n".join(page.extract_text() or "" for page in reader.pages).strip()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def fetch_paper(arxiv_id: str) -> Paper:
|
|
47
|
+
"""Fetch one paper by arXiv ID, including full text from its PDF."""
|
|
48
|
+
try:
|
|
49
|
+
result = next(_client.results(arxiv.Search(id_list=[arxiv_id])))
|
|
50
|
+
except StopIteration:
|
|
51
|
+
raise LookupError(f"No arXiv paper found with id '{arxiv_id}'.")
|
|
52
|
+
|
|
53
|
+
text = _pdf_text(result.pdf_url)
|
|
54
|
+
|
|
55
|
+
# Fall back to the abstract if text extraction came up empty.
|
|
56
|
+
return _result_to_paper(result, full_text=text or result.summary.strip())
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def search_topic(query: str, n: int) -> list[Paper]:
|
|
60
|
+
"""Search arXiv by topic; returns metadata + abstracts (no PDF download)."""
|
|
61
|
+
search = arxiv.Search(
|
|
62
|
+
query=query,
|
|
63
|
+
max_results=n,
|
|
64
|
+
sort_by=arxiv.SortCriterion.Relevance,
|
|
65
|
+
)
|
|
66
|
+
papers = [_result_to_paper(r) for r in _client.results(search)]
|
|
67
|
+
if not papers:
|
|
68
|
+
raise LookupError(f"No arXiv papers found for topic '{query}'.")
|
|
69
|
+
return papers
|
ardive/cli.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""arDive command-line interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
SECTIONS = {
|
|
9
|
+
"abstract": "abstract",
|
|
10
|
+
"intro": "introduction",
|
|
11
|
+
"methodology": "methodology",
|
|
12
|
+
"related": "related works",
|
|
13
|
+
"citations": "citations / references",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def positive_int(value: str) -> int:
|
|
18
|
+
n = int(value)
|
|
19
|
+
if n <= 0:
|
|
20
|
+
raise argparse.ArgumentTypeError("must be a positive integer")
|
|
21
|
+
return n
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
25
|
+
common = argparse.ArgumentParser(add_help=False)
|
|
26
|
+
common.add_argument(
|
|
27
|
+
"--eli5", action="store_true", help="explain like I'm 5"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
parser = argparse.ArgumentParser(
|
|
31
|
+
prog="ardive",
|
|
32
|
+
description="Pull papers from arXiv and summarize / explain them with a local open-source model.",
|
|
33
|
+
)
|
|
34
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
35
|
+
|
|
36
|
+
p_sum = sub.add_parser(
|
|
37
|
+
"sum", parents=[common], help="summarize a paper in bullet points"
|
|
38
|
+
)
|
|
39
|
+
p_sum.add_argument("arxiv_id", help="arXiv id, e.g. 2410.12345")
|
|
40
|
+
p_sum.add_argument(
|
|
41
|
+
"--section", choices=list(SECTIONS), help="focus on one section"
|
|
42
|
+
)
|
|
43
|
+
p_sum.add_argument(
|
|
44
|
+
"--max-bullets", type=positive_int, help="cap the number of bullets"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
p_comp = sub.add_parser(
|
|
48
|
+
"comp", parents=[common], help="compare two or more papers"
|
|
49
|
+
)
|
|
50
|
+
p_comp.add_argument("arxiv_ids", nargs="+", help="two or more arXiv ids")
|
|
51
|
+
|
|
52
|
+
p_dig = sub.add_parser(
|
|
53
|
+
"dig", parents=[common], help="digest a topic of papers"
|
|
54
|
+
)
|
|
55
|
+
p_dig.add_argument("topic", help="topic / search query")
|
|
56
|
+
p_dig.add_argument(
|
|
57
|
+
"-n", "--num", type=positive_int, default=8, help="papers to pull (default 8)"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
return parser
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _run(args: argparse.Namespace) -> str:
|
|
64
|
+
# Imported lazily so `ardive --help` works without the model deps running.
|
|
65
|
+
from . import arxiv, llm
|
|
66
|
+
|
|
67
|
+
if args.command == "sum":
|
|
68
|
+
paper = arxiv.fetch_paper(args.arxiv_id)
|
|
69
|
+
section = SECTIONS[args.section] if args.section else None
|
|
70
|
+
return llm.summarize(paper, section, args.max_bullets, args.eli5)
|
|
71
|
+
|
|
72
|
+
if args.command == "comp":
|
|
73
|
+
if len(args.arxiv_ids) < 2:
|
|
74
|
+
raise ValueError("comp needs at least two arXiv ids")
|
|
75
|
+
papers = [arxiv.fetch_paper(i) for i in args.arxiv_ids]
|
|
76
|
+
return llm.compare(papers, args.eli5)
|
|
77
|
+
|
|
78
|
+
if args.command == "dig":
|
|
79
|
+
papers = arxiv.search_topic(args.topic, args.num)
|
|
80
|
+
return llm.digest(args.topic, papers, args.eli5)
|
|
81
|
+
|
|
82
|
+
raise ValueError(f"unknown command: {args.command}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def main() -> None:
|
|
86
|
+
args = _build_parser().parse_args()
|
|
87
|
+
try:
|
|
88
|
+
print(_run(args))
|
|
89
|
+
except Exception as exc: # clean message, no traceback for expected failures
|
|
90
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
91
|
+
sys.exit(1)
|
ardive/llm.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Open-source LLM (Ollama) backed summarize / compare / digest builders.
|
|
2
|
+
|
|
3
|
+
Uses a local Ollama server (https://ollama.com) — free, no API key. Swap the
|
|
4
|
+
model with the ARDIVE_MODEL env var; point at a remote server with OLLAMA_HOST.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
import ollama
|
|
12
|
+
|
|
13
|
+
from .arxiv import Paper
|
|
14
|
+
|
|
15
|
+
MODEL = os.environ.get("ARDIVE_MODEL", "llama3.2")
|
|
16
|
+
# Context window. Papers are long; raise this (and your RAM) for big papers.
|
|
17
|
+
NUM_CTX = int(os.environ.get("ARDIVE_NUM_CTX", "8192"))
|
|
18
|
+
# Char budget for supplied paper text, leaving room for the prompt + response so
|
|
19
|
+
# the model sees the instruction and the start of the paper instead of truncating.
|
|
20
|
+
INPUT_CHARS = max(2000, (NUM_CTX - 1500) * 4)
|
|
21
|
+
|
|
22
|
+
SYSTEM = (
|
|
23
|
+
"You are arDive, a research-paper explainer. You read arXiv papers and "
|
|
24
|
+
"respond in clear Markdown bullet points. Be accurate and concise; do not "
|
|
25
|
+
"invent results that are not in the provided text."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _eli5_clause(eli5: bool) -> str:
|
|
30
|
+
if not eli5:
|
|
31
|
+
return ""
|
|
32
|
+
return (
|
|
33
|
+
" Explain like I'm 5: use plain language and everyday analogies, and "
|
|
34
|
+
"avoid jargon and equations."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _bullets_clause(max_bullets: int | None) -> str:
|
|
39
|
+
return f" Use at most {max_bullets} bullet points." if max_bullets else ""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _ask(user: str) -> str:
|
|
43
|
+
try:
|
|
44
|
+
response = ollama.chat(
|
|
45
|
+
model=MODEL,
|
|
46
|
+
messages=[
|
|
47
|
+
{"role": "system", "content": SYSTEM},
|
|
48
|
+
{"role": "user", "content": user},
|
|
49
|
+
],
|
|
50
|
+
options={"num_ctx": NUM_CTX},
|
|
51
|
+
)
|
|
52
|
+
except ConnectionError:
|
|
53
|
+
raise RuntimeError(
|
|
54
|
+
"could not reach Ollama. Install it from https://ollama.com, then "
|
|
55
|
+
"run `ollama serve` and `ollama pull " + MODEL + "`."
|
|
56
|
+
)
|
|
57
|
+
except ollama.ResponseError as exc:
|
|
58
|
+
raise RuntimeError(f"Ollama error: {exc}. Try `ollama pull {MODEL}`.")
|
|
59
|
+
return response["message"]["content"].strip()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _clip(text: str, max_chars: int) -> str:
|
|
63
|
+
if len(text) <= max_chars:
|
|
64
|
+
return text
|
|
65
|
+
return text[:max_chars] + "\n[... text truncated to fit the context window ...]"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _paper_block(paper: Paper, body: str) -> str:
|
|
69
|
+
return f"Title: {paper.title}\nAuthors: {paper.authors}\n\n{body}"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def summarize(
|
|
73
|
+
paper: Paper,
|
|
74
|
+
section: str | None,
|
|
75
|
+
max_bullets: int | None,
|
|
76
|
+
eli5: bool,
|
|
77
|
+
) -> str:
|
|
78
|
+
focus = f" Focus only on the {section} section of the paper." if section else ""
|
|
79
|
+
instruction = (
|
|
80
|
+
"Summarize the following paper in bullet-point form."
|
|
81
|
+
+ focus
|
|
82
|
+
+ _bullets_clause(max_bullets)
|
|
83
|
+
+ _eli5_clause(eli5)
|
|
84
|
+
)
|
|
85
|
+
body = _clip(paper.full_text, INPUT_CHARS)
|
|
86
|
+
return _ask(f"{instruction}\n\n{_paper_block(paper, body)}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def compare(papers: list[Paper], eli5: bool) -> str:
|
|
90
|
+
instruction = (
|
|
91
|
+
"Compare the following papers in bullet-point form. Cover what they "
|
|
92
|
+
"share, how they differ, and their relative strengths and weaknesses."
|
|
93
|
+
+ _eli5_clause(eli5)
|
|
94
|
+
)
|
|
95
|
+
per_paper = INPUT_CHARS // max(1, len(papers))
|
|
96
|
+
blocks = "\n\n---\n\n".join(
|
|
97
|
+
_paper_block(p, _clip(p.full_text, per_paper)) for p in papers
|
|
98
|
+
)
|
|
99
|
+
return _ask(f"{instruction}\n\n{blocks}")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def digest(query: str, papers: list[Paper], eli5: bool) -> str:
|
|
103
|
+
instruction = (
|
|
104
|
+
f"Digest the arXiv literature on '{query}' in bullet-point form. "
|
|
105
|
+
"Cover the main themes, the most notable papers, and open questions."
|
|
106
|
+
+ _eli5_clause(eli5)
|
|
107
|
+
)
|
|
108
|
+
blocks = "\n\n".join(_paper_block(p, p.abstract) for p in papers)
|
|
109
|
+
return _ask(f"{instruction}\n\n{blocks}")
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ardive
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Command-line agent that pulls papers from arXiv and summarizes / explains them with a local open-source model.
|
|
5
|
+
Author-email: Rohan Kosalge <rohankosalge06@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/rohankosalge/arDive
|
|
8
|
+
Project-URL: Repository, https://github.com/rohankosalge/arDive
|
|
9
|
+
Keywords: arxiv,summarization,cli,ollama,llm
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: ollama>=0.3
|
|
17
|
+
Requires-Dist: arxiv>=2.1
|
|
18
|
+
Requires-Dist: pypdf>=4.0
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
# arDive: a simple dive into your ArXiv
|
|
22
|
+
|
|
23
|
+
A small command-line agent that pulls papers from arXiv and uses Llama3.2 to
|
|
24
|
+
summarize, explain, compare, and digest them. Anyone can easily install and use without the need of a paid plan.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# 1. Install Ollama (free, runs models locally): https://ollama.com
|
|
30
|
+
ollama pull llama3.2 # or any open model you like
|
|
31
|
+
|
|
32
|
+
# 2. Install arDive
|
|
33
|
+
pip install ardive
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
That's it. arDive talks to the local Ollama server. Pick a
|
|
37
|
+
different model with `ARDIVE_MODEL` (e.g. `export ARDIVE_MODEL=qwen2.5`), or
|
|
38
|
+
point at a remote Ollama with `OLLAMA_HOST`.
|
|
39
|
+
|
|
40
|
+
### From source
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
git clone https://github.com/rohankosalge/arDive
|
|
44
|
+
cd arDive
|
|
45
|
+
pip install -e .
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Usage
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Summarize a paper as bullet points
|
|
52
|
+
ardive sum 1234.56789
|
|
53
|
+
|
|
54
|
+
# Focus on one section, cap the bullets
|
|
55
|
+
ardive sum 1234.56789 --section methodology --max-bullets 5
|
|
56
|
+
|
|
57
|
+
# Explain like I'm 5 (works on every command)
|
|
58
|
+
ardive sum 1234.56789 --eli5
|
|
59
|
+
|
|
60
|
+
# Compare two or more papers
|
|
61
|
+
ardive comp 1234.56789 9876.54321
|
|
62
|
+
|
|
63
|
+
# Digest a topic (searches arXiv, default 8 papers)
|
|
64
|
+
ardive dig "diffusion models for protein folding"
|
|
65
|
+
ardive dig "graph neural networks" -n 12
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Commands
|
|
69
|
+
|
|
70
|
+
| Command | What it does |
|
|
71
|
+
| --- | --- |
|
|
72
|
+
| `sum <id>` | Bullet-point summary of one paper (full PDF text). |
|
|
73
|
+
| `comp <id> <id> [...]` | Compare two or more papers. |
|
|
74
|
+
| `dig <topic>` | Search arXiv by topic and digest the top results. |
|
|
75
|
+
|
|
76
|
+
### Flags
|
|
77
|
+
|
|
78
|
+
- `--eli5` — explain in plain, jargon-free language (all commands).
|
|
79
|
+
- `--section {abstract,intro,methodology,related,citations}` — `sum` only; focus on one section.
|
|
80
|
+
- `--max-bullets N` — `sum` only; cap the number of bullets (positive integer).
|
|
81
|
+
- `-n/--num N` — `dig` only; how many papers to pull (default 8).
|
|
82
|
+
|
|
83
|
+
## How it works
|
|
84
|
+
|
|
85
|
+
`sum` and `comp` download each paper's PDF and extract its full text; `dig`
|
|
86
|
+
searches arXiv and works from abstracts. The text is sent to a local
|
|
87
|
+
open-source model via Ollama (default `llama3.2`) with a prompt tailored to the
|
|
88
|
+
command, and the bullet-point response is printed to stdout.
|
|
89
|
+
|
|
90
|
+
Long papers can exceed the model's context window and be truncated. arDive asks
|
|
91
|
+
Ollama for an 8192-token window by default; raise it (at the cost of more RAM)
|
|
92
|
+
with `export ARDIVE_NUM_CTX=16384`.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
ardive/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
2
|
+
ardive/arxiv.py,sha256=0vicbSuyTu6yO2ei0lXXSKfwxw-QG-5SvHa0aOGfaCs,1949
|
|
3
|
+
ardive/cli.py,sha256=X_RPjvSwj0BlG73-NDiyuYSiXpts5gE2IuD2sQ4_gcU,2814
|
|
4
|
+
ardive/llm.py,sha256=ajiY8aE71yGFMcXB28kxS7FEPZ56r4SS6OHT2ewJs5A,3619
|
|
5
|
+
ardive-0.1.0.dist-info/licenses/LICENSE,sha256=ZxD8S4Ut4ORpsXtj0iXNgXV3fyn5EE3tDdH_QtMBrU4,1070
|
|
6
|
+
ardive-0.1.0.dist-info/METADATA,sha256=5Lg-88MPGery2oBi0ou7b5uBVlt0WDEUu-wlib-XvJQ,2993
|
|
7
|
+
ardive-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
ardive-0.1.0.dist-info/entry_points.txt,sha256=MmcwypMni9-LYbUGoUaynMonpyrtjJ2BUEFZW0nnx2Y,43
|
|
9
|
+
ardive-0.1.0.dist-info/top_level.txt,sha256=BGRIa1zv_vvqhk-HZh2Z1DQlgUNJ1T12kFt9VTY2wYo,7
|
|
10
|
+
ardive-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Rohan Kosalge
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ardive
|