paper-wiki 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paper_wiki-0.2.0/PKG-INFO +101 -0
- paper_wiki-0.2.0/README.md +76 -0
- paper_wiki-0.2.0/paper_wiki/__init__.py +1 -0
- paper_wiki-0.2.0/paper_wiki/cli.py +143 -0
- paper_wiki-0.2.0/paper_wiki/deepxiv_client.py +61 -0
- paper_wiki-0.2.0/paper_wiki/llm.py +47 -0
- paper_wiki-0.2.0/paper_wiki/prompts.py +101 -0
- paper_wiki-0.2.0/paper_wiki/wiki.py +167 -0
- paper_wiki-0.2.0/paper_wiki.egg-info/PKG-INFO +101 -0
- paper_wiki-0.2.0/paper_wiki.egg-info/SOURCES.txt +14 -0
- paper_wiki-0.2.0/paper_wiki.egg-info/dependency_links.txt +1 -0
- paper_wiki-0.2.0/paper_wiki.egg-info/entry_points.txt +2 -0
- paper_wiki-0.2.0/paper_wiki.egg-info/requires.txt +5 -0
- paper_wiki-0.2.0/paper_wiki.egg-info/top_level.txt +1 -0
- paper_wiki-0.2.0/pyproject.toml +42 -0
- paper_wiki-0.2.0/setup.cfg +4 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: paper-wiki
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Deep paper reading wiki: search, summarize, and analyze arXiv papers into a three-layer markdown wiki
|
|
5
|
+
Author: mblank
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mblank5/paper-wiki
|
|
8
|
+
Project-URL: Repository, https://github.com/mblank5/paper-wiki
|
|
9
|
+
Keywords: arxiv,paper,wiki,summarization,cli
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: click>=8.0
|
|
21
|
+
Requires-Dist: openai>=1.0
|
|
22
|
+
Requires-Dist: python-dotenv>=1.0
|
|
23
|
+
Requires-Dist: rich>=13.0
|
|
24
|
+
Requires-Dist: requests>=2.28
|
|
25
|
+
|
|
26
|
+
# paper-wiki
|
|
27
|
+
|
|
28
|
+
Deep paper reading wiki: search, summarize, and analyze arXiv papers into a three-layer markdown wiki.
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- Search arXiv papers via [deepxiv](https://data.rag.ac.cn) CLI
|
|
33
|
+
- Download paper PDFs
|
|
34
|
+
- Generate per-section Chinese summaries using LLM
|
|
35
|
+
- Produce a full paper analysis with 5 dimensions:
|
|
36
|
+
- ELI5 (小学生也能懂)
|
|
37
|
+
- Core innovations (核心创新点)
|
|
38
|
+
- Test datasets & metrics (测试集与指标)
|
|
39
|
+
- Core weaknesses (核心缺点)
|
|
40
|
+
- Future directions (后期演进方向)
|
|
41
|
+
|
|
42
|
+
## Three-Layer Wiki
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
wiki/{arxiv_id}/
|
|
46
|
+
├── README.md # Layer 1: full summary + 5 analysis aspects
|
|
47
|
+
├── sections/ # Layer 2: per-section Chinese summaries
|
|
48
|
+
│ ├── 01_introduction.md
|
|
49
|
+
│ ├── 02_method.md
|
|
50
|
+
│ └── ...
|
|
51
|
+
└── paper.pdf # Layer 3: original PDF
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install paper-wiki
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Prerequisites
|
|
61
|
+
|
|
62
|
+
- [deepxiv](https://pypi.org/project/deepxiv/) CLI installed and configured (`pip install deepxiv`)
|
|
63
|
+
- OpenAI-compatible API endpoint
|
|
64
|
+
|
|
65
|
+
## Quick Start
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Configure API
|
|
69
|
+
export OPENAI_API_KEY=sk-xxx
|
|
70
|
+
export OPENAI_BASE_URL=https://api.openai.com/v1
|
|
71
|
+
export OPENAI_MODEL=gpt-4o
|
|
72
|
+
|
|
73
|
+
# Process a paper by arXiv ID
|
|
74
|
+
paper-wiki process 2409.05591
|
|
75
|
+
|
|
76
|
+
# Or by URL
|
|
77
|
+
paper-wiki process https://arxiv.org/pdf/2604.27393
|
|
78
|
+
|
|
79
|
+
# Search and interactively select
|
|
80
|
+
paper-wiki search "RAG long context"
|
|
81
|
+
|
|
82
|
+
# Search and auto-select first result
|
|
83
|
+
paper-wiki search "transformer memory" --first
|
|
84
|
+
|
|
85
|
+
# List processed papers
|
|
86
|
+
paper-wiki list
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Configuration
|
|
90
|
+
|
|
91
|
+
Create a `.env` file in your working directory:
|
|
92
|
+
|
|
93
|
+
```env
|
|
94
|
+
OPENAI_API_KEY=sk-xxx
|
|
95
|
+
OPENAI_BASE_URL=https://api.openai.com/v1
|
|
96
|
+
OPENAI_MODEL=gpt-4o
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
|
|
101
|
+
MIT
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# paper-wiki
|
|
2
|
+
|
|
3
|
+
Deep paper reading wiki: search, summarize, and analyze arXiv papers into a three-layer markdown wiki.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Search arXiv papers via [deepxiv](https://data.rag.ac.cn) CLI
|
|
8
|
+
- Download paper PDFs
|
|
9
|
+
- Generate per-section Chinese summaries using LLM
|
|
10
|
+
- Produce a full paper analysis with 5 dimensions:
|
|
11
|
+
- ELI5 (小学生也能懂)
|
|
12
|
+
- Core innovations (核心创新点)
|
|
13
|
+
- Test datasets & metrics (测试集与指标)
|
|
14
|
+
- Core weaknesses (核心缺点)
|
|
15
|
+
- Future directions (后期演进方向)
|
|
16
|
+
|
|
17
|
+
## Three-Layer Wiki
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
wiki/{arxiv_id}/
|
|
21
|
+
├── README.md # Layer 1: full summary + 5 analysis aspects
|
|
22
|
+
├── sections/ # Layer 2: per-section Chinese summaries
|
|
23
|
+
│ ├── 01_introduction.md
|
|
24
|
+
│ ├── 02_method.md
|
|
25
|
+
│ └── ...
|
|
26
|
+
└── paper.pdf # Layer 3: original PDF
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install paper-wiki
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Prerequisites
|
|
36
|
+
|
|
37
|
+
- [deepxiv](https://pypi.org/project/deepxiv/) CLI installed and configured (`pip install deepxiv`)
|
|
38
|
+
- OpenAI-compatible API endpoint
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Configure API
|
|
44
|
+
export OPENAI_API_KEY=sk-xxx
|
|
45
|
+
export OPENAI_BASE_URL=https://api.openai.com/v1
|
|
46
|
+
export OPENAI_MODEL=gpt-4o
|
|
47
|
+
|
|
48
|
+
# Process a paper by arXiv ID
|
|
49
|
+
paper-wiki process 2409.05591
|
|
50
|
+
|
|
51
|
+
# Or by URL
|
|
52
|
+
paper-wiki process https://arxiv.org/pdf/2604.27393
|
|
53
|
+
|
|
54
|
+
# Search and interactively select
|
|
55
|
+
paper-wiki search "RAG long context"
|
|
56
|
+
|
|
57
|
+
# Search and auto-select first result
|
|
58
|
+
paper-wiki search "transformer memory" --first
|
|
59
|
+
|
|
60
|
+
# List processed papers
|
|
61
|
+
paper-wiki list
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Configuration
|
|
65
|
+
|
|
66
|
+
Create a `.env` file in your working directory:
|
|
67
|
+
|
|
68
|
+
```env
|
|
69
|
+
OPENAI_API_KEY=sk-xxx
|
|
70
|
+
OPENAI_BASE_URL=https://api.openai.com/v1
|
|
71
|
+
OPENAI_MODEL=gpt-4o
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## License
|
|
75
|
+
|
|
76
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""paper-wiki: Deep paper reading wiki generator."""
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""CLI entry point for paper-wiki."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from . import deepxiv_client as dx
|
|
10
|
+
from .wiki import process_paper, list_papers
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
_ARXIV_ID_RE = re.compile(r"(?:^|/)(\d{4}\.\d{4,5})(?:$|\.pdf|v\d+)")
|
|
15
|
+
_ARXIV_URL_RE = re.compile(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _extract_arxiv_id(text: str) -> str | None:
|
|
19
|
+
"""Extract arXiv ID from raw ID, URL, or return None."""
|
|
20
|
+
text = text.strip()
|
|
21
|
+
# Pure ID like 2604.27393
|
|
22
|
+
if re.fullmatch(r"\d{4}\.\d{4,5}", text):
|
|
23
|
+
return text
|
|
24
|
+
# URL like https://arxiv.org/abs/2604.27393 or .../pdf/2604.27393.pdf
|
|
25
|
+
m = _ARXIV_URL_RE.search(text)
|
|
26
|
+
if m:
|
|
27
|
+
return m.group(1)
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@click.group()
|
|
32
|
+
@click.option("--output-dir", default="wiki", envvar="PAPER_WIKI_DIR", help="Wiki output directory")
|
|
33
|
+
@click.pass_context
|
|
34
|
+
def cli(ctx, output_dir):
|
|
35
|
+
"""paper-wiki: Deep paper reading wiki generator."""
|
|
36
|
+
ctx.ensure_object(dict)
|
|
37
|
+
ctx.obj["output_dir"] = output_dir
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@cli.command()
|
|
41
|
+
@click.argument("arxiv_id")
|
|
42
|
+
@click.option("--model", default=None, help="Override LLM model name")
|
|
43
|
+
@click.pass_context
|
|
44
|
+
def process(ctx, arxiv_id, model):
|
|
45
|
+
"""Process an arXiv paper by ID or URL into a wiki page."""
|
|
46
|
+
output_dir = ctx.obj["output_dir"]
|
|
47
|
+
arxiv_id = _extract_arxiv_id(arxiv_id) or arxiv_id
|
|
48
|
+
console.print(f"[bold blue]Processing paper:[/] {arxiv_id}")
|
|
49
|
+
try:
|
|
50
|
+
paper_dir = process_paper(arxiv_id, output_dir=output_dir, model=model)
|
|
51
|
+
console.print(f"[bold green]Done![/] Wiki page: {paper_dir}/README.md")
|
|
52
|
+
except Exception as e:
|
|
53
|
+
console.print(f"[bold red]Error:[/] {e}")
|
|
54
|
+
raise SystemExit(1)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@cli.command()
|
|
58
|
+
@click.argument("query")
|
|
59
|
+
@click.option("--limit", "-l", default=5, help="Number of search results")
|
|
60
|
+
@click.option("--first", is_flag=True, help="Auto-select the first result")
|
|
61
|
+
@click.option("--model", default=None, help="Override LLM model name")
|
|
62
|
+
@click.pass_context
|
|
63
|
+
def search(ctx, query, limit, first, model):
|
|
64
|
+
"""Search for papers and process into wiki.
|
|
65
|
+
|
|
66
|
+
If QUERY looks like an arXiv ID (e.g. 2604.27393) or arXiv URL,
|
|
67
|
+
skip search and process directly.
|
|
68
|
+
"""
|
|
69
|
+
output_dir = ctx.obj["output_dir"]
|
|
70
|
+
|
|
71
|
+
# Detect direct arXiv ID / URL
|
|
72
|
+
direct_id = _extract_arxiv_id(query)
|
|
73
|
+
if direct_id:
|
|
74
|
+
console.print(f"[bold blue]Detected arXiv ID:[/] {direct_id}, processing directly...")
|
|
75
|
+
paper_dir = process_paper(direct_id, output_dir=output_dir, model=model)
|
|
76
|
+
console.print(f"[bold green]Done![/] Wiki page: {paper_dir}/README.md")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
console.print(f"[bold blue]Searching:[/] {query}")
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
results = dx.search(query, limit=limit)
|
|
83
|
+
except Exception as e:
|
|
84
|
+
console.print(f"[bold red]Search error:[/] {e}")
|
|
85
|
+
raise SystemExit(1)
|
|
86
|
+
|
|
87
|
+
if not results:
|
|
88
|
+
console.print("[yellow]No results found.[/]")
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
# Display results
|
|
92
|
+
table = Table(title="Search Results")
|
|
93
|
+
table.add_column("#", style="bold")
|
|
94
|
+
table.add_column("arXiv ID")
|
|
95
|
+
table.add_column("Title")
|
|
96
|
+
table.add_column("Date")
|
|
97
|
+
|
|
98
|
+
for i, r in enumerate(results, 1):
|
|
99
|
+
arxiv_id = r.get("arxiv_id", r.get("id", ""))
|
|
100
|
+
title = r.get("title", "")[:60]
|
|
101
|
+
date = r.get("publish_at", "")[:10]
|
|
102
|
+
table.add_row(str(i), arxiv_id, title, date)
|
|
103
|
+
|
|
104
|
+
console.print(table)
|
|
105
|
+
|
|
106
|
+
if first:
|
|
107
|
+
selected = results[0]
|
|
108
|
+
else:
|
|
109
|
+
choice = click.prompt("Select a paper (#) or press Enter to cancel", default="")
|
|
110
|
+
if not choice:
|
|
111
|
+
return
|
|
112
|
+
try:
|
|
113
|
+
selected = results[int(choice) - 1]
|
|
114
|
+
except (ValueError, IndexError):
|
|
115
|
+
console.print("[red]Invalid selection.[/]")
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
arxiv_id = selected.get("arxiv_id", selected.get("id", ""))
|
|
119
|
+
console.print(f"\n[bold blue]Processing:[/] {selected.get('title', arxiv_id)}")
|
|
120
|
+
paper_dir = process_paper(arxiv_id, output_dir=output_dir, model=model)
|
|
121
|
+
console.print(f"[bold green]Done![/] Wiki page: {paper_dir}/README.md")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@cli.command(name="list")
|
|
125
|
+
@click.pass_context
|
|
126
|
+
def list_cmd(ctx):
|
|
127
|
+
"""List all processed papers in the wiki."""
|
|
128
|
+
output_dir = ctx.obj["output_dir"]
|
|
129
|
+
papers = list_papers(output_dir)
|
|
130
|
+
|
|
131
|
+
if not papers:
|
|
132
|
+
console.print(f"[yellow]No papers found in {output_dir}/[/]")
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
table = Table(title="Processed Papers")
|
|
136
|
+
table.add_column("arXiv ID", style="bold")
|
|
137
|
+
table.add_column("Title")
|
|
138
|
+
table.add_column("Path")
|
|
139
|
+
|
|
140
|
+
for p in papers:
|
|
141
|
+
table.add_row(p["id"], p["title"][:60], p["path"])
|
|
142
|
+
|
|
143
|
+
console.print(table)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Wrapper around the deepxiv CLI for paper search and retrieval."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import subprocess
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DeepxivError(Exception):
|
|
10
|
+
"""Error from deepxiv CLI."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _run(cmd: list[str]) -> str:
|
|
14
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
15
|
+
if result.returncode != 0:
|
|
16
|
+
msg = result.stderr.strip() or f"deepxiv exited with code {result.returncode}"
|
|
17
|
+
raise DeepxivError(f"{' '.join(cmd)}\n{msg}")
|
|
18
|
+
return result.stdout
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def search(query: str, limit: int = 10) -> list[dict]:
|
|
22
|
+
"""Search papers on arXiv. Returns list of result dicts."""
|
|
23
|
+
out = _run(["deepxiv", "search", query, "-f", "json", "-l", str(limit)])
|
|
24
|
+
data = json.loads(out)
|
|
25
|
+
if isinstance(data, list):
|
|
26
|
+
return data
|
|
27
|
+
return data.get("result", data.get("results", []))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_metadata(arxiv_id: str) -> dict:
|
|
31
|
+
"""Get paper metadata (title, authors, sections list, etc.)."""
|
|
32
|
+
out = _run(["deepxiv", "paper", arxiv_id, "--head", "-f", "json"])
|
|
33
|
+
return json.loads(out)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_brief(arxiv_id: str) -> dict:
|
|
37
|
+
"""Get brief info (title, TLDR, keywords, citations, github_url)."""
|
|
38
|
+
out = _run(["deepxiv", "paper", arxiv_id, "--brief", "-f", "json"])
|
|
39
|
+
return json.loads(out)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_section(arxiv_id: str, section_name: str) -> str:
|
|
43
|
+
"""Get full markdown content of a specific section."""
|
|
44
|
+
return _run(["deepxiv", "paper", arxiv_id, "--section", section_name])
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_sections_list(arxiv_id: str) -> list[dict]:
|
|
48
|
+
"""Extract sections list from paper metadata."""
|
|
49
|
+
meta = get_metadata(arxiv_id)
|
|
50
|
+
return meta.get("sections", [])
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def download_pdf(arxiv_id: str, dest_path: str) -> str:
|
|
54
|
+
"""Download paper PDF to dest_path. Returns dest_path."""
|
|
55
|
+
meta = get_metadata(arxiv_id)
|
|
56
|
+
url = meta.get("src_url") or f"https://arxiv.org/pdf/{arxiv_id}"
|
|
57
|
+
resp = requests.get(url, timeout=120)
|
|
58
|
+
resp.raise_for_status()
|
|
59
|
+
with open(dest_path, "wb") as f:
|
|
60
|
+
f.write(resp.content)
|
|
61
|
+
return dest_path
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""OpenAI-compatible LLM client for paper analysis."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
from openai import OpenAI
|
|
7
|
+
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _default_client() -> OpenAI:
|
|
12
|
+
return OpenAI(
|
|
13
|
+
api_key=os.getenv("OPENAI_API_KEY", ""),
|
|
14
|
+
base_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_client: OpenAI | None = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_client() -> OpenAI:
|
|
22
|
+
global _client
|
|
23
|
+
if _client is None:
|
|
24
|
+
_client = _default_client()
|
|
25
|
+
return _client
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def chat(
|
|
29
|
+
system_prompt: str,
|
|
30
|
+
user_content: str,
|
|
31
|
+
model: str | None = None,
|
|
32
|
+
temperature: float = 0.3,
|
|
33
|
+
max_tokens: int = 4096,
|
|
34
|
+
) -> str:
|
|
35
|
+
"""Single-turn chat completion. Returns assistant message content."""
|
|
36
|
+
client = get_client()
|
|
37
|
+
model = model or os.getenv("OPENAI_MODEL", "gpt-4o")
|
|
38
|
+
resp = client.chat.completions.create(
|
|
39
|
+
model=model,
|
|
40
|
+
temperature=temperature,
|
|
41
|
+
max_tokens=max_tokens,
|
|
42
|
+
messages=[
|
|
43
|
+
{"role": "system", "content": system_prompt},
|
|
44
|
+
{"role": "user", "content": user_content},
|
|
45
|
+
],
|
|
46
|
+
)
|
|
47
|
+
return resp.choices[0].message.content or ""
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""LLM prompt templates for paper analysis."""
|
|
2
|
+
|
|
3
|
+
SYSTEM_ROLE = "你是一位资深AI/CS领域的研究论文分析专家,擅长用中文进行学术分析和通俗讲解。"
|
|
4
|
+
|
|
5
|
+
SECTION_SUMMARY_PROMPT = SYSTEM_ROLE + """
|
|
6
|
+
|
|
7
|
+
请对以下论文章节内容进行深度总结。要求:
|
|
8
|
+
1. 用中文撰写,300-500字
|
|
9
|
+
2. 准确概括该章节的核心内容、方法论和关键发现
|
|
10
|
+
3. 保留重要的技术细节和数学概念(如有)
|
|
11
|
+
4. 语言专业但清晰易懂
|
|
12
|
+
|
|
13
|
+
章节名称:{section_name}
|
|
14
|
+
|
|
15
|
+
章节内容:
|
|
16
|
+
{content}
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
FULL_SUMMARY_PROMPT = SYSTEM_ROLE + """
|
|
20
|
+
|
|
21
|
+
以下是一篇论文各章节的中文总结。请基于这些章节总结,撰写一篇完整的论文概述。要求:
|
|
22
|
+
1. 用中文撰写,800-1200字
|
|
23
|
+
2. 涵盖论文的研究背景、核心方法、实验结果和结论
|
|
24
|
+
3. 逻辑连贯,层次分明
|
|
25
|
+
4. 面向具有一定学术背景的读者
|
|
26
|
+
|
|
27
|
+
论文标题:{title}
|
|
28
|
+
|
|
29
|
+
各章节总结:
|
|
30
|
+
{sections}
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
ELI5_PROMPT = SYSTEM_ROLE + """
|
|
34
|
+
|
|
35
|
+
请用小学生(10岁左右)能听懂的方式,解释以下论文的核心内容。要求:
|
|
36
|
+
1. 用中文撰写,200-400字
|
|
37
|
+
2. 用生活中的比喻来解释复杂概念
|
|
38
|
+
3. 不要使用专业术语,如果必须使用则要给出通俗解释
|
|
39
|
+
4. 让孩子能理解这篇论文"解决了一个什么问题"以及"怎么解决的"
|
|
40
|
+
|
|
41
|
+
论文标题:{title}
|
|
42
|
+
|
|
43
|
+
论文概述:
|
|
44
|
+
{summary}
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
INNOVATIONS_PROMPT = SYSTEM_ROLE + """
|
|
48
|
+
|
|
49
|
+
请分析以下论文的核心创新点。要求:
|
|
50
|
+
1. 用中文列出3-5个核心创新点
|
|
51
|
+
2. 每个创新点用简洁的一句话概括,然后附上简短说明
|
|
52
|
+
3. 重点区分"真正的新贡献"和"已有方法的改进"
|
|
53
|
+
4. 按重要性排序
|
|
54
|
+
|
|
55
|
+
论文标题:{title}
|
|
56
|
+
|
|
57
|
+
论文概述:
|
|
58
|
+
{summary}
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
TEST_METRICS_PROMPT = SYSTEM_ROLE + """
|
|
62
|
+
|
|
63
|
+
请提取以下论文中使用的测试集和评估指标。要求:
|
|
64
|
+
1. 列出论文使用的主要测试/评估数据集
|
|
65
|
+
2. 列出主要评估指标及论文报告的具体数值
|
|
66
|
+
3. 如有对比实验,列出基线方法和本文方法的结果对比
|
|
67
|
+
4. 用中文撰写,但数据集名称和指标名称保留英文原文
|
|
68
|
+
|
|
69
|
+
论文标题:{title}
|
|
70
|
+
|
|
71
|
+
论文概述:
|
|
72
|
+
{summary}
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
WEAKNESSES_PROMPT = SYSTEM_ROLE + """
|
|
76
|
+
|
|
77
|
+
请分析以下论文的核心缺点和局限性。要求:
|
|
78
|
+
1. 用中文列出3-5个主要缺点
|
|
79
|
+
2. 从以下维度分析:方法论的局限性、实验设计的不足、适用范围的限制、潜在的偏差
|
|
80
|
+
3. 客观公正,既不过分苛刻也不回避问题
|
|
81
|
+
4. 每个缺点给出具体论据支持
|
|
82
|
+
|
|
83
|
+
论文标题:{title}
|
|
84
|
+
|
|
85
|
+
论文概述:
|
|
86
|
+
{summary}
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
FUTURE_DIRECTIONS_PROMPT = SYSTEM_ROLE + """
|
|
90
|
+
|
|
91
|
+
请基于以下论文的内容,分析其后期可能的研究演进方向。要求:
|
|
92
|
+
1. 用中文列出3-5个未来研究方向
|
|
93
|
+
2. 包括论文作者自己提到的方向和你基于领域知识的合理推测
|
|
94
|
+
3. 每个方向说明:要解决什么问题、可能的思路、预期影响
|
|
95
|
+
4. 结合当前AI领域的最新发展趋势
|
|
96
|
+
|
|
97
|
+
论文标题:{title}
|
|
98
|
+
|
|
99
|
+
论文概述:
|
|
100
|
+
{summary}
|
|
101
|
+
"""
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Wiki generation orchestration: fetch paper, summarize, assemble."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
8
|
+
|
|
9
|
+
from . import deepxiv_client as dx
|
|
10
|
+
from . import llm
|
|
11
|
+
from . import prompts
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _slug(name: str) -> str:
|
|
15
|
+
"""Convert section name to a file-system friendly slug."""
|
|
16
|
+
# Remove leading numbers like "1. " or "Appendix A "
|
|
17
|
+
s = re.sub(r"^(\d+\.?\s*|Appendix\s+[A-Z]\s*)", "", name)
|
|
18
|
+
s = s.strip().lower()
|
|
19
|
+
s = re.sub(r"[^a-z0-9]+", "_", s)
|
|
20
|
+
return s.strip("_") or "section"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def process_paper(arxiv_id: str, output_dir: str = "wiki", model: str | None = None) -> Path:
|
|
24
|
+
"""Full pipeline: fetch paper, summarize sections, generate wiki."""
|
|
25
|
+
paper_dir = Path(output_dir) / arxiv_id
|
|
26
|
+
sections_dir = paper_dir / "sections"
|
|
27
|
+
sections_dir.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
with Progress(
|
|
30
|
+
SpinnerColumn(),
|
|
31
|
+
TextColumn("[progress.description]{task.description}"),
|
|
32
|
+
transient=True,
|
|
33
|
+
) as progress:
|
|
34
|
+
# 1. Fetch metadata and brief
|
|
35
|
+
t = progress.add_task("Fetching paper metadata...", total=None)
|
|
36
|
+
metadata = dx.get_metadata(arxiv_id)
|
|
37
|
+
brief = dx.get_brief(arxiv_id)
|
|
38
|
+
title = metadata.get("title", arxiv_id)
|
|
39
|
+
raw_authors = metadata.get("authors", [])
|
|
40
|
+
if isinstance(raw_authors, str):
|
|
41
|
+
authors = raw_authors
|
|
42
|
+
elif raw_authors and isinstance(raw_authors[0], dict):
|
|
43
|
+
authors = ", ".join(a["name"] for a in raw_authors)
|
|
44
|
+
else:
|
|
45
|
+
authors = ", ".join(str(a) for a in raw_authors)
|
|
46
|
+
pub_date = metadata.get("publish_at", "")[:10]
|
|
47
|
+
sections_list = metadata.get("sections", [])
|
|
48
|
+
progress.update(t, completed=True)
|
|
49
|
+
|
|
50
|
+
# 2. Download PDF
|
|
51
|
+
progress.update(t, description="Downloading PDF...")
|
|
52
|
+
dx.download_pdf(arxiv_id, str(paper_dir / "paper.pdf"))
|
|
53
|
+
progress.update(t, completed=True)
|
|
54
|
+
|
|
55
|
+
# 3. Summarize each section
|
|
56
|
+
section_summaries: list[tuple[str, str, str]] = [] # (name, filename, summary)
|
|
57
|
+
for i, sec in enumerate(sections_list, 1):
|
|
58
|
+
sec_name = sec["name"]
|
|
59
|
+
progress.update(t, description=f"Summarizing section: {sec_name}...")
|
|
60
|
+
filename = f"{i:02d}_{_slug(sec_name)}.md"
|
|
61
|
+
|
|
62
|
+
content = dx.get_section(arxiv_id, sec_name)
|
|
63
|
+
summary = llm.chat(
|
|
64
|
+
prompts.SECTION_SUMMARY_PROMPT.format(section_name=sec_name, content=content),
|
|
65
|
+
"",
|
|
66
|
+
model=model,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
sec_path = sections_dir / filename
|
|
70
|
+
sec_path.write_text(f"# {sec_name}\n\n{summary}\n", encoding="utf-8")
|
|
71
|
+
section_summaries.append((sec_name, filename, summary))
|
|
72
|
+
|
|
73
|
+
progress.update(t, completed=True)
|
|
74
|
+
|
|
75
|
+
# 4. Full paper summary from section summaries
|
|
76
|
+
progress.update(t, description="Generating full paper summary...")
|
|
77
|
+
all_sections_text = "\n\n---\n\n".join(
|
|
78
|
+
f"### {name}\n{summary}" for name, _, summary in section_summaries
|
|
79
|
+
)
|
|
80
|
+
full_summary = llm.chat(
|
|
81
|
+
prompts.FULL_SUMMARY_PROMPT.format(title=title, sections=all_sections_text),
|
|
82
|
+
"",
|
|
83
|
+
model=model,
|
|
84
|
+
)
|
|
85
|
+
progress.update(t, completed=True)
|
|
86
|
+
|
|
87
|
+
# 5. Five analysis aspects
|
|
88
|
+
aspects = {}
|
|
89
|
+
aspect_defs = [
|
|
90
|
+
("eli5", "小学生也能懂", prompts.ELI5_PROMPT),
|
|
91
|
+
("innovations", "核心创新点", prompts.INNOVATIONS_PROMPT),
|
|
92
|
+
("test_metrics", "测试集与指标", prompts.TEST_METRICS_PROMPT),
|
|
93
|
+
("weaknesses", "核心缺点", prompts.WEAKNESSES_PROMPT),
|
|
94
|
+
("future", "后期演进方向", prompts.FUTURE_DIRECTIONS_PROMPT),
|
|
95
|
+
]
|
|
96
|
+
for key, label, prompt_tmpl in aspect_defs:
|
|
97
|
+
progress.update(t, description=f"Analyzing: {label}...")
|
|
98
|
+
aspects[key] = llm.chat(
|
|
99
|
+
prompt_tmpl.format(title=title, summary=full_summary),
|
|
100
|
+
"",
|
|
101
|
+
model=model,
|
|
102
|
+
)
|
|
103
|
+
progress.update(t, completed=True)
|
|
104
|
+
|
|
105
|
+
# 6. Assemble README.md
|
|
106
|
+
section_links = "\n".join(
|
|
107
|
+
f"- [{name}](sections/{fname})" for name, fname, _ in section_summaries
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
readme = f"""# {title}
|
|
111
|
+
|
|
112
|
+
> Authors: {authors} | Published: {pub_date} | arXiv: [{arxiv_id}](https://arxiv.org/abs/{arxiv_id})
|
|
113
|
+
|
|
114
|
+
## 论文概述
|
|
115
|
+
|
|
116
|
+
{full_summary}
|
|
117
|
+
|
|
118
|
+
## 小学生也能懂
|
|
119
|
+
|
|
120
|
+
{aspects["eli5"]}
|
|
121
|
+
|
|
122
|
+
## 核心创新点
|
|
123
|
+
|
|
124
|
+
{aspects["innovations"]}
|
|
125
|
+
|
|
126
|
+
## 测试集与指标
|
|
127
|
+
|
|
128
|
+
{aspects["test_metrics"]}
|
|
129
|
+
|
|
130
|
+
## 核心缺点
|
|
131
|
+
|
|
132
|
+
{aspects["weaknesses"]}
|
|
133
|
+
|
|
134
|
+
## 后期演进方向
|
|
135
|
+
|
|
136
|
+
{aspects["future"]}
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## 章节详解
|
|
141
|
+
|
|
142
|
+
{section_links}
|
|
143
|
+
|
|
144
|
+
## 原文
|
|
145
|
+
|
|
146
|
+
- [PDF](paper.pdf)
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
readme_path = paper_dir / "README.md"
|
|
150
|
+
readme_path.write_text(readme, encoding="utf-8")
|
|
151
|
+
return paper_dir
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def list_papers(output_dir: str = "wiki") -> list[dict]:
|
|
155
|
+
"""List all processed papers in the wiki directory."""
|
|
156
|
+
wiki = Path(output_dir)
|
|
157
|
+
if not wiki.exists():
|
|
158
|
+
return []
|
|
159
|
+
papers = []
|
|
160
|
+
for d in sorted(wiki.iterdir()):
|
|
161
|
+
readme = d / "README.md"
|
|
162
|
+
if d.is_dir() and readme.exists():
|
|
163
|
+
# Read first line for title
|
|
164
|
+
first_line = readme.read_text(encoding="utf-8").split("\n")[0]
|
|
165
|
+
title = first_line.lstrip("# ").strip()
|
|
166
|
+
papers.append({"id": d.name, "title": title, "path": str(d)})
|
|
167
|
+
return papers
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: paper-wiki
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Deep paper reading wiki: search, summarize, and analyze arXiv papers into a three-layer markdown wiki
|
|
5
|
+
Author: mblank
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mblank5/paper-wiki
|
|
8
|
+
Project-URL: Repository, https://github.com/mblank5/paper-wiki
|
|
9
|
+
Keywords: arxiv,paper,wiki,summarization,cli
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: click>=8.0
|
|
21
|
+
Requires-Dist: openai>=1.0
|
|
22
|
+
Requires-Dist: python-dotenv>=1.0
|
|
23
|
+
Requires-Dist: rich>=13.0
|
|
24
|
+
Requires-Dist: requests>=2.28
|
|
25
|
+
|
|
26
|
+
# paper-wiki
|
|
27
|
+
|
|
28
|
+
Deep paper reading wiki: search, summarize, and analyze arXiv papers into a three-layer markdown wiki.
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- Search arXiv papers via [deepxiv](https://data.rag.ac.cn) CLI
|
|
33
|
+
- Download paper PDFs
|
|
34
|
+
- Generate per-section Chinese summaries using LLM
|
|
35
|
+
- Produce a full paper analysis with 5 dimensions:
|
|
36
|
+
- ELI5 (小学生也能懂)
|
|
37
|
+
- Core innovations (核心创新点)
|
|
38
|
+
- Test datasets & metrics (测试集与指标)
|
|
39
|
+
- Core weaknesses (核心缺点)
|
|
40
|
+
- Future directions (后期演进方向)
|
|
41
|
+
|
|
42
|
+
## Three-Layer Wiki
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
wiki/{arxiv_id}/
|
|
46
|
+
├── README.md # Layer 1: full summary + 5 analysis aspects
|
|
47
|
+
├── sections/ # Layer 2: per-section Chinese summaries
|
|
48
|
+
│ ├── 01_introduction.md
|
|
49
|
+
│ ├── 02_method.md
|
|
50
|
+
│ └── ...
|
|
51
|
+
└── paper.pdf # Layer 3: original PDF
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install paper-wiki
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Prerequisites
|
|
61
|
+
|
|
62
|
+
- [deepxiv](https://pypi.org/project/deepxiv/) CLI installed and configured (`pip install deepxiv`)
|
|
63
|
+
- OpenAI-compatible API endpoint
|
|
64
|
+
|
|
65
|
+
## Quick Start
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Configure API
|
|
69
|
+
export OPENAI_API_KEY=sk-xxx
|
|
70
|
+
export OPENAI_BASE_URL=https://api.openai.com/v1
|
|
71
|
+
export OPENAI_MODEL=gpt-4o
|
|
72
|
+
|
|
73
|
+
# Process a paper by arXiv ID
|
|
74
|
+
paper-wiki process 2409.05591
|
|
75
|
+
|
|
76
|
+
# Or by URL
|
|
77
|
+
paper-wiki process https://arxiv.org/pdf/2604.27393
|
|
78
|
+
|
|
79
|
+
# Search and interactively select
|
|
80
|
+
paper-wiki search "RAG long context"
|
|
81
|
+
|
|
82
|
+
# Search and auto-select first result
|
|
83
|
+
paper-wiki search "transformer memory" --first
|
|
84
|
+
|
|
85
|
+
# List processed papers
|
|
86
|
+
paper-wiki list
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Configuration
|
|
90
|
+
|
|
91
|
+
Create a `.env` file in your working directory:
|
|
92
|
+
|
|
93
|
+
```env
|
|
94
|
+
OPENAI_API_KEY=sk-xxx
|
|
95
|
+
OPENAI_BASE_URL=https://api.openai.com/v1
|
|
96
|
+
OPENAI_MODEL=gpt-4o
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
|
|
101
|
+
MIT
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
paper_wiki/__init__.py
|
|
4
|
+
paper_wiki/cli.py
|
|
5
|
+
paper_wiki/deepxiv_client.py
|
|
6
|
+
paper_wiki/llm.py
|
|
7
|
+
paper_wiki/prompts.py
|
|
8
|
+
paper_wiki/wiki.py
|
|
9
|
+
paper_wiki.egg-info/PKG-INFO
|
|
10
|
+
paper_wiki.egg-info/SOURCES.txt
|
|
11
|
+
paper_wiki.egg-info/dependency_links.txt
|
|
12
|
+
paper_wiki.egg-info/entry_points.txt
|
|
13
|
+
paper_wiki.egg-info/requires.txt
|
|
14
|
+
paper_wiki.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
paper_wiki
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "paper-wiki"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Deep paper reading wiki: search, summarize, and analyze arXiv papers into a three-layer markdown wiki"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.10"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "mblank" },
|
|
10
|
+
]
|
|
11
|
+
keywords = ["arxiv", "paper", "wiki", "summarization", "cli"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Environment :: Console",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"click>=8.0",
|
|
24
|
+
"openai>=1.0",
|
|
25
|
+
"python-dotenv>=1.0",
|
|
26
|
+
"rich>=13.0",
|
|
27
|
+
"requests>=2.28",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/mblank5/paper-wiki"
|
|
32
|
+
Repository = "https://github.com/mblank5/paper-wiki"
|
|
33
|
+
|
|
34
|
+
[project.scripts]
|
|
35
|
+
paper-wiki = "paper_wiki.cli:cli"
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["setuptools>=68.0"]
|
|
39
|
+
build-backend = "setuptools.build_meta"
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.packages.find]
|
|
42
|
+
include = ["paper_wiki*"]
|