arxiv-ingest 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_ingest-0.2.0/.claude/commands/arxiv-ingest.md +66 -0
- arxiv_ingest-0.2.0/.github/workflows/daily.yml +76 -0
- arxiv_ingest-0.2.0/.gitignore +5 -0
- arxiv_ingest-0.2.0/PKG-INFO +136 -0
- arxiv_ingest-0.2.0/README.md +117 -0
- arxiv_ingest-0.2.0/config.yaml +43 -0
- arxiv_ingest-0.2.0/config.yaml.example +59 -0
- arxiv_ingest-0.2.0/main.py +69 -0
- arxiv_ingest-0.2.0/pyproject.toml +40 -0
- arxiv_ingest-0.2.0/scripts/__init__.py +0 -0
- arxiv_ingest-0.2.0/scripts/fetch.py +123 -0
- arxiv_ingest-0.2.0/scripts/generate.py +230 -0
- arxiv_ingest-0.2.0/tests/__init__.py +0 -0
- arxiv_ingest-0.2.0/tests/test_fetch.py +72 -0
- arxiv_ingest-0.2.0/tests/test_generate.py +124 -0
- arxiv_ingest-0.2.0/uv.lock +424 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# /arxiv-ingest
|
|
2
|
+
|
|
3
|
+
arXivから論文を収集し、research-wikiへ自動取り込みするワークフロー。
|
|
4
|
+
|
|
5
|
+
## 実行ステップ
|
|
6
|
+
|
|
7
|
+
### Step 1: 論文を収集する
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
cd ~/workspace/arxiv-ingest
|
|
11
|
+
uv run python scripts/fetch.py
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
`data/fetched.json` に論文リストが保存される。
|
|
15
|
+
|
|
16
|
+
### Step 2: スケルトンファイルを生成する
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
uv run python scripts/generate.py
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
`research-wiki/sources/{Category}/{slug}.md`(メタデータ)と
|
|
23
|
+
`research-wiki/evidence/{Category}/{slug}.md`(空のテンプレート)と
|
|
24
|
+
`research-wiki/wiki/papers/{Category}/{slug}.md`(空のテンプレート)が作成される。
|
|
25
|
+
|
|
26
|
+
### Step 3: 各論文の evidence と wiki を Claude が記入する
|
|
27
|
+
|
|
28
|
+
`data/fetched.json` を読み込み、各論文について以下を実行する:
|
|
29
|
+
|
|
30
|
+
1. 論文のアブストラクト(`data/fetched.json` の `abstract` フィールド)を読む
|
|
31
|
+
2. PDF URL(`pdf_url` フィールド)から論文を取得して内容を把握する(可能な場合)
|
|
32
|
+
3. `research-wiki/evidence/{Category}/{slug}.md` を更新する:
|
|
33
|
+
- 主要な主張(3〜5点)
|
|
34
|
+
- 主要な貢献
|
|
35
|
+
- 制限・注意点
|
|
36
|
+
- ベンチマーク結果(記載があれば)
|
|
37
|
+
4. `research-wiki/wiki/papers/{Category}/{slug}.md` を更新する:
|
|
38
|
+
- ソースからの事実(evidence の要点)
|
|
39
|
+
- 現時点の解釈(手法の意義・他研究との関係)
|
|
40
|
+
- 関連ページ(既存の wiki ページへのリンク)
|
|
41
|
+
- 未解決の問い
|
|
42
|
+
|
|
43
|
+
### Step 4: インデックスを更新する
|
|
44
|
+
|
|
45
|
+
`research-wiki/wiki/index.md` と `research-wiki/index/recent.md` に追加した論文を記録する。
|
|
46
|
+
|
|
47
|
+
`research-wiki/log.md` に以下の形式で追記する:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
## {今日の日付}
|
|
51
|
+
|
|
52
|
+
- **論文追加**: {タイトル} ({著者第一著者} et al., {年})
|
|
53
|
+
- `sources/{Category}/{slug}.md` 作成
|
|
54
|
+
- `evidence/{Category}/{slug}.md` 作成
|
|
55
|
+
- `wiki/papers/{Category}/{slug}.md` 作成
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## 設定変更
|
|
59
|
+
|
|
60
|
+
キーワードや出力先は `~/workspace/arxiv-ingest/config.yaml` を編集する。
|
|
61
|
+
|
|
62
|
+
## 注意
|
|
63
|
+
|
|
64
|
+
- `evidence/` と `wiki/` のテンプレートに `(Claude Code が論文を読んで記入)` と書いてある箇所をすべて実際の内容に書き換えること
|
|
65
|
+
- アブストラクトだけで十分に理解できる論文は PDF 取得を省略してよい
|
|
66
|
+
- 既存のファイルが存在する場合は上書きせず、差分を確認してから更新する
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
name: Daily arXiv Ingest
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
schedule:
|
|
5
|
+
- cron: '0 1 * * 1-5' # 月〜金 JST 10:00 (UTC 01:00)
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
inputs:
|
|
8
|
+
days_back:
|
|
9
|
+
description: 'Fetch papers from last N days'
|
|
10
|
+
default: '1'
|
|
11
|
+
required: false
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
ingest:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
permissions:
|
|
17
|
+
contents: write
|
|
18
|
+
|
|
19
|
+
steps:
|
|
20
|
+
- name: Checkout arxiv-ingest
|
|
21
|
+
uses: actions/checkout@v4
|
|
22
|
+
|
|
23
|
+
- name: Checkout research-wiki
|
|
24
|
+
uses: actions/checkout@v4
|
|
25
|
+
with:
|
|
26
|
+
repository: ${{ vars.WIKI_REPO }} # 例: yamadan96/research-wiki
|
|
27
|
+
token: ${{ secrets.GH_PAT }}
|
|
28
|
+
path: research-wiki
|
|
29
|
+
|
|
30
|
+
- name: Install uv
|
|
31
|
+
uses: astral-sh/setup-uv@v5
|
|
32
|
+
with:
|
|
33
|
+
enable-cache: true
|
|
34
|
+
|
|
35
|
+
- name: Install dependencies
|
|
36
|
+
run: uv sync
|
|
37
|
+
|
|
38
|
+
- name: Override days_back (manual trigger)
|
|
39
|
+
if: github.event_name == 'workflow_dispatch'
|
|
40
|
+
run: |
|
|
41
|
+
python - <<'EOF'
|
|
42
|
+
import yaml, re, sys
|
|
43
|
+
path = "config.yaml"
|
|
44
|
+
text = open(path).read()
|
|
45
|
+
text = re.sub(r'^days_back:.*$', f'days_back: ${{ inputs.days_back }}', text, flags=re.MULTILINE)
|
|
46
|
+
open(path, "w").write(text)
|
|
47
|
+
EOF
|
|
48
|
+
|
|
49
|
+
- name: Fetch papers from arXiv
|
|
50
|
+
run: uv run python scripts/fetch.py
|
|
51
|
+
|
|
52
|
+
- name: Generate research-wiki skeletons
|
|
53
|
+
env:
|
|
54
|
+
OUTPUT_DIR: research-wiki
|
|
55
|
+
run: |
|
|
56
|
+
python - <<'EOF'
|
|
57
|
+
import yaml, pathlib
|
|
58
|
+
cfg = yaml.safe_load(open("config.yaml"))
|
|
59
|
+
cfg["output_dir"] = "research-wiki"
|
|
60
|
+
open("config.yaml", "w").write(yaml.dump(cfg, allow_unicode=True))
|
|
61
|
+
EOF
|
|
62
|
+
uv run python scripts/generate.py
|
|
63
|
+
|
|
64
|
+
- name: Commit new files to research-wiki
|
|
65
|
+
working-directory: research-wiki
|
|
66
|
+
run: |
|
|
67
|
+
git config user.name "arxiv-ingest[bot]"
|
|
68
|
+
git config user.email "arxiv-ingest@users.noreply.github.com"
|
|
69
|
+
git add -A
|
|
70
|
+
if git diff --staged --quiet; then
|
|
71
|
+
echo "No new papers to commit."
|
|
72
|
+
else
|
|
73
|
+
COUNT=$(git diff --staged --name-only | grep '^sources/' | wc -l | tr -d ' ')
|
|
74
|
+
git commit -m "auto: arXiv ingest $(date +%Y-%m-%d) [${COUNT} papers]"
|
|
75
|
+
git push
|
|
76
|
+
fi
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: arxiv-ingest
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Fetch recent arXiv papers and scaffold them into a structured research wiki
|
|
5
|
+
Project-URL: Homepage, https://github.com/yamadan96/arxiv-ingest
|
|
6
|
+
Project-URL: Repository, https://github.com/yamadan96/arxiv-ingest
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: arxiv,knowledge-management,papers,research,wiki
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Requires-Dist: arxiv>=4.0.0
|
|
16
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
17
|
+
Requires-Dist: rich>=15.0.0
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# arxiv-ingest
|
|
21
|
+
|
|
22
|
+
**Fetch recent arXiv papers by keyword and scaffold them into structured research notes.**
|
|
23
|
+
|
|
24
|
+
Works standalone as a CLI, or integrates with Claude Code for LLM-assisted evidence extraction and wiki generation.
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
|
|
28
|
+
- **Keyword-based collection** — list topics in `config.yaml`, no code needed
|
|
29
|
+
- **3-layer output** — `sources/` (metadata) · `evidence/` (claims) · `wiki/` (synthesis)
|
|
30
|
+
- **Safe re-runs** — already-filled files are never overwritten
|
|
31
|
+
- **GitHub Actions template** — auto-runs Mon–Fri, `workflow_dispatch` for on-demand runs
|
|
32
|
+
- **Any wiki** — point `output_dir` at any directory
|
|
33
|
+
|
|
34
|
+
## Quick start
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# Install (requires Python 3.11+)
|
|
38
|
+
git clone https://github.com/yamadan96/arxiv-ingest
|
|
39
|
+
cd arxiv-ingest
|
|
40
|
+
uv sync # or: pip install -e .
|
|
41
|
+
|
|
42
|
+
# Configure
|
|
43
|
+
cp config.yaml.example config.yaml
|
|
44
|
+
# Edit config.yaml: set keywords, output_dir, and category mappings
|
|
45
|
+
|
|
46
|
+
# Fetch papers and generate skeletons
|
|
47
|
+
arxiv-ingest run # fetch + generate in one step
|
|
48
|
+
|
|
49
|
+
# Or step by step:
|
|
50
|
+
arxiv-ingest fetch # saves data/fetched.json
|
|
51
|
+
arxiv-ingest generate # creates skeleton files in output_dir
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Configuration (`config.yaml`)
|
|
55
|
+
|
|
56
|
+
```yaml
|
|
57
|
+
output_dir: "../research-wiki" # path to your wiki directory
|
|
58
|
+
max_results: 20 # papers per keyword
|
|
59
|
+
days_back: 7 # look back N days
|
|
60
|
+
|
|
61
|
+
keywords:
|
|
62
|
+
- "Vision-Language Model transformer"
|
|
63
|
+
- "LoRA PEFT fine-tuning language model"
|
|
64
|
+
- "LLM reasoning chain-of-thought"
|
|
65
|
+
|
|
66
|
+
# Only accept papers in these arXiv categories:
|
|
67
|
+
allowed_arxiv_categories: [cs.CV, cs.CL, cs.LG, cs.AI, cs.RO, stat.ML]
|
|
68
|
+
|
|
69
|
+
# Primary category must be in this list (prevents off-topic papers):
|
|
70
|
+
require_primary_in: [cs.CV, cs.CL, cs.AI, cs.RO, stat.ML]
|
|
71
|
+
|
|
72
|
+
# arXiv category → wiki folder name:
|
|
73
|
+
category_map:
|
|
74
|
+
cs.CV: "Multimodal"
|
|
75
|
+
cs.CL: "Post_Training"
|
|
76
|
+
cs.AI: "Reasoning"
|
|
77
|
+
cs.RO: "Physical_AI"
|
|
78
|
+
stat.ML: "Pretraining"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
See `config.yaml.example` for the full reference with all options documented.
|
|
82
|
+
|
|
83
|
+
## Output format
|
|
84
|
+
|
|
85
|
+
Skeleton files are created at:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
output_dir/
|
|
89
|
+
├── sources/{Category}/{slug}.md # Metadata (title, authors, abstract, URL)
|
|
90
|
+
├── evidence/{Category}/{slug}.md # Claims & benchmarks — fill this in
|
|
91
|
+
└── wiki/papers/{Category}/{slug}.md # Synthesis & interpretation — fill this in
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
`sources/` files are written once and never touched again.
|
|
95
|
+
`evidence/` and `wiki/` files are only (re-)created when they still contain the unfilled template placeholder, so your edits are safe across re-runs.
|
|
96
|
+
|
|
97
|
+
## Using with Claude Code
|
|
98
|
+
|
|
99
|
+
Copy the skill file so `/arxiv-ingest` is available as a Claude Code command:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
cp .claude/commands/arxiv-ingest.md ~/.claude/commands/
|
|
103
|
+
# or into your project's .claude/commands/
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
After running `arxiv-ingest run`, invoke `/arxiv-ingest` in Claude Code.
|
|
107
|
+
Claude reads each paper's abstract (and PDF when available) and fills in the `evidence/` and `wiki/` files automatically.
|
|
108
|
+
|
|
109
|
+
## GitHub Actions: daily auto-ingest
|
|
110
|
+
|
|
111
|
+
### Setup
|
|
112
|
+
|
|
113
|
+
1. Fork or use this repo as a template
|
|
114
|
+
2. Create a GitHub Fine-grained PAT with **Write** access to your wiki repo
|
|
115
|
+
3. Add to your repo's **Settings → Secrets and variables → Actions**:
|
|
116
|
+
|
|
117
|
+
| Kind | Name | Value |
|
|
118
|
+
|------|------|-------|
|
|
119
|
+
| Secret | `GH_PAT` | your PAT |
|
|
120
|
+
| Variable | `WIKI_REPO` | `your-name/research-wiki` |
|
|
121
|
+
|
|
122
|
+
4. Go to **Actions → Daily arXiv Ingest → Run workflow** to verify
|
|
123
|
+
|
|
124
|
+
### Schedule
|
|
125
|
+
|
|
126
|
+
- **Auto**: Mon–Fri UTC 01:00 (JST 10:00) — `cron: '0 1 * * 1-5'`
|
|
127
|
+
- **Manual**: `workflow_dispatch` with optional `days_back` override
|
|
128
|
+
|
|
129
|
+
## Requirements
|
|
130
|
+
|
|
131
|
+
- Python 3.11+
|
|
132
|
+
- [uv](https://docs.astral.sh/uv/) (recommended) or pip
|
|
133
|
+
|
|
134
|
+
## License
|
|
135
|
+
|
|
136
|
+
MIT
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# arxiv-ingest
|
|
2
|
+
|
|
3
|
+
**Fetch recent arXiv papers by keyword and scaffold them into structured research notes.**
|
|
4
|
+
|
|
5
|
+
Works standalone as a CLI, or integrates with Claude Code for LLM-assisted evidence extraction and wiki generation.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Keyword-based collection** — list topics in `config.yaml`, no code needed
|
|
10
|
+
- **3-layer output** — `sources/` (metadata) · `evidence/` (claims) · `wiki/` (synthesis)
|
|
11
|
+
- **Safe re-runs** — already-filled files are never overwritten
|
|
12
|
+
- **GitHub Actions template** — auto-runs Mon–Fri, `workflow_dispatch` for on-demand runs
|
|
13
|
+
- **Any wiki** — point `output_dir` at any directory
|
|
14
|
+
|
|
15
|
+
## Quick start
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Install (requires Python 3.11+)
|
|
19
|
+
git clone https://github.com/yamadan96/arxiv-ingest
|
|
20
|
+
cd arxiv-ingest
|
|
21
|
+
uv sync # or: pip install -e .
|
|
22
|
+
|
|
23
|
+
# Configure
|
|
24
|
+
cp config.yaml.example config.yaml
|
|
25
|
+
# Edit config.yaml: set keywords, output_dir, and category mappings
|
|
26
|
+
|
|
27
|
+
# Fetch papers and generate skeletons
|
|
28
|
+
arxiv-ingest run # fetch + generate in one step
|
|
29
|
+
|
|
30
|
+
# Or step by step:
|
|
31
|
+
arxiv-ingest fetch # saves data/fetched.json
|
|
32
|
+
arxiv-ingest generate # creates skeleton files in output_dir
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Configuration (`config.yaml`)
|
|
36
|
+
|
|
37
|
+
```yaml
|
|
38
|
+
output_dir: "../research-wiki" # path to your wiki directory
|
|
39
|
+
max_results: 20 # papers per keyword
|
|
40
|
+
days_back: 7 # look back N days
|
|
41
|
+
|
|
42
|
+
keywords:
|
|
43
|
+
- "Vision-Language Model transformer"
|
|
44
|
+
- "LoRA PEFT fine-tuning language model"
|
|
45
|
+
- "LLM reasoning chain-of-thought"
|
|
46
|
+
|
|
47
|
+
# Only accept papers in these arXiv categories:
|
|
48
|
+
allowed_arxiv_categories: [cs.CV, cs.CL, cs.LG, cs.AI, cs.RO, stat.ML]
|
|
49
|
+
|
|
50
|
+
# Primary category must be in this list (prevents off-topic papers):
|
|
51
|
+
require_primary_in: [cs.CV, cs.CL, cs.AI, cs.RO, stat.ML]
|
|
52
|
+
|
|
53
|
+
# arXiv category → wiki folder name:
|
|
54
|
+
category_map:
|
|
55
|
+
cs.CV: "Multimodal"
|
|
56
|
+
cs.CL: "Post_Training"
|
|
57
|
+
cs.AI: "Reasoning"
|
|
58
|
+
cs.RO: "Physical_AI"
|
|
59
|
+
stat.ML: "Pretraining"
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
See `config.yaml.example` for the full reference with all options documented.
|
|
63
|
+
|
|
64
|
+
## Output format
|
|
65
|
+
|
|
66
|
+
Skeleton files are created at:
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
output_dir/
|
|
70
|
+
├── sources/{Category}/{slug}.md # Metadata (title, authors, abstract, URL)
|
|
71
|
+
├── evidence/{Category}/{slug}.md # Claims & benchmarks — fill this in
|
|
72
|
+
└── wiki/papers/{Category}/{slug}.md # Synthesis & interpretation — fill this in
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
`sources/` files are written once and never touched again.
|
|
76
|
+
`evidence/` and `wiki/` files are only (re-)created when they still contain the unfilled template placeholder, so your edits are safe across re-runs.
|
|
77
|
+
|
|
78
|
+
## Using with Claude Code
|
|
79
|
+
|
|
80
|
+
Copy the skill file so `/arxiv-ingest` is available as a Claude Code command:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
cp .claude/commands/arxiv-ingest.md ~/.claude/commands/
|
|
84
|
+
# or into your project's .claude/commands/
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
After running `arxiv-ingest run`, invoke `/arxiv-ingest` in Claude Code.
|
|
88
|
+
Claude reads each paper's abstract (and PDF when available) and fills in the `evidence/` and `wiki/` files automatically.
|
|
89
|
+
|
|
90
|
+
## GitHub Actions: daily auto-ingest
|
|
91
|
+
|
|
92
|
+
### Setup
|
|
93
|
+
|
|
94
|
+
1. Fork or use this repo as a template
|
|
95
|
+
2. Create a GitHub Fine-grained PAT with **Write** access to your wiki repo
|
|
96
|
+
3. Add to your repo's **Settings → Secrets and variables → Actions**:
|
|
97
|
+
|
|
98
|
+
| Kind | Name | Value |
|
|
99
|
+
|------|------|-------|
|
|
100
|
+
| Secret | `GH_PAT` | your PAT |
|
|
101
|
+
| Variable | `WIKI_REPO` | `your-name/research-wiki` |
|
|
102
|
+
|
|
103
|
+
4. Go to **Actions → Daily arXiv Ingest → Run workflow** to verify
|
|
104
|
+
|
|
105
|
+
### Schedule
|
|
106
|
+
|
|
107
|
+
- **Auto**: Mon–Fri UTC 01:00 (JST 10:00) — `cron: '0 1 * * 1-5'`
|
|
108
|
+
- **Manual**: `workflow_dispatch` with optional `days_back` override
|
|
109
|
+
|
|
110
|
+
## Requirements
|
|
111
|
+
|
|
112
|
+
- Python 3.11+
|
|
113
|
+
- [uv](https://docs.astral.sh/uv/) (recommended) or pip
|
|
114
|
+
|
|
115
|
+
## License
|
|
116
|
+
|
|
117
|
+
MIT
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# arxiv-ingest configuration
|
|
2
|
+
# Edit keywords and output_dir to match your setup
|
|
3
|
+
|
|
4
|
+
output_dir: "../research-wiki" # path to your research-wiki directory
|
|
5
|
+
|
|
6
|
+
max_results: 20 # per keyword
|
|
7
|
+
days_back: 7 # fetch papers from the last N days
|
|
8
|
+
|
|
9
|
+
keywords:
|
|
10
|
+
- "vision language model transformer"
|
|
11
|
+
- "LoRA PEFT fine-tuning language model"
|
|
12
|
+
- "masked autoencoder self-supervised learning"
|
|
13
|
+
- "embodied AI robot learning manipulation"
|
|
14
|
+
- "large language model reasoning chain-of-thought"
|
|
15
|
+
- "multimodal foundation model"
|
|
16
|
+
|
|
17
|
+
# arXiv category filter — only accept papers in these categories
|
|
18
|
+
allowed_arxiv_categories:
|
|
19
|
+
- cs.CV
|
|
20
|
+
- cs.CL
|
|
21
|
+
- cs.LG
|
|
22
|
+
- cs.AI
|
|
23
|
+
- cs.RO
|
|
24
|
+
- stat.ML
|
|
25
|
+
|
|
26
|
+
# arXiv category → research-wiki category mapping
|
|
27
|
+
category_map:
|
|
28
|
+
cs.CV: "Multimodal"
|
|
29
|
+
cs.CL: "Post_Training"
|
|
30
|
+
cs.AI: "Reasoning"
|
|
31
|
+
cs.RO: "Physical_AI"
|
|
32
|
+
stat.ML: "Pretraining"
|
|
33
|
+
# cs.LG は幅広すぎるため除外 — キーワードマッチで拾う
|
|
34
|
+
|
|
35
|
+
# cs.LG 論文はキーワードが一致した場合のみ受け入れる
|
|
36
|
+
# allowed_arxiv_categories に cs.LG を残しつつ、
|
|
37
|
+
# primary category が cs.LG 単独の場合はスキップ
|
|
38
|
+
require_primary_in:
|
|
39
|
+
- cs.CV
|
|
40
|
+
- cs.CL
|
|
41
|
+
- cs.AI
|
|
42
|
+
- cs.RO
|
|
43
|
+
- stat.ML
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# arxiv-ingest configuration
|
|
2
|
+
# Copy this file to config.yaml and edit to match your setup.
|
|
3
|
+
#
|
|
4
|
+
# Quick start:
|
|
5
|
+
# cp config.yaml.example config.yaml
|
|
6
|
+
# arxiv-ingest run
|
|
7
|
+
|
|
8
|
+
# Path to your research wiki directory (relative to this config file).
|
|
9
|
+
# The tool creates sources/, evidence/, and wiki/papers/ subdirectories here.
|
|
10
|
+
output_dir: "../research-wiki"
|
|
11
|
+
|
|
12
|
+
# How many papers to fetch per keyword (arXiv API limit: 100)
|
|
13
|
+
max_results: 20
|
|
14
|
+
|
|
15
|
+
# Fetch papers published within the last N days
|
|
16
|
+
days_back: 7
|
|
17
|
+
|
|
18
|
+
# Keywords to search for on arXiv. Each keyword is queried independently.
|
|
19
|
+
# Use specific phrases for higher precision, e.g. "vision language model grounding"
|
|
20
|
+
# rather than just "vision".
|
|
21
|
+
keywords:
|
|
22
|
+
- "Vision-Language Model transformer"
|
|
23
|
+
- "LoRA PEFT fine-tuning language model"
|
|
24
|
+
- "masked autoencoder self-supervised learning"
|
|
25
|
+
- "embodied AI robot learning manipulation"
|
|
26
|
+
- "large language model reasoning chain-of-thought"
|
|
27
|
+
- "multimodal foundation model"
|
|
28
|
+
|
|
29
|
+
# Only accept papers whose arXiv categories include at least one of these.
|
|
30
|
+
# Remove this section to accept all categories.
|
|
31
|
+
allowed_arxiv_categories:
|
|
32
|
+
- cs.CV # Computer Vision
|
|
33
|
+
- cs.CL # Computation and Language (NLP)
|
|
34
|
+
- cs.LG # Machine Learning
|
|
35
|
+
- cs.AI # Artificial Intelligence
|
|
36
|
+
- cs.RO # Robotics
|
|
37
|
+
- stat.ML # Statistics - Machine Learning
|
|
38
|
+
|
|
39
|
+
# Only accept papers whose PRIMARY category is in this list.
|
|
40
|
+
# This prevents off-topic papers (e.g. pure math) from being included
|
|
41
|
+
# even if they mention your keywords.
|
|
42
|
+
# Remove or leave empty to skip this filter.
|
|
43
|
+
require_primary_in:
|
|
44
|
+
- cs.CV
|
|
45
|
+
- cs.CL
|
|
46
|
+
- cs.AI
|
|
47
|
+
- cs.RO
|
|
48
|
+
- stat.ML
|
|
49
|
+
|
|
50
|
+
# Map arXiv categories to your wiki's category folder names.
|
|
51
|
+
# Papers are placed in wiki/papers/{category}/ based on their primary arXiv category.
|
|
52
|
+
# Add or rename categories to match your wiki structure.
|
|
53
|
+
category_map:
|
|
54
|
+
cs.CV: "Multimodal"
|
|
55
|
+
cs.CL: "Post_Training"
|
|
56
|
+
cs.LG: "Architecture"
|
|
57
|
+
cs.AI: "Reasoning"
|
|
58
|
+
cs.RO: "Physical_AI"
|
|
59
|
+
stat.ML: "Pretraining"
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""arxiv-ingest CLI — fetch and scaffold arXiv papers into a research wiki."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def cmd_fetch(args: list[str]) -> None:
|
|
8
|
+
from scripts.fetch import main as fetch_main
|
|
9
|
+
sys.argv = ["fetch"] + args
|
|
10
|
+
fetch_main()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def cmd_generate(args: list[str]) -> None:
|
|
14
|
+
from scripts.generate import main as generate_main
|
|
15
|
+
generate_main()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def cmd_run(args: list[str]) -> None:
|
|
19
|
+
"""Fetch + generate in one step."""
|
|
20
|
+
from scripts.fetch import main as fetch_main
|
|
21
|
+
from scripts.generate import main as generate_main
|
|
22
|
+
fetch_main()
|
|
23
|
+
generate_main()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def print_help() -> None:
|
|
27
|
+
print(
|
|
28
|
+
"arxiv-ingest — fetch and scaffold arXiv papers into a research wiki\n"
|
|
29
|
+
"\n"
|
|
30
|
+
"Usage:\n"
|
|
31
|
+
" arxiv-ingest fetch Fetch recent papers from arXiv (saves data/fetched.json)\n"
|
|
32
|
+
" arxiv-ingest generate Generate skeleton files from fetched.json\n"
|
|
33
|
+
" arxiv-ingest run fetch + generate in one step\n"
|
|
34
|
+
"\n"
|
|
35
|
+
"Options:\n"
|
|
36
|
+
" --help Show this help message\n"
|
|
37
|
+
"\n"
|
|
38
|
+
"Configuration:\n"
|
|
39
|
+
" Edit config.yaml to set keywords, output_dir, and category mappings.\n"
|
|
40
|
+
" Copy config.yaml.example to config.yaml to get started.\n"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def main() -> None:
|
|
45
|
+
args = sys.argv[1:]
|
|
46
|
+
|
|
47
|
+
if not args or args[0] in ("--help", "-h", "help"):
|
|
48
|
+
print_help()
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
cmd = args[0]
|
|
52
|
+
rest = args[1:]
|
|
53
|
+
|
|
54
|
+
commands = {
|
|
55
|
+
"fetch": cmd_fetch,
|
|
56
|
+
"generate": cmd_generate,
|
|
57
|
+
"run": cmd_run,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
if cmd not in commands:
|
|
61
|
+
print(f"Unknown command: {cmd}", file=sys.stderr)
|
|
62
|
+
print_help()
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
commands[cmd](rest)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
if __name__ == "__main__":
|
|
69
|
+
main()
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "arxiv-ingest"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Fetch recent arXiv papers and scaffold them into a structured research wiki"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
keywords = ["arxiv", "research", "papers", "wiki", "knowledge-management"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 3 - Alpha",
|
|
11
|
+
"Intended Audience :: Science/Research",
|
|
12
|
+
"License :: OSI Approved :: MIT License",
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
15
|
+
]
|
|
16
|
+
dependencies = [
|
|
17
|
+
"arxiv>=4.0.0",
|
|
18
|
+
"pyyaml>=6.0.3",
|
|
19
|
+
"rich>=15.0.0",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.scripts]
|
|
23
|
+
arxiv-ingest = "main:main"
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/yamadan96/arxiv-ingest"
|
|
27
|
+
Repository = "https://github.com/yamadan96/arxiv-ingest"
|
|
28
|
+
|
|
29
|
+
[build-system]
|
|
30
|
+
requires = ["hatchling"]
|
|
31
|
+
build-backend = "hatchling.build"
|
|
32
|
+
|
|
33
|
+
[tool.hatch.build.targets.wheel]
|
|
34
|
+
packages = ["."]
|
|
35
|
+
include = ["main.py", "scripts/*.py"]
|
|
36
|
+
|
|
37
|
+
[dependency-groups]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=9.1.0",
|
|
40
|
+
]
|
|
File without changes
|