notations-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- notations_cli-0.1.0/.github/workflows/publish.yml +28 -0
- notations_cli-0.1.0/.github/workflows/tests.yml +31 -0
- notations_cli-0.1.0/.gitignore +15 -0
- notations_cli-0.1.0/LICENSE +21 -0
- notations_cli-0.1.0/PKG-INFO +108 -0
- notations_cli-0.1.0/README.md +84 -0
- notations_cli-0.1.0/logo.png +0 -0
- notations_cli-0.1.0/notations/__init__.py +3 -0
- notations_cli-0.1.0/notations/cli.py +185 -0
- notations_cli-0.1.0/notations/extractor.py +176 -0
- notations_cli-0.1.0/notations/renderer.py +112 -0
- notations_cli-0.1.0/notations/template.html +219 -0
- notations_cli-0.1.0/pyproject.toml +39 -0
- notations_cli-0.1.0/tests/__init__.py +0 -0
- notations_cli-0.1.0/tests/test_extractor.py +59 -0
- notations_cli-0.1.0/tests/test_renderer.py +76 -0
- notations_cli-0.1.0/uv.lock +673 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
id-token: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
publish:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
environment: pypi
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.12"
|
|
20
|
+
|
|
21
|
+
- name: Install build dependencies
|
|
22
|
+
run: pip install build
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: python -m build
|
|
26
|
+
|
|
27
|
+
- name: Publish to PyPI
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
|
|
25
|
+
- name: Install dependencies
|
|
26
|
+
run: |
|
|
27
|
+
python -m pip install --upgrade pip
|
|
28
|
+
pip install .[test]
|
|
29
|
+
|
|
30
|
+
- name: Run tests
|
|
31
|
+
run: pytest
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Takashi Ishida
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: notations-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract notation tables from arXiv papers using LLMs
|
|
5
|
+
Project-URL: Homepage, https://github.com/takashiishida/notations-cli
|
|
6
|
+
Project-URL: Repository, https://github.com/takashiishida/notations-cli
|
|
7
|
+
Author: Takashi Ishida
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: arxiv,latex,llm,notation
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: arxiv-to-prompt>=0.11.0
|
|
19
|
+
Requires-Dist: click>=8.0.0
|
|
20
|
+
Requires-Dist: openai>=1.99.2
|
|
21
|
+
Provides-Extra: test
|
|
22
|
+
Requires-Dist: pytest; extra == 'test'
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
<p align="center">
|
|
26
|
+
<img src="logo.png" width="200" />
|
|
27
|
+
</p>
|
|
28
|
+
|
|
29
|
+
<h1 align="center">notations-cli</h1>
|
|
30
|
+
|
|
31
|
+
<p align="center">
|
|
32
|
+
<a href="https://github.com/takashiishida/notations-cli/actions/workflows/tests.yml"><img src="https://github.com/takashiishida/notations-cli/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
|
|
33
|
+
<a href="https://pypi.org/project/notations-cli/"><img src="https://img.shields.io/pypi/v/notations-cli?color=blue" alt="PyPI"></a>
|
|
34
|
+
<a href="https://pepy.tech/project/notations-cli"><img src="https://img.shields.io/pepy/dt/notations-cli?color=orange" alt="Downloads"></a>
|
|
35
|
+
<a href="https://github.com/takashiishida/notations-cli/blob/main/LICENSE"><img src="https://img.shields.io/github/license/takashiishida/notations-cli?color=green" alt="License"></a>
|
|
36
|
+
</p>
|
|
37
|
+
|
|
38
|
+
Extract notation tables from arXiv papers using LLMs.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install notations-cli
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or with uv:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
uv add notations-cli
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# From arXiv ID
|
|
56
|
+
notations 2006.11239
|
|
57
|
+
|
|
58
|
+
# From arXiv URL
|
|
59
|
+
notations https://arxiv.org/abs/2006.11239
|
|
60
|
+
|
|
61
|
+
# From local .tex files
|
|
62
|
+
notations /path/to/tex/folder
|
|
63
|
+
|
|
64
|
+
# With options
|
|
65
|
+
notations 2006.11239 --model gpt-5.2-2025-12-11 --provider openai
|
|
66
|
+
notations 2006.11239 --model anthropic/claude-sonnet-4.5 --provider openrouter
|
|
67
|
+
notations 2006.11239 --output my_paper # custom base name → .json/.html/.md
|
|
68
|
+
notations 2006.11239 --terminal # also print table to terminal
|
|
69
|
+
notations 2006.11239 --no-comments # strip LaTeX comments first
|
|
70
|
+
notations 2006.11239 --no-expand-macros # disable macro expansion
|
|
71
|
+
notations 2006.11239 --no-filter-body # keep all extracted notations
|
|
72
|
+
|
|
73
|
+
# Re-render from existing JSON (no LLM call)
|
|
74
|
+
notations 2006_11239_notations.json
|
|
75
|
+
notations 2006_11239_notations.json -t # also print to terminal
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
By default, LaTeX macro definitions (`\newcommand`, etc.) are expanded inline before processing. Use `--no-expand-macros` to disable this.
|
|
79
|
+
|
|
80
|
+
Notations are also filtered by default to only include symbols that appear in the document body (`\begin{document}...\end{document}`), removing artifacts from preamble-only macro definitions. Use `--no-filter-body` to disable this.
|
|
81
|
+
|
|
82
|
+
## Configuration
|
|
83
|
+
|
|
84
|
+
Set your API key as an environment variable:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# For OpenAI
|
|
88
|
+
export OPENAI_API_KEY=...
|
|
89
|
+
|
|
90
|
+
# For OpenRouter
|
|
91
|
+
export OPENROUTER_API_KEY=...
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Output
|
|
95
|
+
|
|
96
|
+
Generates a self-contained HTML file with:
|
|
97
|
+
- Paper metadata (title, authors, arXiv link)
|
|
98
|
+
- Searchable notation table
|
|
99
|
+
- LaTeX rendering via KaTeX
|
|
100
|
+
|
|
101
|
+
## Development
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
git clone https://github.com/takashiishida/notations-cli.git
|
|
105
|
+
cd notations-cli
|
|
106
|
+
uv sync
|
|
107
|
+
uv run pytest
|
|
108
|
+
```
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="logo.png" width="200" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">notations-cli</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<a href="https://github.com/takashiishida/notations-cli/actions/workflows/tests.yml"><img src="https://github.com/takashiishida/notations-cli/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
|
|
9
|
+
<a href="https://pypi.org/project/notations-cli/"><img src="https://img.shields.io/pypi/v/notations-cli?color=blue" alt="PyPI"></a>
|
|
10
|
+
<a href="https://pepy.tech/project/notations-cli"><img src="https://img.shields.io/pepy/dt/notations-cli?color=orange" alt="Downloads"></a>
|
|
11
|
+
<a href="https://github.com/takashiishida/notations-cli/blob/main/LICENSE"><img src="https://img.shields.io/github/license/takashiishida/notations-cli?color=green" alt="License"></a>
|
|
12
|
+
</p>
|
|
13
|
+
|
|
14
|
+
Extract notation tables from arXiv papers using LLMs.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install notations-cli
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Or with uv:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
uv add notations-cli
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# From arXiv ID
|
|
32
|
+
notations 2006.11239
|
|
33
|
+
|
|
34
|
+
# From arXiv URL
|
|
35
|
+
notations https://arxiv.org/abs/2006.11239
|
|
36
|
+
|
|
37
|
+
# From local .tex files
|
|
38
|
+
notations /path/to/tex/folder
|
|
39
|
+
|
|
40
|
+
# With options
|
|
41
|
+
notations 2006.11239 --model gpt-5.2-2025-12-11 --provider openai
|
|
42
|
+
notations 2006.11239 --model anthropic/claude-sonnet-4.5 --provider openrouter
|
|
43
|
+
notations 2006.11239 --output my_paper # custom base name → .json/.html/.md
|
|
44
|
+
notations 2006.11239 --terminal # also print table to terminal
|
|
45
|
+
notations 2006.11239 --no-comments # strip LaTeX comments first
|
|
46
|
+
notations 2006.11239 --no-expand-macros # disable macro expansion
|
|
47
|
+
notations 2006.11239 --no-filter-body # keep all extracted notations
|
|
48
|
+
|
|
49
|
+
# Re-render from existing JSON (no LLM call)
|
|
50
|
+
notations 2006_11239_notations.json
|
|
51
|
+
notations 2006_11239_notations.json -t # also print to terminal
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
By default, LaTeX macro definitions (`\newcommand`, etc.) are expanded inline before processing. Use `--no-expand-macros` to disable this.
|
|
55
|
+
|
|
56
|
+
Notations are also filtered by default to only include symbols that appear in the document body (`\begin{document}...\end{document}`), removing artifacts from preamble-only macro definitions. Use `--no-filter-body` to disable this.
|
|
57
|
+
|
|
58
|
+
## Configuration
|
|
59
|
+
|
|
60
|
+
Set your API key as an environment variable:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# For OpenAI
|
|
64
|
+
export OPENAI_API_KEY=...
|
|
65
|
+
|
|
66
|
+
# For OpenRouter
|
|
67
|
+
export OPENROUTER_API_KEY=...
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Output
|
|
71
|
+
|
|
72
|
+
Generates a self-contained HTML file with:
|
|
73
|
+
- Paper metadata (title, authors, arXiv link)
|
|
74
|
+
- Searchable notation table
|
|
75
|
+
- LaTeX rendering via KaTeX
|
|
76
|
+
|
|
77
|
+
## Development
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
git clone https://github.com/takashiishida/notations-cli.git
|
|
81
|
+
cd notations-cli
|
|
82
|
+
uv sync
|
|
83
|
+
uv run pytest
|
|
84
|
+
```
|
|
Binary file
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Command-line interface for notations."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from arxiv_to_prompt import process_latex_source
|
|
9
|
+
|
|
10
|
+
from .extractor import SUPPORTED_OPENAI_MODELS, extract_notations, filter_notations_by_body, get_paper_metadata
|
|
11
|
+
from .renderer import render_html, render_json, render_markdown, render_terminal
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
|
|
15
|
+
@click.argument("source")
|
|
16
|
+
@click.option(
|
|
17
|
+
"--model",
|
|
18
|
+
"-m",
|
|
19
|
+
default="gpt-5.2",
|
|
20
|
+
help=f"Model to use for extraction (default: gpt-5.2). OpenAI: {', '.join(SUPPORTED_OPENAI_MODELS)}. OpenRouter: any model.",
|
|
21
|
+
)
|
|
22
|
+
@click.option(
|
|
23
|
+
"--provider",
|
|
24
|
+
"-p",
|
|
25
|
+
type=click.Choice(["openai", "openrouter"]),
|
|
26
|
+
default="openai",
|
|
27
|
+
help="API provider (default: openai)",
|
|
28
|
+
)
|
|
29
|
+
@click.option(
|
|
30
|
+
"--terminal",
|
|
31
|
+
"-t",
|
|
32
|
+
is_flag=True,
|
|
33
|
+
help="Also print notation table to terminal",
|
|
34
|
+
)
|
|
35
|
+
@click.option(
|
|
36
|
+
"--output",
|
|
37
|
+
"-o",
|
|
38
|
+
type=click.Path(),
|
|
39
|
+
help="Base name for output files (default: <arxiv_id>_notations). Produces .json, .html, .md.",
|
|
40
|
+
)
|
|
41
|
+
@click.option(
|
|
42
|
+
"--no-comments",
|
|
43
|
+
is_flag=True,
|
|
44
|
+
help="Remove comments from LaTeX source before processing",
|
|
45
|
+
)
|
|
46
|
+
@click.option(
|
|
47
|
+
"--no-expand-macros",
|
|
48
|
+
is_flag=True,
|
|
49
|
+
help="Disable macro expansion (\\newcommand etc.) in LaTeX source",
|
|
50
|
+
)
|
|
51
|
+
@click.option(
|
|
52
|
+
"--no-filter-body",
|
|
53
|
+
is_flag=True,
|
|
54
|
+
help="Disable filtering notations by document body presence",
|
|
55
|
+
)
|
|
56
|
+
@click.option(
|
|
57
|
+
"--no-cache",
|
|
58
|
+
is_flag=True,
|
|
59
|
+
help="Disable caching of downloaded LaTeX source (re-download from arXiv)",
|
|
60
|
+
)
|
|
61
|
+
def main(source: str, model: str, provider: str, terminal: bool, output: str | None, no_comments: bool, no_expand_macros: bool, no_filter_body: bool, no_cache: bool):
|
|
62
|
+
"""Extract notation table from an arXiv paper.
|
|
63
|
+
|
|
64
|
+
SOURCE can be:
|
|
65
|
+
|
|
66
|
+
\b
|
|
67
|
+
- arXiv ID (e.g., 2006.11239)
|
|
68
|
+
- arXiv URL (e.g., https://arxiv.org/abs/2006.11239)
|
|
69
|
+
- Local folder path containing .tex files
|
|
70
|
+
- A .json file from a previous extraction (skips LLM call)
|
|
71
|
+
"""
|
|
72
|
+
# Check if source is a JSON file (re-render mode)
|
|
73
|
+
if source.endswith(".json") and os.path.isfile(source):
|
|
74
|
+
click.echo(f"Loading notations from {source}...", err=True)
|
|
75
|
+
try:
|
|
76
|
+
with open(source, encoding="utf-8") as f:
|
|
77
|
+
data = json.load(f)
|
|
78
|
+
notations = data["notations"]
|
|
79
|
+
metadata = data["metadata"]
|
|
80
|
+
except (json.JSONDecodeError, KeyError) as e:
|
|
81
|
+
click.echo(f"Error loading JSON: {e}", err=True)
|
|
82
|
+
sys.exit(1)
|
|
83
|
+
|
|
84
|
+
click.echo(f"Loaded {len(notations)} notations", err=True)
|
|
85
|
+
|
|
86
|
+
# Derive base name from JSON filename
|
|
87
|
+
base = output or os.path.splitext(source)[0]
|
|
88
|
+
else:
|
|
89
|
+
# Determine if source is local or arxiv
|
|
90
|
+
is_local = "/" in source and "arxiv.org" not in source
|
|
91
|
+
|
|
92
|
+
# Extract arxiv_id for metadata and default filename
|
|
93
|
+
if is_local:
|
|
94
|
+
arxiv_id = None
|
|
95
|
+
metadata = {
|
|
96
|
+
"title": "Local Paper",
|
|
97
|
+
"authors": [],
|
|
98
|
+
"arxiv_id": "local",
|
|
99
|
+
}
|
|
100
|
+
else:
|
|
101
|
+
# Clean up arxiv_id
|
|
102
|
+
arxiv_id = source
|
|
103
|
+
arxiv_id = arxiv_id.replace("https://arxiv.org/abs/", "")
|
|
104
|
+
arxiv_id = arxiv_id.replace("https://arxiv.org/pdf/", "")
|
|
105
|
+
arxiv_id = arxiv_id.replace(".pdf", "")
|
|
106
|
+
|
|
107
|
+
click.echo(f"Fetching metadata for arXiv:{arxiv_id}...", err=True)
|
|
108
|
+
metadata = get_paper_metadata(arxiv_id)
|
|
109
|
+
click.echo(f"Title: {metadata['title']}", err=True)
|
|
110
|
+
|
|
111
|
+
# Get LaTeX source
|
|
112
|
+
if is_local:
|
|
113
|
+
click.echo("Fetching LaTeX source...", err=True)
|
|
114
|
+
elif no_cache:
|
|
115
|
+
click.echo("Fetching LaTeX source (cache disabled)...", err=True)
|
|
116
|
+
else:
|
|
117
|
+
click.echo("Fetching LaTeX source (using cache if available)...", err=True)
|
|
118
|
+
try:
|
|
119
|
+
if is_local:
|
|
120
|
+
latex_source = process_latex_source(
|
|
121
|
+
local_folder=source,
|
|
122
|
+
keep_comments=not no_comments,
|
|
123
|
+
expand_macros_flag=not no_expand_macros,
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
latex_source = process_latex_source(
|
|
127
|
+
arxiv_id=arxiv_id,
|
|
128
|
+
keep_comments=not no_comments,
|
|
129
|
+
expand_macros_flag=not no_expand_macros,
|
|
130
|
+
use_cache=not no_cache,
|
|
131
|
+
)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
click.echo(f"Error fetching LaTeX source: {e}", err=True)
|
|
134
|
+
sys.exit(1)
|
|
135
|
+
|
|
136
|
+
click.echo(f"LaTeX source: {len(latex_source)} characters", err=True)
|
|
137
|
+
|
|
138
|
+
# Extract notations using LLM
|
|
139
|
+
click.echo(f"Extracting notations using {model} via {provider}...", err=True)
|
|
140
|
+
try:
|
|
141
|
+
notations = extract_notations(
|
|
142
|
+
latex_source=latex_source,
|
|
143
|
+
model=model,
|
|
144
|
+
provider=provider,
|
|
145
|
+
)
|
|
146
|
+
except Exception as e:
|
|
147
|
+
click.echo(f"Error extracting notations: {e}", err=True)
|
|
148
|
+
sys.exit(1)
|
|
149
|
+
|
|
150
|
+
click.echo(f"Found {len(notations)} notations", err=True)
|
|
151
|
+
|
|
152
|
+
# Filter notations by document body presence
|
|
153
|
+
if not no_filter_body:
|
|
154
|
+
before = len(notations)
|
|
155
|
+
notations = filter_notations_by_body(notations, latex_source)
|
|
156
|
+
click.echo(f"Filtered to {len(notations)}/{before} notations (body presence)", err=True)
|
|
157
|
+
|
|
158
|
+
# Derive base name
|
|
159
|
+
base_id = arxiv_id if arxiv_id else "local"
|
|
160
|
+
base_id = base_id.replace("/", "_").replace(".", "_")
|
|
161
|
+
base = output or f"{base_id}_notations"
|
|
162
|
+
|
|
163
|
+
# Write all three output files
|
|
164
|
+
json_path = f"{base}.json"
|
|
165
|
+
html_path = f"{base}.html"
|
|
166
|
+
md_path = f"{base}.md"
|
|
167
|
+
|
|
168
|
+
with open(json_path, "w", encoding="utf-8") as f:
|
|
169
|
+
f.write(render_json(notations, metadata))
|
|
170
|
+
click.echo(f"Written to {json_path}", err=True)
|
|
171
|
+
|
|
172
|
+
with open(html_path, "w", encoding="utf-8") as f:
|
|
173
|
+
f.write(render_html(notations, metadata))
|
|
174
|
+
click.echo(f"Written to {html_path}", err=True)
|
|
175
|
+
|
|
176
|
+
with open(md_path, "w", encoding="utf-8") as f:
|
|
177
|
+
f.write(render_markdown(notations, metadata))
|
|
178
|
+
click.echo(f"Written to {md_path}", err=True)
|
|
179
|
+
|
|
180
|
+
if terminal:
|
|
181
|
+
render_terminal(notations, metadata)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
main()
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""LLM-based notation extraction from LaTeX source."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
|
|
7
|
+
# Supported OpenAI models for notation extraction
|
|
8
|
+
SUPPORTED_OPENAI_MODELS = [
|
|
9
|
+
"gpt-5.2",
|
|
10
|
+
"gpt-5.1",
|
|
11
|
+
"gpt-5-mini",
|
|
12
|
+
"gpt-5-nano",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
SYSTEM_PROMPT = """You are a notation extractor for academic papers. Given a LaTeX source, extract all mathematical notations used in the paper.
|
|
16
|
+
|
|
17
|
+
For each notation, provide:
|
|
18
|
+
- symbol: The LaTeX representation (e.g., "\\theta", "\\mathcal{D}", "x_i")
|
|
19
|
+
- description: A concise description of what it represents
|
|
20
|
+
|
|
21
|
+
Output a JSON array of objects with "symbol" and "description" fields.
|
|
22
|
+
|
|
23
|
+
Rules:
|
|
24
|
+
1. Include all meaningful mathematical symbols, variables, and operators defined or used in the paper
|
|
25
|
+
2. Do not include standard LaTeX commands that aren't paper-specific notations
|
|
26
|
+
3. For indexed variables like x_1, x_2, ..., x_n, just include the general form x_i or x_n
|
|
27
|
+
4. Keep descriptions concise but informative
|
|
28
|
+
5. If a symbol is used with different meanings in different contexts, create separate entries
|
|
29
|
+
6. In descriptions, use $...$ for inline math (e.g., "Model parameters with dimension $d$")
|
|
30
|
+
|
|
31
|
+
Output ONLY the JSON array, no other text."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def extract_notations(
|
|
35
|
+
latex_source: str,
|
|
36
|
+
model: str = "gpt-4o",
|
|
37
|
+
provider: str = "openai",
|
|
38
|
+
) -> list[dict]:
|
|
39
|
+
"""Extract notations from LaTeX source using an LLM.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
latex_source: The LaTeX source code of the paper
|
|
43
|
+
model: The model to use for extraction
|
|
44
|
+
provider: The API provider ("openai" or "openrouter")
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
List of notation dicts with "symbol" and "description" keys
|
|
48
|
+
"""
|
|
49
|
+
if provider == "openai":
|
|
50
|
+
if model not in SUPPORTED_OPENAI_MODELS:
|
|
51
|
+
supported = ", ".join(SUPPORTED_OPENAI_MODELS)
|
|
52
|
+
raise ValueError(f"Unsupported OpenAI model: {model}. Supported: {supported}")
|
|
53
|
+
base_url = None
|
|
54
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
55
|
+
elif provider == "openrouter":
|
|
56
|
+
base_url = "https://openrouter.ai/api/v1"
|
|
57
|
+
api_key = os.environ.get("OPENROUTER_API_KEY")
|
|
58
|
+
else:
|
|
59
|
+
raise ValueError(f"Unknown provider: {provider}")
|
|
60
|
+
|
|
61
|
+
if not api_key:
|
|
62
|
+
env_var = "OPENAI_API_KEY" if provider == "openai" else "OPENROUTER_API_KEY"
|
|
63
|
+
raise ValueError(f"No API key provided. Set {env_var} or pass --api-key")
|
|
64
|
+
|
|
65
|
+
client = OpenAI(api_key=api_key, base_url=base_url)
|
|
66
|
+
|
|
67
|
+
# Check if it's a reasoning model (gpt-5 series, o1, o3, o4)
|
|
68
|
+
is_reasoning_model = any(x in model for x in ["gpt-5", "o1", "o3", "o4"])
|
|
69
|
+
|
|
70
|
+
if is_reasoning_model:
|
|
71
|
+
# Use Responses API for reasoning models
|
|
72
|
+
response = client.responses.create(
|
|
73
|
+
model=model,
|
|
74
|
+
instructions=SYSTEM_PROMPT,
|
|
75
|
+
input=latex_source,
|
|
76
|
+
reasoning={"effort": "low"},
|
|
77
|
+
)
|
|
78
|
+
content = response.output_text.strip()
|
|
79
|
+
else:
|
|
80
|
+
response = client.chat.completions.create(
|
|
81
|
+
model=model,
|
|
82
|
+
messages=[
|
|
83
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
84
|
+
{"role": "user", "content": latex_source},
|
|
85
|
+
],
|
|
86
|
+
temperature=0,
|
|
87
|
+
)
|
|
88
|
+
content = response.choices[0].message.content.strip()
|
|
89
|
+
|
|
90
|
+
# Handle markdown code blocks if present
|
|
91
|
+
if content.startswith("```"):
|
|
92
|
+
lines = content.split("\n")
|
|
93
|
+
content = "\n".join(lines[1:-1])
|
|
94
|
+
|
|
95
|
+
return json.loads(content)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def filter_notations_by_body(
|
|
99
|
+
notations: list[dict], latex_source: str
|
|
100
|
+
) -> list[dict]:
|
|
101
|
+
"""Filter notations to only those whose symbol appears in the document body.
|
|
102
|
+
|
|
103
|
+
This removes notations extracted from preamble-only macro definitions
|
|
104
|
+
that were expanded away and don't appear in the actual document.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
notations: List of notation dicts with "symbol" key
|
|
108
|
+
latex_source: Full LaTeX source including preamble
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Filtered list of notations
|
|
112
|
+
"""
|
|
113
|
+
import re
|
|
114
|
+
|
|
115
|
+
begin = re.search(r"\\begin\{document\}", latex_source)
|
|
116
|
+
end = re.search(r"\\end\{document\}", latex_source)
|
|
117
|
+
if begin is None or end is None:
|
|
118
|
+
return notations
|
|
119
|
+
|
|
120
|
+
body = latex_source[begin.end() : end.start()]
|
|
121
|
+
|
|
122
|
+
filtered = []
|
|
123
|
+
for n in notations:
|
|
124
|
+
symbol = n.get("symbol", "")
|
|
125
|
+
# Strip $...$ wrappers if present
|
|
126
|
+
if symbol.startswith("$") and symbol.endswith("$"):
|
|
127
|
+
symbol = symbol[1:-1]
|
|
128
|
+
if symbol and symbol in body:
|
|
129
|
+
filtered.append(n)
|
|
130
|
+
return filtered
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_paper_metadata(arxiv_id: str) -> dict:
|
|
134
|
+
"""Fetch basic metadata for an arXiv paper.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
arxiv_id: The arXiv ID (e.g., "2006.11239")
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Dict with title, authors, and arxiv_id
|
|
141
|
+
"""
|
|
142
|
+
import urllib.request
|
|
143
|
+
import xml.etree.ElementTree as ET
|
|
144
|
+
|
|
145
|
+
# Clean up arxiv_id
|
|
146
|
+
arxiv_id = arxiv_id.replace("https://arxiv.org/abs/", "")
|
|
147
|
+
arxiv_id = arxiv_id.replace("https://arxiv.org/pdf/", "")
|
|
148
|
+
arxiv_id = arxiv_id.replace(".pdf", "")
|
|
149
|
+
|
|
150
|
+
url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
|
|
151
|
+
|
|
152
|
+
with urllib.request.urlopen(url) as response:
|
|
153
|
+
xml_data = response.read()
|
|
154
|
+
|
|
155
|
+
root = ET.fromstring(xml_data)
|
|
156
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
|
157
|
+
|
|
158
|
+
entry = root.find("atom:entry", ns)
|
|
159
|
+
if entry is None:
|
|
160
|
+
return {"title": "Unknown", "authors": [], "arxiv_id": arxiv_id}
|
|
161
|
+
|
|
162
|
+
title = entry.find("atom:title", ns)
|
|
163
|
+
title_text = title.text.strip().replace("\n", " ") if title is not None else "Unknown"
|
|
164
|
+
|
|
165
|
+
authors = entry.findall("atom:author", ns)
|
|
166
|
+
author_names = []
|
|
167
|
+
for author in authors:
|
|
168
|
+
name = author.find("atom:name", ns)
|
|
169
|
+
if name is not None:
|
|
170
|
+
author_names.append(name.text)
|
|
171
|
+
|
|
172
|
+
return {
|
|
173
|
+
"title": title_text,
|
|
174
|
+
"authors": author_names,
|
|
175
|
+
"arxiv_id": arxiv_id,
|
|
176
|
+
}
|