notations-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write
9
+
10
+ jobs:
11
+ publish:
12
+ runs-on: ubuntu-latest
13
+ environment: pypi
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.12"
20
+
21
+ - name: Install build dependencies
22
+ run: pip install build
23
+
24
+ - name: Build package
25
+ run: python -m build
26
+
27
+ - name: Publish to PyPI
28
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,31 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+
13
+ strategy:
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12"]
16
+
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+
20
+ - name: Set up Python ${{ matrix.python-version }}
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: ${{ matrix.python-version }}
24
+
25
+ - name: Install dependencies
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ pip install .[test]
29
+
30
+ - name: Run tests
31
+ run: pytest
@@ -0,0 +1,15 @@
1
+ # Generated output
2
+ *_notations.html
3
+ *_notations.json
4
+ *_notations.md
5
+
6
+ # Virtual environment
7
+ .venv/
8
+
9
+ # Python
10
+ __pycache__/
11
+ *.pyc
12
+ *.egg-info/
13
+ dist/
14
+ build/
15
+ .python-version
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Takashi Ishida
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: notations-cli
3
+ Version: 0.1.0
4
+ Summary: Extract notation tables from arXiv papers using LLMs
5
+ Project-URL: Homepage, https://github.com/takashiishida/notations-cli
6
+ Project-URL: Repository, https://github.com/takashiishida/notations-cli
7
+ Author: Takashi Ishida
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: arxiv,latex,llm,notation
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: arxiv-to-prompt>=0.11.0
19
+ Requires-Dist: click>=8.0.0
20
+ Requires-Dist: openai>=1.99.2
21
+ Provides-Extra: test
22
+ Requires-Dist: pytest; extra == 'test'
23
+ Description-Content-Type: text/markdown
24
+
25
+ <p align="center">
26
+ <img src="logo.png" width="200" />
27
+ </p>
28
+
29
+ <h1 align="center">notations-cli</h1>
30
+
31
+ <p align="center">
32
+ <a href="https://github.com/takashiishida/notations-cli/actions/workflows/tests.yml"><img src="https://github.com/takashiishida/notations-cli/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
33
+ <a href="https://pypi.org/project/notations-cli/"><img src="https://img.shields.io/pypi/v/notations-cli?color=blue" alt="PyPI"></a>
34
+ <a href="https://pepy.tech/project/notations-cli"><img src="https://img.shields.io/pepy/dt/notations-cli?color=orange" alt="Downloads"></a>
35
+ <a href="https://github.com/takashiishida/notations-cli/blob/main/LICENSE"><img src="https://img.shields.io/github/license/takashiishida/notations-cli?color=green" alt="License"></a>
36
+ </p>
37
+
38
+ Extract notation tables from arXiv papers using LLMs.
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install notations-cli
44
+ ```
45
+
46
+ Or with uv:
47
+
48
+ ```bash
49
+ uv add notations-cli
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ ```bash
55
+ # From arXiv ID
56
+ notations 2006.11239
57
+
58
+ # From arXiv URL
59
+ notations https://arxiv.org/abs/2006.11239
60
+
61
+ # From local .tex files
62
+ notations /path/to/tex/folder
63
+
64
+ # With options
65
+ notations 2006.11239 --model gpt-5.2-2025-12-11 --provider openai
66
+ notations 2006.11239 --model anthropic/claude-sonnet-4.5 --provider openrouter
67
+ notations 2006.11239 --output my_paper # custom base name → .json/.html/.md
68
+ notations 2006.11239 --terminal # also print table to terminal
69
+ notations 2006.11239 --no-comments # strip LaTeX comments first
70
+ notations 2006.11239 --no-expand-macros # disable macro expansion
71
+ notations 2006.11239 --no-filter-body # keep all extracted notations
72
+
73
+ # Re-render from existing JSON (no LLM call)
74
+ notations 2006_11239_notations.json
75
+ notations 2006_11239_notations.json -t # also print to terminal
76
+ ```
77
+
78
+ By default, LaTeX macro definitions (`\newcommand`, etc.) are expanded inline before processing. Use `--no-expand-macros` to disable this.
79
+
80
+ Notations are also filtered by default to only include symbols that appear in the document body (`\begin{document}...\end{document}`), removing artifacts from preamble-only macro definitions. Use `--no-filter-body` to disable this.
81
+
82
+ ## Configuration
83
+
84
+ Set your API key as an environment variable:
85
+
86
+ ```bash
87
+ # For OpenAI
88
+ export OPENAI_API_KEY=...
89
+
90
+ # For OpenRouter
91
+ export OPENROUTER_API_KEY=...
92
+ ```
93
+
94
+ ## Output
95
+
96
+ Generates a self-contained HTML file with:
97
+ - Paper metadata (title, authors, arXiv link)
98
+ - Searchable notation table
99
+ - LaTeX rendering via KaTeX
100
+
101
+ ## Development
102
+
103
+ ```bash
104
+ git clone https://github.com/takashiishida/notations-cli.git
105
+ cd notations-cli
106
+ uv sync
107
+ uv run pytest
108
+ ```
@@ -0,0 +1,84 @@
1
+ <p align="center">
2
+ <img src="logo.png" width="200" />
3
+ </p>
4
+
5
+ <h1 align="center">notations-cli</h1>
6
+
7
+ <p align="center">
8
+ <a href="https://github.com/takashiishida/notations-cli/actions/workflows/tests.yml"><img src="https://github.com/takashiishida/notations-cli/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
9
+ <a href="https://pypi.org/project/notations-cli/"><img src="https://img.shields.io/pypi/v/notations-cli?color=blue" alt="PyPI"></a>
10
+ <a href="https://pepy.tech/project/notations-cli"><img src="https://img.shields.io/pepy/dt/notations-cli?color=orange" alt="Downloads"></a>
11
+ <a href="https://github.com/takashiishida/notations-cli/blob/main/LICENSE"><img src="https://img.shields.io/github/license/takashiishida/notations-cli?color=green" alt="License"></a>
12
+ </p>
13
+
14
+ Extract notation tables from arXiv papers using LLMs.
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install notations-cli
20
+ ```
21
+
22
+ Or with uv:
23
+
24
+ ```bash
25
+ uv add notations-cli
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ ```bash
31
+ # From arXiv ID
32
+ notations 2006.11239
33
+
34
+ # From arXiv URL
35
+ notations https://arxiv.org/abs/2006.11239
36
+
37
+ # From local .tex files
38
+ notations /path/to/tex/folder
39
+
40
+ # With options
41
+ notations 2006.11239 --model gpt-5.2-2025-12-11 --provider openai
42
+ notations 2006.11239 --model anthropic/claude-sonnet-4.5 --provider openrouter
43
+ notations 2006.11239 --output my_paper # custom base name → .json/.html/.md
44
+ notations 2006.11239 --terminal # also print table to terminal
45
+ notations 2006.11239 --no-comments # strip LaTeX comments first
46
+ notations 2006.11239 --no-expand-macros # disable macro expansion
47
+ notations 2006.11239 --no-filter-body # keep all extracted notations
48
+
49
+ # Re-render from existing JSON (no LLM call)
50
+ notations 2006_11239_notations.json
51
+ notations 2006_11239_notations.json -t # also print to terminal
52
+ ```
53
+
54
+ By default, LaTeX macro definitions (`\newcommand`, etc.) are expanded inline before processing. Use `--no-expand-macros` to disable this.
55
+
56
+ Notations are also filtered by default to only include symbols that appear in the document body (`\begin{document}...\end{document}`), removing artifacts from preamble-only macro definitions. Use `--no-filter-body` to disable this.
57
+
58
+ ## Configuration
59
+
60
+ Set your API key as an environment variable:
61
+
62
+ ```bash
63
+ # For OpenAI
64
+ export OPENAI_API_KEY=...
65
+
66
+ # For OpenRouter
67
+ export OPENROUTER_API_KEY=...
68
+ ```
69
+
70
+ ## Output
71
+
72
+ Generates a self-contained HTML file with:
73
+ - Paper metadata (title, authors, arXiv link)
74
+ - Searchable notation table
75
+ - LaTeX rendering via KaTeX
76
+
77
+ ## Development
78
+
79
+ ```bash
80
+ git clone https://github.com/takashiishida/notations-cli.git
81
+ cd notations-cli
82
+ uv sync
83
+ uv run pytest
84
+ ```
Binary file
@@ -0,0 +1,3 @@
1
+ """Extract notation tables from arXiv papers using LLMs."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,185 @@
1
+ """Command-line interface for notations."""
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+
7
+ import click
8
+ from arxiv_to_prompt import process_latex_source
9
+
10
+ from .extractor import SUPPORTED_OPENAI_MODELS, extract_notations, filter_notations_by_body, get_paper_metadata
11
+ from .renderer import render_html, render_json, render_markdown, render_terminal
12
+
13
+
14
+ @click.command(context_settings={"help_option_names": ["-h", "--help"]})
15
+ @click.argument("source")
16
+ @click.option(
17
+ "--model",
18
+ "-m",
19
+ default="gpt-5.2",
20
+ help=f"Model to use for extraction (default: gpt-5.2). OpenAI: {', '.join(SUPPORTED_OPENAI_MODELS)}. OpenRouter: any model.",
21
+ )
22
+ @click.option(
23
+ "--provider",
24
+ "-p",
25
+ type=click.Choice(["openai", "openrouter"]),
26
+ default="openai",
27
+ help="API provider (default: openai)",
28
+ )
29
+ @click.option(
30
+ "--terminal",
31
+ "-t",
32
+ is_flag=True,
33
+ help="Also print notation table to terminal",
34
+ )
35
+ @click.option(
36
+ "--output",
37
+ "-o",
38
+ type=click.Path(),
39
+ help="Base name for output files (default: <arxiv_id>_notations). Produces .json, .html, .md.",
40
+ )
41
+ @click.option(
42
+ "--no-comments",
43
+ is_flag=True,
44
+ help="Remove comments from LaTeX source before processing",
45
+ )
46
+ @click.option(
47
+ "--no-expand-macros",
48
+ is_flag=True,
49
+ help="Disable macro expansion (\\newcommand etc.) in LaTeX source",
50
+ )
51
+ @click.option(
52
+ "--no-filter-body",
53
+ is_flag=True,
54
+ help="Disable filtering notations by document body presence",
55
+ )
56
+ @click.option(
57
+ "--no-cache",
58
+ is_flag=True,
59
+ help="Disable caching of downloaded LaTeX source (re-download from arXiv)",
60
+ )
61
+ def main(source: str, model: str, provider: str, terminal: bool, output: str | None, no_comments: bool, no_expand_macros: bool, no_filter_body: bool, no_cache: bool):
62
+ """Extract notation table from an arXiv paper.
63
+
64
+ SOURCE can be:
65
+
66
+ \b
67
+ - arXiv ID (e.g., 2006.11239)
68
+ - arXiv URL (e.g., https://arxiv.org/abs/2006.11239)
69
+ - Local folder path containing .tex files
70
+ - A .json file from a previous extraction (skips LLM call)
71
+ """
72
+ # Check if source is a JSON file (re-render mode)
73
+ if source.endswith(".json") and os.path.isfile(source):
74
+ click.echo(f"Loading notations from {source}...", err=True)
75
+ try:
76
+ with open(source, encoding="utf-8") as f:
77
+ data = json.load(f)
78
+ notations = data["notations"]
79
+ metadata = data["metadata"]
80
+ except (json.JSONDecodeError, KeyError) as e:
81
+ click.echo(f"Error loading JSON: {e}", err=True)
82
+ sys.exit(1)
83
+
84
+ click.echo(f"Loaded {len(notations)} notations", err=True)
85
+
86
+ # Derive base name from JSON filename
87
+ base = output or os.path.splitext(source)[0]
88
+ else:
89
+ # Determine if source is local or arxiv
90
+ is_local = "/" in source and "arxiv.org" not in source
91
+
92
+ # Extract arxiv_id for metadata and default filename
93
+ if is_local:
94
+ arxiv_id = None
95
+ metadata = {
96
+ "title": "Local Paper",
97
+ "authors": [],
98
+ "arxiv_id": "local",
99
+ }
100
+ else:
101
+ # Clean up arxiv_id
102
+ arxiv_id = source
103
+ arxiv_id = arxiv_id.replace("https://arxiv.org/abs/", "")
104
+ arxiv_id = arxiv_id.replace("https://arxiv.org/pdf/", "")
105
+ arxiv_id = arxiv_id.replace(".pdf", "")
106
+
107
+ click.echo(f"Fetching metadata for arXiv:{arxiv_id}...", err=True)
108
+ metadata = get_paper_metadata(arxiv_id)
109
+ click.echo(f"Title: {metadata['title']}", err=True)
110
+
111
+ # Get LaTeX source
112
+ if is_local:
113
+ click.echo("Fetching LaTeX source...", err=True)
114
+ elif no_cache:
115
+ click.echo("Fetching LaTeX source (cache disabled)...", err=True)
116
+ else:
117
+ click.echo("Fetching LaTeX source (using cache if available)...", err=True)
118
+ try:
119
+ if is_local:
120
+ latex_source = process_latex_source(
121
+ local_folder=source,
122
+ keep_comments=not no_comments,
123
+ expand_macros_flag=not no_expand_macros,
124
+ )
125
+ else:
126
+ latex_source = process_latex_source(
127
+ arxiv_id=arxiv_id,
128
+ keep_comments=not no_comments,
129
+ expand_macros_flag=not no_expand_macros,
130
+ use_cache=not no_cache,
131
+ )
132
+ except Exception as e:
133
+ click.echo(f"Error fetching LaTeX source: {e}", err=True)
134
+ sys.exit(1)
135
+
136
+ click.echo(f"LaTeX source: {len(latex_source)} characters", err=True)
137
+
138
+ # Extract notations using LLM
139
+ click.echo(f"Extracting notations using {model} via {provider}...", err=True)
140
+ try:
141
+ notations = extract_notations(
142
+ latex_source=latex_source,
143
+ model=model,
144
+ provider=provider,
145
+ )
146
+ except Exception as e:
147
+ click.echo(f"Error extracting notations: {e}", err=True)
148
+ sys.exit(1)
149
+
150
+ click.echo(f"Found {len(notations)} notations", err=True)
151
+
152
+ # Filter notations by document body presence
153
+ if not no_filter_body:
154
+ before = len(notations)
155
+ notations = filter_notations_by_body(notations, latex_source)
156
+ click.echo(f"Filtered to {len(notations)}/{before} notations (body presence)", err=True)
157
+
158
+ # Derive base name
159
+ base_id = arxiv_id if arxiv_id else "local"
160
+ base_id = base_id.replace("/", "_").replace(".", "_")
161
+ base = output or f"{base_id}_notations"
162
+
163
+ # Write all three output files
164
+ json_path = f"{base}.json"
165
+ html_path = f"{base}.html"
166
+ md_path = f"{base}.md"
167
+
168
+ with open(json_path, "w", encoding="utf-8") as f:
169
+ f.write(render_json(notations, metadata))
170
+ click.echo(f"Written to {json_path}", err=True)
171
+
172
+ with open(html_path, "w", encoding="utf-8") as f:
173
+ f.write(render_html(notations, metadata))
174
+ click.echo(f"Written to {html_path}", err=True)
175
+
176
+ with open(md_path, "w", encoding="utf-8") as f:
177
+ f.write(render_markdown(notations, metadata))
178
+ click.echo(f"Written to {md_path}", err=True)
179
+
180
+ if terminal:
181
+ render_terminal(notations, metadata)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()
@@ -0,0 +1,176 @@
1
+ """LLM-based notation extraction from LaTeX source."""
2
+
3
+ import json
4
+ import os
5
+ from openai import OpenAI
6
+
7
+ # Supported OpenAI models for notation extraction
8
+ SUPPORTED_OPENAI_MODELS = [
9
+ "gpt-5.2",
10
+ "gpt-5.1",
11
+ "gpt-5-mini",
12
+ "gpt-5-nano",
13
+ ]
14
+
15
+ SYSTEM_PROMPT = """You are a notation extractor for academic papers. Given a LaTeX source, extract all mathematical notations used in the paper.
16
+
17
+ For each notation, provide:
18
+ - symbol: The LaTeX representation (e.g., "\\theta", "\\mathcal{D}", "x_i")
19
+ - description: A concise description of what it represents
20
+
21
+ Output a JSON array of objects with "symbol" and "description" fields.
22
+
23
+ Rules:
24
+ 1. Include all meaningful mathematical symbols, variables, and operators defined or used in the paper
25
+ 2. Do not include standard LaTeX commands that aren't paper-specific notations
26
+ 3. For indexed variables like x_1, x_2, ..., x_n, just include the general form x_i or x_n
27
+ 4. Keep descriptions concise but informative
28
+ 5. If a symbol is used with different meanings in different contexts, create separate entries
29
+ 6. In descriptions, use $...$ for inline math (e.g., "Model parameters with dimension $d$")
30
+
31
+ Output ONLY the JSON array, no other text."""
32
+
33
+
34
+ def extract_notations(
35
+ latex_source: str,
36
+ model: str = "gpt-4o",
37
+ provider: str = "openai",
38
+ ) -> list[dict]:
39
+ """Extract notations from LaTeX source using an LLM.
40
+
41
+ Args:
42
+ latex_source: The LaTeX source code of the paper
43
+ model: The model to use for extraction
44
+ provider: The API provider ("openai" or "openrouter")
45
+
46
+ Returns:
47
+ List of notation dicts with "symbol" and "description" keys
48
+ """
49
+ if provider == "openai":
50
+ if model not in SUPPORTED_OPENAI_MODELS:
51
+ supported = ", ".join(SUPPORTED_OPENAI_MODELS)
52
+ raise ValueError(f"Unsupported OpenAI model: {model}. Supported: {supported}")
53
+ base_url = None
54
+ api_key = os.environ.get("OPENAI_API_KEY")
55
+ elif provider == "openrouter":
56
+ base_url = "https://openrouter.ai/api/v1"
57
+ api_key = os.environ.get("OPENROUTER_API_KEY")
58
+ else:
59
+ raise ValueError(f"Unknown provider: {provider}")
60
+
61
+ if not api_key:
62
+ env_var = "OPENAI_API_KEY" if provider == "openai" else "OPENROUTER_API_KEY"
63
+ raise ValueError(f"No API key provided. Set {env_var} or pass --api-key")
64
+
65
+ client = OpenAI(api_key=api_key, base_url=base_url)
66
+
67
+ # Check if it's a reasoning model (gpt-5 series, o1, o3, o4)
68
+ is_reasoning_model = any(x in model for x in ["gpt-5", "o1", "o3", "o4"])
69
+
70
+ if is_reasoning_model:
71
+ # Use Responses API for reasoning models
72
+ response = client.responses.create(
73
+ model=model,
74
+ instructions=SYSTEM_PROMPT,
75
+ input=latex_source,
76
+ reasoning={"effort": "low"},
77
+ )
78
+ content = response.output_text.strip()
79
+ else:
80
+ response = client.chat.completions.create(
81
+ model=model,
82
+ messages=[
83
+ {"role": "system", "content": SYSTEM_PROMPT},
84
+ {"role": "user", "content": latex_source},
85
+ ],
86
+ temperature=0,
87
+ )
88
+ content = response.choices[0].message.content.strip()
89
+
90
+ # Handle markdown code blocks if present
91
+ if content.startswith("```"):
92
+ lines = content.split("\n")
93
+ content = "\n".join(lines[1:-1])
94
+
95
+ return json.loads(content)
96
+
97
+
98
+ def filter_notations_by_body(
99
+ notations: list[dict], latex_source: str
100
+ ) -> list[dict]:
101
+ """Filter notations to only those whose symbol appears in the document body.
102
+
103
+ This removes notations extracted from preamble-only macro definitions
104
+ that were expanded away and don't appear in the actual document.
105
+
106
+ Args:
107
+ notations: List of notation dicts with "symbol" key
108
+ latex_source: Full LaTeX source including preamble
109
+
110
+ Returns:
111
+ Filtered list of notations
112
+ """
113
+ import re
114
+
115
+ begin = re.search(r"\\begin\{document\}", latex_source)
116
+ end = re.search(r"\\end\{document\}", latex_source)
117
+ if begin is None or end is None:
118
+ return notations
119
+
120
+ body = latex_source[begin.end() : end.start()]
121
+
122
+ filtered = []
123
+ for n in notations:
124
+ symbol = n.get("symbol", "")
125
+ # Strip $...$ wrappers if present
126
+ if symbol.startswith("$") and symbol.endswith("$"):
127
+ symbol = symbol[1:-1]
128
+ if symbol and symbol in body:
129
+ filtered.append(n)
130
+ return filtered
131
+
132
+
133
+ def get_paper_metadata(arxiv_id: str) -> dict:
134
+ """Fetch basic metadata for an arXiv paper.
135
+
136
+ Args:
137
+ arxiv_id: The arXiv ID (e.g., "2006.11239")
138
+
139
+ Returns:
140
+ Dict with title, authors, and arxiv_id
141
+ """
142
+ import urllib.request
143
+ import xml.etree.ElementTree as ET
144
+
145
+ # Clean up arxiv_id
146
+ arxiv_id = arxiv_id.replace("https://arxiv.org/abs/", "")
147
+ arxiv_id = arxiv_id.replace("https://arxiv.org/pdf/", "")
148
+ arxiv_id = arxiv_id.replace(".pdf", "")
149
+
150
+ url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
151
+
152
+ with urllib.request.urlopen(url) as response:
153
+ xml_data = response.read()
154
+
155
+ root = ET.fromstring(xml_data)
156
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
157
+
158
+ entry = root.find("atom:entry", ns)
159
+ if entry is None:
160
+ return {"title": "Unknown", "authors": [], "arxiv_id": arxiv_id}
161
+
162
+ title = entry.find("atom:title", ns)
163
+ title_text = title.text.strip().replace("\n", " ") if title is not None else "Unknown"
164
+
165
+ authors = entry.findall("atom:author", ns)
166
+ author_names = []
167
+ for author in authors:
168
+ name = author.find("atom:name", ns)
169
+ if name is not None:
170
+ author_names.append(name.text)
171
+
172
+ return {
173
+ "title": title_text,
174
+ "authors": author_names,
175
+ "arxiv_id": arxiv_id,
176
+ }