arxiv-to-prompt 0.2.2__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arxiv_to_prompt-0.2.2/src/arxiv_to_prompt.egg-info → arxiv_to_prompt-0.4.0}/PKG-INFO +17 -11
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/README.md +16 -10
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/pyproject.toml +1 -1
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/src/arxiv_to_prompt/__init__.py +5 -1
- arxiv_to_prompt-0.4.0/src/arxiv_to_prompt/cli.py +71 -0
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/src/arxiv_to_prompt/core.py +52 -18
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0/src/arxiv_to_prompt.egg-info}/PKG-INFO +17 -11
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/tests/test_core.py +25 -0
- arxiv_to_prompt-0.2.2/src/arxiv_to_prompt/cli.py +0 -43
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/LICENSE +0 -0
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/setup.cfg +0 -0
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/src/arxiv_to_prompt.egg-info/SOURCES.txt +0 -0
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/src/arxiv_to_prompt.egg-info/dependency_links.txt +0 -0
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/src/arxiv_to_prompt.egg-info/entry_points.txt +0 -0
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/src/arxiv_to_prompt.egg-info/requires.txt +0 -0
- {arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/src/arxiv_to_prompt.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -17,12 +17,15 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
|
17
17
|
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
18
18
|
Dynamic: license-file
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
<div align="center">
|
|
21
|
+
<img src="logo.png#gh-light-mode-only" alt="" width="475"><img src="logo.png#gh-dark-mode-only" alt="" width="475">
|
|
21
22
|
|
|
22
|
-
[](https://pypi.org/project/arxiv-to-prompt/)
|
|
23
24
|
[](https://github.com/takashiishida/arxiv-to-prompt/actions)
|
|
24
25
|
[](https://opensource.org/licenses/MIT)
|
|
25
26
|
[](https://github.com/takashiishida/arxiv-to-prompt/releases)
|
|
27
|
+
[](https://pepy.tech/project/arxiv-to-prompt)
|
|
28
|
+
</div>
|
|
26
29
|
|
|
27
30
|
A command-line tool to transform arXiv papers into a single LaTeX source that can be used as a prompt for asking LLMs questions about the paper. It downloads the source files, automatically finds the main tex file containing `\documentclass`, and flattens multiple files into a single coherent source by resolving `\input` and `\include` commands. The tool also provides options to remove LaTeX comments and appendix sections from the output (which can be useful to shorten the prompt).
|
|
28
31
|
|
|
@@ -48,6 +51,9 @@ arxiv-to-prompt 2303.08774 --no-appendix
|
|
|
48
51
|
# Combine options (no comments and no appendix)
|
|
49
52
|
arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
50
53
|
|
|
54
|
+
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
|
+
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
|
+
|
|
51
57
|
# Copy to clipboard
|
|
52
58
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
53
59
|
|
|
@@ -55,7 +61,7 @@ arxiv-to-prompt 2303.08774 | pbcopy
|
|
|
55
61
|
arxiv-to-prompt 1706.03762 | llm -s "explain this paper"
|
|
56
62
|
```
|
|
57
63
|
|
|
58
|
-
|
|
64
|
+
You can use either the arXiv ID (e.g., `2303.08774`) or the full URL (e.g., `https://arxiv.org/abs/2303.08774`). It will automatically download the latest version of the paper, so you don't need to specify the version.
|
|
59
65
|
|
|
60
66
|
### Python API
|
|
61
67
|
|
|
@@ -75,18 +81,18 @@ latex_source = process_latex_source("2303.08774", remove_appendix_section=True)
|
|
|
75
81
|
|
|
76
82
|
# Combine options (no comments and no appendix)
|
|
77
83
|
latex_source = process_latex_source("2303.08774", keep_comments=False, remove_appendix_section=True)
|
|
84
|
+
|
|
85
|
+
# Process LaTeX sources from a local folder (instead of downloading from arXiv)
|
|
86
|
+
latex_source = process_latex_source(local_folder="/path/to/tex/files")
|
|
78
87
|
```
|
|
79
88
|
|
|
80
89
|
### Projects Using arxiv-to-prompt
|
|
81
90
|
|
|
82
91
|
Here are some projects and use cases that leverage arxiv-to-prompt:
|
|
83
92
|
|
|
84
|
-
- [arxiv-latex-mcp](https://github.com/takashiishida/arxiv-latex-mcp): MCP server that
|
|
85
|
-
- [arxiv-tex-ui](https://github.com/takashiishida/arxiv-tex-ui): chat with an
|
|
93
|
+
- [arxiv-latex-mcp](https://github.com/takashiishida/arxiv-latex-mcp): MCP server that fetch and process arXiv LaTeX sources for precise interpretation of mathematical expressions in papers.
|
|
94
|
+
- [arxiv-tex-ui](https://github.com/takashiishida/arxiv-tex-ui): chat with an LLM about an arxiv paper by using the latex source.
|
|
95
|
+
- [paper2slides](https://github.com/takashiishida/paper2slides): transform an arXiv paper into slides.
|
|
96
|
+
- [ArXivToPrompt](https://apps.apple.com/jp/app/arxivtoprompt/id6751013390): iOS app that allows users to easily extract LaTeX source from arXiv papers on their iPhone and copy it to the clipboard for use with LLM apps.
|
|
86
97
|
|
|
87
98
|
If you're using arxiv-to-prompt in your project, please submit a pull request to add it to this list!
|
|
88
|
-
|
|
89
|
-
### References
|
|
90
|
-
|
|
91
|
-
- Inspired by [files-to-prompt](https://github.com/simonw/files-to-prompt).
|
|
92
|
-
- Reused some code from [paper2slides](https://github.com/takashiishida/paper2slides).
|
|
@@ -1,9 +1,12 @@
|
|
|
1
|
-
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="logo.png#gh-light-mode-only" alt="" width="475"><img src="logo.png#gh-dark-mode-only" alt="" width="475">
|
|
2
3
|
|
|
3
|
-
[](https://pypi.org/project/arxiv-to-prompt/)
|
|
4
5
|
[](https://github.com/takashiishida/arxiv-to-prompt/actions)
|
|
5
6
|
[](https://opensource.org/licenses/MIT)
|
|
6
7
|
[](https://github.com/takashiishida/arxiv-to-prompt/releases)
|
|
8
|
+
[](https://pepy.tech/project/arxiv-to-prompt)
|
|
9
|
+
</div>
|
|
7
10
|
|
|
8
11
|
A command-line tool to transform arXiv papers into a single LaTeX source that can be used as a prompt for asking LLMs questions about the paper. It downloads the source files, automatically finds the main tex file containing `\documentclass`, and flattens multiple files into a single coherent source by resolving `\input` and `\include` commands. The tool also provides options to remove LaTeX comments and appendix sections from the output (which can be useful to shorten the prompt).
|
|
9
12
|
|
|
@@ -29,6 +32,9 @@ arxiv-to-prompt 2303.08774 --no-appendix
|
|
|
29
32
|
# Combine options (no comments and no appendix)
|
|
30
33
|
arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
31
34
|
|
|
35
|
+
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
36
|
+
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
37
|
+
|
|
32
38
|
# Copy to clipboard
|
|
33
39
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
34
40
|
|
|
@@ -36,7 +42,7 @@ arxiv-to-prompt 2303.08774 | pbcopy
|
|
|
36
42
|
arxiv-to-prompt 1706.03762 | llm -s "explain this paper"
|
|
37
43
|
```
|
|
38
44
|
|
|
39
|
-
|
|
45
|
+
You can use either the arXiv ID (e.g., `2303.08774`) or the full URL (e.g., `https://arxiv.org/abs/2303.08774`). It will automatically download the latest version of the paper, so you don't need to specify the version.
|
|
40
46
|
|
|
41
47
|
### Python API
|
|
42
48
|
|
|
@@ -56,18 +62,18 @@ latex_source = process_latex_source("2303.08774", remove_appendix_section=True)
|
|
|
56
62
|
|
|
57
63
|
# Combine options (no comments and no appendix)
|
|
58
64
|
latex_source = process_latex_source("2303.08774", keep_comments=False, remove_appendix_section=True)
|
|
65
|
+
|
|
66
|
+
# Process LaTeX sources from a local folder (instead of downloading from arXiv)
|
|
67
|
+
latex_source = process_latex_source(local_folder="/path/to/tex/files")
|
|
59
68
|
```
|
|
60
69
|
|
|
61
70
|
### Projects Using arxiv-to-prompt
|
|
62
71
|
|
|
63
72
|
Here are some projects and use cases that leverage arxiv-to-prompt:
|
|
64
73
|
|
|
65
|
-
- [arxiv-latex-mcp](https://github.com/takashiishida/arxiv-latex-mcp): MCP server that
|
|
66
|
-
- [arxiv-tex-ui](https://github.com/takashiishida/arxiv-tex-ui): chat with an
|
|
74
|
+
- [arxiv-latex-mcp](https://github.com/takashiishida/arxiv-latex-mcp): MCP server that fetch and process arXiv LaTeX sources for precise interpretation of mathematical expressions in papers.
|
|
75
|
+
- [arxiv-tex-ui](https://github.com/takashiishida/arxiv-tex-ui): chat with an LLM about an arxiv paper by using the latex source.
|
|
76
|
+
- [paper2slides](https://github.com/takashiishida/paper2slides): transform an arXiv paper into slides.
|
|
77
|
+
- [ArXivToPrompt](https://apps.apple.com/jp/app/arxivtoprompt/id6751013390): iOS app that allows users to easily extract LaTeX source from arXiv papers on their iPhone and copy it to the clipboard for use with LLM apps.
|
|
67
78
|
|
|
68
79
|
If you're using arxiv-to-prompt in your project, please submit a pull request to add it to this list!
|
|
69
|
-
|
|
70
|
-
### References
|
|
71
|
-
|
|
72
|
-
- Inspired by [files-to-prompt](https://github.com/simonw/files-to-prompt).
|
|
73
|
-
- Reused some code from [paper2slides](https://github.com/takashiishida/paper2slides).
|
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
"""
|
|
2
|
-
arxiv-to-prompt: A tool to download and process LaTeX source from arXiv papers.
|
|
2
|
+
arxiv-to-prompt: A tool to download and process LaTeX source from arXiv papers or local folders.
|
|
3
3
|
|
|
4
4
|
This package provides functionality to:
|
|
5
5
|
- Download source files from any arXiv paper using its ID
|
|
6
|
+
- Process LaTeX source files from a local folder
|
|
6
7
|
- Smart concatenation of multiple LaTeX files into a single coherent source
|
|
7
8
|
- Option to remove LaTeX comments
|
|
8
9
|
|
|
9
10
|
Example:
|
|
10
11
|
>>> from arxiv_to_prompt import process_latex_source
|
|
12
|
+
>>> # From arXiv
|
|
11
13
|
>>> latex_source = process_latex_source("2303.08774")
|
|
14
|
+
>>> # From local folder
|
|
15
|
+
>>> latex_source = process_latex_source(local_folder="/path/to/tex/files")
|
|
12
16
|
"""
|
|
13
17
|
|
|
14
18
|
from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import re
|
|
3
|
+
from .core import process_latex_source, get_default_cache_dir
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def extract_arxiv_id(input_str: str) -> str:
|
|
7
|
+
"""Extract arxiv ID from URL or return input as-is if already an ID."""
|
|
8
|
+
if "arxiv.org" in input_str:
|
|
9
|
+
match = re.search(r'arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})(?:v\d+)?(?:\.pdf)?', input_str)
|
|
10
|
+
if match:
|
|
11
|
+
return match.group(1)
|
|
12
|
+
return input_str
|
|
13
|
+
|
|
14
|
+
def main():
|
|
15
|
+
default_cache = str(get_default_cache_dir())
|
|
16
|
+
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
description="Download and display LaTeX source from arXiv papers or process local TeX files."
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"arxiv_id",
|
|
22
|
+
nargs="?",
|
|
23
|
+
default=None,
|
|
24
|
+
help="The arXiv ID (e.g. 2303.08774) or URL (e.g. https://arxiv.org/abs/2303.08774). Not needed if --local-folder is provided."
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--no-comments",
|
|
28
|
+
action="store_true",
|
|
29
|
+
help="Remove LaTeX comments from the output"
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--cache-dir",
|
|
33
|
+
type=str,
|
|
34
|
+
help=f"Custom directory to store downloaded files (default: {default_cache})",
|
|
35
|
+
default=None
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--no-appendix",
|
|
39
|
+
action="store_true",
|
|
40
|
+
help="Remove the appendix section and everything after it"
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--local-folder",
|
|
44
|
+
type=str,
|
|
45
|
+
help="Path to a local folder containing TeX files (alternative to arxiv_id)",
|
|
46
|
+
default=None
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
args = parser.parse_args()
|
|
50
|
+
|
|
51
|
+
# Validate that either arxiv_id or local_folder is provided
|
|
52
|
+
if not args.arxiv_id and not args.local_folder:
|
|
53
|
+
parser.error("Either provide an arXiv ID or use --local-folder to specify a local folder")
|
|
54
|
+
|
|
55
|
+
if args.arxiv_id and args.local_folder:
|
|
56
|
+
parser.error("Cannot specify both arXiv ID and --local-folder")
|
|
57
|
+
|
|
58
|
+
arxiv_id = extract_arxiv_id(args.arxiv_id) if args.arxiv_id else None
|
|
59
|
+
|
|
60
|
+
content = process_latex_source(
|
|
61
|
+
arxiv_id=arxiv_id,
|
|
62
|
+
keep_comments=not args.no_comments,
|
|
63
|
+
cache_dir=args.cache_dir,
|
|
64
|
+
remove_appendix_section=args.no_appendix,
|
|
65
|
+
local_folder=args.local_folder
|
|
66
|
+
)
|
|
67
|
+
if content:
|
|
68
|
+
print(content)
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
main()
|
|
@@ -92,14 +92,28 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
|
|
|
92
92
|
|
|
93
93
|
def find_main_tex(directory: str) -> Optional[str]:
|
|
94
94
|
"""
|
|
95
|
-
Find the main .tex file containing documentclass.
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
95
|
+
Find the main .tex file containing documentclass.
|
|
96
|
+
First checks for common naming conventions (main.tex, paper.tex, index.tex).
|
|
97
|
+
If none found, returns the filename of the longest .tex file containing documentclass,
|
|
98
|
+
since shorter files are typically conference templates or supplementary documents
|
|
99
|
+
rather than the main manuscript.
|
|
99
100
|
"""
|
|
101
|
+
common_names = ['main.tex', 'paper.tex', 'index.tex']
|
|
100
102
|
main_tex_file = None
|
|
101
103
|
max_line_count = 0
|
|
102
104
|
|
|
105
|
+
# First pass: check for common naming conventions
|
|
106
|
+
for file_name in os.listdir(directory):
|
|
107
|
+
if file_name in common_names:
|
|
108
|
+
try:
|
|
109
|
+
with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
|
|
110
|
+
lines = file.readlines()
|
|
111
|
+
if any('\\documentclass' in line for line in lines):
|
|
112
|
+
return file_name
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logging.warning(f"Could not read file {file_name}: {e}")
|
|
115
|
+
|
|
116
|
+
# Second pass: find the longest .tex file containing documentclass
|
|
103
117
|
for file_name in os.listdir(directory):
|
|
104
118
|
if file_name.endswith('.tex'):
|
|
105
119
|
try:
|
|
@@ -208,37 +222,57 @@ def flatten_tex(directory: str, main_file: str) -> str:
|
|
|
208
222
|
main_file_path = os.path.join(directory, main_file)
|
|
209
223
|
return process_file(main_file_path, set())
|
|
210
224
|
|
|
211
|
-
def process_latex_source(arxiv_id: str, keep_comments: bool = True,
|
|
225
|
+
def process_latex_source(arxiv_id: Optional[str] = None, keep_comments: bool = True,
|
|
212
226
|
cache_dir: Optional[str] = None,
|
|
213
|
-
use_cache: bool = False, remove_appendix_section: bool = False
|
|
227
|
+
use_cache: bool = False, remove_appendix_section: bool = False,
|
|
228
|
+
local_folder: Optional[str] = None) -> Optional[str]:
|
|
214
229
|
"""
|
|
215
|
-
Process LaTeX source files from arXiv and return the combined content.
|
|
230
|
+
Process LaTeX source files from arXiv or a local folder and return the combined content.
|
|
216
231
|
|
|
217
232
|
Args:
|
|
218
|
-
arxiv_id: The arXiv ID of the paper
|
|
233
|
+
arxiv_id: The arXiv ID of the paper (required if local_folder is not provided)
|
|
219
234
|
keep_comments: Whether to keep LaTeX comments in the output
|
|
220
|
-
cache_dir: Custom directory to store downloaded files
|
|
221
|
-
use_cache: Whether to use cached files if they exist (default: False)
|
|
235
|
+
cache_dir: Custom directory to store downloaded files (only used for arXiv)
|
|
236
|
+
use_cache: Whether to use cached files if they exist (default: False, only used for arXiv)
|
|
222
237
|
remove_appendix_section: Whether to remove the appendix section and everything after it
|
|
238
|
+
local_folder: Path to a local folder containing TeX files (alternative to arxiv_id)
|
|
223
239
|
|
|
224
240
|
Returns:
|
|
225
241
|
The processed LaTeX content or None if processing fails
|
|
226
242
|
"""
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
243
|
+
# Determine the directory to process
|
|
244
|
+
if local_folder:
|
|
245
|
+
directory = Path(local_folder).expanduser().resolve()
|
|
246
|
+
|
|
247
|
+
# Validate the folder exists
|
|
248
|
+
if not directory.exists():
|
|
249
|
+
logging.error(f"Local folder does not exist: {directory}")
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
if not directory.is_dir():
|
|
253
|
+
logging.error(f"Path is not a directory: {directory}")
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
logging.info(f"Processing local folder: {directory}")
|
|
257
|
+
elif arxiv_id:
|
|
258
|
+
base_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
|
|
259
|
+
|
|
260
|
+
# Download the latest version
|
|
261
|
+
if not download_arxiv_source(arxiv_id, cache_dir, use_cache):
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
directory = base_dir / arxiv_id
|
|
265
|
+
else:
|
|
266
|
+
logging.error("Either arxiv_id or local_folder must be provided")
|
|
231
267
|
return None
|
|
232
|
-
|
|
233
|
-
directory = base_dir / arxiv_id
|
|
234
268
|
|
|
235
|
-
main_file = find_main_tex(directory)
|
|
269
|
+
main_file = find_main_tex(str(directory))
|
|
236
270
|
if not main_file:
|
|
237
271
|
logging.error("Main .tex file not found.")
|
|
238
272
|
return None
|
|
239
273
|
|
|
240
274
|
# Get the content
|
|
241
|
-
content = flatten_tex(directory, main_file)
|
|
275
|
+
content = flatten_tex(str(directory), main_file)
|
|
242
276
|
|
|
243
277
|
# Process comments if requested
|
|
244
278
|
if not keep_comments:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -17,12 +17,15 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
|
17
17
|
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
18
18
|
Dynamic: license-file
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
<div align="center">
|
|
21
|
+
<img src="logo.png#gh-light-mode-only" alt="" width="475"><img src="logo.png#gh-dark-mode-only" alt="" width="475">
|
|
21
22
|
|
|
22
|
-
[](https://pypi.org/project/arxiv-to-prompt/)
|
|
23
24
|
[](https://github.com/takashiishida/arxiv-to-prompt/actions)
|
|
24
25
|
[](https://opensource.org/licenses/MIT)
|
|
25
26
|
[](https://github.com/takashiishida/arxiv-to-prompt/releases)
|
|
27
|
+
[](https://pepy.tech/project/arxiv-to-prompt)
|
|
28
|
+
</div>
|
|
26
29
|
|
|
27
30
|
A command-line tool to transform arXiv papers into a single LaTeX source that can be used as a prompt for asking LLMs questions about the paper. It downloads the source files, automatically finds the main tex file containing `\documentclass`, and flattens multiple files into a single coherent source by resolving `\input` and `\include` commands. The tool also provides options to remove LaTeX comments and appendix sections from the output (which can be useful to shorten the prompt).
|
|
28
31
|
|
|
@@ -48,6 +51,9 @@ arxiv-to-prompt 2303.08774 --no-appendix
|
|
|
48
51
|
# Combine options (no comments and no appendix)
|
|
49
52
|
arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
50
53
|
|
|
54
|
+
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
|
+
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
|
+
|
|
51
57
|
# Copy to clipboard
|
|
52
58
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
53
59
|
|
|
@@ -55,7 +61,7 @@ arxiv-to-prompt 2303.08774 | pbcopy
|
|
|
55
61
|
arxiv-to-prompt 1706.03762 | llm -s "explain this paper"
|
|
56
62
|
```
|
|
57
63
|
|
|
58
|
-
|
|
64
|
+
You can use either the arXiv ID (e.g., `2303.08774`) or the full URL (e.g., `https://arxiv.org/abs/2303.08774`). It will automatically download the latest version of the paper, so you don't need to specify the version.
|
|
59
65
|
|
|
60
66
|
### Python API
|
|
61
67
|
|
|
@@ -75,18 +81,18 @@ latex_source = process_latex_source("2303.08774", remove_appendix_section=True)
|
|
|
75
81
|
|
|
76
82
|
# Combine options (no comments and no appendix)
|
|
77
83
|
latex_source = process_latex_source("2303.08774", keep_comments=False, remove_appendix_section=True)
|
|
84
|
+
|
|
85
|
+
# Process LaTeX sources from a local folder (instead of downloading from arXiv)
|
|
86
|
+
latex_source = process_latex_source(local_folder="/path/to/tex/files")
|
|
78
87
|
```
|
|
79
88
|
|
|
80
89
|
### Projects Using arxiv-to-prompt
|
|
81
90
|
|
|
82
91
|
Here are some projects and use cases that leverage arxiv-to-prompt:
|
|
83
92
|
|
|
84
|
-
- [arxiv-latex-mcp](https://github.com/takashiishida/arxiv-latex-mcp): MCP server that
|
|
85
|
-
- [arxiv-tex-ui](https://github.com/takashiishida/arxiv-tex-ui): chat with an
|
|
93
|
+
- [arxiv-latex-mcp](https://github.com/takashiishida/arxiv-latex-mcp): MCP server that fetch and process arXiv LaTeX sources for precise interpretation of mathematical expressions in papers.
|
|
94
|
+
- [arxiv-tex-ui](https://github.com/takashiishida/arxiv-tex-ui): chat with an LLM about an arxiv paper by using the latex source.
|
|
95
|
+
- [paper2slides](https://github.com/takashiishida/paper2slides): transform an arXiv paper into slides.
|
|
96
|
+
- [ArXivToPrompt](https://apps.apple.com/jp/app/arxivtoprompt/id6751013390): iOS app that allows users to easily extract LaTeX source from arXiv papers on their iPhone and copy it to the clipboard for use with LLM apps.
|
|
86
97
|
|
|
87
98
|
If you're using arxiv-to-prompt in your project, please submit a pull request to add it to this list!
|
|
88
|
-
|
|
89
|
-
### References
|
|
90
|
-
|
|
91
|
-
- Inspired by [files-to-prompt](https://github.com/simonw/files-to-prompt).
|
|
92
|
-
- Reused some code from [paper2slides](https://github.com/takashiishida/paper2slides).
|
|
@@ -11,6 +11,7 @@ from arxiv_to_prompt.core import (
|
|
|
11
11
|
flatten_tex,
|
|
12
12
|
remove_appendix,
|
|
13
13
|
)
|
|
14
|
+
from arxiv_to_prompt.cli import extract_arxiv_id
|
|
14
15
|
|
|
15
16
|
# Test fixtures
|
|
16
17
|
@pytest.fixture
|
|
@@ -271,3 +272,27 @@ def test_input_file_extensions(temp_cache_dir):
|
|
|
271
272
|
assert "Style content" in result
|
|
272
273
|
assert "Class content" in result
|
|
273
274
|
assert "Already tex content" in result
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def test_extract_arxiv_id():
|
|
278
|
+
"""Test extracting arxiv ID from URLs and plain IDs."""
|
|
279
|
+
# Plain IDs should be returned as-is
|
|
280
|
+
assert extract_arxiv_id("2505.18102") == "2505.18102"
|
|
281
|
+
assert extract_arxiv_id("2401.12345") == "2401.12345"
|
|
282
|
+
|
|
283
|
+
# Extract from abs URLs
|
|
284
|
+
assert extract_arxiv_id("https://arxiv.org/abs/2505.18102") == "2505.18102"
|
|
285
|
+
assert extract_arxiv_id("http://arxiv.org/abs/2505.18102") == "2505.18102"
|
|
286
|
+
|
|
287
|
+
# Extract from pdf URLs
|
|
288
|
+
assert extract_arxiv_id("https://arxiv.org/pdf/2505.18102") == "2505.18102"
|
|
289
|
+
assert extract_arxiv_id("https://arxiv.org/pdf/2505.18102.pdf") == "2505.18102"
|
|
290
|
+
|
|
291
|
+
# Strip version suffixes
|
|
292
|
+
assert extract_arxiv_id("https://arxiv.org/abs/2505.18102v1") == "2505.18102"
|
|
293
|
+
assert extract_arxiv_id("https://arxiv.org/abs/2505.18102v2") == "2505.18102"
|
|
294
|
+
assert extract_arxiv_id("https://arxiv.org/pdf/2505.18102v3.pdf") == "2505.18102"
|
|
295
|
+
|
|
296
|
+
# Non-arxiv input returned as-is
|
|
297
|
+
assert extract_arxiv_id("invalid") == "invalid"
|
|
298
|
+
assert extract_arxiv_id("https://example.com/2505.18102") == "https://example.com/2505.18102"
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
from .core import process_latex_source, get_default_cache_dir
|
|
3
|
-
|
|
4
|
-
def main():
|
|
5
|
-
default_cache = str(get_default_cache_dir())
|
|
6
|
-
|
|
7
|
-
parser = argparse.ArgumentParser(
|
|
8
|
-
description="Download and display LaTeX source from arXiv papers."
|
|
9
|
-
)
|
|
10
|
-
parser.add_argument(
|
|
11
|
-
"arxiv_id",
|
|
12
|
-
help="The arXiv ID of the paper (do not include the version, e.g. v1, v2)"
|
|
13
|
-
)
|
|
14
|
-
parser.add_argument(
|
|
15
|
-
"--no-comments",
|
|
16
|
-
action="store_true",
|
|
17
|
-
help="Remove LaTeX comments from the output"
|
|
18
|
-
)
|
|
19
|
-
parser.add_argument(
|
|
20
|
-
"--cache-dir",
|
|
21
|
-
type=str,
|
|
22
|
-
help=f"Custom directory to store downloaded files (default: {default_cache})",
|
|
23
|
-
default=None
|
|
24
|
-
)
|
|
25
|
-
parser.add_argument(
|
|
26
|
-
"--no-appendix",
|
|
27
|
-
action="store_true",
|
|
28
|
-
help="Remove the appendix section and everything after it"
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
args = parser.parse_args()
|
|
32
|
-
|
|
33
|
-
content = process_latex_source(
|
|
34
|
-
args.arxiv_id,
|
|
35
|
-
keep_comments=not args.no_comments,
|
|
36
|
-
cache_dir=args.cache_dir,
|
|
37
|
-
remove_appendix_section=args.no_appendix
|
|
38
|
-
)
|
|
39
|
-
if content:
|
|
40
|
-
print(content)
|
|
41
|
-
|
|
42
|
-
if __name__ == "__main__":
|
|
43
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/src/arxiv_to_prompt.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{arxiv_to_prompt-0.2.2 → arxiv_to_prompt-0.4.0}/src/arxiv_to_prompt.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|