arxiv-to-prompt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ """
2
+ arxiv-to-prompt: A tool to download and process LaTeX source from arXiv papers.
3
+
4
+ This package provides functionality to:
5
+ - Download source files from any arXiv paper using its ID
6
+ - Smart concatenation of multiple LaTeX files into a single coherent source
7
+ - Option to remove LaTeX comments
8
+
9
+ Example:
10
+ >>> from arxiv_to_prompt import process_latex_source
11
+ >>> latex_source = process_latex_source("2303.08774")
12
+ """
13
+
14
+ from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
15
+
16
+ __all__ = [
17
+ "process_latex_source",
18
+ "download_arxiv_source",
19
+ "get_default_cache_dir",
20
+ ]
arxiv_to_prompt/cli.py ADDED
@@ -0,0 +1,37 @@
1
+ import argparse
2
+ from .core import process_latex_source, get_default_cache_dir
3
+
4
+ def main():
5
+ default_cache = str(get_default_cache_dir())
6
+
7
+ parser = argparse.ArgumentParser(
8
+ description="Download and display LaTeX source from arXiv papers."
9
+ )
10
+ parser.add_argument(
11
+ "arxiv_id",
12
+ help="The arXiv ID of the paper (do not include the version, e.g. v1, v2)"
13
+ )
14
+ parser.add_argument(
15
+ "--no-comments",
16
+ action="store_true",
17
+ help="Remove LaTeX comments from the output"
18
+ )
19
+ parser.add_argument(
20
+ "--cache-dir",
21
+ type=str,
22
+ help=f"Custom directory to store downloaded files (default: {default_cache})",
23
+ default=None
24
+ )
25
+
26
+ args = parser.parse_args()
27
+
28
+ content = process_latex_source(
29
+ args.arxiv_id,
30
+ keep_comments=not args.no_comments,
31
+ cache_dir=args.cache_dir
32
+ )
33
+ if content:
34
+ print(content)
35
+
36
+ if __name__ == "__main__":
37
+ main()
@@ -0,0 +1,230 @@
1
+ import logging
2
+ import os
3
+ import tarfile
4
+ import shutil
5
+ from typing import Optional, List
6
+ import re
7
+ from pathlib import Path
8
+ import requests
9
+
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+ def get_default_cache_dir() -> Path:
13
+ """Get the default cache directory for downloaded files."""
14
+ # Use standard OS-specific cache directory
15
+ if os.name == 'nt': # Windows
16
+ base_dir = Path(os.environ.get('LOCALAPPDATA', '~'))
17
+ else: # Unix/Linux/MacOS
18
+ base_dir = Path(os.environ.get('XDG_CACHE_HOME', '~/.cache'))
19
+
20
+ cache_dir = base_dir.expanduser() / 'arxiv-to-prompt'
21
+ return cache_dir
22
+
23
+
24
+ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_cache: bool = False) -> bool:
25
+ """
26
+ Download source files from arXiv.
27
+
28
+ Args:
29
+ arxiv_id: The arXiv ID of the paper
30
+ cache_dir: Custom directory to store downloaded files
31
+ use_cache: Whether to use cached files if they exist (default: False)
32
+
33
+ Returns:
34
+ bool: True if download successful, False if failed (including when source not available)
35
+ """
36
+ try:
37
+ # First check if tex source is available
38
+ if not check_source_available(arxiv_id):
39
+ logging.warning(f"TeX source files not available for {arxiv_id}")
40
+ return False
41
+
42
+ # Use provided cache_dir or default
43
+ base_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
44
+
45
+ # Always use latest version by not specifying version in URL
46
+ url = f'https://arxiv.org/e-print/{arxiv_id}'
47
+
48
+ # Set up directory
49
+ directory = base_dir / arxiv_id
50
+ if use_cache and directory.exists():
51
+ logging.info(f"Directory {directory} already exists, using cached version.")
52
+ return True
53
+
54
+ # Clean up existing directory if not using cache
55
+ if directory.exists():
56
+ shutil.rmtree(directory)
57
+
58
+ # Create temporary directory for tar.gz file
59
+ temp_dir = base_dir / 'temp'
60
+ temp_dir.mkdir(parents=True, exist_ok=True)
61
+ tar_path = temp_dir / f'{arxiv_id}.tar.gz'
62
+
63
+ # Download the file
64
+ logging.info(f"Downloading source from {url}")
65
+ headers = {
66
+ 'User-Agent': 'Mozilla/5.0'
67
+ }
68
+ response = requests.get(url, headers=headers, timeout=30)
69
+ response.raise_for_status()
70
+
71
+ # Save and extract
72
+ with open(tar_path, 'wb') as file:
73
+ file.write(response.content)
74
+
75
+ directory.mkdir(parents=True, exist_ok=True)
76
+ with tarfile.open(tar_path) as tar:
77
+ tar.extractall(path=directory)
78
+
79
+ # Clean up temporary files
80
+ tar_path.unlink()
81
+ if temp_dir.exists():
82
+ shutil.rmtree(temp_dir)
83
+
84
+ logging.info(f"Source files downloaded and extracted to {directory}/")
85
+ return True
86
+
87
+ except Exception as e:
88
+ logging.error(f"Error downloading/extracting source: {e}")
89
+ if directory.exists():
90
+ shutil.rmtree(directory) # Clean up failed download
91
+ return False
92
+
93
+ def find_main_tex(directory: str) -> Optional[str]:
94
+ """
95
+ Find the main .tex file containing documentclass. If there are multiple files,
96
+ returns the filename of the longest .tex file containing documentclass, since shorter
97
+ files are typically conference templates or supplementary documents rather than the
98
+ main manuscript.
99
+ """
100
+ main_tex_file = None
101
+ max_line_count = 0
102
+
103
+ for file_name in os.listdir(directory):
104
+ if file_name.endswith('.tex'):
105
+ try:
106
+ with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
107
+ lines = file.readlines()
108
+ if any('\\documentclass' in line for line in lines):
109
+ line_count = len(lines)
110
+ if line_count > max_line_count:
111
+ main_tex_file = file_name
112
+ max_line_count = line_count
113
+ except Exception as e:
114
+ logging.warning(f"Could not read file {file_name}: {e}")
115
+
116
+ return main_tex_file
117
+
118
+ def remove_comments_from_lines(text: str) -> str:
119
+ """Remove LaTeX comments while preserving newlines."""
120
+ lines = text.split('\n')
121
+ result = []
122
+ for line in lines:
123
+ # Skip pure comment lines
124
+ if line.lstrip().startswith('%'):
125
+ continue
126
+ # Handle inline comments
127
+ in_command = False
128
+ cleaned_line = []
129
+ for i, char in enumerate(line):
130
+ if char == '\\':
131
+ in_command = True
132
+ cleaned_line.append(char)
133
+ elif in_command:
134
+ in_command = False
135
+ cleaned_line.append(char)
136
+ elif char == '%' and not in_command:
137
+ break
138
+ else:
139
+ cleaned_line.append(char)
140
+ result.append(''.join(cleaned_line).rstrip())
141
+ return '\n'.join(result)
142
+
143
+ def flatten_tex(directory: str, main_file: str) -> str:
144
+ """Combine all tex files into one, resolving inputs."""
145
+ def process_file(file_path: str, processed_files: set) -> str:
146
+ if file_path in processed_files:
147
+ return ""
148
+ processed_files.add(file_path)
149
+
150
+ try:
151
+ with open(file_path, 'r', encoding='utf-8') as f:
152
+ content = f.read()
153
+
154
+ # Process \input and \include commands
155
+ def replace_input(match):
156
+ input_file = match.group(1)
157
+ if not input_file.endswith('.tex'):
158
+ input_file += '.tex'
159
+ input_path = os.path.join(directory, input_file)
160
+ return process_file(input_path, processed_files)
161
+
162
+ content = re.sub(r'\\(?:input|include){([^}]+)}', replace_input, content)
163
+ return content
164
+
165
+ except Exception as e:
166
+ logging.warning(f"Error processing file {file_path}: {e}")
167
+ return ""
168
+
169
+ main_file_path = os.path.join(directory, main_file)
170
+ return process_file(main_file_path, set())
171
+
172
+ def process_latex_source(arxiv_id: str, keep_comments: bool = True,
173
+ cache_dir: Optional[str] = None,
174
+ use_cache: bool = False) -> Optional[str]:
175
+ """
176
+ Process LaTeX source files from arXiv and return the combined content.
177
+
178
+ Args:
179
+ arxiv_id: The arXiv ID of the paper
180
+ keep_comments: Whether to keep LaTeX comments in the output
181
+ cache_dir: Custom directory to store downloaded files
182
+ use_cache: Whether to use cached files if they exist (default: False)
183
+
184
+ Returns:
185
+ The processed LaTeX content or None if processing fails
186
+ """
187
+ base_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
188
+
189
+ # Download the latest version
190
+ if not download_arxiv_source(arxiv_id, cache_dir, use_cache):
191
+ return None
192
+
193
+ directory = base_dir / arxiv_id
194
+
195
+ main_file = find_main_tex(directory)
196
+ if not main_file:
197
+ logging.error("Main .tex file not found.")
198
+ return None
199
+
200
+ # Get the content
201
+ content = flatten_tex(directory, main_file)
202
+
203
+ # Process comments if requested
204
+ if not keep_comments:
205
+ content = remove_comments_from_lines(content)
206
+
207
+ return content
208
+
209
+ def check_source_available(arxiv_id: str) -> bool:
210
+ """Check if source files are available by checking the format page."""
211
+ url = f'https://arxiv.org/format/{arxiv_id}'
212
+ headers = {
213
+ 'User-Agent': 'Mozilla/5.0'
214
+ }
215
+
216
+ # Create a session with retry capability
217
+ session = requests.Session()
218
+ adapter = requests.adapters.HTTPAdapter(max_retries=3)
219
+ session.mount('https://', adapter)
220
+
221
+ try:
222
+ # Use separate timeouts for connect and read operations
223
+ response = session.get(url, headers=headers, timeout=(5, 30)) # (connect timeout, read timeout)
224
+ response.raise_for_status()
225
+ return 'Download source' in response.text
226
+ except requests.exceptions.RequestException as e:
227
+ logging.error(f"Error checking source availability: {e}")
228
+ return False
229
+ finally:
230
+ session.close()
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Takashi Ishida
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.2
2
+ Name: arxiv-to-prompt
3
+ Version: 0.1.0
4
+ Summary: transform arXiv papers into a single latex prompt for LLMs
5
+ Author: Takashi Ishida
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Takashi Ishida
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/takashiishida/arxiv-to-prompt
29
+ Project-URL: Changelog, https://github.com/takashiishida/arxiv-to-prompt/releases
30
+ Project-URL: Issues, https://github.com/takashiishida/arxiv-to-prompt/issues
31
+ Project-URL: CI, https://github.com/takashiishida/arxiv-to-prompt/actions
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Programming Language :: Python :: 3
34
+ Classifier: Programming Language :: Python :: 3.8
35
+ Classifier: Programming Language :: Python :: 3.9
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Operating System :: OS Independent
39
+ Requires-Python: >=3.8
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: requests>=2.25.0
43
+ Provides-Extra: test
44
+ Requires-Dist: pytest>=7.0.0; extra == "test"
45
+ Requires-Dist: pytest-cov>=4.0.0; extra == "test"
46
+
47
+ # arxiv-to-prompt
48
+
49
+ [![PyPI version](https://badge.fury.io/py/arxiv-to-prompt.svg)](https://pypi.org/project/arxiv-to-prompt/)
50
+ [![Tests](https://github.com/takashiishida/arxiv-to-prompt/actions/workflows/tests.yml/badge.svg)](https://github.com/takashiishida/arxiv-to-prompt/actions)
51
+ [![License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
52
+ [![Changelog](https://img.shields.io/github/v/release/takashiishida/arxiv-to-prompt?label=changelog)](https://github.com/takashiishida/arxiv-to-prompt/releases)
53
+
54
+ A command-line tool to transform arXiv papers into a single LaTeX source that can be used as a prompt for asking LLMs questions about the paper. It downloads the source files, automatically finds the main tex file containing `\documentclass`, and flattens multiple files into a single coherent source by resolving `\input` and `\include` commands. The tool also provides an option to remove LaTeX comments from the output (which can be useful to shorten the prompt).
55
+
56
+ ### Installation
57
+
58
+ ```bash
59
+ pip install arxiv-to-prompt
60
+ ```
61
+
62
+ ### Usage
63
+
64
+ Basic usage:
65
+ ```bash
66
+ # Display LaTeX source with comments
67
+ arxiv-to-prompt 2303.08774
68
+
69
+ # Display LaTeX source without comments
70
+ arxiv-to-prompt 2303.08774 --no-comments
71
+
72
+ # Copy to clipboard
73
+ arxiv-to-prompt 2303.08774 | pbcopy
74
+ ```
75
+
76
+ The arXiv ID can be found in the paper's URL. For example, for `https://arxiv.org/abs/2303.08774`, the ID is `2303.08774`. It will automatically download the latest version of the paper, so you don't need to specify the version.
77
+
78
+ ### Python API
79
+
80
+ You can also use arxiv-to-prompt in your Python code:
81
+
82
+ ```python
83
+ from arxiv_to_prompt import process_latex_source
84
+
85
+ # Get LaTeX source with comments
86
+ latex_source = process_latex_source("2303.08774")
87
+
88
+ # Get LaTeX source without comments
89
+ latex_source = process_latex_source("2303.08774", keep_comments=False)
90
+ ```
91
+
92
+ ### References
93
+
94
+ - Inspired by [files-to-prompt](https://github.com/simonw/files-to-prompt).
95
+ - Reused some code from [paper2slides](https://github.com/takashiishida/paper2slides).
@@ -0,0 +1,9 @@
1
+ arxiv_to_prompt/__init__.py,sha256=oL2bEzZhiFoMqCF-84Xmljqw55lgRkwInBFpExRPCTY,609
2
+ arxiv_to_prompt/cli.py,sha256=WafgKxxpgJrLyeuQ-tnUASoknoNXiaQRWLP-Emsr-ug,977
3
+ arxiv_to_prompt/core.py,sha256=cQcMNQJSrRVQAQsy2ULeLVlQlKIDDdgVLHFKJNMR0Sg,8296
4
+ arxiv_to_prompt-0.1.0.dist-info/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
5
+ arxiv_to_prompt-0.1.0.dist-info/METADATA,sha256=H8T6HFkP199SK19Jy66MgrVE2S8kTBr-2yYzC9qpQBs,4338
6
+ arxiv_to_prompt-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
7
+ arxiv_to_prompt-0.1.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
8
+ arxiv_to_prompt-0.1.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
9
+ arxiv_to_prompt-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ arxiv-to-prompt = arxiv_to_prompt.cli:main
@@ -0,0 +1 @@
1
+ arxiv_to_prompt