arxiv-to-prompt 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_to_prompt/cli.py +7 -1
- arxiv_to_prompt/core.py +45 -2
- {arxiv_to_prompt-0.1.0.dist-info → arxiv_to_prompt-0.2.0.dist-info}/METADATA +30 -33
- arxiv_to_prompt-0.2.0.dist-info/RECORD +9 -0
- {arxiv_to_prompt-0.1.0.dist-info → arxiv_to_prompt-0.2.0.dist-info}/WHEEL +1 -1
- arxiv_to_prompt-0.1.0.dist-info/RECORD +0 -9
- {arxiv_to_prompt-0.1.0.dist-info → arxiv_to_prompt-0.2.0.dist-info}/entry_points.txt +0 -0
- {arxiv_to_prompt-0.1.0.dist-info → arxiv_to_prompt-0.2.0.dist-info/licenses}/LICENSE +0 -0
- {arxiv_to_prompt-0.1.0.dist-info → arxiv_to_prompt-0.2.0.dist-info}/top_level.txt +0 -0
arxiv_to_prompt/cli.py
CHANGED
|
@@ -22,13 +22,19 @@ def main():
|
|
|
22
22
|
help=f"Custom directory to store downloaded files (default: {default_cache})",
|
|
23
23
|
default=None
|
|
24
24
|
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"--no-appendix",
|
|
27
|
+
action="store_true",
|
|
28
|
+
help="Remove the appendix section and everything after it"
|
|
29
|
+
)
|
|
25
30
|
|
|
26
31
|
args = parser.parse_args()
|
|
27
32
|
|
|
28
33
|
content = process_latex_source(
|
|
29
34
|
args.arxiv_id,
|
|
30
35
|
keep_comments=not args.no_comments,
|
|
31
|
-
cache_dir=args.cache_dir
|
|
36
|
+
cache_dir=args.cache_dir,
|
|
37
|
+
remove_appendix_section=args.no_appendix
|
|
32
38
|
)
|
|
33
39
|
if content:
|
|
34
40
|
print(content)
|
arxiv_to_prompt/core.py
CHANGED
|
@@ -140,6 +140,14 @@ def remove_comments_from_lines(text: str) -> str:
|
|
|
140
140
|
result.append(''.join(cleaned_line).rstrip())
|
|
141
141
|
return '\n'.join(result)
|
|
142
142
|
|
|
143
|
+
def remove_appendix(text: str) -> str:
|
|
144
|
+
"""Remove appendix section and everything after it."""
|
|
145
|
+
# Find the position of \appendix command
|
|
146
|
+
appendix_match = re.search(r'\\appendix\b', text)
|
|
147
|
+
if appendix_match:
|
|
148
|
+
return text[:appendix_match.start()].rstrip()
|
|
149
|
+
return text
|
|
150
|
+
|
|
143
151
|
def flatten_tex(directory: str, main_file: str) -> str:
|
|
144
152
|
"""Combine all tex files into one, resolving inputs."""
|
|
145
153
|
def process_file(file_path: str, processed_files: set) -> str:
|
|
@@ -151,8 +159,38 @@ def flatten_tex(directory: str, main_file: str) -> str:
|
|
|
151
159
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
152
160
|
content = f.read()
|
|
153
161
|
|
|
154
|
-
# Process \input and \include commands
|
|
162
|
+
# Process \input and \include commands that are not commented out
|
|
155
163
|
def replace_input(match):
|
|
164
|
+
# Check if the match is preceded by a comment character
|
|
165
|
+
line_start = content.rfind('\n', 0, match.start()) + 1
|
|
166
|
+
line_prefix = content[line_start:match.start()]
|
|
167
|
+
|
|
168
|
+
# If there's a % character in the line prefix that's not escaped,
|
|
169
|
+
# this command is commented out, so return the original text
|
|
170
|
+
comment_pos = -1
|
|
171
|
+
i = 0
|
|
172
|
+
while i < len(line_prefix):
|
|
173
|
+
if line_prefix[i] == '%':
|
|
174
|
+
# Check if the % is escaped with a backslash
|
|
175
|
+
if i > 0 and line_prefix[i-1] == '\\':
|
|
176
|
+
# Count backslashes before %
|
|
177
|
+
backslash_count = 0
|
|
178
|
+
j = i - 1
|
|
179
|
+
while j >= 0 and line_prefix[j] == '\\':
|
|
180
|
+
backslash_count += 1
|
|
181
|
+
j -= 1
|
|
182
|
+
# If odd number of backslashes, % is escaped
|
|
183
|
+
if backslash_count % 2 == 1:
|
|
184
|
+
i += 1
|
|
185
|
+
continue
|
|
186
|
+
comment_pos = i
|
|
187
|
+
break
|
|
188
|
+
i += 1
|
|
189
|
+
|
|
190
|
+
if comment_pos != -1:
|
|
191
|
+
return match.group(0) # Return the original text without processing
|
|
192
|
+
|
|
193
|
+
# Process the command normally
|
|
156
194
|
input_file = match.group(1)
|
|
157
195
|
if not input_file.endswith('.tex'):
|
|
158
196
|
input_file += '.tex'
|
|
@@ -171,7 +209,7 @@ def flatten_tex(directory: str, main_file: str) -> str:
|
|
|
171
209
|
|
|
172
210
|
def process_latex_source(arxiv_id: str, keep_comments: bool = True,
|
|
173
211
|
cache_dir: Optional[str] = None,
|
|
174
|
-
use_cache: bool = False) -> Optional[str]:
|
|
212
|
+
use_cache: bool = False, remove_appendix_section: bool = False) -> Optional[str]:
|
|
175
213
|
"""
|
|
176
214
|
Process LaTeX source files from arXiv and return the combined content.
|
|
177
215
|
|
|
@@ -180,6 +218,7 @@ def process_latex_source(arxiv_id: str, keep_comments: bool = True,
|
|
|
180
218
|
keep_comments: Whether to keep LaTeX comments in the output
|
|
181
219
|
cache_dir: Custom directory to store downloaded files
|
|
182
220
|
use_cache: Whether to use cached files if they exist (default: False)
|
|
221
|
+
remove_appendix_section: Whether to remove the appendix section and everything after it
|
|
183
222
|
|
|
184
223
|
Returns:
|
|
185
224
|
The processed LaTeX content or None if processing fails
|
|
@@ -204,6 +243,10 @@ def process_latex_source(arxiv_id: str, keep_comments: bool = True,
|
|
|
204
243
|
if not keep_comments:
|
|
205
244
|
content = remove_comments_from_lines(content)
|
|
206
245
|
|
|
246
|
+
# Remove appendix if requested
|
|
247
|
+
if remove_appendix_section:
|
|
248
|
+
content = remove_appendix(content)
|
|
249
|
+
|
|
207
250
|
return content
|
|
208
251
|
|
|
209
252
|
def check_source_available(arxiv_id: str) -> bool:
|
|
@@ -1,41 +1,13 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
|
-
License: MIT
|
|
7
|
-
|
|
8
|
-
Copyright (c) 2025 Takashi Ishida
|
|
9
|
-
|
|
10
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
-
in the Software without restriction, including without limitation the rights
|
|
13
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
-
furnished to do so, subject to the following conditions:
|
|
16
|
-
|
|
17
|
-
The above copyright notice and this permission notice shall be included in all
|
|
18
|
-
copies or substantial portions of the Software.
|
|
19
|
-
|
|
20
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
-
SOFTWARE.
|
|
27
|
-
|
|
6
|
+
License: MIT
|
|
28
7
|
Project-URL: Homepage, https://github.com/takashiishida/arxiv-to-prompt
|
|
29
8
|
Project-URL: Changelog, https://github.com/takashiishida/arxiv-to-prompt/releases
|
|
30
9
|
Project-URL: Issues, https://github.com/takashiishida/arxiv-to-prompt/issues
|
|
31
10
|
Project-URL: CI, https://github.com/takashiishida/arxiv-to-prompt/actions
|
|
32
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
33
|
-
Classifier: Programming Language :: Python :: 3
|
|
34
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
35
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
36
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
37
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
38
|
-
Classifier: Operating System :: OS Independent
|
|
39
11
|
Requires-Python: >=3.8
|
|
40
12
|
Description-Content-Type: text/markdown
|
|
41
13
|
License-File: LICENSE
|
|
@@ -43,15 +15,16 @@ Requires-Dist: requests>=2.25.0
|
|
|
43
15
|
Provides-Extra: test
|
|
44
16
|
Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
45
17
|
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
18
|
+
Dynamic: license-file
|
|
46
19
|
|
|
47
20
|
# arxiv-to-prompt
|
|
48
21
|
|
|
49
|
-
[](https://pypi.org/project/arxiv-to-prompt/)
|
|
22
|
+
[](https://pypi.org/project/arxiv-to-prompt/)
|
|
50
23
|
[](https://github.com/takashiishida/arxiv-to-prompt/actions)
|
|
51
24
|
[](https://opensource.org/licenses/MIT)
|
|
52
25
|
[](https://github.com/takashiishida/arxiv-to-prompt/releases)
|
|
53
26
|
|
|
54
|
-
A command-line tool to transform arXiv papers into a single LaTeX source that can be used as a prompt for asking LLMs questions about the paper. It downloads the source files, automatically finds the main tex file containing `\documentclass`, and flattens multiple files into a single coherent source by resolving `\input` and `\include` commands. The tool also provides
|
|
27
|
+
A command-line tool to transform arXiv papers into a single LaTeX source that can be used as a prompt for asking LLMs questions about the paper. It downloads the source files, automatically finds the main tex file containing `\documentclass`, and flattens multiple files into a single coherent source by resolving `\input` and `\include` commands. The tool also provides options to remove LaTeX comments and appendix sections from the output (which can be useful to shorten the prompt).
|
|
55
28
|
|
|
56
29
|
### Installation
|
|
57
30
|
|
|
@@ -69,8 +42,17 @@ arxiv-to-prompt 2303.08774
|
|
|
69
42
|
# Display LaTeX source without comments
|
|
70
43
|
arxiv-to-prompt 2303.08774 --no-comments
|
|
71
44
|
|
|
45
|
+
# Display LaTeX source without appendix sections
|
|
46
|
+
arxiv-to-prompt 2303.08774 --no-appendix
|
|
47
|
+
|
|
48
|
+
# Combine options (no comments and no appendix)
|
|
49
|
+
arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
50
|
+
|
|
72
51
|
# Copy to clipboard
|
|
73
52
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
53
|
+
|
|
54
|
+
# Combine with the `llm` library from https://github.com/simonw/llm to chat about the paper
|
|
55
|
+
arxiv-to-prompt 1706.03762 | llm -s "explain this paper"
|
|
74
56
|
```
|
|
75
57
|
|
|
76
58
|
The arXiv ID can be found in the paper's URL. For example, for `https://arxiv.org/abs/2303.08774`, the ID is `2303.08774`. It will automatically download the latest version of the paper, so you don't need to specify the version.
|
|
@@ -87,8 +69,23 @@ latex_source = process_latex_source("2303.08774")
|
|
|
87
69
|
|
|
88
70
|
# Get LaTeX source without comments
|
|
89
71
|
latex_source = process_latex_source("2303.08774", keep_comments=False)
|
|
72
|
+
|
|
73
|
+
# Get LaTeX source without appendix sections
|
|
74
|
+
latex_source = process_latex_source("2303.08774", remove_appendix_section=True)
|
|
75
|
+
|
|
76
|
+
# Combine options (no comments and no appendix)
|
|
77
|
+
latex_source = process_latex_source("2303.08774", keep_comments=False, remove_appendix_section=True)
|
|
90
78
|
```
|
|
91
79
|
|
|
80
|
+
### Projects Using arxiv-to-prompt
|
|
81
|
+
|
|
82
|
+
Here are some projects and use cases that leverage arxiv-to-prompt:
|
|
83
|
+
|
|
84
|
+
- [arxiv-latex-mcp](https://github.com/takashiishida/arxiv-latex-mcp): MCP server that uses arxiv-to-prompt to fetch and process arXiv LaTeX sources for precise interpretation of mathematical expressions in scientific papers.
|
|
85
|
+
- [arxiv-tex-ui](https://github.com/takashiishida/arxiv-tex-ui): chat with an llm about an arxiv paper by using the latex source.
|
|
86
|
+
|
|
87
|
+
If you're using arxiv-to-prompt in your project, please submit a pull request to add it to this list!
|
|
88
|
+
|
|
92
89
|
### References
|
|
93
90
|
|
|
94
91
|
- Inspired by [files-to-prompt](https://github.com/simonw/files-to-prompt).
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
arxiv_to_prompt/__init__.py,sha256=oL2bEzZhiFoMqCF-84Xmljqw55lgRkwInBFpExRPCTY,609
|
|
2
|
+
arxiv_to_prompt/cli.py,sha256=2ZVmxNcygFpOFROfCo-FtXzcRpLVVRUOkIhASL0iD7o,1179
|
|
3
|
+
arxiv_to_prompt/core.py,sha256=pVsUzpplBBTLBxxjYQ6AbR667XlZ9TMz3RFNS8bX7X8,10343
|
|
4
|
+
arxiv_to_prompt-0.2.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
|
|
5
|
+
arxiv_to_prompt-0.2.0.dist-info/METADATA,sha256=iD0c2HDslUJzc-1xWN1-0X95TUZtxBpLPc36NIicF_A,3998
|
|
6
|
+
arxiv_to_prompt-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
arxiv_to_prompt-0.2.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
|
|
8
|
+
arxiv_to_prompt-0.2.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
|
|
9
|
+
arxiv_to_prompt-0.2.0.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
arxiv_to_prompt/__init__.py,sha256=oL2bEzZhiFoMqCF-84Xmljqw55lgRkwInBFpExRPCTY,609
|
|
2
|
-
arxiv_to_prompt/cli.py,sha256=WafgKxxpgJrLyeuQ-tnUASoknoNXiaQRWLP-Emsr-ug,977
|
|
3
|
-
arxiv_to_prompt/core.py,sha256=cQcMNQJSrRVQAQsy2ULeLVlQlKIDDdgVLHFKJNMR0Sg,8296
|
|
4
|
-
arxiv_to_prompt-0.1.0.dist-info/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
|
|
5
|
-
arxiv_to_prompt-0.1.0.dist-info/METADATA,sha256=H8T6HFkP199SK19Jy66MgrVE2S8kTBr-2yYzC9qpQBs,4338
|
|
6
|
-
arxiv_to_prompt-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
7
|
-
arxiv_to_prompt-0.1.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
|
|
8
|
-
arxiv_to_prompt-0.1.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
|
|
9
|
-
arxiv_to_prompt-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|