arxiv-to-prompt 0.3.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arxiv_to_prompt-0.3.0/src/arxiv_to_prompt.egg-info → arxiv_to_prompt-0.4.1}/PKG-INFO +2 -2
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/README.md +1 -1
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/pyproject.toml +1 -1
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt/cli.py +15 -3
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt/core.py +2 -0
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1/src/arxiv_to_prompt.egg-info}/PKG-INFO +2 -2
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/tests/test_core.py +44 -1
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/LICENSE +0 -0
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/setup.cfg +0 -0
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt/__init__.py +0 -0
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt.egg-info/SOURCES.txt +0 -0
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt.egg-info/dependency_links.txt +0 -0
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt.egg-info/entry_points.txt +0 -0
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt.egg-info/requires.txt +0 -0
- {arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -61,7 +61,7 @@ arxiv-to-prompt 2303.08774 | pbcopy
|
|
|
61
61
|
arxiv-to-prompt 1706.03762 | llm -s "explain this paper"
|
|
62
62
|
```
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
You can use either the arXiv ID (e.g., `2303.08774`) or the full URL (e.g., `https://arxiv.org/abs/2303.08774`). It will automatically download the latest version of the paper, so you don't need to specify the version.
|
|
65
65
|
|
|
66
66
|
### Python API
|
|
67
67
|
|
|
@@ -42,7 +42,7 @@ arxiv-to-prompt 2303.08774 | pbcopy
|
|
|
42
42
|
arxiv-to-prompt 1706.03762 | llm -s "explain this paper"
|
|
43
43
|
```
|
|
44
44
|
|
|
45
|
-
|
|
45
|
+
You can use either the arXiv ID (e.g., `2303.08774`) or the full URL (e.g., `https://arxiv.org/abs/2303.08774`). It will automatically download the latest version of the paper, so you don't need to specify the version.
|
|
46
46
|
|
|
47
47
|
### Python API
|
|
48
48
|
|
|
@@ -1,6 +1,16 @@
|
|
|
1
1
|
import argparse
|
|
2
|
+
import re
|
|
2
3
|
from .core import process_latex_source, get_default_cache_dir
|
|
3
4
|
|
|
5
|
+
|
|
6
|
+
def extract_arxiv_id(input_str: str) -> str:
|
|
7
|
+
"""Extract arxiv ID from URL or return input as-is if already an ID."""
|
|
8
|
+
if "arxiv.org" in input_str:
|
|
9
|
+
match = re.search(r'arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})(?:v\d+)?(?:\.pdf)?', input_str)
|
|
10
|
+
if match:
|
|
11
|
+
return match.group(1)
|
|
12
|
+
return input_str
|
|
13
|
+
|
|
4
14
|
def main():
|
|
5
15
|
default_cache = str(get_default_cache_dir())
|
|
6
16
|
|
|
@@ -11,7 +21,7 @@ def main():
|
|
|
11
21
|
"arxiv_id",
|
|
12
22
|
nargs="?",
|
|
13
23
|
default=None,
|
|
14
|
-
help="The arXiv ID
|
|
24
|
+
help="The arXiv ID (e.g. 2303.08774) or URL (e.g. https://arxiv.org/abs/2303.08774). Not needed if --local-folder is provided."
|
|
15
25
|
)
|
|
16
26
|
parser.add_argument(
|
|
17
27
|
"--no-comments",
|
|
@@ -44,9 +54,11 @@ def main():
|
|
|
44
54
|
|
|
45
55
|
if args.arxiv_id and args.local_folder:
|
|
46
56
|
parser.error("Cannot specify both arXiv ID and --local-folder")
|
|
47
|
-
|
|
57
|
+
|
|
58
|
+
arxiv_id = extract_arxiv_id(args.arxiv_id) if args.arxiv_id else None
|
|
59
|
+
|
|
48
60
|
content = process_latex_source(
|
|
49
|
-
arxiv_id=
|
|
61
|
+
arxiv_id=arxiv_id,
|
|
50
62
|
keep_comments=not args.no_comments,
|
|
51
63
|
cache_dir=args.cache_dir,
|
|
52
64
|
remove_appendix_section=args.no_appendix,
|
|
@@ -131,6 +131,8 @@ def find_main_tex(directory: str) -> Optional[str]:
|
|
|
131
131
|
|
|
132
132
|
def remove_comments_from_lines(text: str) -> str:
|
|
133
133
|
"""Remove LaTeX comments while preserving newlines."""
|
|
134
|
+
# Remove \iffalse...\fi blocks (commonly used to comment out large sections)
|
|
135
|
+
text = re.sub(r'\\iffalse\b.*?\\fi\b', '', text, flags=re.DOTALL)
|
|
134
136
|
lines = text.split('\n')
|
|
135
137
|
result = []
|
|
136
138
|
for line in lines:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -61,7 +61,7 @@ arxiv-to-prompt 2303.08774 | pbcopy
|
|
|
61
61
|
arxiv-to-prompt 1706.03762 | llm -s "explain this paper"
|
|
62
62
|
```
|
|
63
63
|
|
|
64
|
-
|
|
64
|
+
You can use either the arXiv ID (e.g., `2303.08774`) or the full URL (e.g., `https://arxiv.org/abs/2303.08774`). It will automatically download the latest version of the paper, so you don't need to specify the version.
|
|
65
65
|
|
|
66
66
|
### Python API
|
|
67
67
|
|
|
@@ -11,6 +11,7 @@ from arxiv_to_prompt.core import (
|
|
|
11
11
|
flatten_tex,
|
|
12
12
|
remove_appendix,
|
|
13
13
|
)
|
|
14
|
+
from arxiv_to_prompt.cli import extract_arxiv_id
|
|
14
15
|
|
|
15
16
|
# Test fixtures
|
|
16
17
|
@pytest.fixture
|
|
@@ -108,11 +109,29 @@ def test_remove_comments_from_lines():
|
|
|
108
109
|
("Multiple % comments % here", "Multiple"),
|
|
109
110
|
("Line with both \\% and % real comment", "Line with both \\% and"),
|
|
110
111
|
]
|
|
111
|
-
|
|
112
|
+
|
|
112
113
|
for input_text, expected in test_cases:
|
|
113
114
|
assert remove_comments_from_lines(input_text).rstrip() == expected
|
|
114
115
|
|
|
115
116
|
|
|
117
|
+
def test_remove_iffalse_blocks():
|
|
118
|
+
"""Test removal of \\iffalse...\\fi blocks."""
|
|
119
|
+
# Single line
|
|
120
|
+
assert remove_comments_from_lines("before \\iffalse hidden \\fi after") == "before after"
|
|
121
|
+
|
|
122
|
+
# Multi-line block
|
|
123
|
+
input_text = "before\n\\iffalse\nhidden\ncontent\n\\fi\nafter"
|
|
124
|
+
result = remove_comments_from_lines(input_text)
|
|
125
|
+
assert "hidden" not in result
|
|
126
|
+
assert "before" in result
|
|
127
|
+
assert "after" in result
|
|
128
|
+
|
|
129
|
+
# Multiple blocks
|
|
130
|
+
input_text = "a \\iffalse x \\fi b \\iffalse y \\fi c"
|
|
131
|
+
result = remove_comments_from_lines(input_text)
|
|
132
|
+
assert result == "a b c"
|
|
133
|
+
|
|
134
|
+
|
|
116
135
|
def test_find_main_tex(temp_cache_dir):
|
|
117
136
|
"""Test finding the main tex file."""
|
|
118
137
|
# Create test files
|
|
@@ -271,3 +290,27 @@ def test_input_file_extensions(temp_cache_dir):
|
|
|
271
290
|
assert "Style content" in result
|
|
272
291
|
assert "Class content" in result
|
|
273
292
|
assert "Already tex content" in result
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def test_extract_arxiv_id():
|
|
296
|
+
"""Test extracting arxiv ID from URLs and plain IDs."""
|
|
297
|
+
# Plain IDs should be returned as-is
|
|
298
|
+
assert extract_arxiv_id("2505.18102") == "2505.18102"
|
|
299
|
+
assert extract_arxiv_id("2401.12345") == "2401.12345"
|
|
300
|
+
|
|
301
|
+
# Extract from abs URLs
|
|
302
|
+
assert extract_arxiv_id("https://arxiv.org/abs/2505.18102") == "2505.18102"
|
|
303
|
+
assert extract_arxiv_id("http://arxiv.org/abs/2505.18102") == "2505.18102"
|
|
304
|
+
|
|
305
|
+
# Extract from pdf URLs
|
|
306
|
+
assert extract_arxiv_id("https://arxiv.org/pdf/2505.18102") == "2505.18102"
|
|
307
|
+
assert extract_arxiv_id("https://arxiv.org/pdf/2505.18102.pdf") == "2505.18102"
|
|
308
|
+
|
|
309
|
+
# Strip version suffixes
|
|
310
|
+
assert extract_arxiv_id("https://arxiv.org/abs/2505.18102v1") == "2505.18102"
|
|
311
|
+
assert extract_arxiv_id("https://arxiv.org/abs/2505.18102v2") == "2505.18102"
|
|
312
|
+
assert extract_arxiv_id("https://arxiv.org/pdf/2505.18102v3.pdf") == "2505.18102"
|
|
313
|
+
|
|
314
|
+
# Non-arxiv input returned as-is
|
|
315
|
+
assert extract_arxiv_id("invalid") == "invalid"
|
|
316
|
+
assert extract_arxiv_id("https://example.com/2505.18102") == "https://example.com/2505.18102"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{arxiv_to_prompt-0.3.0 → arxiv_to_prompt-0.4.1}/src/arxiv_to_prompt.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|