arxiv-to-prompt 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arxiv_to_prompt-0.4.0/src/arxiv_to_prompt.egg-info → arxiv_to_prompt-0.5.0}/PKG-INFO +7 -1
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/README.md +6 -0
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/pyproject.toml +1 -1
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt/__init__.py +3 -1
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt/cli.py +31 -3
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt/core.py +30 -0
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0/src/arxiv_to_prompt.egg-info}/PKG-INFO +7 -1
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/tests/test_core.py +66 -1
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/LICENSE +0 -0
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/setup.cfg +0 -0
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt.egg-info/SOURCES.txt +0 -0
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt.egg-info/dependency_links.txt +0 -0
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt.egg-info/entry_points.txt +0 -0
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt.egg-info/requires.txt +0 -0
- {arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -54,6 +54,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
54
54
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
55
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
56
|
|
|
57
|
+
# List all section names in the paper
|
|
58
|
+
arxiv-to-prompt 2303.08774 --list-sections
|
|
59
|
+
|
|
60
|
+
# Extract only specific sections
|
|
61
|
+
arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
|
|
62
|
+
|
|
57
63
|
# Copy to clipboard
|
|
58
64
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
59
65
|
|
|
@@ -35,6 +35,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
35
35
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
36
36
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
37
37
|
|
|
38
|
+
# List all section names in the paper
|
|
39
|
+
arxiv-to-prompt 2303.08774 --list-sections
|
|
40
|
+
|
|
41
|
+
# Extract only specific sections
|
|
42
|
+
arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
|
|
43
|
+
|
|
38
44
|
# Copy to clipboard
|
|
39
45
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
40
46
|
|
|
@@ -15,7 +15,7 @@ Example:
|
|
|
15
15
|
>>> latex_source = process_latex_source(local_folder="/path/to/tex/files")
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
|
|
18
|
+
from .core import process_latex_source, download_arxiv_source, get_default_cache_dir, list_sections, extract_section
|
|
19
19
|
|
|
20
20
|
# Import version from package metadata
|
|
21
21
|
try:
|
|
@@ -32,5 +32,7 @@ __all__ = [
|
|
|
32
32
|
"process_latex_source",
|
|
33
33
|
"download_arxiv_source",
|
|
34
34
|
"get_default_cache_dir",
|
|
35
|
+
"list_sections",
|
|
36
|
+
"extract_section",
|
|
35
37
|
"__version__",
|
|
36
38
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import re
|
|
3
|
-
from .core import process_latex_source, get_default_cache_dir
|
|
3
|
+
from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def extract_arxiv_id(input_str: str) -> str:
|
|
@@ -45,7 +45,18 @@ def main():
|
|
|
45
45
|
help="Path to a local folder containing TeX files (alternative to arxiv_id)",
|
|
46
46
|
default=None
|
|
47
47
|
)
|
|
48
|
-
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--list-sections",
|
|
50
|
+
action="store_true",
|
|
51
|
+
help="List all section names in the document"
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--section",
|
|
55
|
+
type=str,
|
|
56
|
+
action="append",
|
|
57
|
+
help="Extract only the specified section(s). Can be used multiple times."
|
|
58
|
+
)
|
|
59
|
+
|
|
49
60
|
args = parser.parse_args()
|
|
50
61
|
|
|
51
62
|
# Validate that either arxiv_id or local_folder is provided
|
|
@@ -64,7 +75,24 @@ def main():
|
|
|
64
75
|
remove_appendix_section=args.no_appendix,
|
|
65
76
|
local_folder=args.local_folder
|
|
66
77
|
)
|
|
67
|
-
if content:
|
|
78
|
+
if not content:
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if args.list_sections:
|
|
82
|
+
sections = list_sections(content)
|
|
83
|
+
for section in sections:
|
|
84
|
+
print(section)
|
|
85
|
+
elif args.section:
|
|
86
|
+
extracted = []
|
|
87
|
+
for section_name in args.section:
|
|
88
|
+
section_content = extract_section(content, section_name)
|
|
89
|
+
if section_content:
|
|
90
|
+
extracted.append(section_content)
|
|
91
|
+
else:
|
|
92
|
+
print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
|
|
93
|
+
if extracted:
|
|
94
|
+
print("\n\n".join(extracted))
|
|
95
|
+
else:
|
|
68
96
|
print(content)
|
|
69
97
|
|
|
70
98
|
if __name__ == "__main__":
|
|
@@ -131,6 +131,8 @@ def find_main_tex(directory: str) -> Optional[str]:
|
|
|
131
131
|
|
|
132
132
|
def remove_comments_from_lines(text: str) -> str:
|
|
133
133
|
"""Remove LaTeX comments while preserving newlines."""
|
|
134
|
+
# Remove \iffalse...\fi blocks (commonly used to comment out large sections)
|
|
135
|
+
text = re.sub(r'\\iffalse\b.*?\\fi\b', '', text, flags=re.DOTALL)
|
|
134
136
|
lines = text.split('\n')
|
|
135
137
|
result = []
|
|
136
138
|
for line in lines:
|
|
@@ -162,6 +164,34 @@ def remove_appendix(text: str) -> str:
|
|
|
162
164
|
return text[:appendix_match.start()].rstrip()
|
|
163
165
|
return text
|
|
164
166
|
|
|
167
|
+
|
|
168
|
+
def list_sections(text: str) -> list:
|
|
169
|
+
"""Extract all section names from LaTeX content."""
|
|
170
|
+
pattern = r'\\section\*?\{([^}]+)\}'
|
|
171
|
+
return re.findall(pattern, text)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def extract_section(text: str, section_name: str) -> Optional[str]:
|
|
175
|
+
"""Extract content of a specific section (including its subsections)."""
|
|
176
|
+
# Find the start of the requested section
|
|
177
|
+
pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
|
|
178
|
+
start_match = re.search(pattern, text)
|
|
179
|
+
if not start_match:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
start_pos = start_match.start()
|
|
183
|
+
|
|
184
|
+
# Find the next \section (not subsection) or end of document
|
|
185
|
+
remaining = text[start_match.end():]
|
|
186
|
+
end_match = re.search(r'\\section\*?\{', remaining)
|
|
187
|
+
|
|
188
|
+
if end_match:
|
|
189
|
+
end_pos = start_match.end() + end_match.start()
|
|
190
|
+
return text[start_pos:end_pos].rstrip()
|
|
191
|
+
else:
|
|
192
|
+
return text[start_pos:].rstrip()
|
|
193
|
+
|
|
194
|
+
|
|
165
195
|
def flatten_tex(directory: str, main_file: str) -> str:
|
|
166
196
|
"""Combine all tex files into one, resolving inputs."""
|
|
167
197
|
def process_file(file_path: str, processed_files: set) -> str:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -54,6 +54,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
54
54
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
55
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
56
|
|
|
57
|
+
# List all section names in the paper
|
|
58
|
+
arxiv-to-prompt 2303.08774 --list-sections
|
|
59
|
+
|
|
60
|
+
# Extract only specific sections
|
|
61
|
+
arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
|
|
62
|
+
|
|
57
63
|
# Copy to clipboard
|
|
58
64
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
59
65
|
|
|
@@ -10,6 +10,8 @@ from arxiv_to_prompt.core import (
|
|
|
10
10
|
check_source_available,
|
|
11
11
|
flatten_tex,
|
|
12
12
|
remove_appendix,
|
|
13
|
+
list_sections,
|
|
14
|
+
extract_section,
|
|
13
15
|
)
|
|
14
16
|
from arxiv_to_prompt.cli import extract_arxiv_id
|
|
15
17
|
|
|
@@ -109,11 +111,29 @@ def test_remove_comments_from_lines():
|
|
|
109
111
|
("Multiple % comments % here", "Multiple"),
|
|
110
112
|
("Line with both \\% and % real comment", "Line with both \\% and"),
|
|
111
113
|
]
|
|
112
|
-
|
|
114
|
+
|
|
113
115
|
for input_text, expected in test_cases:
|
|
114
116
|
assert remove_comments_from_lines(input_text).rstrip() == expected
|
|
115
117
|
|
|
116
118
|
|
|
119
|
+
def test_remove_iffalse_blocks():
|
|
120
|
+
"""Test removal of \\iffalse...\\fi blocks."""
|
|
121
|
+
# Single line
|
|
122
|
+
assert remove_comments_from_lines("before \\iffalse hidden \\fi after") == "before after"
|
|
123
|
+
|
|
124
|
+
# Multi-line block
|
|
125
|
+
input_text = "before\n\\iffalse\nhidden\ncontent\n\\fi\nafter"
|
|
126
|
+
result = remove_comments_from_lines(input_text)
|
|
127
|
+
assert "hidden" not in result
|
|
128
|
+
assert "before" in result
|
|
129
|
+
assert "after" in result
|
|
130
|
+
|
|
131
|
+
# Multiple blocks
|
|
132
|
+
input_text = "a \\iffalse x \\fi b \\iffalse y \\fi c"
|
|
133
|
+
result = remove_comments_from_lines(input_text)
|
|
134
|
+
assert result == "a b c"
|
|
135
|
+
|
|
136
|
+
|
|
117
137
|
def test_find_main_tex(temp_cache_dir):
|
|
118
138
|
"""Test finding the main tex file."""
|
|
119
139
|
# Create test files
|
|
@@ -296,3 +316,48 @@ def test_extract_arxiv_id():
|
|
|
296
316
|
# Non-arxiv input returned as-is
|
|
297
317
|
assert extract_arxiv_id("invalid") == "invalid"
|
|
298
318
|
assert extract_arxiv_id("https://example.com/2505.18102") == "https://example.com/2505.18102"
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def test_list_sections():
|
|
322
|
+
"""Test listing section names."""
|
|
323
|
+
text = r"""
|
|
324
|
+
\section{Introduction}
|
|
325
|
+
Some intro text.
|
|
326
|
+
\section{Methods}
|
|
327
|
+
Some methods text.
|
|
328
|
+
\subsection{Data}
|
|
329
|
+
Data description.
|
|
330
|
+
\section*{Acknowledgments}
|
|
331
|
+
Thanks.
|
|
332
|
+
"""
|
|
333
|
+
sections = list_sections(text)
|
|
334
|
+
assert sections == ["Introduction", "Methods", "Acknowledgments"]
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def test_extract_section():
|
|
338
|
+
"""Test extracting a specific section."""
|
|
339
|
+
text = r"""
|
|
340
|
+
\section{Introduction}
|
|
341
|
+
Intro content here.
|
|
342
|
+
\section{Methods}
|
|
343
|
+
Methods content here.
|
|
344
|
+
\subsection{Data Collection}
|
|
345
|
+
Data info.
|
|
346
|
+
\section{Results}
|
|
347
|
+
Results here.
|
|
348
|
+
"""
|
|
349
|
+
# Extract Methods section (should include subsection)
|
|
350
|
+
methods = extract_section(text, "Methods")
|
|
351
|
+
assert methods is not None
|
|
352
|
+
assert "Methods content here." in methods
|
|
353
|
+
assert "Data Collection" in methods
|
|
354
|
+
assert "Data info." in methods
|
|
355
|
+
assert "Results here." not in methods
|
|
356
|
+
|
|
357
|
+
# Extract non-existent section
|
|
358
|
+
assert extract_section(text, "Discussion") is None
|
|
359
|
+
|
|
360
|
+
# Extract last section
|
|
361
|
+
results = extract_section(text, "Results")
|
|
362
|
+
assert results is not None
|
|
363
|
+
assert "Results here." in results
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{arxiv_to_prompt-0.4.0 → arxiv_to_prompt-0.5.0}/src/arxiv_to_prompt.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|