arxiv-to-prompt 0.4.1__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arxiv_to_prompt-0.4.1/src/arxiv_to_prompt.egg-info → arxiv_to_prompt-0.5.1}/PKG-INFO +7 -1
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/README.md +6 -0
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/pyproject.toml +1 -1
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt/__init__.py +3 -1
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt/cli.py +31 -3
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt/core.py +68 -25
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1/src/arxiv_to_prompt.egg-info}/PKG-INFO +7 -1
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/tests/test_core.py +64 -0
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/LICENSE +0 -0
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/setup.cfg +0 -0
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt.egg-info/SOURCES.txt +0 -0
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt.egg-info/dependency_links.txt +0 -0
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt.egg-info/entry_points.txt +0 -0
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt.egg-info/requires.txt +0 -0
- {arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -54,6 +54,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
54
54
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
55
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
56
|
|
|
57
|
+
# List all section names in the paper
|
|
58
|
+
arxiv-to-prompt 2303.08774 --list-sections
|
|
59
|
+
|
|
60
|
+
# Extract only specific sections
|
|
61
|
+
arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
|
|
62
|
+
|
|
57
63
|
# Copy to clipboard
|
|
58
64
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
59
65
|
|
|
@@ -35,6 +35,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
35
35
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
36
36
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
37
37
|
|
|
38
|
+
# List all section names in the paper
|
|
39
|
+
arxiv-to-prompt 2303.08774 --list-sections
|
|
40
|
+
|
|
41
|
+
# Extract only specific sections
|
|
42
|
+
arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
|
|
43
|
+
|
|
38
44
|
# Copy to clipboard
|
|
39
45
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
40
46
|
|
|
@@ -15,7 +15,7 @@ Example:
|
|
|
15
15
|
>>> latex_source = process_latex_source(local_folder="/path/to/tex/files")
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
|
|
18
|
+
from .core import process_latex_source, download_arxiv_source, get_default_cache_dir, list_sections, extract_section
|
|
19
19
|
|
|
20
20
|
# Import version from package metadata
|
|
21
21
|
try:
|
|
@@ -32,5 +32,7 @@ __all__ = [
|
|
|
32
32
|
"process_latex_source",
|
|
33
33
|
"download_arxiv_source",
|
|
34
34
|
"get_default_cache_dir",
|
|
35
|
+
"list_sections",
|
|
36
|
+
"extract_section",
|
|
35
37
|
"__version__",
|
|
36
38
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import re
|
|
3
|
-
from .core import process_latex_source, get_default_cache_dir
|
|
3
|
+
from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def extract_arxiv_id(input_str: str) -> str:
|
|
@@ -45,7 +45,18 @@ def main():
|
|
|
45
45
|
help="Path to a local folder containing TeX files (alternative to arxiv_id)",
|
|
46
46
|
default=None
|
|
47
47
|
)
|
|
48
|
-
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--list-sections",
|
|
50
|
+
action="store_true",
|
|
51
|
+
help="List all section names in the document"
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--section",
|
|
55
|
+
type=str,
|
|
56
|
+
action="append",
|
|
57
|
+
help="Extract only the specified section(s). Can be used multiple times."
|
|
58
|
+
)
|
|
59
|
+
|
|
49
60
|
args = parser.parse_args()
|
|
50
61
|
|
|
51
62
|
# Validate that either arxiv_id or local_folder is provided
|
|
@@ -64,7 +75,24 @@ def main():
|
|
|
64
75
|
remove_appendix_section=args.no_appendix,
|
|
65
76
|
local_folder=args.local_folder
|
|
66
77
|
)
|
|
67
|
-
if content:
|
|
78
|
+
if not content:
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if args.list_sections:
|
|
82
|
+
sections = list_sections(content)
|
|
83
|
+
for section in sections:
|
|
84
|
+
print(section)
|
|
85
|
+
elif args.section:
|
|
86
|
+
extracted = []
|
|
87
|
+
for section_name in args.section:
|
|
88
|
+
section_content = extract_section(content, section_name)
|
|
89
|
+
if section_content:
|
|
90
|
+
extracted.append(section_content)
|
|
91
|
+
else:
|
|
92
|
+
print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
|
|
93
|
+
if extracted:
|
|
94
|
+
print("\n\n".join(extracted))
|
|
95
|
+
else:
|
|
68
96
|
print(content)
|
|
69
97
|
|
|
70
98
|
if __name__ == "__main__":
|
|
@@ -92,40 +92,55 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
|
|
|
92
92
|
|
|
93
93
|
def find_main_tex(directory: str) -> Optional[str]:
|
|
94
94
|
"""
|
|
95
|
-
Find the main .tex file containing documentclass.
|
|
95
|
+
Find the main .tex file containing documentclass.
|
|
96
|
+
Searches recursively through subdirectories.
|
|
96
97
|
First checks for common naming conventions (main.tex, paper.tex, index.tex).
|
|
97
|
-
If none found, returns the
|
|
98
|
-
since shorter files are typically conference templates or supplementary documents
|
|
98
|
+
If none found, returns the path of the longest .tex file containing documentclass,
|
|
99
|
+
since shorter files are typically conference templates or supplementary documents
|
|
99
100
|
rather than the main manuscript.
|
|
100
101
|
"""
|
|
101
102
|
common_names = ['main.tex', 'paper.tex', 'index.tex']
|
|
102
103
|
main_tex_file = None
|
|
103
104
|
max_line_count = 0
|
|
104
105
|
|
|
105
|
-
#
|
|
106
|
-
for
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
106
|
+
# Walk through directory and subdirectories
|
|
107
|
+
for root, dirs, files in os.walk(directory):
|
|
108
|
+
rel_root = os.path.relpath(root, directory)
|
|
109
|
+
|
|
110
|
+
# First pass: check for common naming conventions
|
|
111
|
+
for file_name in files:
|
|
112
|
+
if file_name in common_names:
|
|
113
|
+
file_path = os.path.join(root, file_name)
|
|
114
|
+
try:
|
|
115
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
116
|
+
lines = file.readlines()
|
|
117
|
+
if any('\\documentclass' in line for line in lines):
|
|
118
|
+
if rel_root == '.':
|
|
119
|
+
return file_name
|
|
120
|
+
return os.path.join(rel_root, file_name)
|
|
121
|
+
except Exception as e:
|
|
122
|
+
logging.warning(f"Could not read file {file_path}: {e}")
|
|
115
123
|
|
|
116
124
|
# Second pass: find the longest .tex file containing documentclass
|
|
117
|
-
for
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
125
|
+
for root, dirs, files in os.walk(directory):
|
|
126
|
+
rel_root = os.path.relpath(root, directory)
|
|
127
|
+
|
|
128
|
+
for file_name in files:
|
|
129
|
+
if file_name.endswith('.tex'):
|
|
130
|
+
file_path = os.path.join(root, file_name)
|
|
131
|
+
try:
|
|
132
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
133
|
+
lines = file.readlines()
|
|
134
|
+
if any('\\documentclass' in line for line in lines):
|
|
135
|
+
line_count = len(lines)
|
|
136
|
+
if line_count > max_line_count:
|
|
137
|
+
if rel_root == '.':
|
|
138
|
+
main_tex_file = file_name
|
|
139
|
+
else:
|
|
140
|
+
main_tex_file = os.path.join(rel_root, file_name)
|
|
141
|
+
max_line_count = line_count
|
|
142
|
+
except Exception as e:
|
|
143
|
+
logging.warning(f"Could not read file {file_path}: {e}")
|
|
129
144
|
|
|
130
145
|
return main_tex_file
|
|
131
146
|
|
|
@@ -164,6 +179,34 @@ def remove_appendix(text: str) -> str:
|
|
|
164
179
|
return text[:appendix_match.start()].rstrip()
|
|
165
180
|
return text
|
|
166
181
|
|
|
182
|
+
|
|
183
|
+
def list_sections(text: str) -> list:
|
|
184
|
+
"""Extract all section names from LaTeX content."""
|
|
185
|
+
pattern = r'\\section\*?\{([^}]+)\}'
|
|
186
|
+
return re.findall(pattern, text)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def extract_section(text: str, section_name: str) -> Optional[str]:
|
|
190
|
+
"""Extract content of a specific section (including its subsections)."""
|
|
191
|
+
# Find the start of the requested section
|
|
192
|
+
pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
|
|
193
|
+
start_match = re.search(pattern, text)
|
|
194
|
+
if not start_match:
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
start_pos = start_match.start()
|
|
198
|
+
|
|
199
|
+
# Find the next \section (not subsection) or end of document
|
|
200
|
+
remaining = text[start_match.end():]
|
|
201
|
+
end_match = re.search(r'\\section\*?\{', remaining)
|
|
202
|
+
|
|
203
|
+
if end_match:
|
|
204
|
+
end_pos = start_match.end() + end_match.start()
|
|
205
|
+
return text[start_pos:end_pos].rstrip()
|
|
206
|
+
else:
|
|
207
|
+
return text[start_pos:].rstrip()
|
|
208
|
+
|
|
209
|
+
|
|
167
210
|
def flatten_tex(directory: str, main_file: str) -> str:
|
|
168
211
|
"""Combine all tex files into one, resolving inputs."""
|
|
169
212
|
def process_file(file_path: str, processed_files: set) -> str:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -54,6 +54,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
54
54
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
55
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
56
|
|
|
57
|
+
# List all section names in the paper
|
|
58
|
+
arxiv-to-prompt 2303.08774 --list-sections
|
|
59
|
+
|
|
60
|
+
# Extract only specific sections
|
|
61
|
+
arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
|
|
62
|
+
|
|
57
63
|
# Copy to clipboard
|
|
58
64
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
59
65
|
|
|
@@ -10,6 +10,8 @@ from arxiv_to_prompt.core import (
|
|
|
10
10
|
check_source_available,
|
|
11
11
|
flatten_tex,
|
|
12
12
|
remove_appendix,
|
|
13
|
+
list_sections,
|
|
14
|
+
extract_section,
|
|
13
15
|
)
|
|
14
16
|
from arxiv_to_prompt.cli import extract_arxiv_id
|
|
15
17
|
|
|
@@ -151,6 +153,23 @@ def test_find_main_tex(temp_cache_dir):
|
|
|
151
153
|
assert found_main == "main.tex"
|
|
152
154
|
|
|
153
155
|
|
|
156
|
+
def test_find_main_tex_in_subdirectory(temp_cache_dir):
|
|
157
|
+
"""Test finding main tex file in a subdirectory."""
|
|
158
|
+
# Create test directory with subdirectory
|
|
159
|
+
tex_dir = temp_cache_dir / "test_tex_subdir"
|
|
160
|
+
tex_dir.mkdir(parents=True)
|
|
161
|
+
subdir = tex_dir / "paper"
|
|
162
|
+
subdir.mkdir()
|
|
163
|
+
|
|
164
|
+
# Create main.tex in subdirectory
|
|
165
|
+
main_file = subdir / "main.tex"
|
|
166
|
+
main_file.write_text("\\documentclass{article}\n\\begin{document}\nHello\n\\end{document}")
|
|
167
|
+
|
|
168
|
+
# Test finding main file in subdirectory
|
|
169
|
+
found_main = find_main_tex(str(tex_dir))
|
|
170
|
+
assert found_main == os.path.join("paper", "main.tex")
|
|
171
|
+
|
|
172
|
+
|
|
154
173
|
def test_commented_input_commands(temp_cache_dir):
|
|
155
174
|
"""Test that commented-out \\include and \\input commands are ignored."""
|
|
156
175
|
# Create test directory and files
|
|
@@ -314,3 +333,48 @@ def test_extract_arxiv_id():
|
|
|
314
333
|
# Non-arxiv input returned as-is
|
|
315
334
|
assert extract_arxiv_id("invalid") == "invalid"
|
|
316
335
|
assert extract_arxiv_id("https://example.com/2505.18102") == "https://example.com/2505.18102"
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def test_list_sections():
|
|
339
|
+
"""Test listing section names."""
|
|
340
|
+
text = r"""
|
|
341
|
+
\section{Introduction}
|
|
342
|
+
Some intro text.
|
|
343
|
+
\section{Methods}
|
|
344
|
+
Some methods text.
|
|
345
|
+
\subsection{Data}
|
|
346
|
+
Data description.
|
|
347
|
+
\section*{Acknowledgments}
|
|
348
|
+
Thanks.
|
|
349
|
+
"""
|
|
350
|
+
sections = list_sections(text)
|
|
351
|
+
assert sections == ["Introduction", "Methods", "Acknowledgments"]
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def test_extract_section():
|
|
355
|
+
"""Test extracting a specific section."""
|
|
356
|
+
text = r"""
|
|
357
|
+
\section{Introduction}
|
|
358
|
+
Intro content here.
|
|
359
|
+
\section{Methods}
|
|
360
|
+
Methods content here.
|
|
361
|
+
\subsection{Data Collection}
|
|
362
|
+
Data info.
|
|
363
|
+
\section{Results}
|
|
364
|
+
Results here.
|
|
365
|
+
"""
|
|
366
|
+
# Extract Methods section (should include subsection)
|
|
367
|
+
methods = extract_section(text, "Methods")
|
|
368
|
+
assert methods is not None
|
|
369
|
+
assert "Methods content here." in methods
|
|
370
|
+
assert "Data Collection" in methods
|
|
371
|
+
assert "Data info." in methods
|
|
372
|
+
assert "Results here." not in methods
|
|
373
|
+
|
|
374
|
+
# Extract non-existent section
|
|
375
|
+
assert extract_section(text, "Discussion") is None
|
|
376
|
+
|
|
377
|
+
# Extract last section
|
|
378
|
+
results = extract_section(text, "Results")
|
|
379
|
+
assert results is not None
|
|
380
|
+
assert "Results here." in results
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{arxiv_to_prompt-0.4.1 → arxiv_to_prompt-0.5.1}/src/arxiv_to_prompt.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|