arxiv-to-prompt 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_to_prompt/__init__.py +3 -1
- arxiv_to_prompt/cli.py +31 -3
- arxiv_to_prompt/core.py +28 -0
- {arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.0.dist-info}/METADATA +7 -1
- arxiv_to_prompt-0.5.0.dist-info/RECORD +9 -0
- arxiv_to_prompt-0.4.1.dist-info/RECORD +0 -9
- {arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.0.dist-info}/WHEEL +0 -0
- {arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.0.dist-info}/entry_points.txt +0 -0
- {arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.0.dist-info}/top_level.txt +0 -0
arxiv_to_prompt/__init__.py
CHANGED
|
@@ -15,7 +15,7 @@ Example:
|
|
|
15
15
|
>>> latex_source = process_latex_source(local_folder="/path/to/tex/files")
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
|
|
18
|
+
from .core import process_latex_source, download_arxiv_source, get_default_cache_dir, list_sections, extract_section
|
|
19
19
|
|
|
20
20
|
# Import version from package metadata
|
|
21
21
|
try:
|
|
@@ -32,5 +32,7 @@ __all__ = [
|
|
|
32
32
|
"process_latex_source",
|
|
33
33
|
"download_arxiv_source",
|
|
34
34
|
"get_default_cache_dir",
|
|
35
|
+
"list_sections",
|
|
36
|
+
"extract_section",
|
|
35
37
|
"__version__",
|
|
36
38
|
]
|
arxiv_to_prompt/cli.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import re
|
|
3
|
-
from .core import process_latex_source, get_default_cache_dir
|
|
3
|
+
from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def extract_arxiv_id(input_str: str) -> str:
|
|
@@ -45,7 +45,18 @@ def main():
|
|
|
45
45
|
help="Path to a local folder containing TeX files (alternative to arxiv_id)",
|
|
46
46
|
default=None
|
|
47
47
|
)
|
|
48
|
-
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--list-sections",
|
|
50
|
+
action="store_true",
|
|
51
|
+
help="List all section names in the document"
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--section",
|
|
55
|
+
type=str,
|
|
56
|
+
action="append",
|
|
57
|
+
help="Extract only the specified section(s). Can be used multiple times."
|
|
58
|
+
)
|
|
59
|
+
|
|
49
60
|
args = parser.parse_args()
|
|
50
61
|
|
|
51
62
|
# Validate that either arxiv_id or local_folder is provided
|
|
@@ -64,7 +75,24 @@ def main():
|
|
|
64
75
|
remove_appendix_section=args.no_appendix,
|
|
65
76
|
local_folder=args.local_folder
|
|
66
77
|
)
|
|
67
|
-
if content:
|
|
78
|
+
if not content:
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if args.list_sections:
|
|
82
|
+
sections = list_sections(content)
|
|
83
|
+
for section in sections:
|
|
84
|
+
print(section)
|
|
85
|
+
elif args.section:
|
|
86
|
+
extracted = []
|
|
87
|
+
for section_name in args.section:
|
|
88
|
+
section_content = extract_section(content, section_name)
|
|
89
|
+
if section_content:
|
|
90
|
+
extracted.append(section_content)
|
|
91
|
+
else:
|
|
92
|
+
print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
|
|
93
|
+
if extracted:
|
|
94
|
+
print("\n\n".join(extracted))
|
|
95
|
+
else:
|
|
68
96
|
print(content)
|
|
69
97
|
|
|
70
98
|
if __name__ == "__main__":
|
arxiv_to_prompt/core.py
CHANGED
|
@@ -164,6 +164,34 @@ def remove_appendix(text: str) -> str:
|
|
|
164
164
|
return text[:appendix_match.start()].rstrip()
|
|
165
165
|
return text
|
|
166
166
|
|
|
167
|
+
|
|
168
|
+
def list_sections(text: str) -> list:
|
|
169
|
+
"""Extract all section names from LaTeX content."""
|
|
170
|
+
pattern = r'\\section\*?\{([^}]+)\}'
|
|
171
|
+
return re.findall(pattern, text)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def extract_section(text: str, section_name: str) -> Optional[str]:
|
|
175
|
+
"""Extract content of a specific section (including its subsections)."""
|
|
176
|
+
# Find the start of the requested section
|
|
177
|
+
pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
|
|
178
|
+
start_match = re.search(pattern, text)
|
|
179
|
+
if not start_match:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
start_pos = start_match.start()
|
|
183
|
+
|
|
184
|
+
# Find the next \section (not subsection) or end of document
|
|
185
|
+
remaining = text[start_match.end():]
|
|
186
|
+
end_match = re.search(r'\\section\*?\{', remaining)
|
|
187
|
+
|
|
188
|
+
if end_match:
|
|
189
|
+
end_pos = start_match.end() + end_match.start()
|
|
190
|
+
return text[start_pos:end_pos].rstrip()
|
|
191
|
+
else:
|
|
192
|
+
return text[start_pos:].rstrip()
|
|
193
|
+
|
|
194
|
+
|
|
167
195
|
def flatten_tex(directory: str, main_file: str) -> str:
|
|
168
196
|
"""Combine all tex files into one, resolving inputs."""
|
|
169
197
|
def process_file(file_path: str, processed_files: set) -> str:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -54,6 +54,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
54
54
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
55
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
56
|
|
|
57
|
+
# List all section names in the paper
|
|
58
|
+
arxiv-to-prompt 2303.08774 --list-sections
|
|
59
|
+
|
|
60
|
+
# Extract only specific sections
|
|
61
|
+
arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
|
|
62
|
+
|
|
57
63
|
# Copy to clipboard
|
|
58
64
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
59
65
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
|
|
2
|
+
arxiv_to_prompt/cli.py,sha256=IwT64A-lf5PrxCxs2e1adN09USkf7ji31uzO8YAegpU,3203
|
|
3
|
+
arxiv_to_prompt/core.py,sha256=GafxYeE0dNg70hNG8BrSM7S99dIpHiy1KoNp5oW8niA,13119
|
|
4
|
+
arxiv_to_prompt-0.5.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
|
|
5
|
+
arxiv_to_prompt-0.5.0.dist-info/METADATA,sha256=4a66cO6DpNdd0dz3U_79QhL60Q1cAhHHyExWUqhL4eo,4786
|
|
6
|
+
arxiv_to_prompt-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
+
arxiv_to_prompt-0.5.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
|
|
8
|
+
arxiv_to_prompt-0.5.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
|
|
9
|
+
arxiv_to_prompt-0.5.0.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
arxiv_to_prompt/__init__.py,sha256=riK7TcTaKDleP5g5rjf2jkmLtXZu7irNZDujyAVDnKM,1093
|
|
2
|
-
arxiv_to_prompt/cli.py,sha256=np6mv2iCkLiVLawyix1vXP4bVFzRmZlkfjxb07ee89Q,2276
|
|
3
|
-
arxiv_to_prompt/core.py,sha256=6tl6IZh5BlBENKa3QMHG0ekqhhLLmh82oUQpgfYrz2o,12228
|
|
4
|
-
arxiv_to_prompt-0.4.1.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
|
|
5
|
-
arxiv_to_prompt-0.4.1.dist-info/METADATA,sha256=p5DIa1t9ik8_Mdn-7qxvfm8j0k--kAMn689-a3WocNM,4598
|
|
6
|
-
arxiv_to_prompt-0.4.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
-
arxiv_to_prompt-0.4.1.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
|
|
8
|
-
arxiv_to_prompt-0.4.1.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
|
|
9
|
-
arxiv_to_prompt-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|