arxiv-to-prompt 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_to_prompt/cli.py +26 -7
- arxiv_to_prompt/core.py +194 -41
- {arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/METADATA +22 -6
- arxiv_to_prompt-0.6.0.dist-info/RECORD +9 -0
- arxiv_to_prompt-0.5.0.dist-info/RECORD +0 -9
- {arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/WHEEL +0 -0
- {arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/entry_points.txt +0 -0
- {arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/top_level.txt +0 -0
arxiv_to_prompt/cli.py
CHANGED
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import re
|
|
3
|
-
from .core import
|
|
3
|
+
from .core import (
|
|
4
|
+
process_latex_source,
|
|
5
|
+
get_default_cache_dir,
|
|
6
|
+
list_sections,
|
|
7
|
+
extract_section,
|
|
8
|
+
parse_section_tree,
|
|
9
|
+
format_section_tree,
|
|
10
|
+
find_all_by_name,
|
|
11
|
+
)
|
|
4
12
|
|
|
5
13
|
|
|
6
14
|
def extract_arxiv_id(input_str: str) -> str:
|
|
@@ -79,17 +87,28 @@ def main():
|
|
|
79
87
|
return
|
|
80
88
|
|
|
81
89
|
if args.list_sections:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
print(section)
|
|
90
|
+
tree = parse_section_tree(content)
|
|
91
|
+
print(format_section_tree(tree))
|
|
85
92
|
elif args.section:
|
|
93
|
+
import sys
|
|
94
|
+
tree = parse_section_tree(content)
|
|
86
95
|
extracted = []
|
|
87
|
-
for
|
|
88
|
-
|
|
96
|
+
for section_path in args.section:
|
|
97
|
+
# Check for ambiguity only if not using path notation
|
|
98
|
+
if " > " not in section_path:
|
|
99
|
+
matching_paths = find_all_by_name(tree, section_path)
|
|
100
|
+
if len(matching_paths) > 1:
|
|
101
|
+
print(f"Warning: '{section_path}' is ambiguous. Found at:", file=sys.stderr)
|
|
102
|
+
for path in matching_paths:
|
|
103
|
+
print(f" - {path}", file=sys.stderr)
|
|
104
|
+
print("Use path notation to disambiguate.", file=sys.stderr)
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
section_content = extract_section(content, section_path)
|
|
89
108
|
if section_content:
|
|
90
109
|
extracted.append(section_content)
|
|
91
110
|
else:
|
|
92
|
-
print(f"Warning: Section '{
|
|
111
|
+
print(f"Warning: Section '{section_path}' not found", file=sys.stderr)
|
|
93
112
|
if extracted:
|
|
94
113
|
print("\n\n".join(extracted))
|
|
95
114
|
else:
|
arxiv_to_prompt/core.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import tarfile
|
|
4
4
|
import shutil
|
|
5
5
|
from typing import Optional, List
|
|
6
|
+
from dataclasses import dataclass, field
|
|
6
7
|
import re
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
import requests
|
|
@@ -92,40 +93,55 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
|
|
|
92
93
|
|
|
93
94
|
def find_main_tex(directory: str) -> Optional[str]:
|
|
94
95
|
"""
|
|
95
|
-
Find the main .tex file containing documentclass.
|
|
96
|
+
Find the main .tex file containing documentclass.
|
|
97
|
+
Searches recursively through subdirectories.
|
|
96
98
|
First checks for common naming conventions (main.tex, paper.tex, index.tex).
|
|
97
|
-
If none found, returns the
|
|
98
|
-
since shorter files are typically conference templates or supplementary documents
|
|
99
|
+
If none found, returns the path of the longest .tex file containing documentclass,
|
|
100
|
+
since shorter files are typically conference templates or supplementary documents
|
|
99
101
|
rather than the main manuscript.
|
|
100
102
|
"""
|
|
101
103
|
common_names = ['main.tex', 'paper.tex', 'index.tex']
|
|
102
104
|
main_tex_file = None
|
|
103
105
|
max_line_count = 0
|
|
104
106
|
|
|
105
|
-
#
|
|
106
|
-
for
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
107
|
+
# Walk through directory and subdirectories
|
|
108
|
+
for root, dirs, files in os.walk(directory):
|
|
109
|
+
rel_root = os.path.relpath(root, directory)
|
|
110
|
+
|
|
111
|
+
# First pass: check for common naming conventions
|
|
112
|
+
for file_name in files:
|
|
113
|
+
if file_name in common_names:
|
|
114
|
+
file_path = os.path.join(root, file_name)
|
|
115
|
+
try:
|
|
116
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
117
|
+
lines = file.readlines()
|
|
118
|
+
if any('\\documentclass' in line for line in lines):
|
|
119
|
+
if rel_root == '.':
|
|
120
|
+
return file_name
|
|
121
|
+
return os.path.join(rel_root, file_name)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logging.warning(f"Could not read file {file_path}: {e}")
|
|
115
124
|
|
|
116
125
|
# Second pass: find the longest .tex file containing documentclass
|
|
117
|
-
for
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
126
|
+
for root, dirs, files in os.walk(directory):
|
|
127
|
+
rel_root = os.path.relpath(root, directory)
|
|
128
|
+
|
|
129
|
+
for file_name in files:
|
|
130
|
+
if file_name.endswith('.tex'):
|
|
131
|
+
file_path = os.path.join(root, file_name)
|
|
132
|
+
try:
|
|
133
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
134
|
+
lines = file.readlines()
|
|
135
|
+
if any('\\documentclass' in line for line in lines):
|
|
136
|
+
line_count = len(lines)
|
|
137
|
+
if line_count > max_line_count:
|
|
138
|
+
if rel_root == '.':
|
|
139
|
+
main_tex_file = file_name
|
|
140
|
+
else:
|
|
141
|
+
main_tex_file = os.path.join(rel_root, file_name)
|
|
142
|
+
max_line_count = line_count
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logging.warning(f"Could not read file {file_path}: {e}")
|
|
129
145
|
|
|
130
146
|
return main_tex_file
|
|
131
147
|
|
|
@@ -171,25 +187,162 @@ def list_sections(text: str) -> list:
|
|
|
171
187
|
return re.findall(pattern, text)
|
|
172
188
|
|
|
173
189
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
190
|
+
@dataclass
|
|
191
|
+
class SectionNode:
|
|
192
|
+
"""Represents a section/subsection/subsubsection in the LaTeX document tree."""
|
|
193
|
+
level: int # 0=section, 1=subsection, 2=subsubsection
|
|
194
|
+
name: str
|
|
195
|
+
start_pos: int
|
|
196
|
+
end_pos: int = -1 # -1 means end of document
|
|
197
|
+
children: List['SectionNode'] = field(default_factory=list)
|
|
198
|
+
parent: Optional['SectionNode'] = None
|
|
181
199
|
|
|
182
|
-
start_pos = start_match.start()
|
|
183
200
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
201
|
+
def parse_section_tree(text: str) -> List[SectionNode]:
|
|
202
|
+
"""
|
|
203
|
+
Build a hierarchical tree from LaTeX section commands.
|
|
187
204
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
205
|
+
Returns a list of top-level section nodes, each containing their subsections as children.
|
|
206
|
+
"""
|
|
207
|
+
# Match section, subsection, and subsubsection commands
|
|
208
|
+
pattern = r'\\(section|subsection|subsubsection)\*?\{([^}]+)\}'
|
|
209
|
+
|
|
210
|
+
level_map = {'section': 0, 'subsection': 1, 'subsubsection': 2}
|
|
211
|
+
|
|
212
|
+
# Find all section commands with their positions
|
|
213
|
+
matches = list(re.finditer(pattern, text))
|
|
214
|
+
|
|
215
|
+
if not matches:
|
|
216
|
+
return []
|
|
217
|
+
|
|
218
|
+
# Create nodes for all sections
|
|
219
|
+
all_nodes = []
|
|
220
|
+
for match in matches:
|
|
221
|
+
level = level_map[match.group(1)]
|
|
222
|
+
name = match.group(2)
|
|
223
|
+
start_pos = match.start()
|
|
224
|
+
all_nodes.append(SectionNode(level=level, name=name, start_pos=start_pos))
|
|
225
|
+
|
|
226
|
+
# Calculate end positions (each section ends where the next same-or-higher level starts)
|
|
227
|
+
for i, node in enumerate(all_nodes):
|
|
228
|
+
# Find next section at same or higher (lower number) level
|
|
229
|
+
for j in range(i + 1, len(all_nodes)):
|
|
230
|
+
if all_nodes[j].level <= node.level:
|
|
231
|
+
node.end_pos = all_nodes[j].start_pos
|
|
232
|
+
break
|
|
233
|
+
# If no next section found at same/higher level, end at document end
|
|
234
|
+
if node.end_pos == -1:
|
|
235
|
+
node.end_pos = len(text)
|
|
236
|
+
|
|
237
|
+
# Build tree structure
|
|
238
|
+
root_nodes: List[SectionNode] = []
|
|
239
|
+
section_stack: List[SectionNode] = []
|
|
240
|
+
|
|
241
|
+
for node in all_nodes:
|
|
242
|
+
# Pop from stack until we find a parent at a higher level
|
|
243
|
+
while section_stack and section_stack[-1].level >= node.level:
|
|
244
|
+
section_stack.pop()
|
|
245
|
+
|
|
246
|
+
if section_stack:
|
|
247
|
+
# This node is a child of the top of the stack
|
|
248
|
+
node.parent = section_stack[-1]
|
|
249
|
+
section_stack[-1].children.append(node)
|
|
250
|
+
else:
|
|
251
|
+
# This is a root node
|
|
252
|
+
root_nodes.append(node)
|
|
253
|
+
|
|
254
|
+
section_stack.append(node)
|
|
255
|
+
|
|
256
|
+
return root_nodes
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def format_section_tree(nodes: List[SectionNode], indent: int = 0) -> str:
|
|
260
|
+
"""
|
|
261
|
+
Format section tree with indentation for display.
|
|
262
|
+
|
|
263
|
+
Returns a string with each section name on its own line, indented by level.
|
|
264
|
+
"""
|
|
265
|
+
lines = []
|
|
266
|
+
for node in nodes:
|
|
267
|
+
lines.append(" " * indent + node.name)
|
|
268
|
+
if node.children:
|
|
269
|
+
lines.append(format_section_tree(node.children, indent + 1))
|
|
270
|
+
return "\n".join(lines)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def find_all_by_name(nodes: List[SectionNode], name: str, parent_path: str = "") -> List[str]:
|
|
274
|
+
"""
|
|
275
|
+
Find all paths to sections with the given name.
|
|
276
|
+
|
|
277
|
+
Returns a list of full paths (e.g., ["Introduction > Background", "Methods > Background"])
|
|
278
|
+
"""
|
|
279
|
+
results = []
|
|
280
|
+
for node in nodes:
|
|
281
|
+
current_path = f"{parent_path} > {node.name}" if parent_path else node.name
|
|
282
|
+
if node.name == name:
|
|
283
|
+
results.append(current_path)
|
|
284
|
+
if node.children:
|
|
285
|
+
results.extend(find_all_by_name(node.children, name, current_path))
|
|
286
|
+
return results
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def find_section_by_path(nodes: List[SectionNode], path: str) -> Optional[SectionNode]:
|
|
290
|
+
"""
|
|
291
|
+
Find a section by path notation (e.g., "Methods > Background").
|
|
292
|
+
|
|
293
|
+
If path contains no " > ", searches for an exact name match at any level.
|
|
294
|
+
If path contains " > ", follows the hierarchy.
|
|
295
|
+
"""
|
|
296
|
+
parts = [p.strip() for p in path.split(" > ")]
|
|
297
|
+
|
|
298
|
+
if len(parts) == 1:
|
|
299
|
+
# Simple name lookup - find first match at any level
|
|
300
|
+
def find_first(nodes: List[SectionNode], name: str) -> Optional[SectionNode]:
|
|
301
|
+
for node in nodes:
|
|
302
|
+
if node.name == name:
|
|
303
|
+
return node
|
|
304
|
+
if node.children:
|
|
305
|
+
result = find_first(node.children, name)
|
|
306
|
+
if result:
|
|
307
|
+
return result
|
|
308
|
+
return None
|
|
309
|
+
return find_first(nodes, parts[0])
|
|
310
|
+
|
|
311
|
+
# Path notation - follow the hierarchy
|
|
312
|
+
current_nodes = nodes
|
|
313
|
+
current_node = None
|
|
314
|
+
|
|
315
|
+
for part in parts:
|
|
316
|
+
found = None
|
|
317
|
+
for node in current_nodes:
|
|
318
|
+
if node.name == part:
|
|
319
|
+
found = node
|
|
320
|
+
break
|
|
321
|
+
if not found:
|
|
322
|
+
return None
|
|
323
|
+
current_node = found
|
|
324
|
+
current_nodes = found.children
|
|
325
|
+
|
|
326
|
+
return current_node
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def extract_section(text: str, section_path: str) -> Optional[str]:
|
|
330
|
+
"""
|
|
331
|
+
Extract content of a specific section, subsection, or subsubsection.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
text: The LaTeX content
|
|
335
|
+
section_path: Section name or path (e.g., "Methods" or "Methods > Background")
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
The section content including any subsections, or None if not found.
|
|
339
|
+
"""
|
|
340
|
+
tree = parse_section_tree(text)
|
|
341
|
+
node = find_section_by_path(tree, section_path)
|
|
342
|
+
if not node:
|
|
343
|
+
return None
|
|
344
|
+
|
|
345
|
+
return text[node.start_pos:node.end_pos].rstrip()
|
|
193
346
|
|
|
194
347
|
|
|
195
348
|
def flatten_tex(directory: str, main_file: str) -> str:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
54
54
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
55
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
56
|
|
|
57
|
-
# List all
|
|
58
|
-
arxiv-to-prompt
|
|
59
|
-
|
|
60
|
-
#
|
|
61
|
-
|
|
57
|
+
# List all sections (with subsections indented)
|
|
58
|
+
arxiv-to-prompt 2307.09288 --list-sections
|
|
59
|
+
# Introduction
|
|
60
|
+
# Pretraining
|
|
61
|
+
# Pretraining Data
|
|
62
|
+
# Training Details
|
|
63
|
+
# Training Hardware \& Carbon Footprint
|
|
64
|
+
# ...
|
|
65
|
+
|
|
66
|
+
# Extract specific sections
|
|
67
|
+
arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
|
|
68
|
+
|
|
69
|
+
# Ambiguous names show a helpful error
|
|
70
|
+
arxiv-to-prompt 2307.09288 --section "Human Evaluation"
|
|
71
|
+
# Warning: 'Human Evaluation' is ambiguous. Found at:
|
|
72
|
+
# - Fine-tuning > RLHF Results > Human Evaluation
|
|
73
|
+
# - Appendix > Additional Details for Fine-tuning > Human Evaluation
|
|
74
|
+
# Use path notation to disambiguate.
|
|
75
|
+
|
|
76
|
+
# Use path notation when the same name appears multiple times
|
|
77
|
+
arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
|
|
62
78
|
|
|
63
79
|
# Copy to clipboard
|
|
64
80
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
|
|
2
|
+
arxiv_to_prompt/cli.py,sha256=0a0DoOYkKIp8mE_FqzVYmG2gvCtnFiIJtIlfZLkZu5g,3865
|
|
3
|
+
arxiv_to_prompt/core.py,sha256=kI0xKTf1igeOxNACJVOtq6PlCoN6kYuTq9KfD4jzE1M,18352
|
|
4
|
+
arxiv_to_prompt-0.6.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
|
|
5
|
+
arxiv_to_prompt-0.6.0.dist-info/METADATA,sha256=VRGqZuboa4DCMzQ2xpAN_G7SVFdOm1YyJ6cor62lr5k,5376
|
|
6
|
+
arxiv_to_prompt-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
+
arxiv_to_prompt-0.6.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
|
|
8
|
+
arxiv_to_prompt-0.6.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
|
|
9
|
+
arxiv_to_prompt-0.6.0.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
|
|
2
|
-
arxiv_to_prompt/cli.py,sha256=IwT64A-lf5PrxCxs2e1adN09USkf7ji31uzO8YAegpU,3203
|
|
3
|
-
arxiv_to_prompt/core.py,sha256=GafxYeE0dNg70hNG8BrSM7S99dIpHiy1KoNp5oW8niA,13119
|
|
4
|
-
arxiv_to_prompt-0.5.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
|
|
5
|
-
arxiv_to_prompt-0.5.0.dist-info/METADATA,sha256=4a66cO6DpNdd0dz3U_79QhL60Q1cAhHHyExWUqhL4eo,4786
|
|
6
|
-
arxiv_to_prompt-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
-
arxiv_to_prompt-0.5.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
|
|
8
|
-
arxiv_to_prompt-0.5.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
|
|
9
|
-
arxiv_to_prompt-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|