arxiv-to-prompt 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_to_prompt/cli.py +26 -7
- arxiv_to_prompt/core.py +154 -16
- {arxiv_to_prompt-0.5.1.dist-info → arxiv_to_prompt-0.6.0.dist-info}/METADATA +22 -6
- arxiv_to_prompt-0.6.0.dist-info/RECORD +9 -0
- arxiv_to_prompt-0.5.1.dist-info/RECORD +0 -9
- {arxiv_to_prompt-0.5.1.dist-info → arxiv_to_prompt-0.6.0.dist-info}/WHEEL +0 -0
- {arxiv_to_prompt-0.5.1.dist-info → arxiv_to_prompt-0.6.0.dist-info}/entry_points.txt +0 -0
- {arxiv_to_prompt-0.5.1.dist-info → arxiv_to_prompt-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {arxiv_to_prompt-0.5.1.dist-info → arxiv_to_prompt-0.6.0.dist-info}/top_level.txt +0 -0
arxiv_to_prompt/cli.py
CHANGED
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import re
|
|
3
|
-
from .core import
|
|
3
|
+
from .core import (
|
|
4
|
+
process_latex_source,
|
|
5
|
+
get_default_cache_dir,
|
|
6
|
+
list_sections,
|
|
7
|
+
extract_section,
|
|
8
|
+
parse_section_tree,
|
|
9
|
+
format_section_tree,
|
|
10
|
+
find_all_by_name,
|
|
11
|
+
)
|
|
4
12
|
|
|
5
13
|
|
|
6
14
|
def extract_arxiv_id(input_str: str) -> str:
|
|
@@ -79,17 +87,28 @@ def main():
|
|
|
79
87
|
return
|
|
80
88
|
|
|
81
89
|
if args.list_sections:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
print(section)
|
|
90
|
+
tree = parse_section_tree(content)
|
|
91
|
+
print(format_section_tree(tree))
|
|
85
92
|
elif args.section:
|
|
93
|
+
import sys
|
|
94
|
+
tree = parse_section_tree(content)
|
|
86
95
|
extracted = []
|
|
87
|
-
for
|
|
88
|
-
|
|
96
|
+
for section_path in args.section:
|
|
97
|
+
# Check for ambiguity only if not using path notation
|
|
98
|
+
if " > " not in section_path:
|
|
99
|
+
matching_paths = find_all_by_name(tree, section_path)
|
|
100
|
+
if len(matching_paths) > 1:
|
|
101
|
+
print(f"Warning: '{section_path}' is ambiguous. Found at:", file=sys.stderr)
|
|
102
|
+
for path in matching_paths:
|
|
103
|
+
print(f" - {path}", file=sys.stderr)
|
|
104
|
+
print("Use path notation to disambiguate.", file=sys.stderr)
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
section_content = extract_section(content, section_path)
|
|
89
108
|
if section_content:
|
|
90
109
|
extracted.append(section_content)
|
|
91
110
|
else:
|
|
92
|
-
print(f"Warning: Section '{
|
|
111
|
+
print(f"Warning: Section '{section_path}' not found", file=sys.stderr)
|
|
93
112
|
if extracted:
|
|
94
113
|
print("\n\n".join(extracted))
|
|
95
114
|
else:
|
arxiv_to_prompt/core.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import tarfile
|
|
4
4
|
import shutil
|
|
5
5
|
from typing import Optional, List
|
|
6
|
+
from dataclasses import dataclass, field
|
|
6
7
|
import re
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
import requests
|
|
@@ -186,25 +187,162 @@ def list_sections(text: str) -> list:
|
|
|
186
187
|
return re.findall(pattern, text)
|
|
187
188
|
|
|
188
189
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
190
|
+
@dataclass
|
|
191
|
+
class SectionNode:
|
|
192
|
+
"""Represents a section/subsection/subsubsection in the LaTeX document tree."""
|
|
193
|
+
level: int # 0=section, 1=subsection, 2=subsubsection
|
|
194
|
+
name: str
|
|
195
|
+
start_pos: int
|
|
196
|
+
end_pos: int = -1 # -1 means end of document
|
|
197
|
+
children: List['SectionNode'] = field(default_factory=list)
|
|
198
|
+
parent: Optional['SectionNode'] = None
|
|
196
199
|
|
|
197
|
-
start_pos = start_match.start()
|
|
198
200
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
201
|
+
def parse_section_tree(text: str) -> List[SectionNode]:
|
|
202
|
+
"""
|
|
203
|
+
Build a hierarchical tree from LaTeX section commands.
|
|
202
204
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
205
|
+
Returns a list of top-level section nodes, each containing their subsections as children.
|
|
206
|
+
"""
|
|
207
|
+
# Match section, subsection, and subsubsection commands
|
|
208
|
+
pattern = r'\\(section|subsection|subsubsection)\*?\{([^}]+)\}'
|
|
209
|
+
|
|
210
|
+
level_map = {'section': 0, 'subsection': 1, 'subsubsection': 2}
|
|
211
|
+
|
|
212
|
+
# Find all section commands with their positions
|
|
213
|
+
matches = list(re.finditer(pattern, text))
|
|
214
|
+
|
|
215
|
+
if not matches:
|
|
216
|
+
return []
|
|
217
|
+
|
|
218
|
+
# Create nodes for all sections
|
|
219
|
+
all_nodes = []
|
|
220
|
+
for match in matches:
|
|
221
|
+
level = level_map[match.group(1)]
|
|
222
|
+
name = match.group(2)
|
|
223
|
+
start_pos = match.start()
|
|
224
|
+
all_nodes.append(SectionNode(level=level, name=name, start_pos=start_pos))
|
|
225
|
+
|
|
226
|
+
# Calculate end positions (each section ends where the next same-or-higher level starts)
|
|
227
|
+
for i, node in enumerate(all_nodes):
|
|
228
|
+
# Find next section at same or higher (lower number) level
|
|
229
|
+
for j in range(i + 1, len(all_nodes)):
|
|
230
|
+
if all_nodes[j].level <= node.level:
|
|
231
|
+
node.end_pos = all_nodes[j].start_pos
|
|
232
|
+
break
|
|
233
|
+
# If no next section found at same/higher level, end at document end
|
|
234
|
+
if node.end_pos == -1:
|
|
235
|
+
node.end_pos = len(text)
|
|
236
|
+
|
|
237
|
+
# Build tree structure
|
|
238
|
+
root_nodes: List[SectionNode] = []
|
|
239
|
+
section_stack: List[SectionNode] = []
|
|
240
|
+
|
|
241
|
+
for node in all_nodes:
|
|
242
|
+
# Pop from stack until we find a parent at a higher level
|
|
243
|
+
while section_stack and section_stack[-1].level >= node.level:
|
|
244
|
+
section_stack.pop()
|
|
245
|
+
|
|
246
|
+
if section_stack:
|
|
247
|
+
# This node is a child of the top of the stack
|
|
248
|
+
node.parent = section_stack[-1]
|
|
249
|
+
section_stack[-1].children.append(node)
|
|
250
|
+
else:
|
|
251
|
+
# This is a root node
|
|
252
|
+
root_nodes.append(node)
|
|
253
|
+
|
|
254
|
+
section_stack.append(node)
|
|
255
|
+
|
|
256
|
+
return root_nodes
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def format_section_tree(nodes: List[SectionNode], indent: int = 0) -> str:
|
|
260
|
+
"""
|
|
261
|
+
Format section tree with indentation for display.
|
|
262
|
+
|
|
263
|
+
Returns a string with each section name on its own line, indented by level.
|
|
264
|
+
"""
|
|
265
|
+
lines = []
|
|
266
|
+
for node in nodes:
|
|
267
|
+
lines.append(" " * indent + node.name)
|
|
268
|
+
if node.children:
|
|
269
|
+
lines.append(format_section_tree(node.children, indent + 1))
|
|
270
|
+
return "\n".join(lines)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def find_all_by_name(nodes: List[SectionNode], name: str, parent_path: str = "") -> List[str]:
|
|
274
|
+
"""
|
|
275
|
+
Find all paths to sections with the given name.
|
|
276
|
+
|
|
277
|
+
Returns a list of full paths (e.g., ["Introduction > Background", "Methods > Background"])
|
|
278
|
+
"""
|
|
279
|
+
results = []
|
|
280
|
+
for node in nodes:
|
|
281
|
+
current_path = f"{parent_path} > {node.name}" if parent_path else node.name
|
|
282
|
+
if node.name == name:
|
|
283
|
+
results.append(current_path)
|
|
284
|
+
if node.children:
|
|
285
|
+
results.extend(find_all_by_name(node.children, name, current_path))
|
|
286
|
+
return results
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def find_section_by_path(nodes: List[SectionNode], path: str) -> Optional[SectionNode]:
|
|
290
|
+
"""
|
|
291
|
+
Find a section by path notation (e.g., "Methods > Background").
|
|
292
|
+
|
|
293
|
+
If path contains no " > ", searches for an exact name match at any level.
|
|
294
|
+
If path contains " > ", follows the hierarchy.
|
|
295
|
+
"""
|
|
296
|
+
parts = [p.strip() for p in path.split(" > ")]
|
|
297
|
+
|
|
298
|
+
if len(parts) == 1:
|
|
299
|
+
# Simple name lookup - find first match at any level
|
|
300
|
+
def find_first(nodes: List[SectionNode], name: str) -> Optional[SectionNode]:
|
|
301
|
+
for node in nodes:
|
|
302
|
+
if node.name == name:
|
|
303
|
+
return node
|
|
304
|
+
if node.children:
|
|
305
|
+
result = find_first(node.children, name)
|
|
306
|
+
if result:
|
|
307
|
+
return result
|
|
308
|
+
return None
|
|
309
|
+
return find_first(nodes, parts[0])
|
|
310
|
+
|
|
311
|
+
# Path notation - follow the hierarchy
|
|
312
|
+
current_nodes = nodes
|
|
313
|
+
current_node = None
|
|
314
|
+
|
|
315
|
+
for part in parts:
|
|
316
|
+
found = None
|
|
317
|
+
for node in current_nodes:
|
|
318
|
+
if node.name == part:
|
|
319
|
+
found = node
|
|
320
|
+
break
|
|
321
|
+
if not found:
|
|
322
|
+
return None
|
|
323
|
+
current_node = found
|
|
324
|
+
current_nodes = found.children
|
|
325
|
+
|
|
326
|
+
return current_node
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def extract_section(text: str, section_path: str) -> Optional[str]:
|
|
330
|
+
"""
|
|
331
|
+
Extract content of a specific section, subsection, or subsubsection.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
text: The LaTeX content
|
|
335
|
+
section_path: Section name or path (e.g., "Methods" or "Methods > Background")
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
The section content including any subsections, or None if not found.
|
|
339
|
+
"""
|
|
340
|
+
tree = parse_section_tree(text)
|
|
341
|
+
node = find_section_by_path(tree, section_path)
|
|
342
|
+
if not node:
|
|
343
|
+
return None
|
|
344
|
+
|
|
345
|
+
return text[node.start_pos:node.end_pos].rstrip()
|
|
208
346
|
|
|
209
347
|
|
|
210
348
|
def flatten_tex(directory: str, main_file: str) -> str:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
54
54
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
55
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
56
|
|
|
57
|
-
# List all
|
|
58
|
-
arxiv-to-prompt
|
|
59
|
-
|
|
60
|
-
#
|
|
61
|
-
|
|
57
|
+
# List all sections (with subsections indented)
|
|
58
|
+
arxiv-to-prompt 2307.09288 --list-sections
|
|
59
|
+
# Introduction
|
|
60
|
+
# Pretraining
|
|
61
|
+
# Pretraining Data
|
|
62
|
+
# Training Details
|
|
63
|
+
# Training Hardware \& Carbon Footprint
|
|
64
|
+
# ...
|
|
65
|
+
|
|
66
|
+
# Extract specific sections
|
|
67
|
+
arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
|
|
68
|
+
|
|
69
|
+
# Ambiguous names show a helpful error
|
|
70
|
+
arxiv-to-prompt 2307.09288 --section "Human Evaluation"
|
|
71
|
+
# Warning: 'Human Evaluation' is ambiguous. Found at:
|
|
72
|
+
# - Fine-tuning > RLHF Results > Human Evaluation
|
|
73
|
+
# - Appendix > Additional Details for Fine-tuning > Human Evaluation
|
|
74
|
+
# Use path notation to disambiguate.
|
|
75
|
+
|
|
76
|
+
# Use path notation when the same name appears multiple times
|
|
77
|
+
arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
|
|
62
78
|
|
|
63
79
|
# Copy to clipboard
|
|
64
80
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
|
|
2
|
+
arxiv_to_prompt/cli.py,sha256=0a0DoOYkKIp8mE_FqzVYmG2gvCtnFiIJtIlfZLkZu5g,3865
|
|
3
|
+
arxiv_to_prompt/core.py,sha256=kI0xKTf1igeOxNACJVOtq6PlCoN6kYuTq9KfD4jzE1M,18352
|
|
4
|
+
arxiv_to_prompt-0.6.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
|
|
5
|
+
arxiv_to_prompt-0.6.0.dist-info/METADATA,sha256=VRGqZuboa4DCMzQ2xpAN_G7SVFdOm1YyJ6cor62lr5k,5376
|
|
6
|
+
arxiv_to_prompt-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
+
arxiv_to_prompt-0.6.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
|
|
8
|
+
arxiv_to_prompt-0.6.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
|
|
9
|
+
arxiv_to_prompt-0.6.0.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
|
|
2
|
-
arxiv_to_prompt/cli.py,sha256=IwT64A-lf5PrxCxs2e1adN09USkf7ji31uzO8YAegpU,3203
|
|
3
|
-
arxiv_to_prompt/core.py,sha256=ln67k1MT-l8PalwGsszU6IwCZ15GAOiX0yfLgyKvySA,13837
|
|
4
|
-
arxiv_to_prompt-0.5.1.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
|
|
5
|
-
arxiv_to_prompt-0.5.1.dist-info/METADATA,sha256=VKK7my5pxFuVLTejMV3vS8BLhk_kV62HHPWxC84_80Q,4786
|
|
6
|
-
arxiv_to_prompt-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
7
|
-
arxiv_to_prompt-0.5.1.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
|
|
8
|
-
arxiv_to_prompt-0.5.1.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
|
|
9
|
-
arxiv_to_prompt-0.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|