arxiv-to-prompt 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arxiv_to_prompt/cli.py CHANGED
@@ -1,6 +1,14 @@
1
1
  import argparse
2
2
  import re
3
- from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
3
+ from .core import (
4
+ process_latex_source,
5
+ get_default_cache_dir,
6
+ list_sections,
7
+ extract_section,
8
+ parse_section_tree,
9
+ format_section_tree,
10
+ find_all_by_name,
11
+ )
4
12
 
5
13
 
6
14
  def extract_arxiv_id(input_str: str) -> str:
@@ -79,17 +87,28 @@ def main():
79
87
  return
80
88
 
81
89
  if args.list_sections:
82
- sections = list_sections(content)
83
- for section in sections:
84
- print(section)
90
+ tree = parse_section_tree(content)
91
+ print(format_section_tree(tree))
85
92
  elif args.section:
93
+ import sys
94
+ tree = parse_section_tree(content)
86
95
  extracted = []
87
- for section_name in args.section:
88
- section_content = extract_section(content, section_name)
96
+ for section_path in args.section:
97
+ # Check for ambiguity only if not using path notation
98
+ if " > " not in section_path:
99
+ matching_paths = find_all_by_name(tree, section_path)
100
+ if len(matching_paths) > 1:
101
+ print(f"Warning: '{section_path}' is ambiguous. Found at:", file=sys.stderr)
102
+ for path in matching_paths:
103
+ print(f" - {path}", file=sys.stderr)
104
+ print("Use path notation to disambiguate.", file=sys.stderr)
105
+ continue
106
+
107
+ section_content = extract_section(content, section_path)
89
108
  if section_content:
90
109
  extracted.append(section_content)
91
110
  else:
92
- print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
111
+ print(f"Warning: Section '{section_path}' not found", file=sys.stderr)
93
112
  if extracted:
94
113
  print("\n\n".join(extracted))
95
114
  else:
arxiv_to_prompt/core.py CHANGED
@@ -3,6 +3,7 @@ import os
3
3
  import tarfile
4
4
  import shutil
5
5
  from typing import Optional, List
6
+ from dataclasses import dataclass, field
6
7
  import re
7
8
  from pathlib import Path
8
9
  import requests
@@ -186,25 +187,162 @@ def list_sections(text: str) -> list:
186
187
  return re.findall(pattern, text)
187
188
 
188
189
 
189
- def extract_section(text: str, section_name: str) -> Optional[str]:
190
- """Extract content of a specific section (including its subsections)."""
191
- # Find the start of the requested section
192
- pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
193
- start_match = re.search(pattern, text)
194
- if not start_match:
195
- return None
190
+ @dataclass
191
+ class SectionNode:
192
+ """Represents a section/subsection/subsubsection in the LaTeX document tree."""
193
+ level: int # 0=section, 1=subsection, 2=subsubsection
194
+ name: str
195
+ start_pos: int
196
+ end_pos: int = -1 # -1 means end of document
197
+ children: List['SectionNode'] = field(default_factory=list)
198
+ parent: Optional['SectionNode'] = None
196
199
 
197
- start_pos = start_match.start()
198
200
 
199
- # Find the next \section (not subsection) or end of document
200
- remaining = text[start_match.end():]
201
- end_match = re.search(r'\\section\*?\{', remaining)
201
+ def parse_section_tree(text: str) -> List[SectionNode]:
202
+ """
203
+ Build a hierarchical tree from LaTeX section commands.
202
204
 
203
- if end_match:
204
- end_pos = start_match.end() + end_match.start()
205
- return text[start_pos:end_pos].rstrip()
206
- else:
207
- return text[start_pos:].rstrip()
205
+ Returns a list of top-level section nodes, each containing their subsections as children.
206
+ """
207
+ # Match section, subsection, and subsubsection commands
208
+ pattern = r'\\(section|subsection|subsubsection)\*?\{([^}]+)\}'
209
+
210
+ level_map = {'section': 0, 'subsection': 1, 'subsubsection': 2}
211
+
212
+ # Find all section commands with their positions
213
+ matches = list(re.finditer(pattern, text))
214
+
215
+ if not matches:
216
+ return []
217
+
218
+ # Create nodes for all sections
219
+ all_nodes = []
220
+ for match in matches:
221
+ level = level_map[match.group(1)]
222
+ name = match.group(2)
223
+ start_pos = match.start()
224
+ all_nodes.append(SectionNode(level=level, name=name, start_pos=start_pos))
225
+
226
+ # Calculate end positions (each section ends where the next same-or-higher level starts)
227
+ for i, node in enumerate(all_nodes):
228
+ # Find next section at same or higher (lower number) level
229
+ for j in range(i + 1, len(all_nodes)):
230
+ if all_nodes[j].level <= node.level:
231
+ node.end_pos = all_nodes[j].start_pos
232
+ break
233
+ # If no next section found at same/higher level, end at document end
234
+ if node.end_pos == -1:
235
+ node.end_pos = len(text)
236
+
237
+ # Build tree structure
238
+ root_nodes: List[SectionNode] = []
239
+ section_stack: List[SectionNode] = []
240
+
241
+ for node in all_nodes:
242
+ # Pop from stack until we find a parent at a higher level
243
+ while section_stack and section_stack[-1].level >= node.level:
244
+ section_stack.pop()
245
+
246
+ if section_stack:
247
+ # This node is a child of the top of the stack
248
+ node.parent = section_stack[-1]
249
+ section_stack[-1].children.append(node)
250
+ else:
251
+ # This is a root node
252
+ root_nodes.append(node)
253
+
254
+ section_stack.append(node)
255
+
256
+ return root_nodes
257
+
258
+
259
+ def format_section_tree(nodes: List[SectionNode], indent: int = 0) -> str:
260
+ """
261
+ Format section tree with indentation for display.
262
+
263
+ Returns a string with each section name on its own line, indented by level.
264
+ """
265
+ lines = []
266
+ for node in nodes:
267
+ lines.append(" " * indent + node.name)
268
+ if node.children:
269
+ lines.append(format_section_tree(node.children, indent + 1))
270
+ return "\n".join(lines)
271
+
272
+
273
+ def find_all_by_name(nodes: List[SectionNode], name: str, parent_path: str = "") -> List[str]:
274
+ """
275
+ Find all paths to sections with the given name.
276
+
277
+ Returns a list of full paths (e.g., ["Introduction > Background", "Methods > Background"])
278
+ """
279
+ results = []
280
+ for node in nodes:
281
+ current_path = f"{parent_path} > {node.name}" if parent_path else node.name
282
+ if node.name == name:
283
+ results.append(current_path)
284
+ if node.children:
285
+ results.extend(find_all_by_name(node.children, name, current_path))
286
+ return results
287
+
288
+
289
+ def find_section_by_path(nodes: List[SectionNode], path: str) -> Optional[SectionNode]:
290
+ """
291
+ Find a section by path notation (e.g., "Methods > Background").
292
+
293
+ If path contains no " > ", searches for an exact name match at any level.
294
+ If path contains " > ", follows the hierarchy.
295
+ """
296
+ parts = [p.strip() for p in path.split(" > ")]
297
+
298
+ if len(parts) == 1:
299
+ # Simple name lookup - find first match at any level
300
+ def find_first(nodes: List[SectionNode], name: str) -> Optional[SectionNode]:
301
+ for node in nodes:
302
+ if node.name == name:
303
+ return node
304
+ if node.children:
305
+ result = find_first(node.children, name)
306
+ if result:
307
+ return result
308
+ return None
309
+ return find_first(nodes, parts[0])
310
+
311
+ # Path notation - follow the hierarchy
312
+ current_nodes = nodes
313
+ current_node = None
314
+
315
+ for part in parts:
316
+ found = None
317
+ for node in current_nodes:
318
+ if node.name == part:
319
+ found = node
320
+ break
321
+ if not found:
322
+ return None
323
+ current_node = found
324
+ current_nodes = found.children
325
+
326
+ return current_node
327
+
328
+
329
+ def extract_section(text: str, section_path: str) -> Optional[str]:
330
+ """
331
+ Extract content of a specific section, subsection, or subsubsection.
332
+
333
+ Args:
334
+ text: The LaTeX content
335
+ section_path: Section name or path (e.g., "Methods" or "Methods > Background")
336
+
337
+ Returns:
338
+ The section content including any subsections, or None if not found.
339
+ """
340
+ tree = parse_section_tree(text)
341
+ node = find_section_by_path(tree, section_path)
342
+ if not node:
343
+ return None
344
+
345
+ return text[node.start_pos:node.end_pos].rstrip()
208
346
 
209
347
 
210
348
  def flatten_tex(directory: str, main_file: str) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
54
54
  # Process a local folder containing TeX files (instead of downloading from arXiv)
55
55
  arxiv-to-prompt --local-folder /path/to/tex/files
56
56
 
57
- # List all section names in the paper
58
- arxiv-to-prompt 2303.08774 --list-sections
59
-
60
- # Extract only specific sections
61
- arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
57
+ # List all sections (with subsections indented)
58
+ arxiv-to-prompt 2307.09288 --list-sections
59
+ # Introduction
60
+ # Pretraining
61
+ # Pretraining Data
62
+ # Training Details
63
+ # Training Hardware \& Carbon Footprint
64
+ # ...
65
+
66
+ # Extract specific sections
67
+ arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
68
+
69
+ # Ambiguous names show a helpful error
70
+ arxiv-to-prompt 2307.09288 --section "Human Evaluation"
71
+ # Warning: 'Human Evaluation' is ambiguous. Found at:
72
+ # - Fine-tuning > RLHF Results > Human Evaluation
73
+ # - Appendix > Additional Details for Fine-tuning > Human Evaluation
74
+ # Use path notation to disambiguate.
75
+
76
+ # Use path notation when the same name appears multiple times
77
+ arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
62
78
 
63
79
  # Copy to clipboard
64
80
  arxiv-to-prompt 2303.08774 | pbcopy
@@ -0,0 +1,9 @@
1
+ arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
2
+ arxiv_to_prompt/cli.py,sha256=0a0DoOYkKIp8mE_FqzVYmG2gvCtnFiIJtIlfZLkZu5g,3865
3
+ arxiv_to_prompt/core.py,sha256=kI0xKTf1igeOxNACJVOtq6PlCoN6kYuTq9KfD4jzE1M,18352
4
+ arxiv_to_prompt-0.6.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
5
+ arxiv_to_prompt-0.6.0.dist-info/METADATA,sha256=VRGqZuboa4DCMzQ2xpAN_G7SVFdOm1YyJ6cor62lr5k,5376
6
+ arxiv_to_prompt-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
7
+ arxiv_to_prompt-0.6.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
8
+ arxiv_to_prompt-0.6.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
9
+ arxiv_to_prompt-0.6.0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
2
- arxiv_to_prompt/cli.py,sha256=IwT64A-lf5PrxCxs2e1adN09USkf7ji31uzO8YAegpU,3203
3
- arxiv_to_prompt/core.py,sha256=ln67k1MT-l8PalwGsszU6IwCZ15GAOiX0yfLgyKvySA,13837
4
- arxiv_to_prompt-0.5.1.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
5
- arxiv_to_prompt-0.5.1.dist-info/METADATA,sha256=VKK7my5pxFuVLTejMV3vS8BLhk_kV62HHPWxC84_80Q,4786
6
- arxiv_to_prompt-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
7
- arxiv_to_prompt-0.5.1.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
8
- arxiv_to_prompt-0.5.1.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
9
- arxiv_to_prompt-0.5.1.dist-info/RECORD,,