arxiv-to-prompt 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
arxiv_to_prompt/cli.py CHANGED
@@ -1,6 +1,14 @@
1
1
  import argparse
2
2
  import re
3
- from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
3
+ from .core import (
4
+ process_latex_source,
5
+ get_default_cache_dir,
6
+ list_sections,
7
+ extract_section,
8
+ parse_section_tree,
9
+ format_section_tree,
10
+ find_all_by_name,
11
+ )
4
12
 
5
13
 
6
14
  def extract_arxiv_id(input_str: str) -> str:
@@ -79,17 +87,28 @@ def main():
79
87
  return
80
88
 
81
89
  if args.list_sections:
82
- sections = list_sections(content)
83
- for section in sections:
84
- print(section)
90
+ tree = parse_section_tree(content)
91
+ print(format_section_tree(tree))
85
92
  elif args.section:
93
+ import sys
94
+ tree = parse_section_tree(content)
86
95
  extracted = []
87
- for section_name in args.section:
88
- section_content = extract_section(content, section_name)
96
+ for section_path in args.section:
97
+ # Check for ambiguity only if not using path notation
98
+ if " > " not in section_path:
99
+ matching_paths = find_all_by_name(tree, section_path)
100
+ if len(matching_paths) > 1:
101
+ print(f"Warning: '{section_path}' is ambiguous. Found at:", file=sys.stderr)
102
+ for path in matching_paths:
103
+ print(f" - {path}", file=sys.stderr)
104
+ print("Use path notation to disambiguate.", file=sys.stderr)
105
+ continue
106
+
107
+ section_content = extract_section(content, section_path)
89
108
  if section_content:
90
109
  extracted.append(section_content)
91
110
  else:
92
- print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
111
+ print(f"Warning: Section '{section_path}' not found", file=sys.stderr)
93
112
  if extracted:
94
113
  print("\n\n".join(extracted))
95
114
  else:
arxiv_to_prompt/core.py CHANGED
@@ -3,6 +3,7 @@ import os
3
3
  import tarfile
4
4
  import shutil
5
5
  from typing import Optional, List
6
+ from dataclasses import dataclass, field
6
7
  import re
7
8
  from pathlib import Path
8
9
  import requests
@@ -92,40 +93,55 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
92
93
 
93
94
  def find_main_tex(directory: str) -> Optional[str]:
94
95
  """
95
- Find the main .tex file containing documentclass.
96
+ Find the main .tex file containing documentclass.
97
+ Searches recursively through subdirectories.
96
98
  First checks for common naming conventions (main.tex, paper.tex, index.tex).
97
- If none found, returns the filename of the longest .tex file containing documentclass,
98
- since shorter files are typically conference templates or supplementary documents
99
+ If none found, returns the path of the longest .tex file containing documentclass,
100
+ since shorter files are typically conference templates or supplementary documents
99
101
  rather than the main manuscript.
100
102
  """
101
103
  common_names = ['main.tex', 'paper.tex', 'index.tex']
102
104
  main_tex_file = None
103
105
  max_line_count = 0
104
106
 
105
- # First pass: check for common naming conventions
106
- for file_name in os.listdir(directory):
107
- if file_name in common_names:
108
- try:
109
- with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
110
- lines = file.readlines()
111
- if any('\\documentclass' in line for line in lines):
112
- return file_name
113
- except Exception as e:
114
- logging.warning(f"Could not read file {file_name}: {e}")
107
+ # Walk through directory and subdirectories
108
+ for root, dirs, files in os.walk(directory):
109
+ rel_root = os.path.relpath(root, directory)
110
+
111
+ # First pass: check for common naming conventions
112
+ for file_name in files:
113
+ if file_name in common_names:
114
+ file_path = os.path.join(root, file_name)
115
+ try:
116
+ with open(file_path, 'r', encoding='utf-8') as file:
117
+ lines = file.readlines()
118
+ if any('\\documentclass' in line for line in lines):
119
+ if rel_root == '.':
120
+ return file_name
121
+ return os.path.join(rel_root, file_name)
122
+ except Exception as e:
123
+ logging.warning(f"Could not read file {file_path}: {e}")
115
124
 
116
125
  # Second pass: find the longest .tex file containing documentclass
117
- for file_name in os.listdir(directory):
118
- if file_name.endswith('.tex'):
119
- try:
120
- with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
121
- lines = file.readlines()
122
- if any('\\documentclass' in line for line in lines):
123
- line_count = len(lines)
124
- if line_count > max_line_count:
125
- main_tex_file = file_name
126
- max_line_count = line_count
127
- except Exception as e:
128
- logging.warning(f"Could not read file {file_name}: {e}")
126
+ for root, dirs, files in os.walk(directory):
127
+ rel_root = os.path.relpath(root, directory)
128
+
129
+ for file_name in files:
130
+ if file_name.endswith('.tex'):
131
+ file_path = os.path.join(root, file_name)
132
+ try:
133
+ with open(file_path, 'r', encoding='utf-8') as file:
134
+ lines = file.readlines()
135
+ if any('\\documentclass' in line for line in lines):
136
+ line_count = len(lines)
137
+ if line_count > max_line_count:
138
+ if rel_root == '.':
139
+ main_tex_file = file_name
140
+ else:
141
+ main_tex_file = os.path.join(rel_root, file_name)
142
+ max_line_count = line_count
143
+ except Exception as e:
144
+ logging.warning(f"Could not read file {file_path}: {e}")
129
145
 
130
146
  return main_tex_file
131
147
 
@@ -171,25 +187,162 @@ def list_sections(text: str) -> list:
171
187
  return re.findall(pattern, text)
172
188
 
173
189
 
174
- def extract_section(text: str, section_name: str) -> Optional[str]:
175
- """Extract content of a specific section (including its subsections)."""
176
- # Find the start of the requested section
177
- pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
178
- start_match = re.search(pattern, text)
179
- if not start_match:
180
- return None
190
+ @dataclass
191
+ class SectionNode:
192
+ """Represents a section/subsection/subsubsection in the LaTeX document tree."""
193
+ level: int # 0=section, 1=subsection, 2=subsubsection
194
+ name: str
195
+ start_pos: int
196
+ end_pos: int = -1 # -1 means end of document
197
+ children: List['SectionNode'] = field(default_factory=list)
198
+ parent: Optional['SectionNode'] = None
181
199
 
182
- start_pos = start_match.start()
183
200
 
184
- # Find the next \section (not subsection) or end of document
185
- remaining = text[start_match.end():]
186
- end_match = re.search(r'\\section\*?\{', remaining)
201
+ def parse_section_tree(text: str) -> List[SectionNode]:
202
+ """
203
+ Build a hierarchical tree from LaTeX section commands.
187
204
 
188
- if end_match:
189
- end_pos = start_match.end() + end_match.start()
190
- return text[start_pos:end_pos].rstrip()
191
- else:
192
- return text[start_pos:].rstrip()
205
+ Returns a list of top-level section nodes, each containing their subsections as children.
206
+ """
207
+ # Match section, subsection, and subsubsection commands
208
+ pattern = r'\\(section|subsection|subsubsection)\*?\{([^}]+)\}'
209
+
210
+ level_map = {'section': 0, 'subsection': 1, 'subsubsection': 2}
211
+
212
+ # Find all section commands with their positions
213
+ matches = list(re.finditer(pattern, text))
214
+
215
+ if not matches:
216
+ return []
217
+
218
+ # Create nodes for all sections
219
+ all_nodes = []
220
+ for match in matches:
221
+ level = level_map[match.group(1)]
222
+ name = match.group(2)
223
+ start_pos = match.start()
224
+ all_nodes.append(SectionNode(level=level, name=name, start_pos=start_pos))
225
+
226
+ # Calculate end positions (each section ends where the next same-or-higher level starts)
227
+ for i, node in enumerate(all_nodes):
228
+ # Find next section at same or higher (lower number) level
229
+ for j in range(i + 1, len(all_nodes)):
230
+ if all_nodes[j].level <= node.level:
231
+ node.end_pos = all_nodes[j].start_pos
232
+ break
233
+ # If no next section found at same/higher level, end at document end
234
+ if node.end_pos == -1:
235
+ node.end_pos = len(text)
236
+
237
+ # Build tree structure
238
+ root_nodes: List[SectionNode] = []
239
+ section_stack: List[SectionNode] = []
240
+
241
+ for node in all_nodes:
242
+ # Pop from stack until we find a parent at a higher level
243
+ while section_stack and section_stack[-1].level >= node.level:
244
+ section_stack.pop()
245
+
246
+ if section_stack:
247
+ # This node is a child of the top of the stack
248
+ node.parent = section_stack[-1]
249
+ section_stack[-1].children.append(node)
250
+ else:
251
+ # This is a root node
252
+ root_nodes.append(node)
253
+
254
+ section_stack.append(node)
255
+
256
+ return root_nodes
257
+
258
+
259
+ def format_section_tree(nodes: List[SectionNode], indent: int = 0) -> str:
260
+ """
261
+ Format section tree with indentation for display.
262
+
263
+ Returns a string with each section name on its own line, indented by level.
264
+ """
265
+ lines = []
266
+ for node in nodes:
267
+ lines.append(" " * indent + node.name)
268
+ if node.children:
269
+ lines.append(format_section_tree(node.children, indent + 1))
270
+ return "\n".join(lines)
271
+
272
+
273
+ def find_all_by_name(nodes: List[SectionNode], name: str, parent_path: str = "") -> List[str]:
274
+ """
275
+ Find all paths to sections with the given name.
276
+
277
+ Returns a list of full paths (e.g., ["Introduction > Background", "Methods > Background"])
278
+ """
279
+ results = []
280
+ for node in nodes:
281
+ current_path = f"{parent_path} > {node.name}" if parent_path else node.name
282
+ if node.name == name:
283
+ results.append(current_path)
284
+ if node.children:
285
+ results.extend(find_all_by_name(node.children, name, current_path))
286
+ return results
287
+
288
+
289
+ def find_section_by_path(nodes: List[SectionNode], path: str) -> Optional[SectionNode]:
290
+ """
291
+ Find a section by path notation (e.g., "Methods > Background").
292
+
293
+ If path contains no " > ", searches for an exact name match at any level.
294
+ If path contains " > ", follows the hierarchy.
295
+ """
296
+ parts = [p.strip() for p in path.split(" > ")]
297
+
298
+ if len(parts) == 1:
299
+ # Simple name lookup - find first match at any level
300
+ def find_first(nodes: List[SectionNode], name: str) -> Optional[SectionNode]:
301
+ for node in nodes:
302
+ if node.name == name:
303
+ return node
304
+ if node.children:
305
+ result = find_first(node.children, name)
306
+ if result:
307
+ return result
308
+ return None
309
+ return find_first(nodes, parts[0])
310
+
311
+ # Path notation - follow the hierarchy
312
+ current_nodes = nodes
313
+ current_node = None
314
+
315
+ for part in parts:
316
+ found = None
317
+ for node in current_nodes:
318
+ if node.name == part:
319
+ found = node
320
+ break
321
+ if not found:
322
+ return None
323
+ current_node = found
324
+ current_nodes = found.children
325
+
326
+ return current_node
327
+
328
+
329
+ def extract_section(text: str, section_path: str) -> Optional[str]:
330
+ """
331
+ Extract content of a specific section, subsection, or subsubsection.
332
+
333
+ Args:
334
+ text: The LaTeX content
335
+ section_path: Section name or path (e.g., "Methods" or "Methods > Background")
336
+
337
+ Returns:
338
+ The section content including any subsections, or None if not found.
339
+ """
340
+ tree = parse_section_tree(text)
341
+ node = find_section_by_path(tree, section_path)
342
+ if not node:
343
+ return None
344
+
345
+ return text[node.start_pos:node.end_pos].rstrip()
193
346
 
194
347
 
195
348
  def flatten_tex(directory: str, main_file: str) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
54
54
  # Process a local folder containing TeX files (instead of downloading from arXiv)
55
55
  arxiv-to-prompt --local-folder /path/to/tex/files
56
56
 
57
- # List all section names in the paper
58
- arxiv-to-prompt 2303.08774 --list-sections
59
-
60
- # Extract only specific sections
61
- arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
57
+ # List all sections (with subsections indented)
58
+ arxiv-to-prompt 2307.09288 --list-sections
59
+ # Introduction
60
+ # Pretraining
61
+ # Pretraining Data
62
+ # Training Details
63
+ # Training Hardware \& Carbon Footprint
64
+ # ...
65
+
66
+ # Extract specific sections
67
+ arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
68
+
69
+ # Ambiguous names show a helpful error
70
+ arxiv-to-prompt 2307.09288 --section "Human Evaluation"
71
+ # Warning: 'Human Evaluation' is ambiguous. Found at:
72
+ # - Fine-tuning > RLHF Results > Human Evaluation
73
+ # - Appendix > Additional Details for Fine-tuning > Human Evaluation
74
+ # Use path notation to disambiguate.
75
+
76
+ # Use path notation when the same name appears multiple times
77
+ arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
62
78
 
63
79
  # Copy to clipboard
64
80
  arxiv-to-prompt 2303.08774 | pbcopy
@@ -0,0 +1,9 @@
1
+ arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
2
+ arxiv_to_prompt/cli.py,sha256=0a0DoOYkKIp8mE_FqzVYmG2gvCtnFiIJtIlfZLkZu5g,3865
3
+ arxiv_to_prompt/core.py,sha256=kI0xKTf1igeOxNACJVOtq6PlCoN6kYuTq9KfD4jzE1M,18352
4
+ arxiv_to_prompt-0.6.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
5
+ arxiv_to_prompt-0.6.0.dist-info/METADATA,sha256=VRGqZuboa4DCMzQ2xpAN_G7SVFdOm1YyJ6cor62lr5k,5376
6
+ arxiv_to_prompt-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
7
+ arxiv_to_prompt-0.6.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
8
+ arxiv_to_prompt-0.6.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
9
+ arxiv_to_prompt-0.6.0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
2
- arxiv_to_prompt/cli.py,sha256=IwT64A-lf5PrxCxs2e1adN09USkf7ji31uzO8YAegpU,3203
3
- arxiv_to_prompt/core.py,sha256=GafxYeE0dNg70hNG8BrSM7S99dIpHiy1KoNp5oW8niA,13119
4
- arxiv_to_prompt-0.5.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
5
- arxiv_to_prompt-0.5.0.dist-info/METADATA,sha256=4a66cO6DpNdd0dz3U_79QhL60Q1cAhHHyExWUqhL4eo,4786
6
- arxiv_to_prompt-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
7
- arxiv_to_prompt-0.5.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
8
- arxiv_to_prompt-0.5.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
9
- arxiv_to_prompt-0.5.0.dist-info/RECORD,,