arxiv-to-prompt 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
54
54
  # Process a local folder containing TeX files (instead of downloading from arXiv)
55
55
  arxiv-to-prompt --local-folder /path/to/tex/files
56
56
 
57
- # List all section names in the paper
58
- arxiv-to-prompt 2303.08774 --list-sections
59
-
60
- # Extract only specific sections
61
- arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
57
+ # List all sections (with subsections indented)
58
+ arxiv-to-prompt 2307.09288 --list-sections
59
+ # Introduction
60
+ # Pretraining
61
+ # Pretraining Data
62
+ # Training Details
63
+ # Training Hardware \& Carbon Footprint
64
+ # ...
65
+
66
+ # Extract specific sections
67
+ arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
68
+
69
+ # Ambiguous names show a helpful error
70
+ arxiv-to-prompt 2307.09288 --section "Human Evaluation"
71
+ # Warning: 'Human Evaluation' is ambiguous. Found at:
72
+ # - Fine-tuning > RLHF Results > Human Evaluation
73
+ # - Appendix > Additional Details for Fine-tuning > Human Evaluation
74
+ # Use path notation to disambiguate.
75
+
76
+ # Use path notation when the same name appears multiple times
77
+ arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
62
78
 
63
79
  # Copy to clipboard
64
80
  arxiv-to-prompt 2303.08774 | pbcopy
@@ -35,11 +35,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
35
35
  # Process a local folder containing TeX files (instead of downloading from arXiv)
36
36
  arxiv-to-prompt --local-folder /path/to/tex/files
37
37
 
38
- # List all section names in the paper
39
- arxiv-to-prompt 2303.08774 --list-sections
40
-
41
- # Extract only specific sections
42
- arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
38
+ # List all sections (with subsections indented)
39
+ arxiv-to-prompt 2307.09288 --list-sections
40
+ # Introduction
41
+ # Pretraining
42
+ # Pretraining Data
43
+ # Training Details
44
+ # Training Hardware \& Carbon Footprint
45
+ # ...
46
+
47
+ # Extract specific sections
48
+ arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
49
+
50
+ # Ambiguous names show a helpful error
51
+ arxiv-to-prompt 2307.09288 --section "Human Evaluation"
52
+ # Warning: 'Human Evaluation' is ambiguous. Found at:
53
+ # - Fine-tuning > RLHF Results > Human Evaluation
54
+ # - Appendix > Additional Details for Fine-tuning > Human Evaluation
55
+ # Use path notation to disambiguate.
56
+
57
+ # Use path notation when the same name appears multiple times
58
+ arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
43
59
 
44
60
  # Copy to clipboard
45
61
  arxiv-to-prompt 2303.08774 | pbcopy
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "arxiv-to-prompt"
7
- version = "0.5.0"
7
+ version = "0.6.0"
8
8
  description = "transform arXiv papers into a single latex prompt for LLMs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "Takashi Ishida" }]
@@ -1,6 +1,14 @@
1
1
  import argparse
2
2
  import re
3
- from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
3
+ from .core import (
4
+ process_latex_source,
5
+ get_default_cache_dir,
6
+ list_sections,
7
+ extract_section,
8
+ parse_section_tree,
9
+ format_section_tree,
10
+ find_all_by_name,
11
+ )
4
12
 
5
13
 
6
14
  def extract_arxiv_id(input_str: str) -> str:
@@ -79,17 +87,28 @@ def main():
79
87
  return
80
88
 
81
89
  if args.list_sections:
82
- sections = list_sections(content)
83
- for section in sections:
84
- print(section)
90
+ tree = parse_section_tree(content)
91
+ print(format_section_tree(tree))
85
92
  elif args.section:
93
+ import sys
94
+ tree = parse_section_tree(content)
86
95
  extracted = []
87
- for section_name in args.section:
88
- section_content = extract_section(content, section_name)
96
+ for section_path in args.section:
97
+ # Check for ambiguity only if not using path notation
98
+ if " > " not in section_path:
99
+ matching_paths = find_all_by_name(tree, section_path)
100
+ if len(matching_paths) > 1:
101
+ print(f"Warning: '{section_path}' is ambiguous. Found at:", file=sys.stderr)
102
+ for path in matching_paths:
103
+ print(f" - {path}", file=sys.stderr)
104
+ print("Use path notation to disambiguate.", file=sys.stderr)
105
+ continue
106
+
107
+ section_content = extract_section(content, section_path)
89
108
  if section_content:
90
109
  extracted.append(section_content)
91
110
  else:
92
- print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
111
+ print(f"Warning: Section '{section_path}' not found", file=sys.stderr)
93
112
  if extracted:
94
113
  print("\n\n".join(extracted))
95
114
  else:
@@ -3,6 +3,7 @@ import os
3
3
  import tarfile
4
4
  import shutil
5
5
  from typing import Optional, List
6
+ from dataclasses import dataclass, field
6
7
  import re
7
8
  from pathlib import Path
8
9
  import requests
@@ -92,40 +93,55 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
92
93
 
93
94
  def find_main_tex(directory: str) -> Optional[str]:
94
95
  """
95
- Find the main .tex file containing documentclass.
96
+ Find the main .tex file containing documentclass.
97
+ Searches recursively through subdirectories.
96
98
  First checks for common naming conventions (main.tex, paper.tex, index.tex).
97
- If none found, returns the filename of the longest .tex file containing documentclass,
98
- since shorter files are typically conference templates or supplementary documents
99
+ If none found, returns the path of the longest .tex file containing documentclass,
100
+ since shorter files are typically conference templates or supplementary documents
99
101
  rather than the main manuscript.
100
102
  """
101
103
  common_names = ['main.tex', 'paper.tex', 'index.tex']
102
104
  main_tex_file = None
103
105
  max_line_count = 0
104
106
 
105
- # First pass: check for common naming conventions
106
- for file_name in os.listdir(directory):
107
- if file_name in common_names:
108
- try:
109
- with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
110
- lines = file.readlines()
111
- if any('\\documentclass' in line for line in lines):
112
- return file_name
113
- except Exception as e:
114
- logging.warning(f"Could not read file {file_name}: {e}")
107
+ # Walk through directory and subdirectories
108
+ for root, dirs, files in os.walk(directory):
109
+ rel_root = os.path.relpath(root, directory)
110
+
111
+ # First pass: check for common naming conventions
112
+ for file_name in files:
113
+ if file_name in common_names:
114
+ file_path = os.path.join(root, file_name)
115
+ try:
116
+ with open(file_path, 'r', encoding='utf-8') as file:
117
+ lines = file.readlines()
118
+ if any('\\documentclass' in line for line in lines):
119
+ if rel_root == '.':
120
+ return file_name
121
+ return os.path.join(rel_root, file_name)
122
+ except Exception as e:
123
+ logging.warning(f"Could not read file {file_path}: {e}")
115
124
 
116
125
  # Second pass: find the longest .tex file containing documentclass
117
- for file_name in os.listdir(directory):
118
- if file_name.endswith('.tex'):
119
- try:
120
- with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
121
- lines = file.readlines()
122
- if any('\\documentclass' in line for line in lines):
123
- line_count = len(lines)
124
- if line_count > max_line_count:
125
- main_tex_file = file_name
126
- max_line_count = line_count
127
- except Exception as e:
128
- logging.warning(f"Could not read file {file_name}: {e}")
126
+ for root, dirs, files in os.walk(directory):
127
+ rel_root = os.path.relpath(root, directory)
128
+
129
+ for file_name in files:
130
+ if file_name.endswith('.tex'):
131
+ file_path = os.path.join(root, file_name)
132
+ try:
133
+ with open(file_path, 'r', encoding='utf-8') as file:
134
+ lines = file.readlines()
135
+ if any('\\documentclass' in line for line in lines):
136
+ line_count = len(lines)
137
+ if line_count > max_line_count:
138
+ if rel_root == '.':
139
+ main_tex_file = file_name
140
+ else:
141
+ main_tex_file = os.path.join(rel_root, file_name)
142
+ max_line_count = line_count
143
+ except Exception as e:
144
+ logging.warning(f"Could not read file {file_path}: {e}")
129
145
 
130
146
  return main_tex_file
131
147
 
@@ -171,25 +187,162 @@ def list_sections(text: str) -> list:
171
187
  return re.findall(pattern, text)
172
188
 
173
189
 
174
- def extract_section(text: str, section_name: str) -> Optional[str]:
175
- """Extract content of a specific section (including its subsections)."""
176
- # Find the start of the requested section
177
- pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
178
- start_match = re.search(pattern, text)
179
- if not start_match:
180
- return None
190
+ @dataclass
191
+ class SectionNode:
192
+ """Represents a section/subsection/subsubsection in the LaTeX document tree."""
193
+ level: int # 0=section, 1=subsection, 2=subsubsection
194
+ name: str
195
+ start_pos: int
196
+ end_pos: int = -1 # -1 means end of document
197
+ children: List['SectionNode'] = field(default_factory=list)
198
+ parent: Optional['SectionNode'] = None
181
199
 
182
- start_pos = start_match.start()
183
200
 
184
- # Find the next \section (not subsection) or end of document
185
- remaining = text[start_match.end():]
186
- end_match = re.search(r'\\section\*?\{', remaining)
201
+ def parse_section_tree(text: str) -> List[SectionNode]:
202
+ """
203
+ Build a hierarchical tree from LaTeX section commands.
187
204
 
188
- if end_match:
189
- end_pos = start_match.end() + end_match.start()
190
- return text[start_pos:end_pos].rstrip()
191
- else:
192
- return text[start_pos:].rstrip()
205
+ Returns a list of top-level section nodes, each containing their subsections as children.
206
+ """
207
+ # Match section, subsection, and subsubsection commands
208
+ pattern = r'\\(section|subsection|subsubsection)\*?\{([^}]+)\}'
209
+
210
+ level_map = {'section': 0, 'subsection': 1, 'subsubsection': 2}
211
+
212
+ # Find all section commands with their positions
213
+ matches = list(re.finditer(pattern, text))
214
+
215
+ if not matches:
216
+ return []
217
+
218
+ # Create nodes for all sections
219
+ all_nodes = []
220
+ for match in matches:
221
+ level = level_map[match.group(1)]
222
+ name = match.group(2)
223
+ start_pos = match.start()
224
+ all_nodes.append(SectionNode(level=level, name=name, start_pos=start_pos))
225
+
226
+ # Calculate end positions (each section ends where the next same-or-higher level starts)
227
+ for i, node in enumerate(all_nodes):
228
+ # Find next section at same or higher (lower number) level
229
+ for j in range(i + 1, len(all_nodes)):
230
+ if all_nodes[j].level <= node.level:
231
+ node.end_pos = all_nodes[j].start_pos
232
+ break
233
+ # If no next section found at same/higher level, end at document end
234
+ if node.end_pos == -1:
235
+ node.end_pos = len(text)
236
+
237
+ # Build tree structure
238
+ root_nodes: List[SectionNode] = []
239
+ section_stack: List[SectionNode] = []
240
+
241
+ for node in all_nodes:
242
+ # Pop from stack until we find a parent at a higher level
243
+ while section_stack and section_stack[-1].level >= node.level:
244
+ section_stack.pop()
245
+
246
+ if section_stack:
247
+ # This node is a child of the top of the stack
248
+ node.parent = section_stack[-1]
249
+ section_stack[-1].children.append(node)
250
+ else:
251
+ # This is a root node
252
+ root_nodes.append(node)
253
+
254
+ section_stack.append(node)
255
+
256
+ return root_nodes
257
+
258
+
259
+ def format_section_tree(nodes: List[SectionNode], indent: int = 0) -> str:
260
+ """
261
+ Format section tree with indentation for display.
262
+
263
+ Returns a string with each section name on its own line, indented by level.
264
+ """
265
+ lines = []
266
+ for node in nodes:
267
+ lines.append(" " * indent + node.name)
268
+ if node.children:
269
+ lines.append(format_section_tree(node.children, indent + 1))
270
+ return "\n".join(lines)
271
+
272
+
273
+ def find_all_by_name(nodes: List[SectionNode], name: str, parent_path: str = "") -> List[str]:
274
+ """
275
+ Find all paths to sections with the given name.
276
+
277
+ Returns a list of full paths (e.g., ["Introduction > Background", "Methods > Background"])
278
+ """
279
+ results = []
280
+ for node in nodes:
281
+ current_path = f"{parent_path} > {node.name}" if parent_path else node.name
282
+ if node.name == name:
283
+ results.append(current_path)
284
+ if node.children:
285
+ results.extend(find_all_by_name(node.children, name, current_path))
286
+ return results
287
+
288
+
289
+ def find_section_by_path(nodes: List[SectionNode], path: str) -> Optional[SectionNode]:
290
+ """
291
+ Find a section by path notation (e.g., "Methods > Background").
292
+
293
+ If path contains no " > ", searches for an exact name match at any level.
294
+ If path contains " > ", follows the hierarchy.
295
+ """
296
+ parts = [p.strip() for p in path.split(" > ")]
297
+
298
+ if len(parts) == 1:
299
+ # Simple name lookup - find first match at any level
300
+ def find_first(nodes: List[SectionNode], name: str) -> Optional[SectionNode]:
301
+ for node in nodes:
302
+ if node.name == name:
303
+ return node
304
+ if node.children:
305
+ result = find_first(node.children, name)
306
+ if result:
307
+ return result
308
+ return None
309
+ return find_first(nodes, parts[0])
310
+
311
+ # Path notation - follow the hierarchy
312
+ current_nodes = nodes
313
+ current_node = None
314
+
315
+ for part in parts:
316
+ found = None
317
+ for node in current_nodes:
318
+ if node.name == part:
319
+ found = node
320
+ break
321
+ if not found:
322
+ return None
323
+ current_node = found
324
+ current_nodes = found.children
325
+
326
+ return current_node
327
+
328
+
329
+ def extract_section(text: str, section_path: str) -> Optional[str]:
330
+ """
331
+ Extract content of a specific section, subsection, or subsubsection.
332
+
333
+ Args:
334
+ text: The LaTeX content
335
+ section_path: Section name or path (e.g., "Methods" or "Methods > Background")
336
+
337
+ Returns:
338
+ The section content including any subsections, or None if not found.
339
+ """
340
+ tree = parse_section_tree(text)
341
+ node = find_section_by_path(tree, section_path)
342
+ if not node:
343
+ return None
344
+
345
+ return text[node.start_pos:node.end_pos].rstrip()
193
346
 
194
347
 
195
348
  def flatten_tex(directory: str, main_file: str) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
54
54
  # Process a local folder containing TeX files (instead of downloading from arXiv)
55
55
  arxiv-to-prompt --local-folder /path/to/tex/files
56
56
 
57
- # List all section names in the paper
58
- arxiv-to-prompt 2303.08774 --list-sections
59
-
60
- # Extract only specific sections
61
- arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
57
+ # List all sections (with subsections indented)
58
+ arxiv-to-prompt 2307.09288 --list-sections
59
+ # Introduction
60
+ # Pretraining
61
+ # Pretraining Data
62
+ # Training Details
63
+ # Training Hardware \& Carbon Footprint
64
+ # ...
65
+
66
+ # Extract specific sections
67
+ arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
68
+
69
+ # Ambiguous names show a helpful error
70
+ arxiv-to-prompt 2307.09288 --section "Human Evaluation"
71
+ # Warning: 'Human Evaluation' is ambiguous. Found at:
72
+ # - Fine-tuning > RLHF Results > Human Evaluation
73
+ # - Appendix > Additional Details for Fine-tuning > Human Evaluation
74
+ # Use path notation to disambiguate.
75
+
76
+ # Use path notation when the same name appears multiple times
77
+ arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
62
78
 
63
79
  # Copy to clipboard
64
80
  arxiv-to-prompt 2303.08774 | pbcopy
@@ -12,6 +12,11 @@ from arxiv_to_prompt.core import (
12
12
  remove_appendix,
13
13
  list_sections,
14
14
  extract_section,
15
+ SectionNode,
16
+ parse_section_tree,
17
+ format_section_tree,
18
+ find_all_by_name,
19
+ find_section_by_path,
15
20
  )
16
21
  from arxiv_to_prompt.cli import extract_arxiv_id
17
22
 
@@ -153,6 +158,23 @@ def test_find_main_tex(temp_cache_dir):
153
158
  assert found_main == "main.tex"
154
159
 
155
160
 
161
+ def test_find_main_tex_in_subdirectory(temp_cache_dir):
162
+ """Test finding main tex file in a subdirectory."""
163
+ # Create test directory with subdirectory
164
+ tex_dir = temp_cache_dir / "test_tex_subdir"
165
+ tex_dir.mkdir(parents=True)
166
+ subdir = tex_dir / "paper"
167
+ subdir.mkdir()
168
+
169
+ # Create main.tex in subdirectory
170
+ main_file = subdir / "main.tex"
171
+ main_file.write_text("\\documentclass{article}\n\\begin{document}\nHello\n\\end{document}")
172
+
173
+ # Test finding main file in subdirectory
174
+ found_main = find_main_tex(str(tex_dir))
175
+ assert found_main == os.path.join("paper", "main.tex")
176
+
177
+
156
178
  def test_commented_input_commands(temp_cache_dir):
157
179
  """Test that commented-out \\include and \\input commands are ignored."""
158
180
  # Create test directory and files
@@ -361,3 +383,256 @@ Results here.
361
383
  results = extract_section(text, "Results")
362
384
  assert results is not None
363
385
  assert "Results here." in results
386
+
387
+
388
+ def test_parse_section_tree():
389
+ """Test parsing LaTeX into a hierarchical section tree."""
390
+ text = r"""
391
+ \section{Introduction}
392
+ Intro text.
393
+ \subsection{Background}
394
+ Background text.
395
+ \subsection{Motivation}
396
+ Motivation text.
397
+ \section{Methods}
398
+ Methods text.
399
+ \subsection{Background}
400
+ Methods background.
401
+ \subsubsection{Details}
402
+ Details text.
403
+ \subsection{Data Collection}
404
+ Data text.
405
+ \section{Results}
406
+ Results text.
407
+ """
408
+ tree = parse_section_tree(text)
409
+
410
+ # Should have 3 top-level sections
411
+ assert len(tree) == 3
412
+ assert tree[0].name == "Introduction"
413
+ assert tree[1].name == "Methods"
414
+ assert tree[2].name == "Results"
415
+
416
+ # Introduction should have 2 subsections
417
+ assert len(tree[0].children) == 2
418
+ assert tree[0].children[0].name == "Background"
419
+ assert tree[0].children[1].name == "Motivation"
420
+
421
+ # Methods should have 2 subsections
422
+ assert len(tree[1].children) == 2
423
+ assert tree[1].children[0].name == "Background"
424
+ assert tree[1].children[1].name == "Data Collection"
425
+
426
+ # Methods > Background should have 1 subsubsection
427
+ assert len(tree[1].children[0].children) == 1
428
+ assert tree[1].children[0].children[0].name == "Details"
429
+
430
+ # Results should have no subsections
431
+ assert len(tree[2].children) == 0
432
+
433
+
434
+ def test_parse_section_tree_levels():
435
+ """Test that section levels are correctly assigned."""
436
+ text = r"""
437
+ \section{Sec}
438
+ \subsection{Subsec}
439
+ \subsubsection{Subsubsec}
440
+ """
441
+ tree = parse_section_tree(text)
442
+
443
+ assert tree[0].level == 0
444
+ assert tree[0].children[0].level == 1
445
+ assert tree[0].children[0].children[0].level == 2
446
+
447
+
448
+ def test_format_section_tree():
449
+ """Test formatting section tree with indentation."""
450
+ text = r"""
451
+ \section{Introduction}
452
+ \subsection{Background}
453
+ \section{Methods}
454
+ \subsection{Data}
455
+ \subsubsection{Collection}
456
+ """
457
+ tree = parse_section_tree(text)
458
+ output = format_section_tree(tree)
459
+
460
+ lines = output.split('\n')
461
+ assert lines[0] == "Introduction"
462
+ assert lines[1] == " Background"
463
+ assert lines[2] == "Methods"
464
+ assert lines[3] == " Data"
465
+ assert lines[4] == " Collection"
466
+
467
+
468
+ def test_find_all_by_name():
469
+ """Test finding all paths to sections with a given name."""
470
+ text = r"""
471
+ \section{Introduction}
472
+ \subsection{Background}
473
+ \section{Methods}
474
+ \subsection{Background}
475
+ \section{Results}
476
+ """
477
+ tree = parse_section_tree(text)
478
+
479
+ # Background appears twice under different parents
480
+ paths = find_all_by_name(tree, "Background")
481
+ assert len(paths) == 2
482
+ assert "Introduction > Background" in paths
483
+ assert "Methods > Background" in paths
484
+
485
+ # Unique name
486
+ paths = find_all_by_name(tree, "Results")
487
+ assert paths == ["Results"]
488
+
489
+ # Non-existent name
490
+ paths = find_all_by_name(tree, "Discussion")
491
+ assert paths == []
492
+
493
+
494
+ def test_find_section_by_path_simple():
495
+ """Test finding section by simple name."""
496
+ text = r"""
497
+ \section{Introduction}
498
+ \section{Methods}
499
+ \subsection{Data}
500
+ """
501
+ tree = parse_section_tree(text)
502
+
503
+ # Find by simple name
504
+ node = find_section_by_path(tree, "Introduction")
505
+ assert node is not None
506
+ assert node.name == "Introduction"
507
+
508
+ # Find subsection by simple name
509
+ node = find_section_by_path(tree, "Data")
510
+ assert node is not None
511
+ assert node.name == "Data"
512
+
513
+
514
+ def test_find_section_by_path_notation():
515
+ """Test finding section by path notation."""
516
+ text = r"""
517
+ \section{Introduction}
518
+ \subsection{Background}
519
+ \section{Methods}
520
+ \subsection{Background}
521
+ """
522
+ tree = parse_section_tree(text)
523
+
524
+ # Find by path notation
525
+ node = find_section_by_path(tree, "Introduction > Background")
526
+ assert node is not None
527
+ assert node.name == "Background"
528
+ assert node.parent.name == "Introduction"
529
+
530
+ node = find_section_by_path(tree, "Methods > Background")
531
+ assert node is not None
532
+ assert node.name == "Background"
533
+ assert node.parent.name == "Methods"
534
+
535
+
536
+ def test_find_section_by_path_not_found():
537
+ """Test that non-existent paths return None."""
538
+ text = r"""
539
+ \section{Introduction}
540
+ \subsection{Background}
541
+ """
542
+ tree = parse_section_tree(text)
543
+
544
+ assert find_section_by_path(tree, "NonExistent") is None
545
+ assert find_section_by_path(tree, "Introduction > NonExistent") is None
546
+ assert find_section_by_path(tree, "NonExistent > Background") is None
547
+
548
+
549
+ def test_extract_section_with_path():
550
+ """Test extracting section using path notation."""
551
+ text = r"""
552
+ \section{Introduction}
553
+ Intro text.
554
+ \subsection{Background}
555
+ Intro background.
556
+ \section{Methods}
557
+ Methods text.
558
+ \subsection{Background}
559
+ Methods background.
560
+ \section{Results}
561
+ Results text.
562
+ """
563
+ # Extract using path notation
564
+ content = extract_section(text, "Introduction > Background")
565
+ assert content is not None
566
+ assert "Intro background." in content
567
+ assert "Methods background." not in content
568
+
569
+ content = extract_section(text, "Methods > Background")
570
+ assert content is not None
571
+ assert "Methods background." in content
572
+ assert "Intro background." not in content
573
+
574
+
575
+ def test_extract_subsection_boundaries():
576
+ """Test that subsection extraction stops at correct boundary."""
577
+ text = r"""
578
+ \section{Methods}
579
+ Methods intro.
580
+ \subsection{First}
581
+ First content.
582
+ \subsection{Second}
583
+ Second content.
584
+ \section{Results}
585
+ Results content.
586
+ """
587
+ # Extract first subsection - should stop at second subsection
588
+ content = extract_section(text, "First")
589
+ assert content is not None
590
+ assert "First content." in content
591
+ assert "Second content." not in content
592
+
593
+ # Extract second subsection - should stop at Results section
594
+ content = extract_section(text, "Second")
595
+ assert content is not None
596
+ assert "Second content." in content
597
+ assert "Results content." not in content
598
+
599
+
600
+ def test_extract_section_includes_subsections():
601
+ """Test that extracting a section includes all its subsections."""
602
+ text = r"""
603
+ \section{Methods}
604
+ Methods intro.
605
+ \subsection{Data}
606
+ Data info.
607
+ \subsubsection{Collection}
608
+ Collection details.
609
+ \subsection{Analysis}
610
+ Analysis info.
611
+ \section{Results}
612
+ Results content.
613
+ """
614
+ content = extract_section(text, "Methods")
615
+ assert content is not None
616
+ assert "Methods intro." in content
617
+ assert "Data info." in content
618
+ assert "Collection details." in content
619
+ assert "Analysis info." in content
620
+ assert "Results content." not in content
621
+
622
+
623
+ def test_section_tree_with_starred_sections():
624
+ """Test that starred sections are correctly parsed."""
625
+ text = r"""
626
+ \section*{Introduction}
627
+ Intro.
628
+ \subsection*{Background}
629
+ Background.
630
+ \section{Methods}
631
+ Methods.
632
+ """
633
+ tree = parse_section_tree(text)
634
+
635
+ assert len(tree) == 2
636
+ assert tree[0].name == "Introduction"
637
+ assert tree[0].children[0].name == "Background"
638
+ assert tree[1].name == "Methods"
File without changes