arxiv-to-prompt 0.5.1__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
54
54
  # Process a local folder containing TeX files (instead of downloading from arXiv)
55
55
  arxiv-to-prompt --local-folder /path/to/tex/files
56
56
 
57
- # List all section names in the paper
58
- arxiv-to-prompt 2303.08774 --list-sections
59
-
60
- # Extract only specific sections
61
- arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
57
+ # List all sections (with subsections indented)
58
+ arxiv-to-prompt 2307.09288 --list-sections
59
+ # Introduction
60
+ # Pretraining
61
+ # Pretraining Data
62
+ # Training Details
63
+ # Training Hardware \& Carbon Footprint
64
+ # ...
65
+
66
+ # Extract specific sections
67
+ arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
68
+
69
+ # Ambiguous names show a helpful error
70
+ arxiv-to-prompt 2307.09288 --section "Human Evaluation"
71
+ # Warning: 'Human Evaluation' is ambiguous. Found at:
72
+ # - Fine-tuning > RLHF Results > Human Evaluation
73
+ # - Appendix > Additional Details for Fine-tuning > Human Evaluation
74
+ # Use path notation to disambiguate.
75
+
76
+ # Use path notation when the same name appears multiple times
77
+ arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
62
78
 
63
79
  # Copy to clipboard
64
80
  arxiv-to-prompt 2303.08774 | pbcopy
@@ -35,11 +35,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
35
35
  # Process a local folder containing TeX files (instead of downloading from arXiv)
36
36
  arxiv-to-prompt --local-folder /path/to/tex/files
37
37
 
38
- # List all section names in the paper
39
- arxiv-to-prompt 2303.08774 --list-sections
40
-
41
- # Extract only specific sections
42
- arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
38
+ # List all sections (with subsections indented)
39
+ arxiv-to-prompt 2307.09288 --list-sections
40
+ # Introduction
41
+ # Pretraining
42
+ # Pretraining Data
43
+ # Training Details
44
+ # Training Hardware \& Carbon Footprint
45
+ # ...
46
+
47
+ # Extract specific sections
48
+ arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
49
+
50
+ # Ambiguous names show a helpful error
51
+ arxiv-to-prompt 2307.09288 --section "Human Evaluation"
52
+ # Warning: 'Human Evaluation' is ambiguous. Found at:
53
+ # - Fine-tuning > RLHF Results > Human Evaluation
54
+ # - Appendix > Additional Details for Fine-tuning > Human Evaluation
55
+ # Use path notation to disambiguate.
56
+
57
+ # Use path notation when the same name appears multiple times
58
+ arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
43
59
 
44
60
  # Copy to clipboard
45
61
  arxiv-to-prompt 2303.08774 | pbcopy
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "arxiv-to-prompt"
7
- version = "0.5.1"
7
+ version = "0.6.0"
8
8
  description = "transform arXiv papers into a single latex prompt for LLMs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "Takashi Ishida" }]
@@ -1,6 +1,14 @@
1
1
  import argparse
2
2
  import re
3
- from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
3
+ from .core import (
4
+ process_latex_source,
5
+ get_default_cache_dir,
6
+ list_sections,
7
+ extract_section,
8
+ parse_section_tree,
9
+ format_section_tree,
10
+ find_all_by_name,
11
+ )
4
12
 
5
13
 
6
14
  def extract_arxiv_id(input_str: str) -> str:
@@ -79,17 +87,28 @@ def main():
79
87
  return
80
88
 
81
89
  if args.list_sections:
82
- sections = list_sections(content)
83
- for section in sections:
84
- print(section)
90
+ tree = parse_section_tree(content)
91
+ print(format_section_tree(tree))
85
92
  elif args.section:
93
+ import sys
94
+ tree = parse_section_tree(content)
86
95
  extracted = []
87
- for section_name in args.section:
88
- section_content = extract_section(content, section_name)
96
+ for section_path in args.section:
97
+ # Check for ambiguity only if not using path notation
98
+ if " > " not in section_path:
99
+ matching_paths = find_all_by_name(tree, section_path)
100
+ if len(matching_paths) > 1:
101
+ print(f"Warning: '{section_path}' is ambiguous. Found at:", file=sys.stderr)
102
+ for path in matching_paths:
103
+ print(f" - {path}", file=sys.stderr)
104
+ print("Use path notation to disambiguate.", file=sys.stderr)
105
+ continue
106
+
107
+ section_content = extract_section(content, section_path)
89
108
  if section_content:
90
109
  extracted.append(section_content)
91
110
  else:
92
- print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
111
+ print(f"Warning: Section '{section_path}' not found", file=sys.stderr)
93
112
  if extracted:
94
113
  print("\n\n".join(extracted))
95
114
  else:
@@ -3,6 +3,7 @@ import os
3
3
  import tarfile
4
4
  import shutil
5
5
  from typing import Optional, List
6
+ from dataclasses import dataclass, field
6
7
  import re
7
8
  from pathlib import Path
8
9
  import requests
@@ -186,25 +187,162 @@ def list_sections(text: str) -> list:
186
187
  return re.findall(pattern, text)
187
188
 
188
189
 
189
- def extract_section(text: str, section_name: str) -> Optional[str]:
190
- """Extract content of a specific section (including its subsections)."""
191
- # Find the start of the requested section
192
- pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
193
- start_match = re.search(pattern, text)
194
- if not start_match:
195
- return None
190
+ @dataclass
191
+ class SectionNode:
192
+ """Represents a section/subsection/subsubsection in the LaTeX document tree."""
193
+ level: int # 0=section, 1=subsection, 2=subsubsection
194
+ name: str
195
+ start_pos: int
196
+ end_pos: int = -1 # -1 means end of document
197
+ children: List['SectionNode'] = field(default_factory=list)
198
+ parent: Optional['SectionNode'] = None
196
199
 
197
- start_pos = start_match.start()
198
200
 
199
- # Find the next \section (not subsection) or end of document
200
- remaining = text[start_match.end():]
201
- end_match = re.search(r'\\section\*?\{', remaining)
201
+ def parse_section_tree(text: str) -> List[SectionNode]:
202
+ """
203
+ Build a hierarchical tree from LaTeX section commands.
202
204
 
203
- if end_match:
204
- end_pos = start_match.end() + end_match.start()
205
- return text[start_pos:end_pos].rstrip()
206
- else:
207
- return text[start_pos:].rstrip()
205
+ Returns a list of top-level section nodes, each containing their subsections as children.
206
+ """
207
+ # Match section, subsection, and subsubsection commands
208
+ pattern = r'\\(section|subsection|subsubsection)\*?\{([^}]+)\}'
209
+
210
+ level_map = {'section': 0, 'subsection': 1, 'subsubsection': 2}
211
+
212
+ # Find all section commands with their positions
213
+ matches = list(re.finditer(pattern, text))
214
+
215
+ if not matches:
216
+ return []
217
+
218
+ # Create nodes for all sections
219
+ all_nodes = []
220
+ for match in matches:
221
+ level = level_map[match.group(1)]
222
+ name = match.group(2)
223
+ start_pos = match.start()
224
+ all_nodes.append(SectionNode(level=level, name=name, start_pos=start_pos))
225
+
226
+ # Calculate end positions (each section ends where the next same-or-higher level starts)
227
+ for i, node in enumerate(all_nodes):
228
+ # Find next section at same or higher (lower number) level
229
+ for j in range(i + 1, len(all_nodes)):
230
+ if all_nodes[j].level <= node.level:
231
+ node.end_pos = all_nodes[j].start_pos
232
+ break
233
+ # If no next section found at same/higher level, end at document end
234
+ if node.end_pos == -1:
235
+ node.end_pos = len(text)
236
+
237
+ # Build tree structure
238
+ root_nodes: List[SectionNode] = []
239
+ section_stack: List[SectionNode] = []
240
+
241
+ for node in all_nodes:
242
+ # Pop from stack until we find a parent at a higher level
243
+ while section_stack and section_stack[-1].level >= node.level:
244
+ section_stack.pop()
245
+
246
+ if section_stack:
247
+ # This node is a child of the top of the stack
248
+ node.parent = section_stack[-1]
249
+ section_stack[-1].children.append(node)
250
+ else:
251
+ # This is a root node
252
+ root_nodes.append(node)
253
+
254
+ section_stack.append(node)
255
+
256
+ return root_nodes
257
+
258
+
259
+ def format_section_tree(nodes: List[SectionNode], indent: int = 0) -> str:
260
+ """
261
+ Format section tree with indentation for display.
262
+
263
+ Returns a string with each section name on its own line, indented by level.
264
+ """
265
+ lines = []
266
+ for node in nodes:
267
+ lines.append(" " * indent + node.name)
268
+ if node.children:
269
+ lines.append(format_section_tree(node.children, indent + 1))
270
+ return "\n".join(lines)
271
+
272
+
273
+ def find_all_by_name(nodes: List[SectionNode], name: str, parent_path: str = "") -> List[str]:
274
+ """
275
+ Find all paths to sections with the given name.
276
+
277
+ Returns a list of full paths (e.g., ["Introduction > Background", "Methods > Background"])
278
+ """
279
+ results = []
280
+ for node in nodes:
281
+ current_path = f"{parent_path} > {node.name}" if parent_path else node.name
282
+ if node.name == name:
283
+ results.append(current_path)
284
+ if node.children:
285
+ results.extend(find_all_by_name(node.children, name, current_path))
286
+ return results
287
+
288
+
289
+ def find_section_by_path(nodes: List[SectionNode], path: str) -> Optional[SectionNode]:
290
+ """
291
+ Find a section by path notation (e.g., "Methods > Background").
292
+
293
+ If path contains no " > ", searches for an exact name match at any level.
294
+ If path contains " > ", follows the hierarchy.
295
+ """
296
+ parts = [p.strip() for p in path.split(" > ")]
297
+
298
+ if len(parts) == 1:
299
+ # Simple name lookup - find first match at any level
300
+ def find_first(nodes: List[SectionNode], name: str) -> Optional[SectionNode]:
301
+ for node in nodes:
302
+ if node.name == name:
303
+ return node
304
+ if node.children:
305
+ result = find_first(node.children, name)
306
+ if result:
307
+ return result
308
+ return None
309
+ return find_first(nodes, parts[0])
310
+
311
+ # Path notation - follow the hierarchy
312
+ current_nodes = nodes
313
+ current_node = None
314
+
315
+ for part in parts:
316
+ found = None
317
+ for node in current_nodes:
318
+ if node.name == part:
319
+ found = node
320
+ break
321
+ if not found:
322
+ return None
323
+ current_node = found
324
+ current_nodes = found.children
325
+
326
+ return current_node
327
+
328
+
329
+ def extract_section(text: str, section_path: str) -> Optional[str]:
330
+ """
331
+ Extract content of a specific section, subsection, or subsubsection.
332
+
333
+ Args:
334
+ text: The LaTeX content
335
+ section_path: Section name or path (e.g., "Methods" or "Methods > Background")
336
+
337
+ Returns:
338
+ The section content including any subsections, or None if not found.
339
+ """
340
+ tree = parse_section_tree(text)
341
+ node = find_section_by_path(tree, section_path)
342
+ if not node:
343
+ return None
344
+
345
+ return text[node.start_pos:node.end_pos].rstrip()
208
346
 
209
347
 
210
348
  def flatten_tex(directory: str, main_file: str) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
54
54
  # Process a local folder containing TeX files (instead of downloading from arXiv)
55
55
  arxiv-to-prompt --local-folder /path/to/tex/files
56
56
 
57
- # List all section names in the paper
58
- arxiv-to-prompt 2303.08774 --list-sections
59
-
60
- # Extract only specific sections
61
- arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
57
+ # List all sections (with subsections indented)
58
+ arxiv-to-prompt 2307.09288 --list-sections
59
+ # Introduction
60
+ # Pretraining
61
+ # Pretraining Data
62
+ # Training Details
63
+ # Training Hardware \& Carbon Footprint
64
+ # ...
65
+
66
+ # Extract specific sections
67
+ arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
68
+
69
+ # Ambiguous names show a helpful error
70
+ arxiv-to-prompt 2307.09288 --section "Human Evaluation"
71
+ # Warning: 'Human Evaluation' is ambiguous. Found at:
72
+ # - Fine-tuning > RLHF Results > Human Evaluation
73
+ # - Appendix > Additional Details for Fine-tuning > Human Evaluation
74
+ # Use path notation to disambiguate.
75
+
76
+ # Use path notation when the same name appears multiple times
77
+ arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
62
78
 
63
79
  # Copy to clipboard
64
80
  arxiv-to-prompt 2303.08774 | pbcopy
@@ -12,6 +12,11 @@ from arxiv_to_prompt.core import (
12
12
  remove_appendix,
13
13
  list_sections,
14
14
  extract_section,
15
+ SectionNode,
16
+ parse_section_tree,
17
+ format_section_tree,
18
+ find_all_by_name,
19
+ find_section_by_path,
15
20
  )
16
21
  from arxiv_to_prompt.cli import extract_arxiv_id
17
22
 
@@ -378,3 +383,256 @@ Results here.
378
383
  results = extract_section(text, "Results")
379
384
  assert results is not None
380
385
  assert "Results here." in results
386
+
387
+
388
+ def test_parse_section_tree():
389
+ """Test parsing LaTeX into a hierarchical section tree."""
390
+ text = r"""
391
+ \section{Introduction}
392
+ Intro text.
393
+ \subsection{Background}
394
+ Background text.
395
+ \subsection{Motivation}
396
+ Motivation text.
397
+ \section{Methods}
398
+ Methods text.
399
+ \subsection{Background}
400
+ Methods background.
401
+ \subsubsection{Details}
402
+ Details text.
403
+ \subsection{Data Collection}
404
+ Data text.
405
+ \section{Results}
406
+ Results text.
407
+ """
408
+ tree = parse_section_tree(text)
409
+
410
+ # Should have 3 top-level sections
411
+ assert len(tree) == 3
412
+ assert tree[0].name == "Introduction"
413
+ assert tree[1].name == "Methods"
414
+ assert tree[2].name == "Results"
415
+
416
+ # Introduction should have 2 subsections
417
+ assert len(tree[0].children) == 2
418
+ assert tree[0].children[0].name == "Background"
419
+ assert tree[0].children[1].name == "Motivation"
420
+
421
+ # Methods should have 2 subsections
422
+ assert len(tree[1].children) == 2
423
+ assert tree[1].children[0].name == "Background"
424
+ assert tree[1].children[1].name == "Data Collection"
425
+
426
+ # Methods > Background should have 1 subsubsection
427
+ assert len(tree[1].children[0].children) == 1
428
+ assert tree[1].children[0].children[0].name == "Details"
429
+
430
+ # Results should have no subsections
431
+ assert len(tree[2].children) == 0
432
+
433
+
434
+ def test_parse_section_tree_levels():
435
+ """Test that section levels are correctly assigned."""
436
+ text = r"""
437
+ \section{Sec}
438
+ \subsection{Subsec}
439
+ \subsubsection{Subsubsec}
440
+ """
441
+ tree = parse_section_tree(text)
442
+
443
+ assert tree[0].level == 0
444
+ assert tree[0].children[0].level == 1
445
+ assert tree[0].children[0].children[0].level == 2
446
+
447
+
448
+ def test_format_section_tree():
449
+ """Test formatting section tree with indentation."""
450
+ text = r"""
451
+ \section{Introduction}
452
+ \subsection{Background}
453
+ \section{Methods}
454
+ \subsection{Data}
455
+ \subsubsection{Collection}
456
+ """
457
+ tree = parse_section_tree(text)
458
+ output = format_section_tree(tree)
459
+
460
+ lines = output.split('\n')
461
+ assert lines[0] == "Introduction"
462
+ assert lines[1] == " Background"
463
+ assert lines[2] == "Methods"
464
+ assert lines[3] == " Data"
465
+ assert lines[4] == " Collection"
466
+
467
+
468
+ def test_find_all_by_name():
469
+ """Test finding all paths to sections with a given name."""
470
+ text = r"""
471
+ \section{Introduction}
472
+ \subsection{Background}
473
+ \section{Methods}
474
+ \subsection{Background}
475
+ \section{Results}
476
+ """
477
+ tree = parse_section_tree(text)
478
+
479
+ # Background appears twice under different parents
480
+ paths = find_all_by_name(tree, "Background")
481
+ assert len(paths) == 2
482
+ assert "Introduction > Background" in paths
483
+ assert "Methods > Background" in paths
484
+
485
+ # Unique name
486
+ paths = find_all_by_name(tree, "Results")
487
+ assert paths == ["Results"]
488
+
489
+ # Non-existent name
490
+ paths = find_all_by_name(tree, "Discussion")
491
+ assert paths == []
492
+
493
+
494
+ def test_find_section_by_path_simple():
495
+ """Test finding section by simple name."""
496
+ text = r"""
497
+ \section{Introduction}
498
+ \section{Methods}
499
+ \subsection{Data}
500
+ """
501
+ tree = parse_section_tree(text)
502
+
503
+ # Find by simple name
504
+ node = find_section_by_path(tree, "Introduction")
505
+ assert node is not None
506
+ assert node.name == "Introduction"
507
+
508
+ # Find subsection by simple name
509
+ node = find_section_by_path(tree, "Data")
510
+ assert node is not None
511
+ assert node.name == "Data"
512
+
513
+
514
+ def test_find_section_by_path_notation():
515
+ """Test finding section by path notation."""
516
+ text = r"""
517
+ \section{Introduction}
518
+ \subsection{Background}
519
+ \section{Methods}
520
+ \subsection{Background}
521
+ """
522
+ tree = parse_section_tree(text)
523
+
524
+ # Find by path notation
525
+ node = find_section_by_path(tree, "Introduction > Background")
526
+ assert node is not None
527
+ assert node.name == "Background"
528
+ assert node.parent.name == "Introduction"
529
+
530
+ node = find_section_by_path(tree, "Methods > Background")
531
+ assert node is not None
532
+ assert node.name == "Background"
533
+ assert node.parent.name == "Methods"
534
+
535
+
536
+ def test_find_section_by_path_not_found():
537
+ """Test that non-existent paths return None."""
538
+ text = r"""
539
+ \section{Introduction}
540
+ \subsection{Background}
541
+ """
542
+ tree = parse_section_tree(text)
543
+
544
+ assert find_section_by_path(tree, "NonExistent") is None
545
+ assert find_section_by_path(tree, "Introduction > NonExistent") is None
546
+ assert find_section_by_path(tree, "NonExistent > Background") is None
547
+
548
+
549
+ def test_extract_section_with_path():
550
+ """Test extracting section using path notation."""
551
+ text = r"""
552
+ \section{Introduction}
553
+ Intro text.
554
+ \subsection{Background}
555
+ Intro background.
556
+ \section{Methods}
557
+ Methods text.
558
+ \subsection{Background}
559
+ Methods background.
560
+ \section{Results}
561
+ Results text.
562
+ """
563
+ # Extract using path notation
564
+ content = extract_section(text, "Introduction > Background")
565
+ assert content is not None
566
+ assert "Intro background." in content
567
+ assert "Methods background." not in content
568
+
569
+ content = extract_section(text, "Methods > Background")
570
+ assert content is not None
571
+ assert "Methods background." in content
572
+ assert "Intro background." not in content
573
+
574
+
575
+ def test_extract_subsection_boundaries():
576
+ """Test that subsection extraction stops at correct boundary."""
577
+ text = r"""
578
+ \section{Methods}
579
+ Methods intro.
580
+ \subsection{First}
581
+ First content.
582
+ \subsection{Second}
583
+ Second content.
584
+ \section{Results}
585
+ Results content.
586
+ """
587
+ # Extract first subsection - should stop at second subsection
588
+ content = extract_section(text, "First")
589
+ assert content is not None
590
+ assert "First content." in content
591
+ assert "Second content." not in content
592
+
593
+ # Extract second subsection - should stop at Results section
594
+ content = extract_section(text, "Second")
595
+ assert content is not None
596
+ assert "Second content." in content
597
+ assert "Results content." not in content
598
+
599
+
600
+ def test_extract_section_includes_subsections():
601
+ """Test that extracting a section includes all its subsections."""
602
+ text = r"""
603
+ \section{Methods}
604
+ Methods intro.
605
+ \subsection{Data}
606
+ Data info.
607
+ \subsubsection{Collection}
608
+ Collection details.
609
+ \subsection{Analysis}
610
+ Analysis info.
611
+ \section{Results}
612
+ Results content.
613
+ """
614
+ content = extract_section(text, "Methods")
615
+ assert content is not None
616
+ assert "Methods intro." in content
617
+ assert "Data info." in content
618
+ assert "Collection details." in content
619
+ assert "Analysis info." in content
620
+ assert "Results content." not in content
621
+
622
+
623
+ def test_section_tree_with_starred_sections():
624
+ """Test that starred sections are correctly parsed."""
625
+ text = r"""
626
+ \section*{Introduction}
627
+ Intro.
628
+ \subsection*{Background}
629
+ Background.
630
+ \section{Methods}
631
+ Methods.
632
+ """
633
+ tree = parse_section_tree(text)
634
+
635
+ assert len(tree) == 2
636
+ assert tree[0].name == "Introduction"
637
+ assert tree[0].children[0].name == "Background"
638
+ assert tree[1].name == "Methods"
File without changes