PyPI - arxiv-to-prompt - Versions diffs - 0.5.0__tar.gz → 0.6.0__tar.gz - Mend

arxiv-to-prompt 0.5.0tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{arxiv_to_prompt-0.5.0/src/arxiv_to_prompt.egg-info → arxiv_to_prompt-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arxiv-to-prompt
-Version: 0.5.0
+Version: 0.6.0
 Summary: transform arXiv papers into a single latex prompt for LLMs
 Author: Takashi Ishida
 License: MIT
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
 # Process a local folder containing TeX files (instead of downloading from arXiv)
 arxiv-to-prompt --local-folder /path/to/tex/files
-# List all section names in the paper
-arxiv-to-prompt 2303.08774 --list-sections
-# Extract only specific sections
-arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
+# List all sections (with subsections indented)
+arxiv-to-prompt 2307.09288 --list-sections
+# Introduction
+# Pretraining
+#   Pretraining Data
+#   Training Details
+#     Training Hardware \& Carbon Footprint
+#   ...
+# Extract specific sections
+arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
+# Ambiguous names show a helpful error
+arxiv-to-prompt 2307.09288 --section "Human Evaluation"
+# Warning: 'Human Evaluation' is ambiguous. Found at:
+#   - Fine-tuning > RLHF Results > Human Evaluation
+#   - Appendix > Additional Details for Fine-tuning > Human Evaluation
+# Use path notation to disambiguate.
+# Use path notation when the same name appears multiple times
+arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
 # Copy to clipboard
 arxiv-to-prompt 2303.08774 | pbcopy

{arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/README.md RENAMED Viewed

@@ -35,11 +35,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
 # Process a local folder containing TeX files (instead of downloading from arXiv)
 arxiv-to-prompt --local-folder /path/to/tex/files
-# List all section names in the paper
-arxiv-to-prompt 2303.08774 --list-sections
-# Extract only specific sections
-arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
+# List all sections (with subsections indented)
+arxiv-to-prompt 2307.09288 --list-sections
+# Introduction
+# Pretraining
+#   Pretraining Data
+#   Training Details
+#     Training Hardware \& Carbon Footprint
+#   ...
+# Extract specific sections
+arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
+# Ambiguous names show a helpful error
+arxiv-to-prompt 2307.09288 --section "Human Evaluation"
+# Warning: 'Human Evaluation' is ambiguous. Found at:
+#   - Fine-tuning > RLHF Results > Human Evaluation
+#   - Appendix > Additional Details for Fine-tuning > Human Evaluation
+# Use path notation to disambiguate.
+# Use path notation when the same name appears multiple times
+arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
 # Copy to clipboard
 arxiv-to-prompt 2303.08774 | pbcopy

{arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "arxiv-to-prompt"
-version = "0.5.0"
+version = "0.6.0"
 description = "transform arXiv papers into a single latex prompt for LLMs"
 readme = "README.md"
 authors = [{ name = "Takashi Ishida" }]

{arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt/cli.py RENAMED Viewed

@@ -1,6 +1,14 @@
 import argparse
 import re
-from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
+from .core import (
+    process_latex_source,
+    get_default_cache_dir,
+    list_sections,
+    extract_section,
+    parse_section_tree,
+    format_section_tree,
+    find_all_by_name,
+)
 def extract_arxiv_id(input_str: str) -> str:
@@ -79,17 +87,28 @@ def main():
         return
     if args.list_sections:
-        sections = list_sections(content)
-        for section in sections:
-            print(section)
+        tree = parse_section_tree(content)
+        print(format_section_tree(tree))
     elif args.section:
+        import sys
+        tree = parse_section_tree(content)
         extracted = []
-        for section_name in args.section:
-            section_content = extract_section(content, section_name)
+        for section_path in args.section:
+            # Check for ambiguity only if not using path notation
+            if " > " not in section_path:
+                matching_paths = find_all_by_name(tree, section_path)
+                if len(matching_paths) > 1:
+                    print(f"Warning: '{section_path}' is ambiguous. Found at:", file=sys.stderr)
+                    for path in matching_paths:
+                        print(f"  - {path}", file=sys.stderr)
+                    print("Use path notation to disambiguate.", file=sys.stderr)
+                    continue
+            section_content = extract_section(content, section_path)
             if section_content:
                 extracted.append(section_content)
             else:
-                print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
+                print(f"Warning: Section '{section_path}' not found", file=sys.stderr)
         if extracted:
             print("\n\n".join(extracted))
     else:

{arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt/core.py RENAMED Viewed

@@ -3,6 +3,7 @@ import os
 import tarfile
 import shutil
 from typing import Optional, List
+from dataclasses import dataclass, field
 import re
 from pathlib import Path
 import requests
@@ -92,40 +93,55 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
 def find_main_tex(directory: str) -> Optional[str]:
     """
-    Find the main .tex file containing documentclass.
+    Find the main .tex file containing documentclass.
+    Searches recursively through subdirectories.
     First checks for common naming conventions (main.tex, paper.tex, index.tex).
-    If none found, returns the filename of the longest .tex file containing documentclass,
-    since shorter files are typically conference templates or supplementary documents
+    If none found, returns the path of the longest .tex file containing documentclass,
+    since shorter files are typically conference templates or supplementary documents
     rather than the main manuscript.
     """
     common_names = ['main.tex', 'paper.tex', 'index.tex']
     main_tex_file = None
     max_line_count = 0
-    # First pass: check for common naming conventions
-    for file_name in os.listdir(directory):
-        if file_name in common_names:
-            try:
-                with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
-                    lines = file.readlines()
-                    if any('\\documentclass' in line for line in lines):
-                        return file_name
-            except Exception as e:
-                logging.warning(f"Could not read file {file_name}: {e}")
+    # Walk through directory and subdirectories
+    for root, dirs, files in os.walk(directory):
+        rel_root = os.path.relpath(root, directory)
+        # First pass: check for common naming conventions
+        for file_name in files:
+            if file_name in common_names:
+                file_path = os.path.join(root, file_name)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as file:
+                        lines = file.readlines()
+                        if any('\\documentclass' in line for line in lines):
+                            if rel_root == '.':
+                                return file_name
+                            return os.path.join(rel_root, file_name)
+                except Exception as e:
+                    logging.warning(f"Could not read file {file_path}: {e}")
     # Second pass: find the longest .tex file containing documentclass
-    for file_name in os.listdir(directory):
-        if file_name.endswith('.tex'):
-            try:
-                with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
-                    lines = file.readlines()
-                    if any('\\documentclass' in line for line in lines):
-                        line_count = len(lines)
-                        if line_count > max_line_count:
-                            main_tex_file = file_name
-                            max_line_count = line_count
-            except Exception as e:
-                logging.warning(f"Could not read file {file_name}: {e}")
+    for root, dirs, files in os.walk(directory):
+        rel_root = os.path.relpath(root, directory)
+        for file_name in files:
+            if file_name.endswith('.tex'):
+                file_path = os.path.join(root, file_name)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as file:
+                        lines = file.readlines()
+                        if any('\\documentclass' in line for line in lines):
+                            line_count = len(lines)
+                            if line_count > max_line_count:
+                                if rel_root == '.':
+                                    main_tex_file = file_name
+                                else:
+                                    main_tex_file = os.path.join(rel_root, file_name)
+                                max_line_count = line_count
+                except Exception as e:
+                    logging.warning(f"Could not read file {file_path}: {e}")
     return main_tex_file
@@ -171,25 +187,162 @@ def list_sections(text: str) -> list:
     return re.findall(pattern, text)
-def extract_section(text: str, section_name: str) -> Optional[str]:
-    """Extract content of a specific section (including its subsections)."""
-    # Find the start of the requested section
-    pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
-    start_match = re.search(pattern, text)
-    if not start_match:
-        return None
+@dataclass
+class SectionNode:
+    """Represents a section/subsection/subsubsection in the LaTeX document tree."""
+    level: int  # 0=section, 1=subsection, 2=subsubsection
+    name: str
+    start_pos: int
+    end_pos: int = -1  # -1 means end of document
+    children: List['SectionNode'] = field(default_factory=list)
+    parent: Optional['SectionNode'] = None
-    start_pos = start_match.start()
-    # Find the next \section (not subsection) or end of document
-    remaining = text[start_match.end():]
-    end_match = re.search(r'\\section\*?\{', remaining)
+def parse_section_tree(text: str) -> List[SectionNode]:
+    """
+    Build a hierarchical tree from LaTeX section commands.
-    if end_match:
-        end_pos = start_match.end() + end_match.start()
-        return text[start_pos:end_pos].rstrip()
-    else:
-        return text[start_pos:].rstrip()
+    Returns a list of top-level section nodes, each containing their subsections as children.
+    """
+    # Match section, subsection, and subsubsection commands
+    pattern = r'\\(section|subsection|subsubsection)\*?\{([^}]+)\}'
+    level_map = {'section': 0, 'subsection': 1, 'subsubsection': 2}
+    # Find all section commands with their positions
+    matches = list(re.finditer(pattern, text))
+    if not matches:
+        return []
+    # Create nodes for all sections
+    all_nodes = []
+    for match in matches:
+        level = level_map[match.group(1)]
+        name = match.group(2)
+        start_pos = match.start()
+        all_nodes.append(SectionNode(level=level, name=name, start_pos=start_pos))
+    # Calculate end positions (each section ends where the next same-or-higher level starts)
+    for i, node in enumerate(all_nodes):
+        # Find next section at same or higher (lower number) level
+        for j in range(i + 1, len(all_nodes)):
+            if all_nodes[j].level <= node.level:
+                node.end_pos = all_nodes[j].start_pos
+                break
+        # If no next section found at same/higher level, end at document end
+        if node.end_pos == -1:
+            node.end_pos = len(text)
+    # Build tree structure
+    root_nodes: List[SectionNode] = []
+    section_stack: List[SectionNode] = []
+    for node in all_nodes:
+        # Pop from stack until we find a parent at a higher level
+        while section_stack and section_stack[-1].level >= node.level:
+            section_stack.pop()
+        if section_stack:
+            # This node is a child of the top of the stack
+            node.parent = section_stack[-1]
+            section_stack[-1].children.append(node)
+        else:
+            # This is a root node
+            root_nodes.append(node)
+        section_stack.append(node)
+    return root_nodes
+def format_section_tree(nodes: List[SectionNode], indent: int = 0) -> str:
+    """
+    Format section tree with indentation for display.
+    Returns a string with each section name on its own line, indented by level.
+    """
+    lines = []
+    for node in nodes:
+        lines.append("  " * indent + node.name)
+        if node.children:
+            lines.append(format_section_tree(node.children, indent + 1))
+    return "\n".join(lines)
+def find_all_by_name(nodes: List[SectionNode], name: str, parent_path: str = "") -> List[str]:
+    """
+    Find all paths to sections with the given name.
+    Returns a list of full paths (e.g., ["Introduction > Background", "Methods > Background"])
+    """
+    results = []
+    for node in nodes:
+        current_path = f"{parent_path} > {node.name}" if parent_path else node.name
+        if node.name == name:
+            results.append(current_path)
+        if node.children:
+            results.extend(find_all_by_name(node.children, name, current_path))
+    return results
+def find_section_by_path(nodes: List[SectionNode], path: str) -> Optional[SectionNode]:
+    """
+    Find a section by path notation (e.g., "Methods > Background").
+    If path contains no " > ", searches for an exact name match at any level.
+    If path contains " > ", follows the hierarchy.
+    """
+    parts = [p.strip() for p in path.split(" > ")]
+    if len(parts) == 1:
+        # Simple name lookup - find first match at any level
+        def find_first(nodes: List[SectionNode], name: str) -> Optional[SectionNode]:
+            for node in nodes:
+                if node.name == name:
+                    return node
+                if node.children:
+                    result = find_first(node.children, name)
+                    if result:
+                        return result
+            return None
+        return find_first(nodes, parts[0])
+    # Path notation - follow the hierarchy
+    current_nodes = nodes
+    current_node = None
+    for part in parts:
+        found = None
+        for node in current_nodes:
+            if node.name == part:
+                found = node
+                break
+        if not found:
+            return None
+        current_node = found
+        current_nodes = found.children
+    return current_node
+def extract_section(text: str, section_path: str) -> Optional[str]:
+    """
+    Extract content of a specific section, subsection, or subsubsection.
+    Args:
+        text: The LaTeX content
+        section_path: Section name or path (e.g., "Methods" or "Methods > Background")
+    Returns:
+        The section content including any subsections, or None if not found.
+    """
+    tree = parse_section_tree(text)
+    node = find_section_by_path(tree, section_path)
+    if not node:
+        return None
+    return text[node.start_pos:node.end_pos].rstrip()
 def flatten_tex(directory: str, main_file: str) -> str:

{arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0/src/arxiv_to_prompt.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arxiv-to-prompt
-Version: 0.5.0
+Version: 0.6.0
 Summary: transform arXiv papers into a single latex prompt for LLMs
 Author: Takashi Ishida
 License: MIT
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
 # Process a local folder containing TeX files (instead of downloading from arXiv)
 arxiv-to-prompt --local-folder /path/to/tex/files
-# List all section names in the paper
-arxiv-to-prompt 2303.08774 --list-sections
-# Extract only specific sections
-arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
+# List all sections (with subsections indented)
+arxiv-to-prompt 2307.09288 --list-sections
+# Introduction
+# Pretraining
+#   Pretraining Data
+#   Training Details
+#     Training Hardware \& Carbon Footprint
+#   ...
+# Extract specific sections
+arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
+# Ambiguous names show a helpful error
+arxiv-to-prompt 2307.09288 --section "Human Evaluation"
+# Warning: 'Human Evaluation' is ambiguous. Found at:
+#   - Fine-tuning > RLHF Results > Human Evaluation
+#   - Appendix > Additional Details for Fine-tuning > Human Evaluation
+# Use path notation to disambiguate.
+# Use path notation when the same name appears multiple times
+arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
 # Copy to clipboard
 arxiv-to-prompt 2303.08774 | pbcopy

{arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/tests/test_core.py RENAMED Viewed

@@ -12,6 +12,11 @@ from arxiv_to_prompt.core import (
     remove_appendix,
     list_sections,
     extract_section,
+    SectionNode,
+    parse_section_tree,
+    format_section_tree,
+    find_all_by_name,
+    find_section_by_path,
 )
 from arxiv_to_prompt.cli import extract_arxiv_id
@@ -153,6 +158,23 @@ def test_find_main_tex(temp_cache_dir):
     assert found_main == "main.tex"
+def test_find_main_tex_in_subdirectory(temp_cache_dir):
+    """Test finding main tex file in a subdirectory."""
+    # Create test directory with subdirectory
+    tex_dir = temp_cache_dir / "test_tex_subdir"
+    tex_dir.mkdir(parents=True)
+    subdir = tex_dir / "paper"
+    subdir.mkdir()
+    # Create main.tex in subdirectory
+    main_file = subdir / "main.tex"
+    main_file.write_text("\\documentclass{article}\n\\begin{document}\nHello\n\\end{document}")
+    # Test finding main file in subdirectory
+    found_main = find_main_tex(str(tex_dir))
+    assert found_main == os.path.join("paper", "main.tex")
 def test_commented_input_commands(temp_cache_dir):
     """Test that commented-out \\include and \\input commands are ignored."""
     # Create test directory and files
@@ -361,3 +383,256 @@ Results here.
     results = extract_section(text, "Results")
     assert results is not None
     assert "Results here." in results
+def test_parse_section_tree():
+    """Test parsing LaTeX into a hierarchical section tree."""
+    text = r"""
+\section{Introduction}
+Intro text.
+\subsection{Background}
+Background text.
+\subsection{Motivation}
+Motivation text.
+\section{Methods}
+Methods text.
+\subsection{Background}
+Methods background.
+\subsubsection{Details}
+Details text.
+\subsection{Data Collection}
+Data text.
+\section{Results}
+Results text.
+"""
+    tree = parse_section_tree(text)
+    # Should have 3 top-level sections
+    assert len(tree) == 3
+    assert tree[0].name == "Introduction"
+    assert tree[1].name == "Methods"
+    assert tree[2].name == "Results"
+    # Introduction should have 2 subsections
+    assert len(tree[0].children) == 2
+    assert tree[0].children[0].name == "Background"
+    assert tree[0].children[1].name == "Motivation"
+    # Methods should have 2 subsections
+    assert len(tree[1].children) == 2
+    assert tree[1].children[0].name == "Background"
+    assert tree[1].children[1].name == "Data Collection"
+    # Methods > Background should have 1 subsubsection
+    assert len(tree[1].children[0].children) == 1
+    assert tree[1].children[0].children[0].name == "Details"
+    # Results should have no subsections
+    assert len(tree[2].children) == 0
+def test_parse_section_tree_levels():
+    """Test that section levels are correctly assigned."""
+    text = r"""
+\section{Sec}
+\subsection{Subsec}
+\subsubsection{Subsubsec}
+"""
+    tree = parse_section_tree(text)
+    assert tree[0].level == 0
+    assert tree[0].children[0].level == 1
+    assert tree[0].children[0].children[0].level == 2
+def test_format_section_tree():
+    """Test formatting section tree with indentation."""
+    text = r"""
+\section{Introduction}
+\subsection{Background}
+\section{Methods}
+\subsection{Data}
+\subsubsection{Collection}
+"""
+    tree = parse_section_tree(text)
+    output = format_section_tree(tree)
+    lines = output.split('\n')
+    assert lines[0] == "Introduction"
+    assert lines[1] == "  Background"
+    assert lines[2] == "Methods"
+    assert lines[3] == "  Data"
+    assert lines[4] == "    Collection"
+def test_find_all_by_name():
+    """Test finding all paths to sections with a given name."""
+    text = r"""
+\section{Introduction}
+\subsection{Background}
+\section{Methods}
+\subsection{Background}
+\section{Results}
+"""
+    tree = parse_section_tree(text)
+    # Background appears twice under different parents
+    paths = find_all_by_name(tree, "Background")
+    assert len(paths) == 2
+    assert "Introduction > Background" in paths
+    assert "Methods > Background" in paths
+    # Unique name
+    paths = find_all_by_name(tree, "Results")
+    assert paths == ["Results"]
+    # Non-existent name
+    paths = find_all_by_name(tree, "Discussion")
+    assert paths == []
+def test_find_section_by_path_simple():
+    """Test finding section by simple name."""
+    text = r"""
+\section{Introduction}
+\section{Methods}
+\subsection{Data}
+"""
+    tree = parse_section_tree(text)
+    # Find by simple name
+    node = find_section_by_path(tree, "Introduction")
+    assert node is not None
+    assert node.name == "Introduction"
+    # Find subsection by simple name
+    node = find_section_by_path(tree, "Data")
+    assert node is not None
+    assert node.name == "Data"
+def test_find_section_by_path_notation():
+    """Test finding section by path notation."""
+    text = r"""
+\section{Introduction}
+\subsection{Background}
+\section{Methods}
+\subsection{Background}
+"""
+    tree = parse_section_tree(text)
+    # Find by path notation
+    node = find_section_by_path(tree, "Introduction > Background")
+    assert node is not None
+    assert node.name == "Background"
+    assert node.parent.name == "Introduction"
+    node = find_section_by_path(tree, "Methods > Background")
+    assert node is not None
+    assert node.name == "Background"
+    assert node.parent.name == "Methods"
+def test_find_section_by_path_not_found():
+    """Test that non-existent paths return None."""
+    text = r"""
+\section{Introduction}
+\subsection{Background}
+"""
+    tree = parse_section_tree(text)
+    assert find_section_by_path(tree, "NonExistent") is None
+    assert find_section_by_path(tree, "Introduction > NonExistent") is None
+    assert find_section_by_path(tree, "NonExistent > Background") is None
+def test_extract_section_with_path():
+    """Test extracting section using path notation."""
+    text = r"""
+\section{Introduction}
+Intro text.
+\subsection{Background}
+Intro background.
+\section{Methods}
+Methods text.
+\subsection{Background}
+Methods background.
+\section{Results}
+Results text.
+"""
+    # Extract using path notation
+    content = extract_section(text, "Introduction > Background")
+    assert content is not None
+    assert "Intro background." in content
+    assert "Methods background." not in content
+    content = extract_section(text, "Methods > Background")
+    assert content is not None
+    assert "Methods background." in content
+    assert "Intro background." not in content
+def test_extract_subsection_boundaries():
+    """Test that subsection extraction stops at correct boundary."""
+    text = r"""
+\section{Methods}
+Methods intro.
+\subsection{First}
+First content.
+\subsection{Second}
+Second content.
+\section{Results}
+Results content.
+"""
+    # Extract first subsection - should stop at second subsection
+    content = extract_section(text, "First")
+    assert content is not None
+    assert "First content." in content
+    assert "Second content." not in content
+    # Extract second subsection - should stop at Results section
+    content = extract_section(text, "Second")
+    assert content is not None
+    assert "Second content." in content
+    assert "Results content." not in content
+def test_extract_section_includes_subsections():
+    """Test that extracting a section includes all its subsections."""
+    text = r"""
+\section{Methods}
+Methods intro.
+\subsection{Data}
+Data info.
+\subsubsection{Collection}
+Collection details.
+\subsection{Analysis}
+Analysis info.
+\section{Results}
+Results content.
+"""
+    content = extract_section(text, "Methods")
+    assert content is not None
+    assert "Methods intro." in content
+    assert "Data info." in content
+    assert "Collection details." in content
+    assert "Analysis info." in content
+    assert "Results content." not in content
+def test_section_tree_with_starred_sections():
+    """Test that starred sections are correctly parsed."""
+    text = r"""
+\section*{Introduction}
+Intro.
+\subsection*{Background}
+Background.
+\section{Methods}
+Methods.
+"""
+    tree = parse_section_tree(text)
+    assert len(tree) == 2
+    assert tree[0].name == "Introduction"
+    assert tree[0].children[0].name == "Background"
+    assert tree[1].name == "Methods"