PyPI - arxiv-to-prompt - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

arxiv-to-prompt 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

arxiv_to_prompt/cli.py CHANGED Viewed

@@ -1,6 +1,14 @@
 import argparse
 import re
-from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
+from .core import (
+    process_latex_source,
+    get_default_cache_dir,
+    list_sections,
+    extract_section,
+    parse_section_tree,
+    format_section_tree,
+    find_all_by_name,
+)
 def extract_arxiv_id(input_str: str) -> str:
@@ -79,17 +87,28 @@ def main():
         return
     if args.list_sections:
-        sections = list_sections(content)
-        for section in sections:
-            print(section)
+        tree = parse_section_tree(content)
+        print(format_section_tree(tree))
     elif args.section:
+        import sys
+        tree = parse_section_tree(content)
         extracted = []
-        for section_name in args.section:
-            section_content = extract_section(content, section_name)
+        for section_path in args.section:
+            # Check for ambiguity only if not using path notation
+            if " > " not in section_path:
+                matching_paths = find_all_by_name(tree, section_path)
+                if len(matching_paths) > 1:
+                    print(f"Warning: '{section_path}' is ambiguous. Found at:", file=sys.stderr)
+                    for path in matching_paths:
+                        print(f"  - {path}", file=sys.stderr)
+                    print("Use path notation to disambiguate.", file=sys.stderr)
+                    continue
+            section_content = extract_section(content, section_path)
             if section_content:
                 extracted.append(section_content)
             else:
-                print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
+                print(f"Warning: Section '{section_path}' not found", file=sys.stderr)
         if extracted:
             print("\n\n".join(extracted))
     else:

arxiv_to_prompt/core.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import tarfile
 import shutil
 from typing import Optional, List
+from dataclasses import dataclass, field
 import re
 from pathlib import Path
 import requests
@@ -92,40 +93,55 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
 def find_main_tex(directory: str) -> Optional[str]:
     """
-    Find the main .tex file containing documentclass.
+    Find the main .tex file containing documentclass.
+    Searches recursively through subdirectories.
     First checks for common naming conventions (main.tex, paper.tex, index.tex).
-    If none found, returns the filename of the longest .tex file containing documentclass,
-    since shorter files are typically conference templates or supplementary documents
+    If none found, returns the path of the longest .tex file containing documentclass,
+    since shorter files are typically conference templates or supplementary documents
     rather than the main manuscript.
     """
     common_names = ['main.tex', 'paper.tex', 'index.tex']
     main_tex_file = None
     max_line_count = 0
-    # First pass: check for common naming conventions
-    for file_name in os.listdir(directory):
-        if file_name in common_names:
-            try:
-                with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
-                    lines = file.readlines()
-                    if any('\\documentclass' in line for line in lines):
-                        return file_name
-            except Exception as e:
-                logging.warning(f"Could not read file {file_name}: {e}")
+    # Walk through directory and subdirectories
+    for root, dirs, files in os.walk(directory):
+        rel_root = os.path.relpath(root, directory)
+        # First pass: check for common naming conventions
+        for file_name in files:
+            if file_name in common_names:
+                file_path = os.path.join(root, file_name)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as file:
+                        lines = file.readlines()
+                        if any('\\documentclass' in line for line in lines):
+                            if rel_root == '.':
+                                return file_name
+                            return os.path.join(rel_root, file_name)
+                except Exception as e:
+                    logging.warning(f"Could not read file {file_path}: {e}")
     # Second pass: find the longest .tex file containing documentclass
-    for file_name in os.listdir(directory):
-        if file_name.endswith('.tex'):
-            try:
-                with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
-                    lines = file.readlines()
-                    if any('\\documentclass' in line for line in lines):
-                        line_count = len(lines)
-                        if line_count > max_line_count:
-                            main_tex_file = file_name
-                            max_line_count = line_count
-            except Exception as e:
-                logging.warning(f"Could not read file {file_name}: {e}")
+    for root, dirs, files in os.walk(directory):
+        rel_root = os.path.relpath(root, directory)
+        for file_name in files:
+            if file_name.endswith('.tex'):
+                file_path = os.path.join(root, file_name)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as file:
+                        lines = file.readlines()
+                        if any('\\documentclass' in line for line in lines):
+                            line_count = len(lines)
+                            if line_count > max_line_count:
+                                if rel_root == '.':
+                                    main_tex_file = file_name
+                                else:
+                                    main_tex_file = os.path.join(rel_root, file_name)
+                                max_line_count = line_count
+                except Exception as e:
+                    logging.warning(f"Could not read file {file_path}: {e}")
     return main_tex_file
@@ -171,25 +187,162 @@ def list_sections(text: str) -> list:
     return re.findall(pattern, text)
-def extract_section(text: str, section_name: str) -> Optional[str]:
-    """Extract content of a specific section (including its subsections)."""
-    # Find the start of the requested section
-    pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
-    start_match = re.search(pattern, text)
-    if not start_match:
-        return None
+@dataclass
+class SectionNode:
+    """Represents a section/subsection/subsubsection in the LaTeX document tree."""
+    level: int  # 0=section, 1=subsection, 2=subsubsection
+    name: str
+    start_pos: int
+    end_pos: int = -1  # -1 means end of document
+    children: List['SectionNode'] = field(default_factory=list)
+    parent: Optional['SectionNode'] = None
-    start_pos = start_match.start()
-    # Find the next \section (not subsection) or end of document
-    remaining = text[start_match.end():]
-    end_match = re.search(r'\\section\*?\{', remaining)
+def parse_section_tree(text: str) -> List[SectionNode]:
+    """
+    Build a hierarchical tree from LaTeX section commands.
-    if end_match:
-        end_pos = start_match.end() + end_match.start()
-        return text[start_pos:end_pos].rstrip()
-    else:
-        return text[start_pos:].rstrip()
+    Returns a list of top-level section nodes, each containing their subsections as children.
+    """
+    # Match section, subsection, and subsubsection commands
+    pattern = r'\\(section|subsection|subsubsection)\*?\{([^}]+)\}'
+    level_map = {'section': 0, 'subsection': 1, 'subsubsection': 2}
+    # Find all section commands with their positions
+    matches = list(re.finditer(pattern, text))
+    if not matches:
+        return []
+    # Create nodes for all sections
+    all_nodes = []
+    for match in matches:
+        level = level_map[match.group(1)]
+        name = match.group(2)
+        start_pos = match.start()
+        all_nodes.append(SectionNode(level=level, name=name, start_pos=start_pos))
+    # Calculate end positions (each section ends where the next same-or-higher level starts)
+    for i, node in enumerate(all_nodes):
+        # Find next section at same or higher (lower number) level
+        for j in range(i + 1, len(all_nodes)):
+            if all_nodes[j].level <= node.level:
+                node.end_pos = all_nodes[j].start_pos
+                break
+        # If no next section found at same/higher level, end at document end
+        if node.end_pos == -1:
+            node.end_pos = len(text)
+    # Build tree structure
+    root_nodes: List[SectionNode] = []
+    section_stack: List[SectionNode] = []
+    for node in all_nodes:
+        # Pop from stack until we find a parent at a higher level
+        while section_stack and section_stack[-1].level >= node.level:
+            section_stack.pop()
+        if section_stack:
+            # This node is a child of the top of the stack
+            node.parent = section_stack[-1]
+            section_stack[-1].children.append(node)
+        else:
+            # This is a root node
+            root_nodes.append(node)
+        section_stack.append(node)
+    return root_nodes
+def format_section_tree(nodes: List[SectionNode], indent: int = 0) -> str:
+    """
+    Format section tree with indentation for display.
+    Returns a string with each section name on its own line, indented by level.
+    """
+    lines = []
+    for node in nodes:
+        lines.append("  " * indent + node.name)
+        if node.children:
+            lines.append(format_section_tree(node.children, indent + 1))
+    return "\n".join(lines)
+def find_all_by_name(nodes: List[SectionNode], name: str, parent_path: str = "") -> List[str]:
+    """
+    Find all paths to sections with the given name.
+    Returns a list of full paths (e.g., ["Introduction > Background", "Methods > Background"])
+    """
+    results = []
+    for node in nodes:
+        current_path = f"{parent_path} > {node.name}" if parent_path else node.name
+        if node.name == name:
+            results.append(current_path)
+        if node.children:
+            results.extend(find_all_by_name(node.children, name, current_path))
+    return results
+def find_section_by_path(nodes: List[SectionNode], path: str) -> Optional[SectionNode]:
+    """
+    Find a section by path notation (e.g., "Methods > Background").
+    If path contains no " > ", searches for an exact name match at any level.
+    If path contains " > ", follows the hierarchy.
+    """
+    parts = [p.strip() for p in path.split(" > ")]
+    if len(parts) == 1:
+        # Simple name lookup - find first match at any level
+        def find_first(nodes: List[SectionNode], name: str) -> Optional[SectionNode]:
+            for node in nodes:
+                if node.name == name:
+                    return node
+                if node.children:
+                    result = find_first(node.children, name)
+                    if result:
+                        return result
+            return None
+        return find_first(nodes, parts[0])
+    # Path notation - follow the hierarchy
+    current_nodes = nodes
+    current_node = None
+    for part in parts:
+        found = None
+        for node in current_nodes:
+            if node.name == part:
+                found = node
+                break
+        if not found:
+            return None
+        current_node = found
+        current_nodes = found.children
+    return current_node
+def extract_section(text: str, section_path: str) -> Optional[str]:
+    """
+    Extract content of a specific section, subsection, or subsubsection.
+    Args:
+        text: The LaTeX content
+        section_path: Section name or path (e.g., "Methods" or "Methods > Background")
+    Returns:
+        The section content including any subsections, or None if not found.
+    """
+    tree = parse_section_tree(text)
+    node = find_section_by_path(tree, section_path)
+    if not node:
+        return None
+    return text[node.start_pos:node.end_pos].rstrip()
 def flatten_tex(directory: str, main_file: str) -> str:

{arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arxiv-to-prompt
-Version: 0.5.0
+Version: 0.6.0
 Summary: transform arXiv papers into a single latex prompt for LLMs
 Author: Takashi Ishida
 License: MIT
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
 # Process a local folder containing TeX files (instead of downloading from arXiv)
 arxiv-to-prompt --local-folder /path/to/tex/files
-# List all section names in the paper
-arxiv-to-prompt 2303.08774 --list-sections
-# Extract only specific sections
-arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
+# List all sections (with subsections indented)
+arxiv-to-prompt 2307.09288 --list-sections
+# Introduction
+# Pretraining
+#   Pretraining Data
+#   Training Details
+#     Training Hardware \& Carbon Footprint
+#   ...
+# Extract specific sections
+arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
+# Ambiguous names show a helpful error
+arxiv-to-prompt 2307.09288 --section "Human Evaluation"
+# Warning: 'Human Evaluation' is ambiguous. Found at:
+#   - Fine-tuning > RLHF Results > Human Evaluation
+#   - Appendix > Additional Details for Fine-tuning > Human Evaluation
+# Use path notation to disambiguate.
+# Use path notation when the same name appears multiple times
+arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
 # Copy to clipboard
 arxiv-to-prompt 2303.08774 | pbcopy

arxiv_to_prompt-0.6.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
+arxiv_to_prompt/cli.py,sha256=0a0DoOYkKIp8mE_FqzVYmG2gvCtnFiIJtIlfZLkZu5g,3865
+arxiv_to_prompt/core.py,sha256=kI0xKTf1igeOxNACJVOtq6PlCoN6kYuTq9KfD4jzE1M,18352
+arxiv_to_prompt-0.6.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
+arxiv_to_prompt-0.6.0.dist-info/METADATA,sha256=VRGqZuboa4DCMzQ2xpAN_G7SVFdOm1YyJ6cor62lr5k,5376
+arxiv_to_prompt-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+arxiv_to_prompt-0.6.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
+arxiv_to_prompt-0.6.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
+arxiv_to_prompt-0.6.0.dist-info/RECORD,,

arxiv_to_prompt-0.5.0.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
-arxiv_to_prompt/cli.py,sha256=IwT64A-lf5PrxCxs2e1adN09USkf7ji31uzO8YAegpU,3203
-arxiv_to_prompt/core.py,sha256=GafxYeE0dNg70hNG8BrSM7S99dIpHiy1KoNp5oW8niA,13119
-arxiv_to_prompt-0.5.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
-arxiv_to_prompt-0.5.0.dist-info/METADATA,sha256=4a66cO6DpNdd0dz3U_79QhL60Q1cAhHHyExWUqhL4eo,4786
-arxiv_to_prompt-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-arxiv_to_prompt-0.5.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
-arxiv_to_prompt-0.5.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
-arxiv_to_prompt-0.5.0.dist-info/RECORD,,

{arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{arxiv_to_prompt-0.5.0.dist-info → arxiv_to_prompt-0.6.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

arxiv-to-prompt 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

arxiv-to-prompt 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl