PyPI - arxiv-to-prompt - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

arxiv-to-prompt 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

arxiv_to_prompt/__init__.py CHANGED Viewed

@@ -15,7 +15,7 @@ Example:
     >>> latex_source = process_latex_source(local_folder="/path/to/tex/files")
 """
-from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
+from .core import process_latex_source, download_arxiv_source, get_default_cache_dir, list_sections, extract_section
 # Import version from package metadata
 try:
@@ -32,5 +32,7 @@ __all__ = [
     "process_latex_source",
     "download_arxiv_source",
     "get_default_cache_dir",
+    "list_sections",
+    "extract_section",
     "__version__",
 ]

arxiv_to_prompt/cli.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import argparse
 import re
-from .core import process_latex_source, get_default_cache_dir
+from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
 def extract_arxiv_id(input_str: str) -> str:
@@ -45,7 +45,18 @@ def main():
         help="Path to a local folder containing TeX files (alternative to arxiv_id)",
         default=None
     )
+    parser.add_argument(
+        "--list-sections",
+        action="store_true",
+        help="List all section names in the document"
+    )
+    parser.add_argument(
+        "--section",
+        type=str,
+        action="append",
+        help="Extract only the specified section(s). Can be used multiple times."
+    )
     args = parser.parse_args()
     # Validate that either arxiv_id or local_folder is provided
@@ -64,7 +75,24 @@ def main():
         remove_appendix_section=args.no_appendix,
         local_folder=args.local_folder
     )
-    if content:
+    if not content:
+        return
+    if args.list_sections:
+        sections = list_sections(content)
+        for section in sections:
+            print(section)
+    elif args.section:
+        extracted = []
+        for section_name in args.section:
+            section_content = extract_section(content, section_name)
+            if section_content:
+                extracted.append(section_content)
+            else:
+                print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
+        if extracted:
+            print("\n\n".join(extracted))
+    else:
         print(content)
 if __name__ == "__main__":

arxiv_to_prompt/core.py CHANGED Viewed

@@ -92,40 +92,55 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
 def find_main_tex(directory: str) -> Optional[str]:
     """
-    Find the main .tex file containing documentclass.
+    Find the main .tex file containing documentclass.
+    Searches recursively through subdirectories.
     First checks for common naming conventions (main.tex, paper.tex, index.tex).
-    If none found, returns the filename of the longest .tex file containing documentclass,
-    since shorter files are typically conference templates or supplementary documents
+    If none found, returns the path of the longest .tex file containing documentclass,
+    since shorter files are typically conference templates or supplementary documents
     rather than the main manuscript.
     """
     common_names = ['main.tex', 'paper.tex', 'index.tex']
     main_tex_file = None
     max_line_count = 0
-    # First pass: check for common naming conventions
-    for file_name in os.listdir(directory):
-        if file_name in common_names:
-            try:
-                with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
-                    lines = file.readlines()
-                    if any('\\documentclass' in line for line in lines):
-                        return file_name
-            except Exception as e:
-                logging.warning(f"Could not read file {file_name}: {e}")
+    # Walk through directory and subdirectories
+    for root, dirs, files in os.walk(directory):
+        rel_root = os.path.relpath(root, directory)
+        # First pass: check for common naming conventions
+        for file_name in files:
+            if file_name in common_names:
+                file_path = os.path.join(root, file_name)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as file:
+                        lines = file.readlines()
+                        if any('\\documentclass' in line for line in lines):
+                            if rel_root == '.':
+                                return file_name
+                            return os.path.join(rel_root, file_name)
+                except Exception as e:
+                    logging.warning(f"Could not read file {file_path}: {e}")
     # Second pass: find the longest .tex file containing documentclass
-    for file_name in os.listdir(directory):
-        if file_name.endswith('.tex'):
-            try:
-                with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
-                    lines = file.readlines()
-                    if any('\\documentclass' in line for line in lines):
-                        line_count = len(lines)
-                        if line_count > max_line_count:
-                            main_tex_file = file_name
-                            max_line_count = line_count
-            except Exception as e:
-                logging.warning(f"Could not read file {file_name}: {e}")
+    for root, dirs, files in os.walk(directory):
+        rel_root = os.path.relpath(root, directory)
+        for file_name in files:
+            if file_name.endswith('.tex'):
+                file_path = os.path.join(root, file_name)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as file:
+                        lines = file.readlines()
+                        if any('\\documentclass' in line for line in lines):
+                            line_count = len(lines)
+                            if line_count > max_line_count:
+                                if rel_root == '.':
+                                    main_tex_file = file_name
+                                else:
+                                    main_tex_file = os.path.join(rel_root, file_name)
+                                max_line_count = line_count
+                except Exception as e:
+                    logging.warning(f"Could not read file {file_path}: {e}")
     return main_tex_file
@@ -164,6 +179,34 @@ def remove_appendix(text: str) -> str:
         return text[:appendix_match.start()].rstrip()
     return text
+def list_sections(text: str) -> list:
+    """Extract all section names from LaTeX content."""
+    pattern = r'\\section\*?\{([^}]+)\}'
+    return re.findall(pattern, text)
+def extract_section(text: str, section_name: str) -> Optional[str]:
+    """Extract content of a specific section (including its subsections)."""
+    # Find the start of the requested section
+    pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
+    start_match = re.search(pattern, text)
+    if not start_match:
+        return None
+    start_pos = start_match.start()
+    # Find the next \section (not subsection) or end of document
+    remaining = text[start_match.end():]
+    end_match = re.search(r'\\section\*?\{', remaining)
+    if end_match:
+        end_pos = start_match.end() + end_match.start()
+        return text[start_pos:end_pos].rstrip()
+    else:
+        return text[start_pos:].rstrip()
 def flatten_tex(directory: str, main_file: str) -> str:
     """Combine all tex files into one, resolving inputs."""
     def process_file(file_path: str, processed_files: set) -> str:

{arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arxiv-to-prompt
-Version: 0.4.1
+Version: 0.5.1
 Summary: transform arXiv papers into a single latex prompt for LLMs
 Author: Takashi Ishida
 License: MIT
@@ -54,6 +54,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
 # Process a local folder containing TeX files (instead of downloading from arXiv)
 arxiv-to-prompt --local-folder /path/to/tex/files
+# List all section names in the paper
+arxiv-to-prompt 2303.08774 --list-sections
+# Extract only specific sections
+arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
 # Copy to clipboard
 arxiv-to-prompt 2303.08774 | pbcopy

arxiv_to_prompt-0.5.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
+arxiv_to_prompt/cli.py,sha256=IwT64A-lf5PrxCxs2e1adN09USkf7ji31uzO8YAegpU,3203
+arxiv_to_prompt/core.py,sha256=ln67k1MT-l8PalwGsszU6IwCZ15GAOiX0yfLgyKvySA,13837
+arxiv_to_prompt-0.5.1.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
+arxiv_to_prompt-0.5.1.dist-info/METADATA,sha256=VKK7my5pxFuVLTejMV3vS8BLhk_kV62HHPWxC84_80Q,4786
+arxiv_to_prompt-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+arxiv_to_prompt-0.5.1.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
+arxiv_to_prompt-0.5.1.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
+arxiv_to_prompt-0.5.1.dist-info/RECORD,,

arxiv_to_prompt-0.4.1.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-arxiv_to_prompt/__init__.py,sha256=riK7TcTaKDleP5g5rjf2jkmLtXZu7irNZDujyAVDnKM,1093
-arxiv_to_prompt/cli.py,sha256=np6mv2iCkLiVLawyix1vXP4bVFzRmZlkfjxb07ee89Q,2276
-arxiv_to_prompt/core.py,sha256=6tl6IZh5BlBENKa3QMHG0ekqhhLLmh82oUQpgfYrz2o,12228
-arxiv_to_prompt-0.4.1.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
-arxiv_to_prompt-0.4.1.dist-info/METADATA,sha256=p5DIa1t9ik8_Mdn-7qxvfm8j0k--kAMn689-a3WocNM,4598
-arxiv_to_prompt-0.4.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-arxiv_to_prompt-0.4.1.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
-arxiv_to_prompt-0.4.1.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
-arxiv_to_prompt-0.4.1.dist-info/RECORD,,

{arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{arxiv_to_prompt-0.4.1.dist-info → arxiv_to_prompt-0.5.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

arxiv-to-prompt 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

arxiv-to-prompt 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl