PyPI - arxiv-to-prompt - Versions diffs - 0.1.0__py3-none-any.whl - Mend

arxiv-to-prompt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

arxiv_to_prompt/__init__.py +20 -0
arxiv_to_prompt/cli.py +37 -0
arxiv_to_prompt/core.py +230 -0
arxiv_to_prompt-0.1.0.dist-info/LICENSE +21 -0
arxiv_to_prompt-0.1.0.dist-info/METADATA +95 -0
arxiv_to_prompt-0.1.0.dist-info/RECORD +9 -0
arxiv_to_prompt-0.1.0.dist-info/WHEEL +5 -0
arxiv_to_prompt-0.1.0.dist-info/entry_points.txt +2 -0
arxiv_to_prompt-0.1.0.dist-info/top_level.txt +1 -0

arxiv_to_prompt/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""
+arxiv-to-prompt: A tool to download and process LaTeX source from arXiv papers.
+This package provides functionality to:
+- Download source files from any arXiv paper using its ID
+- Smart concatenation of multiple LaTeX files into a single coherent source
+- Option to remove LaTeX comments
+Example:
+    >>> from arxiv_to_prompt import process_latex_source
+    >>> latex_source = process_latex_source("2303.08774")
+"""
+from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
+__all__ = [
+    "process_latex_source",
+    "download_arxiv_source",
+    "get_default_cache_dir",
+]

arxiv_to_prompt/cli.py ADDED Viewed

@@ -0,0 +1,37 @@
+import argparse
+from .core import process_latex_source, get_default_cache_dir
+def main():
+    default_cache = str(get_default_cache_dir())
+    parser = argparse.ArgumentParser(
+        description="Download and display LaTeX source from arXiv papers."
+    )
+    parser.add_argument(
+        "arxiv_id",
+        help="The arXiv ID of the paper (do not include the version, e.g. v1, v2)"
+    )
+    parser.add_argument(
+        "--no-comments",
+        action="store_true",
+        help="Remove LaTeX comments from the output"
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        help=f"Custom directory to store downloaded files (default: {default_cache})",
+        default=None
+    )
+    args = parser.parse_args()
+    content = process_latex_source(
+        args.arxiv_id,
+        keep_comments=not args.no_comments,
+        cache_dir=args.cache_dir
+    )
+    if content:
+        print(content)
+if __name__ == "__main__":
+    main()

arxiv_to_prompt/core.py ADDED Viewed

@@ -0,0 +1,230 @@
+import logging
+import os
+import tarfile
+import shutil
+from typing import Optional, List
+import re
+from pathlib import Path
+import requests
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def get_default_cache_dir() -> Path:
+    """Get the default cache directory for downloaded files."""
+    # Use standard OS-specific cache directory
+    if os.name == 'nt':  # Windows
+        base_dir = Path(os.environ.get('LOCALAPPDATA', '~'))
+    else:  # Unix/Linux/MacOS
+        base_dir = Path(os.environ.get('XDG_CACHE_HOME', '~/.cache'))
+    cache_dir = base_dir.expanduser() / 'arxiv-to-prompt'
+    return cache_dir
+def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_cache: bool = False) -> bool:
+    """
+    Download source files from arXiv.
+    Args:
+        arxiv_id: The arXiv ID of the paper
+        cache_dir: Custom directory to store downloaded files
+        use_cache: Whether to use cached files if they exist (default: False)
+    Returns:
+        bool: True if download successful, False if failed (including when source not available)
+    """
+    try:
+        # First check if tex source is available
+        if not check_source_available(arxiv_id):
+            logging.warning(f"TeX source files not available for {arxiv_id}")
+            return False
+        # Use provided cache_dir or default
+        base_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
+        # Always use latest version by not specifying version in URL
+        url = f'https://arxiv.org/e-print/{arxiv_id}'
+        # Set up directory
+        directory = base_dir / arxiv_id
+        if use_cache and directory.exists():
+            logging.info(f"Directory {directory} already exists, using cached version.")
+            return True
+        # Clean up existing directory if not using cache
+        if directory.exists():
+            shutil.rmtree(directory)
+        # Create temporary directory for tar.gz file
+        temp_dir = base_dir / 'temp'
+        temp_dir.mkdir(parents=True, exist_ok=True)
+        tar_path = temp_dir / f'{arxiv_id}.tar.gz'
+        # Download the file
+        logging.info(f"Downloading source from {url}")
+        headers = {
+            'User-Agent': 'Mozilla/5.0'
+        }
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        # Save and extract
+        with open(tar_path, 'wb') as file:
+            file.write(response.content)
+        directory.mkdir(parents=True, exist_ok=True)
+        with tarfile.open(tar_path) as tar:
+            tar.extractall(path=directory)
+        # Clean up temporary files
+        tar_path.unlink()
+        if temp_dir.exists():
+            shutil.rmtree(temp_dir)
+        logging.info(f"Source files downloaded and extracted to {directory}/")
+        return True
+    except Exception as e:
+        logging.error(f"Error downloading/extracting source: {e}")
+        if directory.exists():
+            shutil.rmtree(directory)  # Clean up failed download
+        return False
+def find_main_tex(directory: str) -> Optional[str]:
+    """
+    Find the main .tex file containing documentclass. If there are multiple files,
+    returns the filename of the longest .tex file containing documentclass, since shorter
+    files are typically conference templates or supplementary documents rather than the
+    main manuscript.
+    """
+    main_tex_file = None
+    max_line_count = 0
+    for file_name in os.listdir(directory):
+        if file_name.endswith('.tex'):
+            try:
+                with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
+                    lines = file.readlines()
+                    if any('\\documentclass' in line for line in lines):
+                        line_count = len(lines)
+                        if line_count > max_line_count:
+                            main_tex_file = file_name
+                            max_line_count = line_count
+            except Exception as e:
+                logging.warning(f"Could not read file {file_name}: {e}")
+    return main_tex_file
+def remove_comments_from_lines(text: str) -> str:
+    """Remove LaTeX comments while preserving newlines."""
+    lines = text.split('\n')
+    result = []
+    for line in lines:
+        # Skip pure comment lines
+        if line.lstrip().startswith('%'):
+            continue
+        # Handle inline comments
+        in_command = False
+        cleaned_line = []
+        for i, char in enumerate(line):
+            if char == '\\':
+                in_command = True
+                cleaned_line.append(char)
+            elif in_command:
+                in_command = False
+                cleaned_line.append(char)
+            elif char == '%' and not in_command:
+                break
+            else:
+                cleaned_line.append(char)
+        result.append(''.join(cleaned_line).rstrip())
+    return '\n'.join(result)
+def flatten_tex(directory: str, main_file: str) -> str:
+    """Combine all tex files into one, resolving inputs."""
+    def process_file(file_path: str, processed_files: set) -> str:
+        if file_path in processed_files:
+            return ""
+        processed_files.add(file_path)
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            # Process \input and \include commands
+            def replace_input(match):
+                input_file = match.group(1)
+                if not input_file.endswith('.tex'):
+                    input_file += '.tex'
+                input_path = os.path.join(directory, input_file)
+                return process_file(input_path, processed_files)
+            content = re.sub(r'\\(?:input|include){([^}]+)}', replace_input, content)
+            return content
+        except Exception as e:
+            logging.warning(f"Error processing file {file_path}: {e}")
+            return ""
+    main_file_path = os.path.join(directory, main_file)
+    return process_file(main_file_path, set())
+def process_latex_source(arxiv_id: str, keep_comments: bool = True,
+                        cache_dir: Optional[str] = None,
+                        use_cache: bool = False) -> Optional[str]:
+    """
+    Process LaTeX source files from arXiv and return the combined content.
+    Args:
+        arxiv_id: The arXiv ID of the paper
+        keep_comments: Whether to keep LaTeX comments in the output
+        cache_dir: Custom directory to store downloaded files
+        use_cache: Whether to use cached files if they exist (default: False)
+    Returns:
+        The processed LaTeX content or None if processing fails
+    """
+    base_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
+    # Download the latest version
+    if not download_arxiv_source(arxiv_id, cache_dir, use_cache):
+        return None
+    directory = base_dir / arxiv_id
+    main_file = find_main_tex(directory)
+    if not main_file:
+        logging.error("Main .tex file not found.")
+        return None
+    # Get the content
+    content = flatten_tex(directory, main_file)
+    # Process comments if requested
+    if not keep_comments:
+        content = remove_comments_from_lines(content)
+    return content
+def check_source_available(arxiv_id: str) -> bool:
+    """Check if source files are available by checking the format page."""
+    url = f'https://arxiv.org/format/{arxiv_id}'
+    headers = {
+        'User-Agent': 'Mozilla/5.0'
+    }
+    # Create a session with retry capability
+    session = requests.Session()
+    adapter = requests.adapters.HTTPAdapter(max_retries=3)
+    session.mount('https://', adapter)
+    try:
+        # Use separate timeouts for connect and read operations
+        response = session.get(url, headers=headers, timeout=(5, 30))  # (connect timeout, read timeout)
+        response.raise_for_status()
+        return 'Download source' in response.text
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Error checking source availability: {e}")
+        return False
+    finally:
+        session.close()

arxiv_to_prompt-0.1.0.dist-info/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Takashi Ishida
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

arxiv_to_prompt-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,95 @@
+Metadata-Version: 2.2
+Name: arxiv-to-prompt
+Version: 0.1.0
+Summary: transform arXiv papers into a single latex prompt for LLMs
+Author: Takashi Ishida
+License: MIT License
+        Copyright (c) 2025 Takashi Ishida
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://github.com/takashiishida/arxiv-to-prompt
+Project-URL: Changelog, https://github.com/takashiishida/arxiv-to-prompt/releases
+Project-URL: Issues, https://github.com/takashiishida/arxiv-to-prompt/issues
+Project-URL: CI, https://github.com/takashiishida/arxiv-to-prompt/actions
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: requests>=2.25.0
+Provides-Extra: test
+Requires-Dist: pytest>=7.0.0; extra == "test"
+Requires-Dist: pytest-cov>=4.0.0; extra == "test"
+# arxiv-to-prompt
+[![PyPI version](https://badge.fury.io/py/arxiv-to-prompt.svg)](https://pypi.org/project/arxiv-to-prompt/)
+[![Tests](https://github.com/takashiishida/arxiv-to-prompt/actions/workflows/tests.yml/badge.svg)](https://github.com/takashiishida/arxiv-to-prompt/actions)
+[![License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Changelog](https://img.shields.io/github/v/release/takashiishida/arxiv-to-prompt?label=changelog)](https://github.com/takashiishida/arxiv-to-prompt/releases)
+A command-line tool to transform arXiv papers into a single LaTeX source that can be used as a prompt for asking LLMs questions about the paper. It downloads the source files, automatically finds the main tex file containing `\documentclass`, and flattens multiple files into a single coherent source by resolving `\input` and `\include` commands. The tool also provides an option to remove LaTeX comments from the output (which can be useful to shorten the prompt).
+### Installation
+```bash
+pip install arxiv-to-prompt
+```
+### Usage
+Basic usage:
+```bash
+# Display LaTeX source with comments
+arxiv-to-prompt 2303.08774
+# Display LaTeX source without comments
+arxiv-to-prompt 2303.08774 --no-comments
+# Copy to clipboard
+arxiv-to-prompt 2303.08774 | pbcopy
+```
+The arXiv ID can be found in the paper's URL. For example, for `https://arxiv.org/abs/2303.08774`, the ID is `2303.08774`. It will automatically download the latest version of the paper, so you don't need to specify the version.
+### Python API
+You can also use arxiv-to-prompt in your Python code:
+```python
+from arxiv_to_prompt import process_latex_source
+# Get LaTeX source with comments
+latex_source = process_latex_source("2303.08774")
+# Get LaTeX source without comments
+latex_source = process_latex_source("2303.08774", keep_comments=False)
+```
+### References
+- Inspired by [files-to-prompt](https://github.com/simonw/files-to-prompt).
+- Reused some code from [paper2slides](https://github.com/takashiishida/paper2slides).

arxiv_to_prompt-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+arxiv_to_prompt/__init__.py,sha256=oL2bEzZhiFoMqCF-84Xmljqw55lgRkwInBFpExRPCTY,609
+arxiv_to_prompt/cli.py,sha256=WafgKxxpgJrLyeuQ-tnUASoknoNXiaQRWLP-Emsr-ug,977
+arxiv_to_prompt/core.py,sha256=cQcMNQJSrRVQAQsy2ULeLVlQlKIDDdgVLHFKJNMR0Sg,8296
+arxiv_to_prompt-0.1.0.dist-info/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
+arxiv_to_prompt-0.1.0.dist-info/METADATA,sha256=H8T6HFkP199SK19Jy66MgrVE2S8kTBr-2yYzC9qpQBs,4338
+arxiv_to_prompt-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+arxiv_to_prompt-0.1.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
+arxiv_to_prompt-0.1.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
+arxiv_to_prompt-0.1.0.dist-info/RECORD,,

arxiv_to_prompt-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (75.8.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

arxiv_to_prompt-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ arxiv-to-prompt = arxiv_to_prompt.cli:main

arxiv_to_prompt-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ arxiv_to_prompt