arxiv-to-prompt 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,18 @@
1
1
  """
2
- arxiv-to-prompt: A tool to download and process LaTeX source from arXiv papers.
2
+ arxiv-to-prompt: A tool to download and process LaTeX source from arXiv papers or local folders.
3
3
 
4
4
  This package provides functionality to:
5
5
  - Download source files from any arXiv paper using its ID
6
+ - Process LaTeX source files from a local folder
6
7
  - Smart concatenation of multiple LaTeX files into a single coherent source
7
8
  - Option to remove LaTeX comments
8
9
 
9
10
  Example:
10
11
  >>> from arxiv_to_prompt import process_latex_source
12
+ >>> # From arXiv
11
13
  >>> latex_source = process_latex_source("2303.08774")
14
+ >>> # From local folder
15
+ >>> latex_source = process_latex_source(local_folder="/path/to/tex/files")
12
16
  """
13
17
 
14
18
  from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
arxiv_to_prompt/cli.py CHANGED
@@ -5,11 +5,13 @@ def main():
5
5
  default_cache = str(get_default_cache_dir())
6
6
 
7
7
  parser = argparse.ArgumentParser(
8
- description="Download and display LaTeX source from arXiv papers."
8
+ description="Download and display LaTeX source from arXiv papers or process local TeX files."
9
9
  )
10
10
  parser.add_argument(
11
11
  "arxiv_id",
12
- help="The arXiv ID of the paper (do not include the version, e.g. v1, v2)"
12
+ nargs="?",
13
+ default=None,
14
+ help="The arXiv ID of the paper (do not include the version, e.g. v1, v2). Not needed if --local-folder is provided."
13
15
  )
14
16
  parser.add_argument(
15
17
  "--no-comments",
@@ -27,14 +29,28 @@ def main():
27
29
  action="store_true",
28
30
  help="Remove the appendix section and everything after it"
29
31
  )
32
+ parser.add_argument(
33
+ "--local-folder",
34
+ type=str,
35
+ help="Path to a local folder containing TeX files (alternative to arxiv_id)",
36
+ default=None
37
+ )
30
38
 
31
39
  args = parser.parse_args()
32
40
 
41
+ # Validate that either arxiv_id or local_folder is provided
42
+ if not args.arxiv_id and not args.local_folder:
43
+ parser.error("Either provide an arXiv ID or use --local-folder to specify a local folder")
44
+
45
+ if args.arxiv_id and args.local_folder:
46
+ parser.error("Cannot specify both arXiv ID and --local-folder")
47
+
33
48
  content = process_latex_source(
34
- args.arxiv_id,
49
+ arxiv_id=args.arxiv_id,
35
50
  keep_comments=not args.no_comments,
36
51
  cache_dir=args.cache_dir,
37
- remove_appendix_section=args.no_appendix
52
+ remove_appendix_section=args.no_appendix,
53
+ local_folder=args.local_folder
38
54
  )
39
55
  if content:
40
56
  print(content)
arxiv_to_prompt/core.py CHANGED
@@ -92,14 +92,28 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
92
92
 
93
93
  def find_main_tex(directory: str) -> Optional[str]:
94
94
  """
95
- Find the main .tex file containing documentclass. If there are multiple files,
96
- returns the filename of the longest .tex file containing documentclass, since shorter
97
- files are typically conference templates or supplementary documents rather than the
98
- main manuscript.
95
+ Find the main .tex file containing documentclass.
96
+ First checks for common naming conventions (main.tex, paper.tex, index.tex).
97
+ If none found, returns the filename of the longest .tex file containing documentclass,
98
+ since shorter files are typically conference templates or supplementary documents
99
+ rather than the main manuscript.
99
100
  """
101
+ common_names = ['main.tex', 'paper.tex', 'index.tex']
100
102
  main_tex_file = None
101
103
  max_line_count = 0
102
104
 
105
+ # First pass: check for common naming conventions
106
+ for file_name in os.listdir(directory):
107
+ if file_name in common_names:
108
+ try:
109
+ with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
110
+ lines = file.readlines()
111
+ if any('\\documentclass' in line for line in lines):
112
+ return file_name
113
+ except Exception as e:
114
+ logging.warning(f"Could not read file {file_name}: {e}")
115
+
116
+ # Second pass: find the longest .tex file containing documentclass
103
117
  for file_name in os.listdir(directory):
104
118
  if file_name.endswith('.tex'):
105
119
  try:
@@ -208,37 +222,57 @@ def flatten_tex(directory: str, main_file: str) -> str:
208
222
  main_file_path = os.path.join(directory, main_file)
209
223
  return process_file(main_file_path, set())
210
224
 
211
- def process_latex_source(arxiv_id: str, keep_comments: bool = True,
225
+ def process_latex_source(arxiv_id: Optional[str] = None, keep_comments: bool = True,
212
226
  cache_dir: Optional[str] = None,
213
- use_cache: bool = False, remove_appendix_section: bool = False) -> Optional[str]:
227
+ use_cache: bool = False, remove_appendix_section: bool = False,
228
+ local_folder: Optional[str] = None) -> Optional[str]:
214
229
  """
215
- Process LaTeX source files from arXiv and return the combined content.
230
+ Process LaTeX source files from arXiv or a local folder and return the combined content.
216
231
 
217
232
  Args:
218
- arxiv_id: The arXiv ID of the paper
233
+ arxiv_id: The arXiv ID of the paper (required if local_folder is not provided)
219
234
  keep_comments: Whether to keep LaTeX comments in the output
220
- cache_dir: Custom directory to store downloaded files
221
- use_cache: Whether to use cached files if they exist (default: False)
235
+ cache_dir: Custom directory to store downloaded files (only used for arXiv)
236
+ use_cache: Whether to use cached files if they exist (default: False, only used for arXiv)
222
237
  remove_appendix_section: Whether to remove the appendix section and everything after it
238
+ local_folder: Path to a local folder containing TeX files (alternative to arxiv_id)
223
239
 
224
240
  Returns:
225
241
  The processed LaTeX content or None if processing fails
226
242
  """
227
- base_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
228
-
229
- # Download the latest version
230
- if not download_arxiv_source(arxiv_id, cache_dir, use_cache):
243
+ # Determine the directory to process
244
+ if local_folder:
245
+ directory = Path(local_folder).expanduser().resolve()
246
+
247
+ # Validate the folder exists
248
+ if not directory.exists():
249
+ logging.error(f"Local folder does not exist: {directory}")
250
+ return None
251
+
252
+ if not directory.is_dir():
253
+ logging.error(f"Path is not a directory: {directory}")
254
+ return None
255
+
256
+ logging.info(f"Processing local folder: {directory}")
257
+ elif arxiv_id:
258
+ base_dir = Path(cache_dir) if cache_dir else get_default_cache_dir()
259
+
260
+ # Download the latest version
261
+ if not download_arxiv_source(arxiv_id, cache_dir, use_cache):
262
+ return None
263
+
264
+ directory = base_dir / arxiv_id
265
+ else:
266
+ logging.error("Either arxiv_id or local_folder must be provided")
231
267
  return None
232
-
233
- directory = base_dir / arxiv_id
234
268
 
235
- main_file = find_main_tex(directory)
269
+ main_file = find_main_tex(str(directory))
236
270
  if not main_file:
237
271
  logging.error("Main .tex file not found.")
238
272
  return None
239
273
 
240
274
  # Get the content
241
- content = flatten_tex(directory, main_file)
275
+ content = flatten_tex(str(directory), main_file)
242
276
 
243
277
  # Process comments if requested
244
278
  if not keep_comments:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.2.2
3
+ Version: 0.3.0
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -17,12 +17,15 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
17
17
  Requires-Dist: pytest-cov>=4.0.0; extra == "test"
18
18
  Dynamic: license-file
19
19
 
20
- # arxiv-to-prompt
20
+ <div align="center">
21
+ <img src="logo.png#gh-light-mode-only" alt="" width="475"><img src="logo.png#gh-dark-mode-only" alt="" width="475">
21
22
 
22
- [![PyPI version](https://badge.fury.io/py/arxiv-to-prompt.svg?update=20250307)](https://pypi.org/project/arxiv-to-prompt/)
23
+ [![PyPI version](https://badge.fury.io/py/arxiv-to-prompt.svg)](https://pypi.org/project/arxiv-to-prompt/)
23
24
  [![Tests](https://github.com/takashiishida/arxiv-to-prompt/actions/workflows/tests.yml/badge.svg)](https://github.com/takashiishida/arxiv-to-prompt/actions)
24
25
  [![License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
25
26
  [![Changelog](https://img.shields.io/github/v/release/takashiishida/arxiv-to-prompt?label=changelog)](https://github.com/takashiishida/arxiv-to-prompt/releases)
27
+ [![Downloads](https://static.pepy.tech/badge/arxiv-to-prompt)](https://pepy.tech/project/arxiv-to-prompt)
28
+ </div>
26
29
 
27
30
  A command-line tool to transform arXiv papers into a single LaTeX source that can be used as a prompt for asking LLMs questions about the paper. It downloads the source files, automatically finds the main tex file containing `\documentclass`, and flattens multiple files into a single coherent source by resolving `\input` and `\include` commands. The tool also provides options to remove LaTeX comments and appendix sections from the output (which can be useful to shorten the prompt).
28
31
 
@@ -48,6 +51,9 @@ arxiv-to-prompt 2303.08774 --no-appendix
48
51
  # Combine options (no comments and no appendix)
49
52
  arxiv-to-prompt 2303.08774 --no-comments --no-appendix
50
53
 
54
+ # Process a local folder containing TeX files (instead of downloading from arXiv)
55
+ arxiv-to-prompt --local-folder /path/to/tex/files
56
+
51
57
  # Copy to clipboard
52
58
  arxiv-to-prompt 2303.08774 | pbcopy
53
59
 
@@ -75,18 +81,18 @@ latex_source = process_latex_source("2303.08774", remove_appendix_section=True)
75
81
 
76
82
  # Combine options (no comments and no appendix)
77
83
  latex_source = process_latex_source("2303.08774", keep_comments=False, remove_appendix_section=True)
84
+
85
+ # Process LaTeX sources from a local folder (instead of downloading from arXiv)
86
+ latex_source = process_latex_source(local_folder="/path/to/tex/files")
78
87
  ```
79
88
 
80
89
  ### Projects Using arxiv-to-prompt
81
90
 
82
91
  Here are some projects and use cases that leverage arxiv-to-prompt:
83
92
 
84
- - [arxiv-latex-mcp](https://github.com/takashiishida/arxiv-latex-mcp): MCP server that uses arxiv-to-prompt to fetch and process arXiv LaTeX sources for precise interpretation of mathematical expressions in scientific papers.
85
- - [arxiv-tex-ui](https://github.com/takashiishida/arxiv-tex-ui): chat with an llm about an arxiv paper by using the latex source.
93
+ - [arxiv-latex-mcp](https://github.com/takashiishida/arxiv-latex-mcp): MCP server that fetch and process arXiv LaTeX sources for precise interpretation of mathematical expressions in papers.
94
+ - [arxiv-tex-ui](https://github.com/takashiishida/arxiv-tex-ui): chat with an LLM about an arxiv paper by using the latex source.
95
+ - [paper2slides](https://github.com/takashiishida/paper2slides): transform an arXiv paper into slides.
96
+ - [ArXivToPrompt](https://apps.apple.com/jp/app/arxivtoprompt/id6751013390): iOS app that allows users to easily extract LaTeX source from arXiv papers on their iPhone and copy it to the clipboard for use with LLM apps.
86
97
 
87
98
  If you're using arxiv-to-prompt in your project, please submit a pull request to add it to this list!
88
-
89
- ### References
90
-
91
- - Inspired by [files-to-prompt](https://github.com/simonw/files-to-prompt).
92
- - Reused some code from [paper2slides](https://github.com/takashiishida/paper2slides).
@@ -0,0 +1,9 @@
1
+ arxiv_to_prompt/__init__.py,sha256=riK7TcTaKDleP5g5rjf2jkmLtXZu7irNZDujyAVDnKM,1093
2
+ arxiv_to_prompt/cli.py,sha256=TUnHsGolF5zhiexW5RXBPhNL0HODmfppEkXzC8z65NE,1861
3
+ arxiv_to_prompt/core.py,sha256=pgb8PGiOqgbPTW5rIJwLlmS9n3nnlYa5UVQ5YSvCIuo,12077
4
+ arxiv_to_prompt-0.3.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
5
+ arxiv_to_prompt-0.3.0.dist-info/METADATA,sha256=CNOBI0du7Yj9Hyr6JElWhohvLgd4jLUvlZEn9c8NexU,4608
6
+ arxiv_to_prompt-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ arxiv_to_prompt-0.3.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
8
+ arxiv_to_prompt-0.3.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
9
+ arxiv_to_prompt-0.3.0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- arxiv_to_prompt/__init__.py,sha256=UjbXdsTGX7eT6O1RvqGB1-wMv_Kj-pM-7M5FZUUzVIQ,899
2
- arxiv_to_prompt/cli.py,sha256=2ZVmxNcygFpOFROfCo-FtXzcRpLVVRUOkIhASL0iD7o,1179
3
- arxiv_to_prompt/core.py,sha256=0XwG9hqljQ3FHDOmmR7C8CX4ge1CJJAqSosVzTXhkes,10425
4
- arxiv_to_prompt-0.2.2.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
5
- arxiv_to_prompt-0.2.2.dist-info/METADATA,sha256=MwmVCm5oLxKS8L-gQeFHK5W-YCidjepwebVesWHskco,3998
6
- arxiv_to_prompt-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- arxiv_to_prompt-0.2.2.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
8
- arxiv_to_prompt-0.2.2.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
9
- arxiv_to_prompt-0.2.2.dist-info/RECORD,,