arxiv-to-prompt 0.4.1__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.4.1
3
+ Version: 0.5.1
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -54,6 +54,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
54
54
  # Process a local folder containing TeX files (instead of downloading from arXiv)
55
55
  arxiv-to-prompt --local-folder /path/to/tex/files
56
56
 
57
+ # List all section names in the paper
58
+ arxiv-to-prompt 2303.08774 --list-sections
59
+
60
+ # Extract only specific sections
61
+ arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
62
+
57
63
  # Copy to clipboard
58
64
  arxiv-to-prompt 2303.08774 | pbcopy
59
65
 
@@ -35,6 +35,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
35
35
  # Process a local folder containing TeX files (instead of downloading from arXiv)
36
36
  arxiv-to-prompt --local-folder /path/to/tex/files
37
37
 
38
+ # List all section names in the paper
39
+ arxiv-to-prompt 2303.08774 --list-sections
40
+
41
+ # Extract only specific sections
42
+ arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
43
+
38
44
  # Copy to clipboard
39
45
  arxiv-to-prompt 2303.08774 | pbcopy
40
46
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "arxiv-to-prompt"
7
- version = "0.4.1"
7
+ version = "0.5.1"
8
8
  description = "transform arXiv papers into a single latex prompt for LLMs"
9
9
  readme = "README.md"
10
10
  authors = [{ name = "Takashi Ishida" }]
@@ -15,7 +15,7 @@ Example:
15
15
  >>> latex_source = process_latex_source(local_folder="/path/to/tex/files")
16
16
  """
17
17
 
18
- from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
18
+ from .core import process_latex_source, download_arxiv_source, get_default_cache_dir, list_sections, extract_section
19
19
 
20
20
  # Import version from package metadata
21
21
  try:
@@ -32,5 +32,7 @@ __all__ = [
32
32
  "process_latex_source",
33
33
  "download_arxiv_source",
34
34
  "get_default_cache_dir",
35
+ "list_sections",
36
+ "extract_section",
35
37
  "__version__",
36
38
  ]
@@ -1,6 +1,6 @@
1
1
  import argparse
2
2
  import re
3
- from .core import process_latex_source, get_default_cache_dir
3
+ from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
4
4
 
5
5
 
6
6
  def extract_arxiv_id(input_str: str) -> str:
@@ -45,7 +45,18 @@ def main():
45
45
  help="Path to a local folder containing TeX files (alternative to arxiv_id)",
46
46
  default=None
47
47
  )
48
-
48
+ parser.add_argument(
49
+ "--list-sections",
50
+ action="store_true",
51
+ help="List all section names in the document"
52
+ )
53
+ parser.add_argument(
54
+ "--section",
55
+ type=str,
56
+ action="append",
57
+ help="Extract only the specified section(s). Can be used multiple times."
58
+ )
59
+
49
60
  args = parser.parse_args()
50
61
 
51
62
  # Validate that either arxiv_id or local_folder is provided
@@ -64,7 +75,24 @@ def main():
64
75
  remove_appendix_section=args.no_appendix,
65
76
  local_folder=args.local_folder
66
77
  )
67
- if content:
78
+ if not content:
79
+ return
80
+
81
+ if args.list_sections:
82
+ sections = list_sections(content)
83
+ for section in sections:
84
+ print(section)
85
+ elif args.section:
86
+ extracted = []
87
+ for section_name in args.section:
88
+ section_content = extract_section(content, section_name)
89
+ if section_content:
90
+ extracted.append(section_content)
91
+ else:
92
+ print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
93
+ if extracted:
94
+ print("\n\n".join(extracted))
95
+ else:
68
96
  print(content)
69
97
 
70
98
  if __name__ == "__main__":
@@ -92,40 +92,55 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
92
92
 
93
93
  def find_main_tex(directory: str) -> Optional[str]:
94
94
  """
95
- Find the main .tex file containing documentclass.
95
+ Find the main .tex file containing documentclass.
96
+ Searches recursively through subdirectories.
96
97
  First checks for common naming conventions (main.tex, paper.tex, index.tex).
97
- If none found, returns the filename of the longest .tex file containing documentclass,
98
- since shorter files are typically conference templates or supplementary documents
98
+ If none found, returns the path of the longest .tex file containing documentclass,
99
+ since shorter files are typically conference templates or supplementary documents
99
100
  rather than the main manuscript.
100
101
  """
101
102
  common_names = ['main.tex', 'paper.tex', 'index.tex']
102
103
  main_tex_file = None
103
104
  max_line_count = 0
104
105
 
105
- # First pass: check for common naming conventions
106
- for file_name in os.listdir(directory):
107
- if file_name in common_names:
108
- try:
109
- with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
110
- lines = file.readlines()
111
- if any('\\documentclass' in line for line in lines):
112
- return file_name
113
- except Exception as e:
114
- logging.warning(f"Could not read file {file_name}: {e}")
106
+ # Walk through directory and subdirectories
107
+ for root, dirs, files in os.walk(directory):
108
+ rel_root = os.path.relpath(root, directory)
109
+
110
+ # First pass: check for common naming conventions
111
+ for file_name in files:
112
+ if file_name in common_names:
113
+ file_path = os.path.join(root, file_name)
114
+ try:
115
+ with open(file_path, 'r', encoding='utf-8') as file:
116
+ lines = file.readlines()
117
+ if any('\\documentclass' in line for line in lines):
118
+ if rel_root == '.':
119
+ return file_name
120
+ return os.path.join(rel_root, file_name)
121
+ except Exception as e:
122
+ logging.warning(f"Could not read file {file_path}: {e}")
115
123
 
116
124
  # Second pass: find the longest .tex file containing documentclass
117
- for file_name in os.listdir(directory):
118
- if file_name.endswith('.tex'):
119
- try:
120
- with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as file:
121
- lines = file.readlines()
122
- if any('\\documentclass' in line for line in lines):
123
- line_count = len(lines)
124
- if line_count > max_line_count:
125
- main_tex_file = file_name
126
- max_line_count = line_count
127
- except Exception as e:
128
- logging.warning(f"Could not read file {file_name}: {e}")
125
+ for root, dirs, files in os.walk(directory):
126
+ rel_root = os.path.relpath(root, directory)
127
+
128
+ for file_name in files:
129
+ if file_name.endswith('.tex'):
130
+ file_path = os.path.join(root, file_name)
131
+ try:
132
+ with open(file_path, 'r', encoding='utf-8') as file:
133
+ lines = file.readlines()
134
+ if any('\\documentclass' in line for line in lines):
135
+ line_count = len(lines)
136
+ if line_count > max_line_count:
137
+ if rel_root == '.':
138
+ main_tex_file = file_name
139
+ else:
140
+ main_tex_file = os.path.join(rel_root, file_name)
141
+ max_line_count = line_count
142
+ except Exception as e:
143
+ logging.warning(f"Could not read file {file_path}: {e}")
129
144
 
130
145
  return main_tex_file
131
146
 
@@ -164,6 +179,34 @@ def remove_appendix(text: str) -> str:
164
179
  return text[:appendix_match.start()].rstrip()
165
180
  return text
166
181
 
182
+
183
+ def list_sections(text: str) -> list:
184
+ """Extract all section names from LaTeX content."""
185
+ pattern = r'\\section\*?\{([^}]+)\}'
186
+ return re.findall(pattern, text)
187
+
188
+
189
+ def extract_section(text: str, section_name: str) -> Optional[str]:
190
+ """Extract content of a specific section (including its subsections)."""
191
+ # Find the start of the requested section
192
+ pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
193
+ start_match = re.search(pattern, text)
194
+ if not start_match:
195
+ return None
196
+
197
+ start_pos = start_match.start()
198
+
199
+ # Find the next \section (not subsection) or end of document
200
+ remaining = text[start_match.end():]
201
+ end_match = re.search(r'\\section\*?\{', remaining)
202
+
203
+ if end_match:
204
+ end_pos = start_match.end() + end_match.start()
205
+ return text[start_pos:end_pos].rstrip()
206
+ else:
207
+ return text[start_pos:].rstrip()
208
+
209
+
167
210
  def flatten_tex(directory: str, main_file: str) -> str:
168
211
  """Combine all tex files into one, resolving inputs."""
169
212
  def process_file(file_path: str, processed_files: set) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.4.1
3
+ Version: 0.5.1
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -54,6 +54,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
54
54
  # Process a local folder containing TeX files (instead of downloading from arXiv)
55
55
  arxiv-to-prompt --local-folder /path/to/tex/files
56
56
 
57
+ # List all section names in the paper
58
+ arxiv-to-prompt 2303.08774 --list-sections
59
+
60
+ # Extract only specific sections
61
+ arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
62
+
57
63
  # Copy to clipboard
58
64
  arxiv-to-prompt 2303.08774 | pbcopy
59
65
 
@@ -10,6 +10,8 @@ from arxiv_to_prompt.core import (
10
10
  check_source_available,
11
11
  flatten_tex,
12
12
  remove_appendix,
13
+ list_sections,
14
+ extract_section,
13
15
  )
14
16
  from arxiv_to_prompt.cli import extract_arxiv_id
15
17
 
@@ -151,6 +153,23 @@ def test_find_main_tex(temp_cache_dir):
151
153
  assert found_main == "main.tex"
152
154
 
153
155
 
156
+ def test_find_main_tex_in_subdirectory(temp_cache_dir):
157
+ """Test finding main tex file in a subdirectory."""
158
+ # Create test directory with subdirectory
159
+ tex_dir = temp_cache_dir / "test_tex_subdir"
160
+ tex_dir.mkdir(parents=True)
161
+ subdir = tex_dir / "paper"
162
+ subdir.mkdir()
163
+
164
+ # Create main.tex in subdirectory
165
+ main_file = subdir / "main.tex"
166
+ main_file.write_text("\\documentclass{article}\n\\begin{document}\nHello\n\\end{document}")
167
+
168
+ # Test finding main file in subdirectory
169
+ found_main = find_main_tex(str(tex_dir))
170
+ assert found_main == os.path.join("paper", "main.tex")
171
+
172
+
154
173
  def test_commented_input_commands(temp_cache_dir):
155
174
  """Test that commented-out \\include and \\input commands are ignored."""
156
175
  # Create test directory and files
@@ -314,3 +333,48 @@ def test_extract_arxiv_id():
314
333
  # Non-arxiv input returned as-is
315
334
  assert extract_arxiv_id("invalid") == "invalid"
316
335
  assert extract_arxiv_id("https://example.com/2505.18102") == "https://example.com/2505.18102"
336
+
337
+
338
+ def test_list_sections():
339
+ """Test listing section names."""
340
+ text = r"""
341
+ \section{Introduction}
342
+ Some intro text.
343
+ \section{Methods}
344
+ Some methods text.
345
+ \subsection{Data}
346
+ Data description.
347
+ \section*{Acknowledgments}
348
+ Thanks.
349
+ """
350
+ sections = list_sections(text)
351
+ assert sections == ["Introduction", "Methods", "Acknowledgments"]
352
+
353
+
354
+ def test_extract_section():
355
+ """Test extracting a specific section."""
356
+ text = r"""
357
+ \section{Introduction}
358
+ Intro content here.
359
+ \section{Methods}
360
+ Methods content here.
361
+ \subsection{Data Collection}
362
+ Data info.
363
+ \section{Results}
364
+ Results here.
365
+ """
366
+ # Extract Methods section (should include subsection)
367
+ methods = extract_section(text, "Methods")
368
+ assert methods is not None
369
+ assert "Methods content here." in methods
370
+ assert "Data Collection" in methods
371
+ assert "Data info." in methods
372
+ assert "Results here." not in methods
373
+
374
+ # Extract non-existent section
375
+ assert extract_section(text, "Discussion") is None
376
+
377
+ # Extract last section
378
+ results = extract_section(text, "Results")
379
+ assert results is not None
380
+ assert "Results here." in results
File without changes