arxiv-to-prompt 0.5.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arxiv_to_prompt-0.5.0/src/arxiv_to_prompt.egg-info → arxiv_to_prompt-0.6.0}/PKG-INFO +22 -6
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/README.md +21 -5
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/pyproject.toml +1 -1
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt/cli.py +26 -7
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt/core.py +194 -41
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0/src/arxiv_to_prompt.egg-info}/PKG-INFO +22 -6
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/tests/test_core.py +275 -0
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/LICENSE +0 -0
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/setup.cfg +0 -0
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt/__init__.py +0 -0
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt.egg-info/SOURCES.txt +0 -0
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt.egg-info/dependency_links.txt +0 -0
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt.egg-info/entry_points.txt +0 -0
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt.egg-info/requires.txt +0 -0
- {arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
54
54
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
55
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
56
|
|
|
57
|
-
# List all
|
|
58
|
-
arxiv-to-prompt
|
|
59
|
-
|
|
60
|
-
#
|
|
61
|
-
|
|
57
|
+
# List all sections (with subsections indented)
|
|
58
|
+
arxiv-to-prompt 2307.09288 --list-sections
|
|
59
|
+
# Introduction
|
|
60
|
+
# Pretraining
|
|
61
|
+
# Pretraining Data
|
|
62
|
+
# Training Details
|
|
63
|
+
# Training Hardware \& Carbon Footprint
|
|
64
|
+
# ...
|
|
65
|
+
|
|
66
|
+
# Extract specific sections
|
|
67
|
+
arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
|
|
68
|
+
|
|
69
|
+
# Ambiguous names show a helpful error
|
|
70
|
+
arxiv-to-prompt 2307.09288 --section "Human Evaluation"
|
|
71
|
+
# Warning: 'Human Evaluation' is ambiguous. Found at:
|
|
72
|
+
# - Fine-tuning > RLHF Results > Human Evaluation
|
|
73
|
+
# - Appendix > Additional Details for Fine-tuning > Human Evaluation
|
|
74
|
+
# Use path notation to disambiguate.
|
|
75
|
+
|
|
76
|
+
# Use path notation when the same name appears multiple times
|
|
77
|
+
arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
|
|
62
78
|
|
|
63
79
|
# Copy to clipboard
|
|
64
80
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
@@ -35,11 +35,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
35
35
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
36
36
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
37
37
|
|
|
38
|
-
# List all
|
|
39
|
-
arxiv-to-prompt
|
|
40
|
-
|
|
41
|
-
#
|
|
42
|
-
|
|
38
|
+
# List all sections (with subsections indented)
|
|
39
|
+
arxiv-to-prompt 2307.09288 --list-sections
|
|
40
|
+
# Introduction
|
|
41
|
+
# Pretraining
|
|
42
|
+
# Pretraining Data
|
|
43
|
+
# Training Details
|
|
44
|
+
# Training Hardware \& Carbon Footprint
|
|
45
|
+
# ...
|
|
46
|
+
|
|
47
|
+
# Extract specific sections
|
|
48
|
+
arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
|
|
49
|
+
|
|
50
|
+
# Ambiguous names show a helpful error
|
|
51
|
+
arxiv-to-prompt 2307.09288 --section "Human Evaluation"
|
|
52
|
+
# Warning: 'Human Evaluation' is ambiguous. Found at:
|
|
53
|
+
# - Fine-tuning > RLHF Results > Human Evaluation
|
|
54
|
+
# - Appendix > Additional Details for Fine-tuning > Human Evaluation
|
|
55
|
+
# Use path notation to disambiguate.
|
|
56
|
+
|
|
57
|
+
# Use path notation when the same name appears multiple times
|
|
58
|
+
arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
|
|
43
59
|
|
|
44
60
|
# Copy to clipboard
|
|
45
61
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import re
|
|
3
|
-
from .core import
|
|
3
|
+
from .core import (
|
|
4
|
+
process_latex_source,
|
|
5
|
+
get_default_cache_dir,
|
|
6
|
+
list_sections,
|
|
7
|
+
extract_section,
|
|
8
|
+
parse_section_tree,
|
|
9
|
+
format_section_tree,
|
|
10
|
+
find_all_by_name,
|
|
11
|
+
)
|
|
4
12
|
|
|
5
13
|
|
|
6
14
|
def extract_arxiv_id(input_str: str) -> str:
|
|
@@ -79,17 +87,28 @@ def main():
|
|
|
79
87
|
return
|
|
80
88
|
|
|
81
89
|
if args.list_sections:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
print(section)
|
|
90
|
+
tree = parse_section_tree(content)
|
|
91
|
+
print(format_section_tree(tree))
|
|
85
92
|
elif args.section:
|
|
93
|
+
import sys
|
|
94
|
+
tree = parse_section_tree(content)
|
|
86
95
|
extracted = []
|
|
87
|
-
for
|
|
88
|
-
|
|
96
|
+
for section_path in args.section:
|
|
97
|
+
# Check for ambiguity only if not using path notation
|
|
98
|
+
if " > " not in section_path:
|
|
99
|
+
matching_paths = find_all_by_name(tree, section_path)
|
|
100
|
+
if len(matching_paths) > 1:
|
|
101
|
+
print(f"Warning: '{section_path}' is ambiguous. Found at:", file=sys.stderr)
|
|
102
|
+
for path in matching_paths:
|
|
103
|
+
print(f" - {path}", file=sys.stderr)
|
|
104
|
+
print("Use path notation to disambiguate.", file=sys.stderr)
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
section_content = extract_section(content, section_path)
|
|
89
108
|
if section_content:
|
|
90
109
|
extracted.append(section_content)
|
|
91
110
|
else:
|
|
92
|
-
print(f"Warning: Section '{
|
|
111
|
+
print(f"Warning: Section '{section_path}' not found", file=sys.stderr)
|
|
93
112
|
if extracted:
|
|
94
113
|
print("\n\n".join(extracted))
|
|
95
114
|
else:
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import tarfile
|
|
4
4
|
import shutil
|
|
5
5
|
from typing import Optional, List
|
|
6
|
+
from dataclasses import dataclass, field
|
|
6
7
|
import re
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
import requests
|
|
@@ -92,40 +93,55 @@ def download_arxiv_source(arxiv_id: str, cache_dir: Optional[str] = None, use_ca
|
|
|
92
93
|
|
|
93
94
|
def find_main_tex(directory: str) -> Optional[str]:
|
|
94
95
|
"""
|
|
95
|
-
Find the main .tex file containing documentclass.
|
|
96
|
+
Find the main .tex file containing documentclass.
|
|
97
|
+
Searches recursively through subdirectories.
|
|
96
98
|
First checks for common naming conventions (main.tex, paper.tex, index.tex).
|
|
97
|
-
If none found, returns the
|
|
98
|
-
since shorter files are typically conference templates or supplementary documents
|
|
99
|
+
If none found, returns the path of the longest .tex file containing documentclass,
|
|
100
|
+
since shorter files are typically conference templates or supplementary documents
|
|
99
101
|
rather than the main manuscript.
|
|
100
102
|
"""
|
|
101
103
|
common_names = ['main.tex', 'paper.tex', 'index.tex']
|
|
102
104
|
main_tex_file = None
|
|
103
105
|
max_line_count = 0
|
|
104
106
|
|
|
105
|
-
#
|
|
106
|
-
for
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
107
|
+
# Walk through directory and subdirectories
|
|
108
|
+
for root, dirs, files in os.walk(directory):
|
|
109
|
+
rel_root = os.path.relpath(root, directory)
|
|
110
|
+
|
|
111
|
+
# First pass: check for common naming conventions
|
|
112
|
+
for file_name in files:
|
|
113
|
+
if file_name in common_names:
|
|
114
|
+
file_path = os.path.join(root, file_name)
|
|
115
|
+
try:
|
|
116
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
117
|
+
lines = file.readlines()
|
|
118
|
+
if any('\\documentclass' in line for line in lines):
|
|
119
|
+
if rel_root == '.':
|
|
120
|
+
return file_name
|
|
121
|
+
return os.path.join(rel_root, file_name)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logging.warning(f"Could not read file {file_path}: {e}")
|
|
115
124
|
|
|
116
125
|
# Second pass: find the longest .tex file containing documentclass
|
|
117
|
-
for
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
126
|
+
for root, dirs, files in os.walk(directory):
|
|
127
|
+
rel_root = os.path.relpath(root, directory)
|
|
128
|
+
|
|
129
|
+
for file_name in files:
|
|
130
|
+
if file_name.endswith('.tex'):
|
|
131
|
+
file_path = os.path.join(root, file_name)
|
|
132
|
+
try:
|
|
133
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
134
|
+
lines = file.readlines()
|
|
135
|
+
if any('\\documentclass' in line for line in lines):
|
|
136
|
+
line_count = len(lines)
|
|
137
|
+
if line_count > max_line_count:
|
|
138
|
+
if rel_root == '.':
|
|
139
|
+
main_tex_file = file_name
|
|
140
|
+
else:
|
|
141
|
+
main_tex_file = os.path.join(rel_root, file_name)
|
|
142
|
+
max_line_count = line_count
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logging.warning(f"Could not read file {file_path}: {e}")
|
|
129
145
|
|
|
130
146
|
return main_tex_file
|
|
131
147
|
|
|
@@ -171,25 +187,162 @@ def list_sections(text: str) -> list:
|
|
|
171
187
|
return re.findall(pattern, text)
|
|
172
188
|
|
|
173
189
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
190
|
+
@dataclass
|
|
191
|
+
class SectionNode:
|
|
192
|
+
"""Represents a section/subsection/subsubsection in the LaTeX document tree."""
|
|
193
|
+
level: int # 0=section, 1=subsection, 2=subsubsection
|
|
194
|
+
name: str
|
|
195
|
+
start_pos: int
|
|
196
|
+
end_pos: int = -1 # -1 means end of document
|
|
197
|
+
children: List['SectionNode'] = field(default_factory=list)
|
|
198
|
+
parent: Optional['SectionNode'] = None
|
|
181
199
|
|
|
182
|
-
start_pos = start_match.start()
|
|
183
200
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
201
|
+
def parse_section_tree(text: str) -> List[SectionNode]:
|
|
202
|
+
"""
|
|
203
|
+
Build a hierarchical tree from LaTeX section commands.
|
|
187
204
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
205
|
+
Returns a list of top-level section nodes, each containing their subsections as children.
|
|
206
|
+
"""
|
|
207
|
+
# Match section, subsection, and subsubsection commands
|
|
208
|
+
pattern = r'\\(section|subsection|subsubsection)\*?\{([^}]+)\}'
|
|
209
|
+
|
|
210
|
+
level_map = {'section': 0, 'subsection': 1, 'subsubsection': 2}
|
|
211
|
+
|
|
212
|
+
# Find all section commands with their positions
|
|
213
|
+
matches = list(re.finditer(pattern, text))
|
|
214
|
+
|
|
215
|
+
if not matches:
|
|
216
|
+
return []
|
|
217
|
+
|
|
218
|
+
# Create nodes for all sections
|
|
219
|
+
all_nodes = []
|
|
220
|
+
for match in matches:
|
|
221
|
+
level = level_map[match.group(1)]
|
|
222
|
+
name = match.group(2)
|
|
223
|
+
start_pos = match.start()
|
|
224
|
+
all_nodes.append(SectionNode(level=level, name=name, start_pos=start_pos))
|
|
225
|
+
|
|
226
|
+
# Calculate end positions (each section ends where the next same-or-higher level starts)
|
|
227
|
+
for i, node in enumerate(all_nodes):
|
|
228
|
+
# Find next section at same or higher (lower number) level
|
|
229
|
+
for j in range(i + 1, len(all_nodes)):
|
|
230
|
+
if all_nodes[j].level <= node.level:
|
|
231
|
+
node.end_pos = all_nodes[j].start_pos
|
|
232
|
+
break
|
|
233
|
+
# If no next section found at same/higher level, end at document end
|
|
234
|
+
if node.end_pos == -1:
|
|
235
|
+
node.end_pos = len(text)
|
|
236
|
+
|
|
237
|
+
# Build tree structure
|
|
238
|
+
root_nodes: List[SectionNode] = []
|
|
239
|
+
section_stack: List[SectionNode] = []
|
|
240
|
+
|
|
241
|
+
for node in all_nodes:
|
|
242
|
+
# Pop from stack until we find a parent at a higher level
|
|
243
|
+
while section_stack and section_stack[-1].level >= node.level:
|
|
244
|
+
section_stack.pop()
|
|
245
|
+
|
|
246
|
+
if section_stack:
|
|
247
|
+
# This node is a child of the top of the stack
|
|
248
|
+
node.parent = section_stack[-1]
|
|
249
|
+
section_stack[-1].children.append(node)
|
|
250
|
+
else:
|
|
251
|
+
# This is a root node
|
|
252
|
+
root_nodes.append(node)
|
|
253
|
+
|
|
254
|
+
section_stack.append(node)
|
|
255
|
+
|
|
256
|
+
return root_nodes
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def format_section_tree(nodes: List[SectionNode], indent: int = 0) -> str:
|
|
260
|
+
"""
|
|
261
|
+
Format section tree with indentation for display.
|
|
262
|
+
|
|
263
|
+
Returns a string with each section name on its own line, indented by level.
|
|
264
|
+
"""
|
|
265
|
+
lines = []
|
|
266
|
+
for node in nodes:
|
|
267
|
+
lines.append(" " * indent + node.name)
|
|
268
|
+
if node.children:
|
|
269
|
+
lines.append(format_section_tree(node.children, indent + 1))
|
|
270
|
+
return "\n".join(lines)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def find_all_by_name(nodes: List[SectionNode], name: str, parent_path: str = "") -> List[str]:
|
|
274
|
+
"""
|
|
275
|
+
Find all paths to sections with the given name.
|
|
276
|
+
|
|
277
|
+
Returns a list of full paths (e.g., ["Introduction > Background", "Methods > Background"])
|
|
278
|
+
"""
|
|
279
|
+
results = []
|
|
280
|
+
for node in nodes:
|
|
281
|
+
current_path = f"{parent_path} > {node.name}" if parent_path else node.name
|
|
282
|
+
if node.name == name:
|
|
283
|
+
results.append(current_path)
|
|
284
|
+
if node.children:
|
|
285
|
+
results.extend(find_all_by_name(node.children, name, current_path))
|
|
286
|
+
return results
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def find_section_by_path(nodes: List[SectionNode], path: str) -> Optional[SectionNode]:
|
|
290
|
+
"""
|
|
291
|
+
Find a section by path notation (e.g., "Methods > Background").
|
|
292
|
+
|
|
293
|
+
If path contains no " > ", searches for an exact name match at any level.
|
|
294
|
+
If path contains " > ", follows the hierarchy.
|
|
295
|
+
"""
|
|
296
|
+
parts = [p.strip() for p in path.split(" > ")]
|
|
297
|
+
|
|
298
|
+
if len(parts) == 1:
|
|
299
|
+
# Simple name lookup - find first match at any level
|
|
300
|
+
def find_first(nodes: List[SectionNode], name: str) -> Optional[SectionNode]:
|
|
301
|
+
for node in nodes:
|
|
302
|
+
if node.name == name:
|
|
303
|
+
return node
|
|
304
|
+
if node.children:
|
|
305
|
+
result = find_first(node.children, name)
|
|
306
|
+
if result:
|
|
307
|
+
return result
|
|
308
|
+
return None
|
|
309
|
+
return find_first(nodes, parts[0])
|
|
310
|
+
|
|
311
|
+
# Path notation - follow the hierarchy
|
|
312
|
+
current_nodes = nodes
|
|
313
|
+
current_node = None
|
|
314
|
+
|
|
315
|
+
for part in parts:
|
|
316
|
+
found = None
|
|
317
|
+
for node in current_nodes:
|
|
318
|
+
if node.name == part:
|
|
319
|
+
found = node
|
|
320
|
+
break
|
|
321
|
+
if not found:
|
|
322
|
+
return None
|
|
323
|
+
current_node = found
|
|
324
|
+
current_nodes = found.children
|
|
325
|
+
|
|
326
|
+
return current_node
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def extract_section(text: str, section_path: str) -> Optional[str]:
|
|
330
|
+
"""
|
|
331
|
+
Extract content of a specific section, subsection, or subsubsection.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
text: The LaTeX content
|
|
335
|
+
section_path: Section name or path (e.g., "Methods" or "Methods > Background")
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
The section content including any subsections, or None if not found.
|
|
339
|
+
"""
|
|
340
|
+
tree = parse_section_tree(text)
|
|
341
|
+
node = find_section_by_path(tree, section_path)
|
|
342
|
+
if not node:
|
|
343
|
+
return None
|
|
344
|
+
|
|
345
|
+
return text[node.start_pos:node.end_pos].rstrip()
|
|
193
346
|
|
|
194
347
|
|
|
195
348
|
def flatten_tex(directory: str, main_file: str) -> str:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: arxiv-to-prompt
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: transform arXiv papers into a single latex prompt for LLMs
|
|
5
5
|
Author: Takashi Ishida
|
|
6
6
|
License: MIT
|
|
@@ -54,11 +54,27 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
|
|
|
54
54
|
# Process a local folder containing TeX files (instead of downloading from arXiv)
|
|
55
55
|
arxiv-to-prompt --local-folder /path/to/tex/files
|
|
56
56
|
|
|
57
|
-
# List all
|
|
58
|
-
arxiv-to-prompt
|
|
59
|
-
|
|
60
|
-
#
|
|
61
|
-
|
|
57
|
+
# List all sections (with subsections indented)
|
|
58
|
+
arxiv-to-prompt 2307.09288 --list-sections
|
|
59
|
+
# Introduction
|
|
60
|
+
# Pretraining
|
|
61
|
+
# Pretraining Data
|
|
62
|
+
# Training Details
|
|
63
|
+
# Training Hardware \& Carbon Footprint
|
|
64
|
+
# ...
|
|
65
|
+
|
|
66
|
+
# Extract specific sections
|
|
67
|
+
arxiv-to-prompt 2307.09288 --section "Introduction" --section "Pretraining"
|
|
68
|
+
|
|
69
|
+
# Ambiguous names show a helpful error
|
|
70
|
+
arxiv-to-prompt 2307.09288 --section "Human Evaluation"
|
|
71
|
+
# Warning: 'Human Evaluation' is ambiguous. Found at:
|
|
72
|
+
# - Fine-tuning > RLHF Results > Human Evaluation
|
|
73
|
+
# - Appendix > Additional Details for Fine-tuning > Human Evaluation
|
|
74
|
+
# Use path notation to disambiguate.
|
|
75
|
+
|
|
76
|
+
# Use path notation when the same name appears multiple times
|
|
77
|
+
arxiv-to-prompt 2307.09288 --section "Fine-tuning > RLHF Results > Human Evaluation"
|
|
62
78
|
|
|
63
79
|
# Copy to clipboard
|
|
64
80
|
arxiv-to-prompt 2303.08774 | pbcopy
|
|
@@ -12,6 +12,11 @@ from arxiv_to_prompt.core import (
|
|
|
12
12
|
remove_appendix,
|
|
13
13
|
list_sections,
|
|
14
14
|
extract_section,
|
|
15
|
+
SectionNode,
|
|
16
|
+
parse_section_tree,
|
|
17
|
+
format_section_tree,
|
|
18
|
+
find_all_by_name,
|
|
19
|
+
find_section_by_path,
|
|
15
20
|
)
|
|
16
21
|
from arxiv_to_prompt.cli import extract_arxiv_id
|
|
17
22
|
|
|
@@ -153,6 +158,23 @@ def test_find_main_tex(temp_cache_dir):
|
|
|
153
158
|
assert found_main == "main.tex"
|
|
154
159
|
|
|
155
160
|
|
|
161
|
+
def test_find_main_tex_in_subdirectory(temp_cache_dir):
|
|
162
|
+
"""Test finding main tex file in a subdirectory."""
|
|
163
|
+
# Create test directory with subdirectory
|
|
164
|
+
tex_dir = temp_cache_dir / "test_tex_subdir"
|
|
165
|
+
tex_dir.mkdir(parents=True)
|
|
166
|
+
subdir = tex_dir / "paper"
|
|
167
|
+
subdir.mkdir()
|
|
168
|
+
|
|
169
|
+
# Create main.tex in subdirectory
|
|
170
|
+
main_file = subdir / "main.tex"
|
|
171
|
+
main_file.write_text("\\documentclass{article}\n\\begin{document}\nHello\n\\end{document}")
|
|
172
|
+
|
|
173
|
+
# Test finding main file in subdirectory
|
|
174
|
+
found_main = find_main_tex(str(tex_dir))
|
|
175
|
+
assert found_main == os.path.join("paper", "main.tex")
|
|
176
|
+
|
|
177
|
+
|
|
156
178
|
def test_commented_input_commands(temp_cache_dir):
|
|
157
179
|
"""Test that commented-out \\include and \\input commands are ignored."""
|
|
158
180
|
# Create test directory and files
|
|
@@ -361,3 +383,256 @@ Results here.
|
|
|
361
383
|
results = extract_section(text, "Results")
|
|
362
384
|
assert results is not None
|
|
363
385
|
assert "Results here." in results
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def test_parse_section_tree():
|
|
389
|
+
"""Test parsing LaTeX into a hierarchical section tree."""
|
|
390
|
+
text = r"""
|
|
391
|
+
\section{Introduction}
|
|
392
|
+
Intro text.
|
|
393
|
+
\subsection{Background}
|
|
394
|
+
Background text.
|
|
395
|
+
\subsection{Motivation}
|
|
396
|
+
Motivation text.
|
|
397
|
+
\section{Methods}
|
|
398
|
+
Methods text.
|
|
399
|
+
\subsection{Background}
|
|
400
|
+
Methods background.
|
|
401
|
+
\subsubsection{Details}
|
|
402
|
+
Details text.
|
|
403
|
+
\subsection{Data Collection}
|
|
404
|
+
Data text.
|
|
405
|
+
\section{Results}
|
|
406
|
+
Results text.
|
|
407
|
+
"""
|
|
408
|
+
tree = parse_section_tree(text)
|
|
409
|
+
|
|
410
|
+
# Should have 3 top-level sections
|
|
411
|
+
assert len(tree) == 3
|
|
412
|
+
assert tree[0].name == "Introduction"
|
|
413
|
+
assert tree[1].name == "Methods"
|
|
414
|
+
assert tree[2].name == "Results"
|
|
415
|
+
|
|
416
|
+
# Introduction should have 2 subsections
|
|
417
|
+
assert len(tree[0].children) == 2
|
|
418
|
+
assert tree[0].children[0].name == "Background"
|
|
419
|
+
assert tree[0].children[1].name == "Motivation"
|
|
420
|
+
|
|
421
|
+
# Methods should have 2 subsections
|
|
422
|
+
assert len(tree[1].children) == 2
|
|
423
|
+
assert tree[1].children[0].name == "Background"
|
|
424
|
+
assert tree[1].children[1].name == "Data Collection"
|
|
425
|
+
|
|
426
|
+
# Methods > Background should have 1 subsubsection
|
|
427
|
+
assert len(tree[1].children[0].children) == 1
|
|
428
|
+
assert tree[1].children[0].children[0].name == "Details"
|
|
429
|
+
|
|
430
|
+
# Results should have no subsections
|
|
431
|
+
assert len(tree[2].children) == 0
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def test_parse_section_tree_levels():
|
|
435
|
+
"""Test that section levels are correctly assigned."""
|
|
436
|
+
text = r"""
|
|
437
|
+
\section{Sec}
|
|
438
|
+
\subsection{Subsec}
|
|
439
|
+
\subsubsection{Subsubsec}
|
|
440
|
+
"""
|
|
441
|
+
tree = parse_section_tree(text)
|
|
442
|
+
|
|
443
|
+
assert tree[0].level == 0
|
|
444
|
+
assert tree[0].children[0].level == 1
|
|
445
|
+
assert tree[0].children[0].children[0].level == 2
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def test_format_section_tree():
|
|
449
|
+
"""Test formatting section tree with indentation."""
|
|
450
|
+
text = r"""
|
|
451
|
+
\section{Introduction}
|
|
452
|
+
\subsection{Background}
|
|
453
|
+
\section{Methods}
|
|
454
|
+
\subsection{Data}
|
|
455
|
+
\subsubsection{Collection}
|
|
456
|
+
"""
|
|
457
|
+
tree = parse_section_tree(text)
|
|
458
|
+
output = format_section_tree(tree)
|
|
459
|
+
|
|
460
|
+
lines = output.split('\n')
|
|
461
|
+
assert lines[0] == "Introduction"
|
|
462
|
+
assert lines[1] == " Background"
|
|
463
|
+
assert lines[2] == "Methods"
|
|
464
|
+
assert lines[3] == " Data"
|
|
465
|
+
assert lines[4] == " Collection"
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def test_find_all_by_name():
|
|
469
|
+
"""Test finding all paths to sections with a given name."""
|
|
470
|
+
text = r"""
|
|
471
|
+
\section{Introduction}
|
|
472
|
+
\subsection{Background}
|
|
473
|
+
\section{Methods}
|
|
474
|
+
\subsection{Background}
|
|
475
|
+
\section{Results}
|
|
476
|
+
"""
|
|
477
|
+
tree = parse_section_tree(text)
|
|
478
|
+
|
|
479
|
+
# Background appears twice under different parents
|
|
480
|
+
paths = find_all_by_name(tree, "Background")
|
|
481
|
+
assert len(paths) == 2
|
|
482
|
+
assert "Introduction > Background" in paths
|
|
483
|
+
assert "Methods > Background" in paths
|
|
484
|
+
|
|
485
|
+
# Unique name
|
|
486
|
+
paths = find_all_by_name(tree, "Results")
|
|
487
|
+
assert paths == ["Results"]
|
|
488
|
+
|
|
489
|
+
# Non-existent name
|
|
490
|
+
paths = find_all_by_name(tree, "Discussion")
|
|
491
|
+
assert paths == []
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def test_find_section_by_path_simple():
|
|
495
|
+
"""Test finding section by simple name."""
|
|
496
|
+
text = r"""
|
|
497
|
+
\section{Introduction}
|
|
498
|
+
\section{Methods}
|
|
499
|
+
\subsection{Data}
|
|
500
|
+
"""
|
|
501
|
+
tree = parse_section_tree(text)
|
|
502
|
+
|
|
503
|
+
# Find by simple name
|
|
504
|
+
node = find_section_by_path(tree, "Introduction")
|
|
505
|
+
assert node is not None
|
|
506
|
+
assert node.name == "Introduction"
|
|
507
|
+
|
|
508
|
+
# Find subsection by simple name
|
|
509
|
+
node = find_section_by_path(tree, "Data")
|
|
510
|
+
assert node is not None
|
|
511
|
+
assert node.name == "Data"
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def test_find_section_by_path_notation():
|
|
515
|
+
"""Test finding section by path notation."""
|
|
516
|
+
text = r"""
|
|
517
|
+
\section{Introduction}
|
|
518
|
+
\subsection{Background}
|
|
519
|
+
\section{Methods}
|
|
520
|
+
\subsection{Background}
|
|
521
|
+
"""
|
|
522
|
+
tree = parse_section_tree(text)
|
|
523
|
+
|
|
524
|
+
# Find by path notation
|
|
525
|
+
node = find_section_by_path(tree, "Introduction > Background")
|
|
526
|
+
assert node is not None
|
|
527
|
+
assert node.name == "Background"
|
|
528
|
+
assert node.parent.name == "Introduction"
|
|
529
|
+
|
|
530
|
+
node = find_section_by_path(tree, "Methods > Background")
|
|
531
|
+
assert node is not None
|
|
532
|
+
assert node.name == "Background"
|
|
533
|
+
assert node.parent.name == "Methods"
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def test_find_section_by_path_not_found():
|
|
537
|
+
"""Test that non-existent paths return None."""
|
|
538
|
+
text = r"""
|
|
539
|
+
\section{Introduction}
|
|
540
|
+
\subsection{Background}
|
|
541
|
+
"""
|
|
542
|
+
tree = parse_section_tree(text)
|
|
543
|
+
|
|
544
|
+
assert find_section_by_path(tree, "NonExistent") is None
|
|
545
|
+
assert find_section_by_path(tree, "Introduction > NonExistent") is None
|
|
546
|
+
assert find_section_by_path(tree, "NonExistent > Background") is None
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def test_extract_section_with_path():
|
|
550
|
+
"""Test extracting section using path notation."""
|
|
551
|
+
text = r"""
|
|
552
|
+
\section{Introduction}
|
|
553
|
+
Intro text.
|
|
554
|
+
\subsection{Background}
|
|
555
|
+
Intro background.
|
|
556
|
+
\section{Methods}
|
|
557
|
+
Methods text.
|
|
558
|
+
\subsection{Background}
|
|
559
|
+
Methods background.
|
|
560
|
+
\section{Results}
|
|
561
|
+
Results text.
|
|
562
|
+
"""
|
|
563
|
+
# Extract using path notation
|
|
564
|
+
content = extract_section(text, "Introduction > Background")
|
|
565
|
+
assert content is not None
|
|
566
|
+
assert "Intro background." in content
|
|
567
|
+
assert "Methods background." not in content
|
|
568
|
+
|
|
569
|
+
content = extract_section(text, "Methods > Background")
|
|
570
|
+
assert content is not None
|
|
571
|
+
assert "Methods background." in content
|
|
572
|
+
assert "Intro background." not in content
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def test_extract_subsection_boundaries():
|
|
576
|
+
"""Test that subsection extraction stops at correct boundary."""
|
|
577
|
+
text = r"""
|
|
578
|
+
\section{Methods}
|
|
579
|
+
Methods intro.
|
|
580
|
+
\subsection{First}
|
|
581
|
+
First content.
|
|
582
|
+
\subsection{Second}
|
|
583
|
+
Second content.
|
|
584
|
+
\section{Results}
|
|
585
|
+
Results content.
|
|
586
|
+
"""
|
|
587
|
+
# Extract first subsection - should stop at second subsection
|
|
588
|
+
content = extract_section(text, "First")
|
|
589
|
+
assert content is not None
|
|
590
|
+
assert "First content." in content
|
|
591
|
+
assert "Second content." not in content
|
|
592
|
+
|
|
593
|
+
# Extract second subsection - should stop at Results section
|
|
594
|
+
content = extract_section(text, "Second")
|
|
595
|
+
assert content is not None
|
|
596
|
+
assert "Second content." in content
|
|
597
|
+
assert "Results content." not in content
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def test_extract_section_includes_subsections():
|
|
601
|
+
"""Test that extracting a section includes all its subsections."""
|
|
602
|
+
text = r"""
|
|
603
|
+
\section{Methods}
|
|
604
|
+
Methods intro.
|
|
605
|
+
\subsection{Data}
|
|
606
|
+
Data info.
|
|
607
|
+
\subsubsection{Collection}
|
|
608
|
+
Collection details.
|
|
609
|
+
\subsection{Analysis}
|
|
610
|
+
Analysis info.
|
|
611
|
+
\section{Results}
|
|
612
|
+
Results content.
|
|
613
|
+
"""
|
|
614
|
+
content = extract_section(text, "Methods")
|
|
615
|
+
assert content is not None
|
|
616
|
+
assert "Methods intro." in content
|
|
617
|
+
assert "Data info." in content
|
|
618
|
+
assert "Collection details." in content
|
|
619
|
+
assert "Analysis info." in content
|
|
620
|
+
assert "Results content." not in content
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def test_section_tree_with_starred_sections():
|
|
624
|
+
"""Test that starred sections are correctly parsed."""
|
|
625
|
+
text = r"""
|
|
626
|
+
\section*{Introduction}
|
|
627
|
+
Intro.
|
|
628
|
+
\subsection*{Background}
|
|
629
|
+
Background.
|
|
630
|
+
\section{Methods}
|
|
631
|
+
Methods.
|
|
632
|
+
"""
|
|
633
|
+
tree = parse_section_tree(text)
|
|
634
|
+
|
|
635
|
+
assert len(tree) == 2
|
|
636
|
+
assert tree[0].name == "Introduction"
|
|
637
|
+
assert tree[0].children[0].name == "Background"
|
|
638
|
+
assert tree[1].name == "Methods"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{arxiv_to_prompt-0.5.0 → arxiv_to_prompt-0.6.0}/src/arxiv_to_prompt.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|