arxiv-to-prompt 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,7 @@ Example:
15
15
  >>> latex_source = process_latex_source(local_folder="/path/to/tex/files")
16
16
  """
17
17
 
18
- from .core import process_latex_source, download_arxiv_source, get_default_cache_dir
18
+ from .core import process_latex_source, download_arxiv_source, get_default_cache_dir, list_sections, extract_section
19
19
 
20
20
  # Import version from package metadata
21
21
  try:
@@ -32,5 +32,7 @@ __all__ = [
32
32
  "process_latex_source",
33
33
  "download_arxiv_source",
34
34
  "get_default_cache_dir",
35
+ "list_sections",
36
+ "extract_section",
35
37
  "__version__",
36
38
  ]
arxiv_to_prompt/cli.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import argparse
2
2
  import re
3
- from .core import process_latex_source, get_default_cache_dir
3
+ from .core import process_latex_source, get_default_cache_dir, list_sections, extract_section
4
4
 
5
5
 
6
6
  def extract_arxiv_id(input_str: str) -> str:
@@ -45,7 +45,18 @@ def main():
45
45
  help="Path to a local folder containing TeX files (alternative to arxiv_id)",
46
46
  default=None
47
47
  )
48
-
48
+ parser.add_argument(
49
+ "--list-sections",
50
+ action="store_true",
51
+ help="List all section names in the document"
52
+ )
53
+ parser.add_argument(
54
+ "--section",
55
+ type=str,
56
+ action="append",
57
+ help="Extract only the specified section(s). Can be used multiple times."
58
+ )
59
+
49
60
  args = parser.parse_args()
50
61
 
51
62
  # Validate that either arxiv_id or local_folder is provided
@@ -64,7 +75,24 @@ def main():
64
75
  remove_appendix_section=args.no_appendix,
65
76
  local_folder=args.local_folder
66
77
  )
67
- if content:
78
+ if not content:
79
+ return
80
+
81
+ if args.list_sections:
82
+ sections = list_sections(content)
83
+ for section in sections:
84
+ print(section)
85
+ elif args.section:
86
+ extracted = []
87
+ for section_name in args.section:
88
+ section_content = extract_section(content, section_name)
89
+ if section_content:
90
+ extracted.append(section_content)
91
+ else:
92
+ print(f"Warning: Section '{section_name}' not found", file=__import__('sys').stderr)
93
+ if extracted:
94
+ print("\n\n".join(extracted))
95
+ else:
68
96
  print(content)
69
97
 
70
98
  if __name__ == "__main__":
arxiv_to_prompt/core.py CHANGED
@@ -131,6 +131,8 @@ def find_main_tex(directory: str) -> Optional[str]:
131
131
 
132
132
  def remove_comments_from_lines(text: str) -> str:
133
133
  """Remove LaTeX comments while preserving newlines."""
134
+ # Remove \iffalse...\fi blocks (commonly used to comment out large sections)
135
+ text = re.sub(r'\\iffalse\b.*?\\fi\b', '', text, flags=re.DOTALL)
134
136
  lines = text.split('\n')
135
137
  result = []
136
138
  for line in lines:
@@ -162,6 +164,34 @@ def remove_appendix(text: str) -> str:
162
164
  return text[:appendix_match.start()].rstrip()
163
165
  return text
164
166
 
167
+
168
+ def list_sections(text: str) -> list:
169
+ """Extract all section names from LaTeX content."""
170
+ pattern = r'\\section\*?\{([^}]+)\}'
171
+ return re.findall(pattern, text)
172
+
173
+
174
+ def extract_section(text: str, section_name: str) -> Optional[str]:
175
+ """Extract content of a specific section (including its subsections)."""
176
+ # Find the start of the requested section
177
+ pattern = rf'\\section\*?\{{{re.escape(section_name)}\}}'
178
+ start_match = re.search(pattern, text)
179
+ if not start_match:
180
+ return None
181
+
182
+ start_pos = start_match.start()
183
+
184
+ # Find the next \section (not subsection) or end of document
185
+ remaining = text[start_match.end():]
186
+ end_match = re.search(r'\\section\*?\{', remaining)
187
+
188
+ if end_match:
189
+ end_pos = start_match.end() + end_match.start()
190
+ return text[start_pos:end_pos].rstrip()
191
+ else:
192
+ return text[start_pos:].rstrip()
193
+
194
+
165
195
  def flatten_tex(directory: str, main_file: str) -> str:
166
196
  """Combine all tex files into one, resolving inputs."""
167
197
  def process_file(file_path: str, processed_files: set) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arxiv-to-prompt
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: transform arXiv papers into a single latex prompt for LLMs
5
5
  Author: Takashi Ishida
6
6
  License: MIT
@@ -54,6 +54,12 @@ arxiv-to-prompt 2303.08774 --no-comments --no-appendix
54
54
  # Process a local folder containing TeX files (instead of downloading from arXiv)
55
55
  arxiv-to-prompt --local-folder /path/to/tex/files
56
56
 
57
+ # List all section names in the paper
58
+ arxiv-to-prompt 2303.08774 --list-sections
59
+
60
+ # Extract only specific sections
61
+ arxiv-to-prompt 2303.08774 --section "Introduction" --section "Methods"
62
+
57
63
  # Copy to clipboard
58
64
  arxiv-to-prompt 2303.08774 | pbcopy
59
65
 
@@ -0,0 +1,9 @@
1
+ arxiv_to_prompt/__init__.py,sha256=LbfYhirPwhaMpwV4-YgMwW6hA0GOQDHVCPYCPKabjw0,1169
2
+ arxiv_to_prompt/cli.py,sha256=IwT64A-lf5PrxCxs2e1adN09USkf7ji31uzO8YAegpU,3203
3
+ arxiv_to_prompt/core.py,sha256=GafxYeE0dNg70hNG8BrSM7S99dIpHiy1KoNp5oW8niA,13119
4
+ arxiv_to_prompt-0.5.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
5
+ arxiv_to_prompt-0.5.0.dist-info/METADATA,sha256=4a66cO6DpNdd0dz3U_79QhL60Q1cAhHHyExWUqhL4eo,4786
6
+ arxiv_to_prompt-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
7
+ arxiv_to_prompt-0.5.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
8
+ arxiv_to_prompt-0.5.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
9
+ arxiv_to_prompt-0.5.0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- arxiv_to_prompt/__init__.py,sha256=riK7TcTaKDleP5g5rjf2jkmLtXZu7irNZDujyAVDnKM,1093
2
- arxiv_to_prompt/cli.py,sha256=np6mv2iCkLiVLawyix1vXP4bVFzRmZlkfjxb07ee89Q,2276
3
- arxiv_to_prompt/core.py,sha256=pgb8PGiOqgbPTW5rIJwLlmS9n3nnlYa5UVQ5YSvCIuo,12077
4
- arxiv_to_prompt-0.4.0.dist-info/licenses/LICENSE,sha256=np8L3--VyxwVJa_8D_mfK4RYrtnRMM_eeYN3rM4PMHo,1071
5
- arxiv_to_prompt-0.4.0.dist-info/METADATA,sha256=_G2nwz5NSnKPMpS6UowWqCe2WSpJBjHqOMshmtGOdak,4598
6
- arxiv_to_prompt-0.4.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
7
- arxiv_to_prompt-0.4.0.dist-info/entry_points.txt,sha256=iYEEn8xZ_5OkhNIs5HCyHSQBpDRJkbD5h0tlAb16lL0,61
8
- arxiv_to_prompt-0.4.0.dist-info/top_level.txt,sha256=JClbu_lGGWu3RaTHZlNqTKB1-DUSbYXQNIYmJ9_F7fY,16
9
- arxiv_to_prompt-0.4.0.dist-info/RECORD,,