chatterer 0.1.24__py3-none-any.whl → 0.1.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +97 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__main__.py +75 -75
  5. chatterer/examples/any2md.py +85 -85
  6. chatterer/examples/pdf2md.py +338 -338
  7. chatterer/examples/pdf2txt.py +54 -54
  8. chatterer/examples/ppt.py +486 -486
  9. chatterer/examples/pw.py +143 -137
  10. chatterer/examples/snippet.py +56 -55
  11. chatterer/examples/transcribe.py +192 -112
  12. chatterer/examples/upstage.py +89 -89
  13. chatterer/examples/web2md.py +80 -66
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +536 -536
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +645 -625
  30. chatterer/tools/convert_to_text.py +446 -446
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +293 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/METADATA +390 -389
  40. chatterer-0.1.25.dist-info/RECORD +45 -0
  41. chatterer-0.1.24.dist-info/RECORD +0 -45
  42. {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/WHEEL +0 -0
  43. {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/entry_points.txt +0 -0
  44. {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/top_level.txt +0 -0
@@ -1,338 +1,338 @@
1
- #!/usr/bin/env python3
2
- """
3
- PDF to Markdown Converter CLI
4
-
5
- A command-line tool for converting PDF documents to Markdown using multimodal LLMs.
6
- Supports both sequential and parallel processing modes with async capabilities.
7
- """
8
-
9
- import asyncio
10
- import logging
11
- import sys
12
- import time
13
- from pathlib import Path
14
- from typing import List, Literal, Optional, TypedDict
15
-
16
- from spargear import ArgumentSpec, RunnableArguments
17
-
18
- from chatterer import Chatterer
19
- from chatterer.tools.convert_pdf_to_markdown import PdfToMarkdown
20
-
21
-
22
- class ConversionResult(TypedDict, total=False):
23
- """Type definition for conversion results."""
24
-
25
- input: str
26
- output: str
27
- result: str
28
- processing_time: float
29
- characters: int
30
- error: str
31
-
32
-
33
- # Setup enhanced logging
34
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%H:%M:%S")
35
- logger = logging.getLogger(__name__)
36
-
37
-
38
- class Arguments(RunnableArguments[List[ConversionResult]]):
39
- """Command-line arguments for PDF to Markdown conversion."""
40
-
41
- PDF_OR_DIRECTORY_PATH: str
42
- """Input PDF file or directory containing PDF files to convert to markdown."""
43
-
44
- output: Optional[str] = None
45
- """Output path. For a file, path to the output markdown file. For a directory, output directory for .md files."""
46
-
47
- page: Optional[str] = None
48
- """Zero-based page indices to convert (e.g., '0,2,4-8'). If None, converts all pages."""
49
-
50
- recursive: bool = False
51
- """If input is a directory, search for PDFs recursively."""
52
-
53
- mode: Literal["sequential", "parallel"] = "parallel"
54
- """Processing mode: 'sequential' for strict continuity, 'parallel' for faster processing."""
55
-
56
- sync: bool = False
57
- """Enable synchronous processing for sequential mode. If set to True, will run in sync mode."""
58
-
59
- max_concurrent: int = 10
60
- """Maximum number of concurrent LLM requests when using async mode."""
61
-
62
- image_zoom: float = 2.0
63
- """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
64
-
65
- image_format: Literal["png", "jpg", "jpeg"] = "png"
66
- """Image format for PDF page rendering."""
67
-
68
- image_quality: int = 95
69
- """JPEG quality when using jpg/jpeg format (1-100)."""
70
-
71
- context_tail_lines: int = 10
72
- """Number of lines from previous page's markdown to use as context (sequential mode only)."""
73
-
74
- verbose: bool = False
75
- """Enable verbose logging output."""
76
-
77
- chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
78
- ["--chatterer"],
79
- default_factory=lambda: Chatterer.from_provider("google:gemini-2.5-flash-preview-05-20"),
80
- help="Chatterer instance configuration (e.g., 'google:gemini-2.5-flash-preview-05-20').",
81
- type=Chatterer.from_provider,
82
- )
83
-
84
- def __post_init__(self) -> None:
85
- """Validate and adjust arguments after initialization."""
86
- if self.verbose:
87
- logging.getLogger().setLevel(logging.DEBUG)
88
-
89
- if not self.sync and self.mode == "sequential":
90
- logger.warning("Async mode is only available with parallel mode. Switching to parallel mode.")
91
- self.mode = "parallel"
92
-
93
- if self.max_concurrent < 1:
94
- logger.warning("max_concurrent must be >= 1. Setting to 1.")
95
- self.max_concurrent = 1
96
- elif self.max_concurrent > 10:
97
- logger.warning("max_concurrent > 10 may cause rate limiting. Consider reducing.")
98
-
99
- def run(self) -> List[ConversionResult]:
100
- """Execute the PDF to Markdown conversion."""
101
- if not self.sync:
102
- return asyncio.run(self._run_async())
103
- else:
104
- return self._run_sync()
105
-
106
- def _run_sync(self) -> List[ConversionResult]:
107
- """Execute synchronous conversion."""
108
- pdf_files, output_base, is_dir = self._prepare_files()
109
-
110
- converter = PdfToMarkdown(
111
- chatterer=self.chatterer.unwrap(),
112
- image_zoom=self.image_zoom,
113
- image_format=self.image_format,
114
- image_jpg_quality=self.image_quality,
115
- context_tail_lines=self.context_tail_lines,
116
- )
117
-
118
- results: List[ConversionResult] = []
119
- total_start_time = time.time()
120
-
121
- logger.info(f"🚀 Starting {self.mode} conversion of {len(pdf_files)} PDF(s)...")
122
-
123
- for i, pdf in enumerate(pdf_files, 1):
124
- output_path = (output_base / f"{pdf.stem}.md") if is_dir else output_base
125
-
126
- logger.info(f"📄 Processing {i}/{len(pdf_files)}: {pdf.name}")
127
- start_time = time.time()
128
-
129
- # Progress callback for individual PDF
130
- def progress_callback(current: int, total: int) -> None:
131
- progress = (current / total) * 100
132
- logger.info(f" └─ Progress: {current}/{total} pages ({progress:.1f}%)")
133
-
134
- try:
135
- markdown = converter.convert(
136
- pdf_input=str(pdf),
137
- page_indices=self.page,
138
- mode=self.mode,
139
- progress_callback=progress_callback,
140
- )
141
-
142
- # Save result
143
- output_path.parent.mkdir(parents=True, exist_ok=True)
144
- output_path.write_text(markdown, encoding="utf-8")
145
-
146
- elapsed = time.time() - start_time
147
- chars_per_sec = len(markdown) / elapsed if elapsed > 0 else 0
148
-
149
- logger.info(f" ✅ Completed in {elapsed:.1f}s ({chars_per_sec:.0f} chars/s)")
150
- logger.info(f" 📝 Generated {len(markdown):,} characters → {output_path}")
151
-
152
- results.append({
153
- "input": pdf.as_posix(),
154
- "output": output_path.as_posix(),
155
- "result": markdown,
156
- "processing_time": elapsed,
157
- "characters": len(markdown),
158
- })
159
-
160
- except Exception as e:
161
- logger.error(f" ❌ Failed to process {pdf.name}: {e}")
162
- results.append({
163
- "input": pdf.as_posix(),
164
- "output": "",
165
- "result": "",
166
- "error": str(e),
167
- })
168
-
169
- total_elapsed = time.time() - total_start_time
170
- total_chars = sum(len(r.get("result", "")) for r in results)
171
- successful_conversions = sum(1 for r in results if "error" not in r)
172
-
173
- logger.info("🎉 Conversion complete!")
174
- logger.info(f" 📊 Total time: {total_elapsed:.1f}s")
175
- logger.info(f" 📈 Success rate: {successful_conversions}/{len(pdf_files)} ({(successful_conversions / len(pdf_files) * 100):.1f}%)")
176
- logger.info(f" 📝 Total output: {total_chars:,} characters")
177
- logger.info(f" ⚡ Average speed: {total_chars / total_elapsed:.0f} chars/s")
178
-
179
- return results
180
-
181
- async def _run_async(self) -> List[ConversionResult]:
182
- """Execute asynchronous conversion with parallel processing."""
183
- pdf_files, output_base, is_dir = self._prepare_files()
184
-
185
- converter = PdfToMarkdown(
186
- chatterer=self.chatterer.unwrap(),
187
- image_zoom=self.image_zoom,
188
- image_format=self.image_format,
189
- image_jpg_quality=self.image_quality,
190
- context_tail_lines=self.context_tail_lines,
191
- )
192
-
193
- total_start_time = time.time()
194
-
195
- logger.info(f"🚀 Starting ASYNC parallel conversion of {len(pdf_files)} PDF(s)...")
196
- logger.info(f"⚡ Max concurrent: {self.max_concurrent} LLM requests")
197
-
198
- # Process PDFs concurrently
199
- semaphore = asyncio.Semaphore(self.max_concurrent)
200
-
201
- async def process_pdf(pdf: Path, index: int) -> ConversionResult:
202
- async with semaphore:
203
- output_path = (output_base / f"{pdf.stem}.md") if is_dir else output_base
204
-
205
- logger.info(f"📄 Processing {index}/{len(pdf_files)}: {pdf.name}")
206
- start_time = time.time()
207
-
208
- # Progress callback for individual PDF
209
- def progress_callback(current: int, total: int) -> None:
210
- progress = (current / total) * 100
211
- logger.info(f" └─ {pdf.name}: {current}/{total} pages ({progress:.1f}%)")
212
-
213
- try:
214
- markdown = await converter.aconvert(
215
- pdf_input=str(pdf),
216
- page_indices=self.page,
217
- progress_callback=progress_callback,
218
- max_concurrent=self.max_concurrent, # Limit per-PDF concurrency
219
- )
220
-
221
- # Save result
222
- output_path.parent.mkdir(parents=True, exist_ok=True)
223
- output_path.write_text(markdown, encoding="utf-8")
224
-
225
- elapsed = time.time() - start_time
226
- chars_per_sec = len(markdown) / elapsed if elapsed > 0 else 0
227
-
228
- logger.info(f" ✅ {pdf.name} completed in {elapsed:.1f}s ({chars_per_sec:.0f} chars/s)")
229
- logger.info(f" 📝 Generated {len(markdown):,} characters → {output_path}")
230
-
231
- return {
232
- "input": pdf.as_posix(),
233
- "output": output_path.as_posix(),
234
- "result": markdown,
235
- "processing_time": elapsed,
236
- "characters": len(markdown),
237
- }
238
-
239
- except Exception as e:
240
- logger.error(f" ❌ Failed to process {pdf.name}: {e}")
241
- return {
242
- "input": pdf.as_posix(),
243
- "output": "",
244
- "result": "",
245
- "error": str(e),
246
- }
247
-
248
- # Execute all PDF processing tasks
249
- tasks = [process_pdf(pdf, i) for i, pdf in enumerate(pdf_files, 1)]
250
- raw_results = await asyncio.gather(*tasks, return_exceptions=True)
251
-
252
- # Handle exceptions in results
253
- final_results: List[ConversionResult] = []
254
- for result in raw_results:
255
- if isinstance(result, Exception):
256
- logger.error(f"Task failed with exception: {result}")
257
- final_results.append(ConversionResult(input="", output="", result="", error=str(result)))
258
- else:
259
- # Type narrowing: result is ConversionResult after isinstance check
260
- final_results.append(result) # type: ignore[arg-type]
261
-
262
- total_elapsed = time.time() - total_start_time
263
- total_chars = sum(len(r.get("result", "")) for r in final_results)
264
- successful_conversions = sum(1 for r in final_results if "error" not in r)
265
-
266
- logger.info("🎉 ASYNC conversion complete!")
267
- logger.info(f" 📊 Total time: {total_elapsed:.1f}s")
268
- logger.info(f" 📈 Success rate: {successful_conversions}/{len(pdf_files)} ({(successful_conversions / len(pdf_files) * 100):.1f}%)")
269
- logger.info(f" 📝 Total output: {total_chars:,} characters")
270
- logger.info(f" ⚡ Average speed: {total_chars / total_elapsed:.0f} chars/s")
271
- logger.info(f" 🚀 Speedup: ~{len(pdf_files) / max(1, total_elapsed / 60):.1f}x faster than sequential")
272
-
273
- return final_results
274
-
275
- def _prepare_files(self) -> tuple[List[Path], Path, bool]:
276
- """Prepare input and output file paths."""
277
- input_path = Path(self.PDF_OR_DIRECTORY_PATH).resolve()
278
- pdf_files: List[Path] = []
279
- is_dir = False
280
-
281
- # Determine input files
282
- if input_path.is_file():
283
- if input_path.suffix.lower() != ".pdf":
284
- logger.error(f"❌ Input file must be a PDF: {input_path}")
285
- sys.exit(1)
286
- pdf_files.append(input_path)
287
- elif input_path.is_dir():
288
- is_dir = True
289
- pattern = "**/*.pdf" if self.recursive else "*.pdf"
290
- pdf_files = sorted([f for f in input_path.glob(pattern) if f.is_file()])
291
- if not pdf_files:
292
- logger.warning(f"⚠️ No PDF files found in {input_path}")
293
- sys.exit(0)
294
- else:
295
- logger.error(f"❌ Input path does not exist: {input_path}")
296
- sys.exit(1)
297
-
298
- # Determine output path
299
- if self.output:
300
- output_base = Path(self.output).resolve()
301
- elif is_dir:
302
- output_base = input_path
303
- else:
304
- output_base = input_path.with_suffix(".md")
305
-
306
- # Create output directories
307
- if is_dir:
308
- output_base.mkdir(parents=True, exist_ok=True)
309
- else:
310
- output_base.parent.mkdir(parents=True, exist_ok=True)
311
-
312
- logger.info(f"📂 Input: {input_path}")
313
- logger.info(f"📁 Output: {output_base}")
314
- logger.info(f"📄 Found {len(pdf_files)} PDF file(s)")
315
-
316
- return pdf_files, output_base, is_dir
317
-
318
-
319
- def main() -> None:
320
- """Main entry point for the CLI application."""
321
- args = None
322
- try:
323
- args = Arguments()
324
- args.run()
325
- except KeyboardInterrupt:
326
- logger.info("🛑 Conversion interrupted by user")
327
- sys.exit(130)
328
- except Exception as e:
329
- logger.error(f"❌ Unexpected error: {e}")
330
- if args and hasattr(args, "verbose") and args.verbose:
331
- import traceback
332
-
333
- traceback.print_exc()
334
- sys.exit(1)
335
-
336
-
337
- if __name__ == "__main__":
338
- main()
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF to Markdown Converter CLI
4
+
5
+ A command-line tool for converting PDF documents to Markdown using multimodal LLMs.
6
+ Supports both sequential and parallel processing modes with async capabilities.
7
+ """
8
+
9
+ import asyncio
10
+ import logging
11
+ import sys
12
+ import time
13
+ from pathlib import Path
14
+ from typing import List, Literal, Optional, TypedDict
15
+
16
+ from spargear import ArgumentSpec, RunnableArguments
17
+
18
+ from chatterer import Chatterer
19
+ from chatterer.tools.convert_pdf_to_markdown import PdfToMarkdown
20
+
21
+
22
+ class ConversionResult(TypedDict, total=False):
23
+ """Type definition for conversion results."""
24
+
25
+ input: str
26
+ output: str
27
+ result: str
28
+ processing_time: float
29
+ characters: int
30
+ error: str
31
+
32
+
33
+ # Setup enhanced logging
34
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%H:%M:%S")
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class Arguments(RunnableArguments[List[ConversionResult]]):
39
+ """Command-line arguments for PDF to Markdown conversion."""
40
+
41
+ PDF_OR_DIRECTORY_PATH: str
42
+ """Input PDF file or directory containing PDF files to convert to markdown."""
43
+
44
+ output: Optional[str] = None
45
+ """Output path. For a file, path to the output markdown file. For a directory, output directory for .md files."""
46
+
47
+ page: Optional[str] = None
48
+ """Zero-based page indices to convert (e.g., '0,2,4-8'). If None, converts all pages."""
49
+
50
+ recursive: bool = False
51
+ """If input is a directory, search for PDFs recursively."""
52
+
53
+ mode: Literal["sequential", "parallel"] = "parallel"
54
+ """Processing mode: 'sequential' for strict continuity, 'parallel' for faster processing."""
55
+
56
+ sync: bool = False
57
+ """Enable synchronous processing for sequential mode. If set to True, will run in sync mode."""
58
+
59
+ max_concurrent: int = 10
60
+ """Maximum number of concurrent LLM requests when using async mode."""
61
+
62
+ image_zoom: float = 2.0
63
+ """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
64
+
65
+ image_format: Literal["png", "jpg", "jpeg"] = "png"
66
+ """Image format for PDF page rendering."""
67
+
68
+ image_quality: int = 95
69
+ """JPEG quality when using jpg/jpeg format (1-100)."""
70
+
71
+ context_tail_lines: int = 10
72
+ """Number of lines from previous page's markdown to use as context (sequential mode only)."""
73
+
74
+ verbose: bool = False
75
+ """Enable verbose logging output."""
76
+
77
+ chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
78
+ ["--chatterer"],
79
+ default_factory=lambda: Chatterer.from_provider("google:gemini-2.5-flash-preview-05-20"),
80
+ help="Chatterer instance configuration (e.g., 'google:gemini-2.5-flash-preview-05-20').",
81
+ type=Chatterer.from_provider,
82
+ )
83
+
84
+ def __post_init__(self) -> None:
85
+ """Validate and adjust arguments after initialization."""
86
+ if self.verbose:
87
+ logging.getLogger().setLevel(logging.DEBUG)
88
+
89
+ if not self.sync and self.mode == "sequential":
90
+ logger.warning("Async mode is only available with parallel mode. Switching to parallel mode.")
91
+ self.mode = "parallel"
92
+
93
+ if self.max_concurrent < 1:
94
+ logger.warning("max_concurrent must be >= 1. Setting to 1.")
95
+ self.max_concurrent = 1
96
+ elif self.max_concurrent > 10:
97
+ logger.warning("max_concurrent > 10 may cause rate limiting. Consider reducing.")
98
+
99
+ def run(self) -> List[ConversionResult]:
100
+ """Execute the PDF to Markdown conversion."""
101
+ if not self.sync:
102
+ return asyncio.run(self._run_async())
103
+ else:
104
+ return self._run_sync()
105
+
106
+ def _run_sync(self) -> List[ConversionResult]:
107
+ """Execute synchronous conversion."""
108
+ pdf_files, output_base, is_dir = self._prepare_files()
109
+
110
+ converter = PdfToMarkdown(
111
+ chatterer=self.chatterer.unwrap(),
112
+ image_zoom=self.image_zoom,
113
+ image_format=self.image_format,
114
+ image_jpg_quality=self.image_quality,
115
+ context_tail_lines=self.context_tail_lines,
116
+ )
117
+
118
+ results: List[ConversionResult] = []
119
+ total_start_time = time.time()
120
+
121
+ logger.info(f"🚀 Starting {self.mode} conversion of {len(pdf_files)} PDF(s)...")
122
+
123
+ for i, pdf in enumerate(pdf_files, 1):
124
+ output_path = (output_base / f"{pdf.stem}.md") if is_dir else output_base
125
+
126
+ logger.info(f"📄 Processing {i}/{len(pdf_files)}: {pdf.name}")
127
+ start_time = time.time()
128
+
129
+ # Progress callback for individual PDF
130
+ def progress_callback(current: int, total: int) -> None:
131
+ progress = (current / total) * 100
132
+ logger.info(f" └─ Progress: {current}/{total} pages ({progress:.1f}%)")
133
+
134
+ try:
135
+ markdown = converter.convert(
136
+ pdf_input=str(pdf),
137
+ page_indices=self.page,
138
+ mode=self.mode,
139
+ progress_callback=progress_callback,
140
+ )
141
+
142
+ # Save result
143
+ output_path.parent.mkdir(parents=True, exist_ok=True)
144
+ output_path.write_text(markdown, encoding="utf-8")
145
+
146
+ elapsed = time.time() - start_time
147
+ chars_per_sec = len(markdown) / elapsed if elapsed > 0 else 0
148
+
149
+ logger.info(f" ✅ Completed in {elapsed:.1f}s ({chars_per_sec:.0f} chars/s)")
150
+ logger.info(f" 📝 Generated {len(markdown):,} characters → {output_path}")
151
+
152
+ results.append({
153
+ "input": pdf.as_posix(),
154
+ "output": output_path.as_posix(),
155
+ "result": markdown,
156
+ "processing_time": elapsed,
157
+ "characters": len(markdown),
158
+ })
159
+
160
+ except Exception as e:
161
+ logger.error(f" ❌ Failed to process {pdf.name}: {e}")
162
+ results.append({
163
+ "input": pdf.as_posix(),
164
+ "output": "",
165
+ "result": "",
166
+ "error": str(e),
167
+ })
168
+
169
+ total_elapsed = time.time() - total_start_time
170
+ total_chars = sum(len(r.get("result", "")) for r in results)
171
+ successful_conversions = sum(1 for r in results if "error" not in r)
172
+
173
+ logger.info("🎉 Conversion complete!")
174
+ logger.info(f" 📊 Total time: {total_elapsed:.1f}s")
175
+ logger.info(f" 📈 Success rate: {successful_conversions}/{len(pdf_files)} ({(successful_conversions / len(pdf_files) * 100):.1f}%)")
176
+ logger.info(f" 📝 Total output: {total_chars:,} characters")
177
+ logger.info(f" ⚡ Average speed: {total_chars / total_elapsed:.0f} chars/s")
178
+
179
+ return results
180
+
181
+ async def _run_async(self) -> List[ConversionResult]:
182
+ """Execute asynchronous conversion with parallel processing."""
183
+ pdf_files, output_base, is_dir = self._prepare_files()
184
+
185
+ converter = PdfToMarkdown(
186
+ chatterer=self.chatterer.unwrap(),
187
+ image_zoom=self.image_zoom,
188
+ image_format=self.image_format,
189
+ image_jpg_quality=self.image_quality,
190
+ context_tail_lines=self.context_tail_lines,
191
+ )
192
+
193
+ total_start_time = time.time()
194
+
195
+ logger.info(f"🚀 Starting ASYNC parallel conversion of {len(pdf_files)} PDF(s)...")
196
+ logger.info(f"⚡ Max concurrent: {self.max_concurrent} LLM requests")
197
+
198
+ # Process PDFs concurrently
199
+ semaphore = asyncio.Semaphore(self.max_concurrent)
200
+
201
+ async def process_pdf(pdf: Path, index: int) -> ConversionResult:
202
+ async with semaphore:
203
+ output_path = (output_base / f"{pdf.stem}.md") if is_dir else output_base
204
+
205
+ logger.info(f"📄 Processing {index}/{len(pdf_files)}: {pdf.name}")
206
+ start_time = time.time()
207
+
208
+ # Progress callback for individual PDF
209
+ def progress_callback(current: int, total: int) -> None:
210
+ progress = (current / total) * 100
211
+ logger.info(f" └─ {pdf.name}: {current}/{total} pages ({progress:.1f}%)")
212
+
213
+ try:
214
+ markdown = await converter.aconvert(
215
+ pdf_input=str(pdf),
216
+ page_indices=self.page,
217
+ progress_callback=progress_callback,
218
+ max_concurrent=self.max_concurrent, # Limit per-PDF concurrency
219
+ )
220
+
221
+ # Save result
222
+ output_path.parent.mkdir(parents=True, exist_ok=True)
223
+ output_path.write_text(markdown, encoding="utf-8")
224
+
225
+ elapsed = time.time() - start_time
226
+ chars_per_sec = len(markdown) / elapsed if elapsed > 0 else 0
227
+
228
+ logger.info(f" ✅ {pdf.name} completed in {elapsed:.1f}s ({chars_per_sec:.0f} chars/s)")
229
+ logger.info(f" 📝 Generated {len(markdown):,} characters → {output_path}")
230
+
231
+ return {
232
+ "input": pdf.as_posix(),
233
+ "output": output_path.as_posix(),
234
+ "result": markdown,
235
+ "processing_time": elapsed,
236
+ "characters": len(markdown),
237
+ }
238
+
239
+ except Exception as e:
240
+ logger.error(f" ❌ Failed to process {pdf.name}: {e}")
241
+ return {
242
+ "input": pdf.as_posix(),
243
+ "output": "",
244
+ "result": "",
245
+ "error": str(e),
246
+ }
247
+
248
+ # Execute all PDF processing tasks
249
+ tasks = [process_pdf(pdf, i) for i, pdf in enumerate(pdf_files, 1)]
250
+ raw_results = await asyncio.gather(*tasks, return_exceptions=True)
251
+
252
+ # Handle exceptions in results
253
+ final_results: List[ConversionResult] = []
254
+ for result in raw_results:
255
+ if isinstance(result, Exception):
256
+ logger.error(f"Task failed with exception: {result}")
257
+ final_results.append(ConversionResult(input="", output="", result="", error=str(result)))
258
+ else:
259
+ # Type narrowing: result is ConversionResult after isinstance check
260
+ final_results.append(result) # type: ignore[arg-type]
261
+
262
+ total_elapsed = time.time() - total_start_time
263
+ total_chars = sum(len(r.get("result", "")) for r in final_results)
264
+ successful_conversions = sum(1 for r in final_results if "error" not in r)
265
+
266
+ logger.info("🎉 ASYNC conversion complete!")
267
+ logger.info(f" 📊 Total time: {total_elapsed:.1f}s")
268
+ logger.info(f" 📈 Success rate: {successful_conversions}/{len(pdf_files)} ({(successful_conversions / len(pdf_files) * 100):.1f}%)")
269
+ logger.info(f" 📝 Total output: {total_chars:,} characters")
270
+ logger.info(f" ⚡ Average speed: {total_chars / total_elapsed:.0f} chars/s")
271
+ logger.info(f" 🚀 Speedup: ~{len(pdf_files) / max(1, total_elapsed / 60):.1f}x faster than sequential")
272
+
273
+ return final_results
274
+
275
+ def _prepare_files(self) -> tuple[List[Path], Path, bool]:
276
+ """Prepare input and output file paths."""
277
+ input_path = Path(self.PDF_OR_DIRECTORY_PATH).resolve()
278
+ pdf_files: List[Path] = []
279
+ is_dir = False
280
+
281
+ # Determine input files
282
+ if input_path.is_file():
283
+ if input_path.suffix.lower() != ".pdf":
284
+ logger.error(f"❌ Input file must be a PDF: {input_path}")
285
+ sys.exit(1)
286
+ pdf_files.append(input_path)
287
+ elif input_path.is_dir():
288
+ is_dir = True
289
+ pattern = "**/*.pdf" if self.recursive else "*.pdf"
290
+ pdf_files = sorted([f for f in input_path.glob(pattern) if f.is_file()])
291
+ if not pdf_files:
292
+ logger.warning(f"⚠️ No PDF files found in {input_path}")
293
+ sys.exit(0)
294
+ else:
295
+ logger.error(f"❌ Input path does not exist: {input_path}")
296
+ sys.exit(1)
297
+
298
+ # Determine output path
299
+ if self.output:
300
+ output_base = Path(self.output).resolve()
301
+ elif is_dir:
302
+ output_base = input_path
303
+ else:
304
+ output_base = input_path.with_suffix(".md")
305
+
306
+ # Create output directories
307
+ if is_dir:
308
+ output_base.mkdir(parents=True, exist_ok=True)
309
+ else:
310
+ output_base.parent.mkdir(parents=True, exist_ok=True)
311
+
312
+ logger.info(f"📂 Input: {input_path}")
313
+ logger.info(f"📁 Output: {output_base}")
314
+ logger.info(f"📄 Found {len(pdf_files)} PDF file(s)")
315
+
316
+ return pdf_files, output_base, is_dir
317
+
318
+
319
+ def main() -> None:
320
+ """Main entry point for the CLI application."""
321
+ args = None
322
+ try:
323
+ args = Arguments()
324
+ args.run()
325
+ except KeyboardInterrupt:
326
+ logger.info("🛑 Conversion interrupted by user")
327
+ sys.exit(130)
328
+ except Exception as e:
329
+ logger.error(f"❌ Unexpected error: {e}")
330
+ if args and hasattr(args, "verbose") and args.verbose:
331
+ import traceback
332
+
333
+ traceback.print_exc()
334
+ sys.exit(1)
335
+
336
+
337
+ if __name__ == "__main__":
338
+ main()