chatterer 0.1.22__tar.gz → 0.1.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chatterer-0.1.22 → chatterer-0.1.24}/PKG-INFO +6 -9
- chatterer-0.1.24/chatterer/examples/__main__.py +75 -0
- chatterer-0.1.22/chatterer/examples/anything_to_markdown.py → chatterer-0.1.24/chatterer/examples/any2md.py +9 -9
- chatterer-0.1.24/chatterer/examples/pdf2md.py +338 -0
- chatterer-0.1.22/chatterer/examples/pdf_to_text.py → chatterer-0.1.24/chatterer/examples/pdf2txt.py +5 -5
- chatterer-0.1.22/chatterer/examples/make_ppt.py → chatterer-0.1.24/chatterer/examples/ppt.py +5 -7
- chatterer-0.1.24/chatterer/examples/pw.py +137 -0
- chatterer-0.1.22/chatterer/examples/get_code_snippets.py → chatterer-0.1.24/chatterer/examples/snippet.py +7 -7
- chatterer-0.1.22/chatterer/examples/transcription_api.py → chatterer-0.1.24/chatterer/examples/transcribe.py +6 -6
- chatterer-0.1.22/chatterer/examples/upstage_parser.py → chatterer-0.1.24/chatterer/examples/upstage.py +17 -17
- chatterer-0.1.22/chatterer/examples/webpage_to_markdown.py → chatterer-0.1.24/chatterer/examples/web2md.py +8 -12
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/strategies/atom_of_thoughts.py +161 -161
- chatterer-0.1.24/chatterer/tools/convert_pdf_to_markdown.py +625 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer.egg-info/PKG-INFO +6 -9
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer.egg-info/SOURCES.txt +10 -9
- chatterer-0.1.24/chatterer.egg-info/entry_points.txt +2 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer.egg-info/requires.txt +3 -7
- {chatterer-0.1.22 → chatterer-0.1.24}/pyproject.toml +58 -66
- chatterer-0.1.22/chatterer/examples/login_with_playwright.py +0 -156
- chatterer-0.1.22/chatterer/examples/pdf_to_markdown.py +0 -77
- chatterer-0.1.22/chatterer/tools/convert_pdf_to_markdown.py +0 -393
- chatterer-0.1.22/chatterer.egg-info/entry_points.txt +0 -10
- {chatterer-0.1.22 → chatterer-0.1.24}/README.md +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/__init__.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/common_types/__init__.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/common_types/io.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/examples/__init__.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/interactive.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/language_model.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/messages.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/py.typed +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/strategies/__init__.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/strategies/base.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/__init__.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/caption_markdown_images.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/citation_chunking/__init__.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/citation_chunking/chunks.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/citation_chunking/citation_chunker.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/citation_chunking/citations.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/citation_chunking/prompt.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/citation_chunking/reference.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/citation_chunking/utils.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/convert_to_text.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/upstage_document_parser.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/webpage_to_markdown.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/tools/youtube.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/utils/__init__.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/utils/base64_image.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/utils/bytesio.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/utils/code_agent.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer/utils/imghdr.py +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer.egg-info/dependency_links.txt +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/chatterer.egg-info/top_level.txt +0 -0
- {chatterer-0.1.22 → chatterer-0.1.24}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: chatterer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.24
|
4
4
|
Summary: The highest-level interface for various LLM APIs.
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -11,10 +11,9 @@ Requires-Dist: pillow>=11.1.0
|
|
11
11
|
Requires-Dist: regex>=2024.11.6
|
12
12
|
Requires-Dist: rich>=13.9.4
|
13
13
|
Requires-Dist: colorama>=0.4.6
|
14
|
-
Requires-Dist: spargear>=0.2.
|
14
|
+
Requires-Dist: spargear>=0.2.7
|
15
15
|
Provides-Extra: dev
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist: ipykernel>=6.29.5; extra == "dev"
|
16
|
+
Requires-Dist: pyright>=1.1.401; extra == "dev"
|
18
17
|
Provides-Extra: conversion
|
19
18
|
Requires-Dist: youtube-transcript-api>=1.0.3; extra == "conversion"
|
20
19
|
Requires-Dist: chatterer[browser]; extra == "conversion"
|
@@ -34,12 +33,10 @@ Requires-Dist: mistune>=3.1.3; extra == "markdown"
|
|
34
33
|
Provides-Extra: video
|
35
34
|
Requires-Dist: pydub>=0.25.1; extra == "video"
|
36
35
|
Provides-Extra: langchain
|
37
|
-
Requires-Dist:
|
36
|
+
Requires-Dist: langchain-anthropic>=0.3.10; extra == "langchain"
|
37
|
+
Requires-Dist: langchain-google-genai>=2.1.1; extra == "langchain"
|
38
|
+
Requires-Dist: langchain-ollama>=0.3.0; extra == "langchain"
|
38
39
|
Requires-Dist: langchain-experimental>=0.3.4; extra == "langchain"
|
39
|
-
Provides-Extra: langchain-providers
|
40
|
-
Requires-Dist: langchain-anthropic>=0.3.10; extra == "langchain-providers"
|
41
|
-
Requires-Dist: langchain-google-genai>=2.1.1; extra == "langchain-providers"
|
42
|
-
Requires-Dist: langchain-ollama>=0.3.0; extra == "langchain-providers"
|
43
40
|
Provides-Extra: all
|
44
41
|
Requires-Dist: chatterer[dev]; extra == "all"
|
45
42
|
Requires-Dist: chatterer[langchain]; extra == "all"
|
@@ -0,0 +1,75 @@
|
|
1
|
+
from spargear import SubcommandArguments, SubcommandSpec
|
2
|
+
|
3
|
+
|
4
|
+
def any2md():
|
5
|
+
from .any2md import Arguments
|
6
|
+
|
7
|
+
return Arguments
|
8
|
+
|
9
|
+
|
10
|
+
def pdf2md():
|
11
|
+
from .pdf2md import Arguments
|
12
|
+
|
13
|
+
return Arguments
|
14
|
+
|
15
|
+
|
16
|
+
def pdf2txt():
|
17
|
+
from .pdf2txt import Arguments
|
18
|
+
|
19
|
+
return Arguments
|
20
|
+
|
21
|
+
|
22
|
+
def ppt():
|
23
|
+
from .ppt import Arguments
|
24
|
+
|
25
|
+
return Arguments
|
26
|
+
|
27
|
+
|
28
|
+
def pw():
|
29
|
+
from .pw import Arguments
|
30
|
+
|
31
|
+
return Arguments
|
32
|
+
|
33
|
+
|
34
|
+
def snippet():
|
35
|
+
from .snippet import Arguments
|
36
|
+
|
37
|
+
return Arguments
|
38
|
+
|
39
|
+
|
40
|
+
def transcribe():
|
41
|
+
from .transcribe import Arguments
|
42
|
+
|
43
|
+
return Arguments
|
44
|
+
|
45
|
+
|
46
|
+
def upstage():
|
47
|
+
from .upstage import Arguments
|
48
|
+
|
49
|
+
return Arguments
|
50
|
+
|
51
|
+
|
52
|
+
def web2md():
|
53
|
+
from .web2md import Arguments
|
54
|
+
|
55
|
+
return Arguments
|
56
|
+
|
57
|
+
|
58
|
+
class Arguments(SubcommandArguments):
|
59
|
+
any2md = SubcommandSpec(name="any2md", argument_class_factory=any2md)
|
60
|
+
pdf2md = SubcommandSpec(name="pdf2md", argument_class_factory=pdf2md)
|
61
|
+
pdf2txt = SubcommandSpec(name="pdf2txt", argument_class_factory=pdf2txt)
|
62
|
+
ppt = SubcommandSpec(name="ppt", argument_class_factory=ppt)
|
63
|
+
pw = SubcommandSpec(name="pw", argument_class_factory=pw)
|
64
|
+
snippet = SubcommandSpec(name="snippet", argument_class_factory=snippet)
|
65
|
+
transcribe = SubcommandSpec(name="transcribe", argument_class_factory=transcribe)
|
66
|
+
upstage = SubcommandSpec(name="upstage", argument_class_factory=upstage)
|
67
|
+
web2md = SubcommandSpec(name="web2md", argument_class_factory=web2md)
|
68
|
+
|
69
|
+
|
70
|
+
def main():
|
71
|
+
Arguments().execute()
|
72
|
+
|
73
|
+
|
74
|
+
if __name__ == "__main__":
|
75
|
+
main()
|
@@ -3,7 +3,7 @@ from pathlib import Path
|
|
3
3
|
from typing import Optional, TypedDict
|
4
4
|
|
5
5
|
import openai
|
6
|
-
from spargear import
|
6
|
+
from spargear import RunnableArguments
|
7
7
|
|
8
8
|
from chatterer import anything_to_markdown
|
9
9
|
|
@@ -16,10 +16,10 @@ class AnythingToMarkdownReturns(TypedDict):
|
|
16
16
|
out_text: str
|
17
17
|
|
18
18
|
|
19
|
-
class
|
19
|
+
class Arguments(RunnableArguments[AnythingToMarkdownReturns]):
|
20
20
|
"""Command line arguments for converting various file types to markdown."""
|
21
21
|
|
22
|
-
|
22
|
+
SOURCE: str
|
23
23
|
"""Input file to convert to markdown. Can be a file path or a URL."""
|
24
24
|
output: Optional[str] = None
|
25
25
|
"""Output path for the converted markdown file. If not provided, the input file's suffix is replaced with .md"""
|
@@ -43,7 +43,7 @@ class AnythingToMarkdownArguments(BaseArguments):
|
|
43
43
|
def run(self) -> AnythingToMarkdownReturns:
|
44
44
|
if not self.prevent_save_file:
|
45
45
|
if not self.output:
|
46
|
-
output = Path(self.
|
46
|
+
output = Path(self.SOURCE).with_suffix(".md")
|
47
47
|
else:
|
48
48
|
output = Path(self.output)
|
49
49
|
else:
|
@@ -57,7 +57,7 @@ class AnythingToMarkdownArguments(BaseArguments):
|
|
57
57
|
llm_model = None
|
58
58
|
|
59
59
|
text: str = anything_to_markdown(
|
60
|
-
self.
|
60
|
+
self.SOURCE,
|
61
61
|
llm_client=llm_client,
|
62
62
|
llm_model=llm_model,
|
63
63
|
style_map=self.style_map,
|
@@ -67,18 +67,18 @@ class AnythingToMarkdownArguments(BaseArguments):
|
|
67
67
|
if output:
|
68
68
|
output.parent.mkdir(parents=True, exist_ok=True)
|
69
69
|
output.write_text(text, encoding=self.encoding)
|
70
|
-
logger.info(f"Converted `{self.
|
70
|
+
logger.info(f"Converted `{self.SOURCE}` to markdown and saved to `{output}`.")
|
71
71
|
else:
|
72
|
-
logger.info(f"Converted `{self.
|
72
|
+
logger.info(f"Converted `{self.SOURCE}` to markdown.")
|
73
73
|
return {
|
74
|
-
"input": self.
|
74
|
+
"input": self.SOURCE,
|
75
75
|
"output": str(output) if output is not None else None,
|
76
76
|
"out_text": text,
|
77
77
|
}
|
78
78
|
|
79
79
|
|
80
80
|
def main() -> None:
|
81
|
-
|
81
|
+
Arguments().run()
|
82
82
|
|
83
83
|
|
84
84
|
if __name__ == "__main__":
|
@@ -0,0 +1,338 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
PDF to Markdown Converter CLI
|
4
|
+
|
5
|
+
A command-line tool for converting PDF documents to Markdown using multimodal LLMs.
|
6
|
+
Supports both sequential and parallel processing modes with async capabilities.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import asyncio
|
10
|
+
import logging
|
11
|
+
import sys
|
12
|
+
import time
|
13
|
+
from pathlib import Path
|
14
|
+
from typing import List, Literal, Optional, TypedDict
|
15
|
+
|
16
|
+
from spargear import ArgumentSpec, RunnableArguments
|
17
|
+
|
18
|
+
from chatterer import Chatterer
|
19
|
+
from chatterer.tools.convert_pdf_to_markdown import PdfToMarkdown
|
20
|
+
|
21
|
+
|
22
|
+
class ConversionResult(TypedDict, total=False):
|
23
|
+
"""Type definition for conversion results."""
|
24
|
+
|
25
|
+
input: str
|
26
|
+
output: str
|
27
|
+
result: str
|
28
|
+
processing_time: float
|
29
|
+
characters: int
|
30
|
+
error: str
|
31
|
+
|
32
|
+
|
33
|
+
# Setup enhanced logging
|
34
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%H:%M:%S")
|
35
|
+
logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
|
38
|
+
class Arguments(RunnableArguments[List[ConversionResult]]):
|
39
|
+
"""Command-line arguments for PDF to Markdown conversion."""
|
40
|
+
|
41
|
+
PDF_OR_DIRECTORY_PATH: str
|
42
|
+
"""Input PDF file or directory containing PDF files to convert to markdown."""
|
43
|
+
|
44
|
+
output: Optional[str] = None
|
45
|
+
"""Output path. For a file, path to the output markdown file. For a directory, output directory for .md files."""
|
46
|
+
|
47
|
+
page: Optional[str] = None
|
48
|
+
"""Zero-based page indices to convert (e.g., '0,2,4-8'). If None, converts all pages."""
|
49
|
+
|
50
|
+
recursive: bool = False
|
51
|
+
"""If input is a directory, search for PDFs recursively."""
|
52
|
+
|
53
|
+
mode: Literal["sequential", "parallel"] = "parallel"
|
54
|
+
"""Processing mode: 'sequential' for strict continuity, 'parallel' for faster processing."""
|
55
|
+
|
56
|
+
sync: bool = False
|
57
|
+
"""Enable synchronous processing for sequential mode. If set to True, will run in sync mode."""
|
58
|
+
|
59
|
+
max_concurrent: int = 10
|
60
|
+
"""Maximum number of concurrent LLM requests when using async mode."""
|
61
|
+
|
62
|
+
image_zoom: float = 2.0
|
63
|
+
"""Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
|
64
|
+
|
65
|
+
image_format: Literal["png", "jpg", "jpeg"] = "png"
|
66
|
+
"""Image format for PDF page rendering."""
|
67
|
+
|
68
|
+
image_quality: int = 95
|
69
|
+
"""JPEG quality when using jpg/jpeg format (1-100)."""
|
70
|
+
|
71
|
+
context_tail_lines: int = 10
|
72
|
+
"""Number of lines from previous page's markdown to use as context (sequential mode only)."""
|
73
|
+
|
74
|
+
verbose: bool = False
|
75
|
+
"""Enable verbose logging output."""
|
76
|
+
|
77
|
+
chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
|
78
|
+
["--chatterer"],
|
79
|
+
default_factory=lambda: Chatterer.from_provider("google:gemini-2.5-flash-preview-05-20"),
|
80
|
+
help="Chatterer instance configuration (e.g., 'google:gemini-2.5-flash-preview-05-20').",
|
81
|
+
type=Chatterer.from_provider,
|
82
|
+
)
|
83
|
+
|
84
|
+
def __post_init__(self) -> None:
|
85
|
+
"""Validate and adjust arguments after initialization."""
|
86
|
+
if self.verbose:
|
87
|
+
logging.getLogger().setLevel(logging.DEBUG)
|
88
|
+
|
89
|
+
if not self.sync and self.mode == "sequential":
|
90
|
+
logger.warning("Async mode is only available with parallel mode. Switching to parallel mode.")
|
91
|
+
self.mode = "parallel"
|
92
|
+
|
93
|
+
if self.max_concurrent < 1:
|
94
|
+
logger.warning("max_concurrent must be >= 1. Setting to 1.")
|
95
|
+
self.max_concurrent = 1
|
96
|
+
elif self.max_concurrent > 10:
|
97
|
+
logger.warning("max_concurrent > 10 may cause rate limiting. Consider reducing.")
|
98
|
+
|
99
|
+
def run(self) -> List[ConversionResult]:
|
100
|
+
"""Execute the PDF to Markdown conversion."""
|
101
|
+
if not self.sync:
|
102
|
+
return asyncio.run(self._run_async())
|
103
|
+
else:
|
104
|
+
return self._run_sync()
|
105
|
+
|
106
|
+
def _run_sync(self) -> List[ConversionResult]:
|
107
|
+
"""Execute synchronous conversion."""
|
108
|
+
pdf_files, output_base, is_dir = self._prepare_files()
|
109
|
+
|
110
|
+
converter = PdfToMarkdown(
|
111
|
+
chatterer=self.chatterer.unwrap(),
|
112
|
+
image_zoom=self.image_zoom,
|
113
|
+
image_format=self.image_format,
|
114
|
+
image_jpg_quality=self.image_quality,
|
115
|
+
context_tail_lines=self.context_tail_lines,
|
116
|
+
)
|
117
|
+
|
118
|
+
results: List[ConversionResult] = []
|
119
|
+
total_start_time = time.time()
|
120
|
+
|
121
|
+
logger.info(f"🚀 Starting {self.mode} conversion of {len(pdf_files)} PDF(s)...")
|
122
|
+
|
123
|
+
for i, pdf in enumerate(pdf_files, 1):
|
124
|
+
output_path = (output_base / f"{pdf.stem}.md") if is_dir else output_base
|
125
|
+
|
126
|
+
logger.info(f"📄 Processing {i}/{len(pdf_files)}: {pdf.name}")
|
127
|
+
start_time = time.time()
|
128
|
+
|
129
|
+
# Progress callback for individual PDF
|
130
|
+
def progress_callback(current: int, total: int) -> None:
|
131
|
+
progress = (current / total) * 100
|
132
|
+
logger.info(f" └─ Progress: {current}/{total} pages ({progress:.1f}%)")
|
133
|
+
|
134
|
+
try:
|
135
|
+
markdown = converter.convert(
|
136
|
+
pdf_input=str(pdf),
|
137
|
+
page_indices=self.page,
|
138
|
+
mode=self.mode,
|
139
|
+
progress_callback=progress_callback,
|
140
|
+
)
|
141
|
+
|
142
|
+
# Save result
|
143
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
144
|
+
output_path.write_text(markdown, encoding="utf-8")
|
145
|
+
|
146
|
+
elapsed = time.time() - start_time
|
147
|
+
chars_per_sec = len(markdown) / elapsed if elapsed > 0 else 0
|
148
|
+
|
149
|
+
logger.info(f" ✅ Completed in {elapsed:.1f}s ({chars_per_sec:.0f} chars/s)")
|
150
|
+
logger.info(f" 📝 Generated {len(markdown):,} characters → {output_path}")
|
151
|
+
|
152
|
+
results.append({
|
153
|
+
"input": pdf.as_posix(),
|
154
|
+
"output": output_path.as_posix(),
|
155
|
+
"result": markdown,
|
156
|
+
"processing_time": elapsed,
|
157
|
+
"characters": len(markdown),
|
158
|
+
})
|
159
|
+
|
160
|
+
except Exception as e:
|
161
|
+
logger.error(f" ❌ Failed to process {pdf.name}: {e}")
|
162
|
+
results.append({
|
163
|
+
"input": pdf.as_posix(),
|
164
|
+
"output": "",
|
165
|
+
"result": "",
|
166
|
+
"error": str(e),
|
167
|
+
})
|
168
|
+
|
169
|
+
total_elapsed = time.time() - total_start_time
|
170
|
+
total_chars = sum(len(r.get("result", "")) for r in results)
|
171
|
+
successful_conversions = sum(1 for r in results if "error" not in r)
|
172
|
+
|
173
|
+
logger.info("🎉 Conversion complete!")
|
174
|
+
logger.info(f" 📊 Total time: {total_elapsed:.1f}s")
|
175
|
+
logger.info(f" 📈 Success rate: {successful_conversions}/{len(pdf_files)} ({(successful_conversions / len(pdf_files) * 100):.1f}%)")
|
176
|
+
logger.info(f" 📝 Total output: {total_chars:,} characters")
|
177
|
+
logger.info(f" ⚡ Average speed: {total_chars / total_elapsed:.0f} chars/s")
|
178
|
+
|
179
|
+
return results
|
180
|
+
|
181
|
+
async def _run_async(self) -> List[ConversionResult]:
|
182
|
+
"""Execute asynchronous conversion with parallel processing."""
|
183
|
+
pdf_files, output_base, is_dir = self._prepare_files()
|
184
|
+
|
185
|
+
converter = PdfToMarkdown(
|
186
|
+
chatterer=self.chatterer.unwrap(),
|
187
|
+
image_zoom=self.image_zoom,
|
188
|
+
image_format=self.image_format,
|
189
|
+
image_jpg_quality=self.image_quality,
|
190
|
+
context_tail_lines=self.context_tail_lines,
|
191
|
+
)
|
192
|
+
|
193
|
+
total_start_time = time.time()
|
194
|
+
|
195
|
+
logger.info(f"🚀 Starting ASYNC parallel conversion of {len(pdf_files)} PDF(s)...")
|
196
|
+
logger.info(f"⚡ Max concurrent: {self.max_concurrent} LLM requests")
|
197
|
+
|
198
|
+
# Process PDFs concurrently
|
199
|
+
semaphore = asyncio.Semaphore(self.max_concurrent)
|
200
|
+
|
201
|
+
async def process_pdf(pdf: Path, index: int) -> ConversionResult:
|
202
|
+
async with semaphore:
|
203
|
+
output_path = (output_base / f"{pdf.stem}.md") if is_dir else output_base
|
204
|
+
|
205
|
+
logger.info(f"📄 Processing {index}/{len(pdf_files)}: {pdf.name}")
|
206
|
+
start_time = time.time()
|
207
|
+
|
208
|
+
# Progress callback for individual PDF
|
209
|
+
def progress_callback(current: int, total: int) -> None:
|
210
|
+
progress = (current / total) * 100
|
211
|
+
logger.info(f" └─ {pdf.name}: {current}/{total} pages ({progress:.1f}%)")
|
212
|
+
|
213
|
+
try:
|
214
|
+
markdown = await converter.aconvert(
|
215
|
+
pdf_input=str(pdf),
|
216
|
+
page_indices=self.page,
|
217
|
+
progress_callback=progress_callback,
|
218
|
+
max_concurrent=self.max_concurrent, # Limit per-PDF concurrency
|
219
|
+
)
|
220
|
+
|
221
|
+
# Save result
|
222
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
223
|
+
output_path.write_text(markdown, encoding="utf-8")
|
224
|
+
|
225
|
+
elapsed = time.time() - start_time
|
226
|
+
chars_per_sec = len(markdown) / elapsed if elapsed > 0 else 0
|
227
|
+
|
228
|
+
logger.info(f" ✅ {pdf.name} completed in {elapsed:.1f}s ({chars_per_sec:.0f} chars/s)")
|
229
|
+
logger.info(f" 📝 Generated {len(markdown):,} characters → {output_path}")
|
230
|
+
|
231
|
+
return {
|
232
|
+
"input": pdf.as_posix(),
|
233
|
+
"output": output_path.as_posix(),
|
234
|
+
"result": markdown,
|
235
|
+
"processing_time": elapsed,
|
236
|
+
"characters": len(markdown),
|
237
|
+
}
|
238
|
+
|
239
|
+
except Exception as e:
|
240
|
+
logger.error(f" ❌ Failed to process {pdf.name}: {e}")
|
241
|
+
return {
|
242
|
+
"input": pdf.as_posix(),
|
243
|
+
"output": "",
|
244
|
+
"result": "",
|
245
|
+
"error": str(e),
|
246
|
+
}
|
247
|
+
|
248
|
+
# Execute all PDF processing tasks
|
249
|
+
tasks = [process_pdf(pdf, i) for i, pdf in enumerate(pdf_files, 1)]
|
250
|
+
raw_results = await asyncio.gather(*tasks, return_exceptions=True)
|
251
|
+
|
252
|
+
# Handle exceptions in results
|
253
|
+
final_results: List[ConversionResult] = []
|
254
|
+
for result in raw_results:
|
255
|
+
if isinstance(result, Exception):
|
256
|
+
logger.error(f"Task failed with exception: {result}")
|
257
|
+
final_results.append(ConversionResult(input="", output="", result="", error=str(result)))
|
258
|
+
else:
|
259
|
+
# Type narrowing: result is ConversionResult after isinstance check
|
260
|
+
final_results.append(result) # type: ignore[arg-type]
|
261
|
+
|
262
|
+
total_elapsed = time.time() - total_start_time
|
263
|
+
total_chars = sum(len(r.get("result", "")) for r in final_results)
|
264
|
+
successful_conversions = sum(1 for r in final_results if "error" not in r)
|
265
|
+
|
266
|
+
logger.info("🎉 ASYNC conversion complete!")
|
267
|
+
logger.info(f" 📊 Total time: {total_elapsed:.1f}s")
|
268
|
+
logger.info(f" 📈 Success rate: {successful_conversions}/{len(pdf_files)} ({(successful_conversions / len(pdf_files) * 100):.1f}%)")
|
269
|
+
logger.info(f" 📝 Total output: {total_chars:,} characters")
|
270
|
+
logger.info(f" ⚡ Average speed: {total_chars / total_elapsed:.0f} chars/s")
|
271
|
+
logger.info(f" 🚀 Speedup: ~{len(pdf_files) / max(1, total_elapsed / 60):.1f}x faster than sequential")
|
272
|
+
|
273
|
+
return final_results
|
274
|
+
|
275
|
+
def _prepare_files(self) -> tuple[List[Path], Path, bool]:
|
276
|
+
"""Prepare input and output file paths."""
|
277
|
+
input_path = Path(self.PDF_OR_DIRECTORY_PATH).resolve()
|
278
|
+
pdf_files: List[Path] = []
|
279
|
+
is_dir = False
|
280
|
+
|
281
|
+
# Determine input files
|
282
|
+
if input_path.is_file():
|
283
|
+
if input_path.suffix.lower() != ".pdf":
|
284
|
+
logger.error(f"❌ Input file must be a PDF: {input_path}")
|
285
|
+
sys.exit(1)
|
286
|
+
pdf_files.append(input_path)
|
287
|
+
elif input_path.is_dir():
|
288
|
+
is_dir = True
|
289
|
+
pattern = "**/*.pdf" if self.recursive else "*.pdf"
|
290
|
+
pdf_files = sorted([f for f in input_path.glob(pattern) if f.is_file()])
|
291
|
+
if not pdf_files:
|
292
|
+
logger.warning(f"⚠️ No PDF files found in {input_path}")
|
293
|
+
sys.exit(0)
|
294
|
+
else:
|
295
|
+
logger.error(f"❌ Input path does not exist: {input_path}")
|
296
|
+
sys.exit(1)
|
297
|
+
|
298
|
+
# Determine output path
|
299
|
+
if self.output:
|
300
|
+
output_base = Path(self.output).resolve()
|
301
|
+
elif is_dir:
|
302
|
+
output_base = input_path
|
303
|
+
else:
|
304
|
+
output_base = input_path.with_suffix(".md")
|
305
|
+
|
306
|
+
# Create output directories
|
307
|
+
if is_dir:
|
308
|
+
output_base.mkdir(parents=True, exist_ok=True)
|
309
|
+
else:
|
310
|
+
output_base.parent.mkdir(parents=True, exist_ok=True)
|
311
|
+
|
312
|
+
logger.info(f"📂 Input: {input_path}")
|
313
|
+
logger.info(f"📁 Output: {output_base}")
|
314
|
+
logger.info(f"📄 Found {len(pdf_files)} PDF file(s)")
|
315
|
+
|
316
|
+
return pdf_files, output_base, is_dir
|
317
|
+
|
318
|
+
|
319
|
+
def main() -> None:
|
320
|
+
"""Main entry point for the CLI application."""
|
321
|
+
args = None
|
322
|
+
try:
|
323
|
+
args = Arguments()
|
324
|
+
args.run()
|
325
|
+
except KeyboardInterrupt:
|
326
|
+
logger.info("🛑 Conversion interrupted by user")
|
327
|
+
sys.exit(130)
|
328
|
+
except Exception as e:
|
329
|
+
logger.error(f"❌ Unexpected error: {e}")
|
330
|
+
if args and hasattr(args, "verbose") and args.verbose:
|
331
|
+
import traceback
|
332
|
+
|
333
|
+
traceback.print_exc()
|
334
|
+
sys.exit(1)
|
335
|
+
|
336
|
+
|
337
|
+
if __name__ == "__main__":
|
338
|
+
main()
|
chatterer-0.1.22/chatterer/examples/pdf_to_text.py → chatterer-0.1.24/chatterer/examples/pdf2txt.py
RENAMED
@@ -3,15 +3,15 @@ import sys
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import Optional
|
5
5
|
|
6
|
-
from spargear import
|
6
|
+
from spargear import RunnableArguments
|
7
7
|
|
8
8
|
from chatterer.tools.convert_to_text import pdf_to_text
|
9
9
|
|
10
10
|
logger = logging.getLogger(__name__)
|
11
11
|
|
12
12
|
|
13
|
-
class
|
14
|
-
|
13
|
+
class Arguments(RunnableArguments[None]):
|
14
|
+
PDF_PATH: Path
|
15
15
|
"""Path to the PDF file to convert to text."""
|
16
16
|
output: Optional[Path]
|
17
17
|
"""Path to the output text file. If not provided, defaults to the input file with a .txt suffix."""
|
@@ -19,7 +19,7 @@ class PdfToTextArgs(BaseArguments):
|
|
19
19
|
"""Comma-separated list of zero-based page indices to extract from the PDF. Supports ranges, e.g., '0,2,4-8'."""
|
20
20
|
|
21
21
|
def run(self) -> None:
|
22
|
-
input = self.
|
22
|
+
input = self.PDF_PATH.resolve()
|
23
23
|
out = self.output or input.with_suffix(".txt")
|
24
24
|
if not input.is_file():
|
25
25
|
sys.exit(1)
|
@@ -47,7 +47,7 @@ def parse_page_indices(pages_str: str) -> list[int]:
|
|
47
47
|
|
48
48
|
|
49
49
|
def main() -> None:
|
50
|
-
|
50
|
+
Arguments().run()
|
51
51
|
|
52
52
|
|
53
53
|
if __name__ == "__main__":
|
chatterer-0.1.22/chatterer/examples/make_ppt.py → chatterer-0.1.24/chatterer/examples/ppt.py
RENAMED
@@ -3,7 +3,7 @@ import sys
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import NotRequired, TypedDict
|
5
5
|
|
6
|
-
from spargear import
|
6
|
+
from spargear import RunnableArguments
|
7
7
|
|
8
8
|
from chatterer import BaseMessage, Chatterer, HumanMessage, SystemMessage
|
9
9
|
|
@@ -155,7 +155,7 @@ Now, generate the final `presentation.html` file using impress.js and the provid
|
|
155
155
|
# --- Argument Parsing ---
|
156
156
|
|
157
157
|
|
158
|
-
class
|
158
|
+
class Arguments(RunnableArguments[None]):
|
159
159
|
"""
|
160
160
|
Arguments for the presentation generation process.
|
161
161
|
"""
|
@@ -179,9 +179,7 @@ class MakePptArguments(BaseArguments):
|
|
179
179
|
"""Prompt for organizing slides into a presentation script"""
|
180
180
|
|
181
181
|
# LLM Settings
|
182
|
-
provider: str =
|
183
|
-
"openai:gpt-4.1" # Example: "openai:gpt-4o", "anthropic:claude-3-haiku-20240307", "google:gemini-1.5-flash"
|
184
|
-
)
|
182
|
+
provider: str = "openai:gpt-4.1" # Example: "openai:gpt-4o", "anthropic:claude-3-haiku-20240307", "google:gemini-1.5-flash"
|
185
183
|
"""Name of the language model to use (provider:model_name)"""
|
186
184
|
|
187
185
|
# Other settings
|
@@ -293,7 +291,7 @@ class GeneratedSlide(TypedDict):
|
|
293
291
|
script: NotRequired[str]
|
294
292
|
|
295
293
|
|
296
|
-
def run_presentation_agent(args:
|
294
|
+
def run_presentation_agent(args: Arguments):
|
297
295
|
"""Executes the presentation generation agent loop."""
|
298
296
|
|
299
297
|
if args.verbose:
|
@@ -481,7 +479,7 @@ Remember to follow all instructions in the role prompt, especially regarding HTM
|
|
481
479
|
|
482
480
|
|
483
481
|
def main() -> None:
|
484
|
-
|
482
|
+
Arguments().run()
|
485
483
|
|
486
484
|
|
487
485
|
if __name__ == "__main__":
|