markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/llm.py ADDED
@@ -0,0 +1,4339 @@
1
+ """LLM integration module using LiteLLM Router."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import base64
7
+ import copy
8
+ import hashlib
9
+ import json
10
+ import threading
11
+ import time
12
+ from collections import defaultdict
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+ from typing import TYPE_CHECKING, Any, cast
16
+
17
+ import instructor
18
+ import litellm
19
+ from litellm import completion_cost
20
+ from litellm.exceptions import (
21
+ APIConnectionError,
22
+ RateLimitError,
23
+ ServiceUnavailableError,
24
+ Timeout,
25
+ )
26
+ from litellm.integrations.custom_logger import CustomLogger
27
+ from litellm.router import Router
28
+ from litellm.types.llms.openai import AllMessageValues
29
+ from litellm.types.utils import Choices
30
+ from loguru import logger
31
+ from openai.types.chat import ChatCompletionMessageParam
32
+ from pydantic import BaseModel, Field
33
+
34
+ if TYPE_CHECKING:
35
+ from markitai.config import LLMConfig, PromptsConfig
36
+ from markitai.types import LLMUsageByModel, ModelUsageStats
37
+
38
+
39
+ from markitai.constants import (
40
+ DEFAULT_CACHE_DB_FILENAME,
41
+ DEFAULT_CACHE_MAXSIZE,
42
+ DEFAULT_CACHE_SIZE_LIMIT,
43
+ DEFAULT_CACHE_TTL_SECONDS,
44
+ DEFAULT_GLOBAL_CACHE_DIR,
45
+ DEFAULT_INSTRUCTOR_MAX_RETRIES,
46
+ DEFAULT_IO_CONCURRENCY,
47
+ DEFAULT_MAX_CONTENT_CHARS,
48
+ DEFAULT_MAX_IMAGES_PER_BATCH,
49
+ DEFAULT_MAX_OUTPUT_TOKENS,
50
+ DEFAULT_MAX_PAGES_PER_BATCH,
51
+ DEFAULT_MAX_RETRIES,
52
+ DEFAULT_PROJECT_CACHE_DIR,
53
+ DEFAULT_RETRY_BASE_DELAY,
54
+ DEFAULT_RETRY_MAX_DELAY,
55
+ )
56
+ from markitai.prompts import PromptManager
57
+ from markitai.utils.mime import get_mime_type, is_llm_supported_image
58
+ from markitai.workflow.helpers import detect_language, get_language_name
59
+
60
+ # Retryable exceptions (kept here as they depend on litellm types)
61
+ RETRYABLE_ERRORS = (
62
+ RateLimitError,
63
+ APIConnectionError,
64
+ Timeout,
65
+ ServiceUnavailableError,
66
+ )
67
+
68
+
69
+ # Cache for model info to avoid repeated litellm queries
70
+ _model_info_cache: dict[str, dict[str, Any]] = {}
71
+
72
+
73
+ def get_model_info_cached(model: str) -> dict[str, Any]:
74
+ """Get model info from litellm with caching.
75
+
76
+ Args:
77
+ model: Model identifier (e.g., "deepseek/deepseek-chat", "gemini/gemini-2.5-flash")
78
+
79
+ Returns:
80
+ Dict with keys:
81
+ - max_input_tokens: int (context window size)
82
+ - max_output_tokens: int (max output tokens)
83
+ - supports_vision: bool (whether model supports images)
84
+ Returns defaults if litellm info unavailable.
85
+ """
86
+ if model in _model_info_cache:
87
+ return _model_info_cache[model]
88
+
89
+ # Defaults
90
+ result = {
91
+ "max_input_tokens": 128000, # Conservative default
92
+ "max_output_tokens": DEFAULT_MAX_OUTPUT_TOKENS,
93
+ "supports_vision": False,
94
+ }
95
+
96
+ try:
97
+ info = litellm.get_model_info(model)
98
+ if info.get("max_input_tokens"):
99
+ result["max_input_tokens"] = info["max_input_tokens"]
100
+ if info.get("max_output_tokens"):
101
+ result["max_output_tokens"] = info["max_output_tokens"]
102
+ supports_vision = info.get("supports_vision")
103
+ if supports_vision is not None:
104
+ result["supports_vision"] = bool(supports_vision)
105
+ except Exception:
106
+ logger.debug(f"[ModelInfo] Could not get info for {model}, using defaults")
107
+
108
+ _model_info_cache[model] = result
109
+ return result
110
+
111
+
112
+ def get_model_max_output_tokens(model: str) -> int:
113
+ """Get max_output_tokens for a model using litellm.get_model_info().
114
+
115
+ Args:
116
+ model: Model identifier (e.g., "deepseek/deepseek-chat", "gemini/gemini-2.5-flash")
117
+
118
+ Returns:
119
+ max_output_tokens value, or DEFAULT_MAX_OUTPUT_TOKENS if unavailable
120
+ """
121
+ return get_model_info_cached(model)["max_output_tokens"]
122
+
123
+
124
+ def _context_display_name(context: str) -> str:
125
+ """Extract display name from context for logging.
126
+
127
+ Converts full paths to filenames while preserving suffixes like ':images'.
128
+ Examples:
129
+ 'C:/path/to/file.pdf:images' -> 'file.pdf:images'
130
+ 'file.pdf' -> 'file.pdf'
131
+ '' -> ''
132
+ """
133
+ if not context:
134
+ return context
135
+ # Split context into path part and suffix (e.g., ':images')
136
+ if (
137
+ ":" in context and context[1:3] != ":\\"
138
+ ): # Avoid splitting Windows drive letters
139
+ # Find the last colon that's not part of a Windows path
140
+ parts = context.rsplit(":", 1)
141
+ if len(parts) == 2 and not parts[1].startswith("\\"):
142
+ path_part, suffix = parts
143
+ return f"{Path(path_part).name}:{suffix}"
144
+ return Path(context).name
145
+
146
+
147
+ class MarkitaiLLMLogger(CustomLogger):
148
+ """Custom LiteLLM callback logger for capturing additional call details."""
149
+
150
+ def __init__(self) -> None:
151
+ self.last_call_details: dict[str, Any] = {}
152
+
153
+ def log_success_event(
154
+ self,
155
+ kwargs: dict,
156
+ response_obj: Any,
157
+ start_time: Any,
158
+ end_time: Any,
159
+ ) -> None:
160
+ """Capture details from successful LLM calls."""
161
+ slo = kwargs.get("standard_logging_object", {})
162
+ self.last_call_details = {
163
+ "api_base": slo.get("api_base"),
164
+ "response_time": slo.get("response_time"),
165
+ "cache_hit": kwargs.get("cache_hit", False),
166
+ "model_id": slo.get("model_id"),
167
+ }
168
+
169
+ async def async_log_success_event(
170
+ self,
171
+ kwargs: dict,
172
+ response_obj: Any,
173
+ start_time: Any,
174
+ end_time: Any,
175
+ ) -> None:
176
+ """Async version of success event logging."""
177
+ self.log_success_event(kwargs, response_obj, start_time, end_time)
178
+
179
+ def log_failure_event(
180
+ self,
181
+ kwargs: dict,
182
+ response_obj: Any,
183
+ start_time: Any,
184
+ end_time: Any,
185
+ ) -> None:
186
+ """Capture details from failed LLM calls."""
187
+ slo = kwargs.get("standard_logging_object", {})
188
+ self.last_call_details = {
189
+ "api_base": slo.get("api_base"),
190
+ "error_code": slo.get("error_code"),
191
+ "error_class": slo.get("error_class"),
192
+ }
193
+
194
+ async def async_log_failure_event(
195
+ self,
196
+ kwargs: dict,
197
+ response_obj: Any,
198
+ start_time: Any,
199
+ end_time: Any,
200
+ ) -> None:
201
+ """Async version of failure event logging."""
202
+ self.log_failure_event(kwargs, response_obj, start_time, end_time)
203
+
204
+
205
+ # Global callback instance
206
+ _markitai_llm_logger = MarkitaiLLMLogger()
207
+
208
+
209
+ @dataclass
210
+ class LLMRuntime:
211
+ """Global LLM runtime with shared concurrency control.
212
+
213
+ This allows multiple LLMProcessor instances to share semaphores
214
+ for rate limiting across the entire application.
215
+
216
+ Supports separate concurrency limits for:
217
+ - LLM API calls (rate-limited by provider)
218
+ - I/O operations (disk reads, can be higher)
219
+
220
+ Usage:
221
+ runtime = LLMRuntime(concurrency=10, io_concurrency=20)
222
+ processor1 = LLMProcessor(config, runtime=runtime)
223
+ processor2 = LLMProcessor(config, runtime=runtime)
224
+ # Both processors share the same semaphores
225
+ """
226
+
227
+ concurrency: int
228
+ io_concurrency: int = DEFAULT_IO_CONCURRENCY
229
+ _semaphore: asyncio.Semaphore | None = field(default=None, init=False, repr=False)
230
+ _io_semaphore: asyncio.Semaphore | None = field(
231
+ default=None, init=False, repr=False
232
+ )
233
+
234
+ @property
235
+ def semaphore(self) -> asyncio.Semaphore:
236
+ """Get or create the shared LLM concurrency semaphore."""
237
+ if self._semaphore is None:
238
+ self._semaphore = asyncio.Semaphore(self.concurrency)
239
+ return self._semaphore
240
+
241
+ @property
242
+ def io_semaphore(self) -> asyncio.Semaphore:
243
+ """Get or create the shared I/O concurrency semaphore."""
244
+ if self._io_semaphore is None:
245
+ self._io_semaphore = asyncio.Semaphore(self.io_concurrency)
246
+ return self._io_semaphore
247
+
248
+
249
+ @dataclass
250
+ class LLMResponse:
251
+ """Response from LLM call."""
252
+
253
+ content: str
254
+ model: str
255
+ input_tokens: int
256
+ output_tokens: int
257
+ cost_usd: float
258
+
259
+
260
+ @dataclass
261
+ class ImageAnalysis:
262
+ """Result of image analysis.
263
+
264
+ Attributes:
265
+ caption: Short alt text for accessibility
266
+ description: Detailed markdown description
267
+ extracted_text: Text extracted from image (OCR)
268
+ llm_usage: LLM usage statistics in format:
269
+ {"<model-name>": {"requests": N, "input_tokens": N,
270
+ "output_tokens": N, "cost_usd": N}}
271
+ """
272
+
273
+ caption: str # Short alt text
274
+ description: str # Detailed description
275
+ extracted_text: str | None = None # Text extracted from image
276
+ llm_usage: LLMUsageByModel | None = None # LLM usage stats
277
+
278
+
279
+ class ImageAnalysisResult(BaseModel):
280
+ """Pydantic model for structured image analysis output."""
281
+
282
+ caption: str = Field(description="Short alt text for the image (10-30 characters)")
283
+ description: str = Field(description="Detailed markdown description of the image")
284
+ extracted_text: str | None = Field(
285
+ default=None,
286
+ description="Text extracted from the image, preserving original layout",
287
+ )
288
+
289
+
290
+ class SingleImageResult(BaseModel):
291
+ """Result for a single image in batch analysis."""
292
+
293
+ image_index: int = Field(description="Index of the image (1-based)")
294
+ caption: str = Field(description="Short alt text for the image (10-30 characters)")
295
+ description: str = Field(description="Detailed markdown description of the image")
296
+ extracted_text: str | None = Field(
297
+ default=None,
298
+ description="Text extracted from the image, preserving original layout",
299
+ )
300
+
301
+
302
+ class BatchImageAnalysisResult(BaseModel):
303
+ """Result for batch image analysis."""
304
+
305
+ images: list[SingleImageResult] = Field(
306
+ description="Analysis results for each image"
307
+ )
308
+
309
+
310
+ class Frontmatter(BaseModel):
311
+ """Pydantic model for document frontmatter."""
312
+
313
+ title: str = Field(description="Document title extracted from content")
314
+ description: str = Field(
315
+ description="Brief summary of the document (100 chars max)"
316
+ )
317
+ tags: list[str] = Field(description="Related tags (3-5 items)")
318
+
319
+
320
+ class DocumentProcessResult(BaseModel):
321
+ """Pydantic model for combined cleaner + frontmatter output."""
322
+
323
+ cleaned_markdown: str = Field(description="Cleaned and formatted markdown content")
324
+ frontmatter: Frontmatter = Field(description="Document metadata")
325
+
326
+
327
+ class EnhancedDocumentResult(BaseModel):
328
+ """Pydantic model for complete document enhancement output (Vision+LLM combined)."""
329
+
330
+ cleaned_markdown: str = Field(description="Enhanced and cleaned markdown content")
331
+ frontmatter: Frontmatter = Field(description="Document metadata")
332
+
333
+
334
+ class SQLiteCache:
335
+ """SQLite-based persistent LRU cache with size limit.
336
+
337
+ Thread-safe via SQLite's built-in locking mechanism.
338
+ Uses WAL mode for better concurrent read performance.
339
+ """
340
+
341
+ def __init__(
342
+ self,
343
+ db_path: Path,
344
+ max_size_bytes: int = DEFAULT_CACHE_SIZE_LIMIT,
345
+ ) -> None:
346
+ """Initialize SQLite cache.
347
+
348
+ Args:
349
+ db_path: Path to the SQLite database file
350
+ max_size_bytes: Maximum total cache size in bytes (default 1GB)
351
+ """
352
+ import hashlib
353
+
354
+ self._db_path = Path(db_path)
355
+ self._max_size_bytes = max_size_bytes
356
+ self._hashlib = hashlib
357
+
358
+ # Ensure parent directory exists
359
+ self._db_path.parent.mkdir(parents=True, exist_ok=True)
360
+
361
+ # Initialize database
362
+ self._init_db()
363
+
364
+ def _get_connection(self) -> Any:
365
+ """Get a new database connection (thread-local)."""
366
+ import sqlite3
367
+
368
+ conn = sqlite3.connect(str(self._db_path), timeout=30.0)
369
+ conn.execute("PRAGMA journal_mode=WAL")
370
+ conn.execute("PRAGMA synchronous=NORMAL")
371
+ conn.row_factory = sqlite3.Row
372
+ return conn
373
+
374
+ def _init_db(self) -> None:
375
+ """Initialize database schema."""
376
+ with self._get_connection() as conn:
377
+ conn.execute("""
378
+ CREATE TABLE IF NOT EXISTS cache (
379
+ key TEXT PRIMARY KEY,
380
+ value TEXT NOT NULL,
381
+ model TEXT DEFAULT '',
382
+ created_at INTEGER NOT NULL,
383
+ accessed_at INTEGER NOT NULL,
384
+ size_bytes INTEGER NOT NULL
385
+ )
386
+ """)
387
+ conn.execute(
388
+ "CREATE INDEX IF NOT EXISTS idx_accessed ON cache(accessed_at)"
389
+ )
390
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_created ON cache(created_at)")
391
+ conn.commit()
392
+
393
+ def _compute_hash(self, prompt: str, content: str) -> str:
394
+ """Compute hash key from prompt and content.
395
+
396
+ Uses head + tail + length strategy to detect changes anywhere in content:
397
+ - Head: first 25000 chars (catches changes at the beginning)
398
+ - Tail: last 25000 chars (catches changes at the end)
399
+ - Length: total content length (catches changes that alter length)
400
+
401
+ This avoids the problem where only using head truncation would miss
402
+ changes at the end of large documents.
403
+ """
404
+ length = len(content)
405
+ head = content[:25000]
406
+ tail = content[-25000:] if length > 25000 else ""
407
+ combined = f"{prompt}|{length}|{head}|{tail}"
408
+ return self._hashlib.sha256(combined.encode()).hexdigest()[:32]
409
+
410
+ def get(self, prompt: str, content: str) -> str | None:
411
+ """Get cached value if exists, update accessed_at for LRU.
412
+
413
+ Args:
414
+ prompt: Prompt template used
415
+ content: Content being processed
416
+
417
+ Returns:
418
+ Cached JSON string or None if not found
419
+ """
420
+ key = self._compute_hash(prompt, content)
421
+ now = int(time.time())
422
+
423
+ with self._get_connection() as conn:
424
+ row = conn.execute(
425
+ "SELECT value FROM cache WHERE key = ?", (key,)
426
+ ).fetchone()
427
+
428
+ if row:
429
+ # Update accessed_at for LRU tracking
430
+ conn.execute(
431
+ "UPDATE cache SET accessed_at = ? WHERE key = ?", (now, key)
432
+ )
433
+ conn.commit()
434
+ return row["value"]
435
+
436
+ return None
437
+
438
+ def set(self, prompt: str, content: str, value: str, model: str = "") -> None:
439
+ """Set cache value, evict LRU entries if size exceeded.
440
+
441
+ Args:
442
+ prompt: Prompt template used
443
+ content: Content being processed
444
+ value: JSON string to cache
445
+ model: Model identifier (for potential invalidation)
446
+ """
447
+ key = self._compute_hash(prompt, content)
448
+ now = int(time.time())
449
+ size_bytes = len(value.encode("utf-8"))
450
+
451
+ with self._get_connection() as conn:
452
+ # Check current total size
453
+ total_size = conn.execute(
454
+ "SELECT COALESCE(SUM(size_bytes), 0) as total FROM cache"
455
+ ).fetchone()["total"]
456
+
457
+ # Evict LRU entries if needed
458
+ while total_size + size_bytes > self._max_size_bytes:
459
+ oldest = conn.execute(
460
+ "SELECT key, size_bytes FROM cache ORDER BY accessed_at ASC LIMIT 1"
461
+ ).fetchone()
462
+
463
+ if oldest is None:
464
+ break
465
+
466
+ conn.execute("DELETE FROM cache WHERE key = ?", (oldest["key"],))
467
+ total_size -= oldest["size_bytes"]
468
+ logger.debug(f"[Cache] Evicted LRU entry: {oldest['key'][:8]}...")
469
+
470
+ # Insert or replace
471
+ conn.execute(
472
+ """
473
+ INSERT OR REPLACE INTO cache (key, value, model, created_at, accessed_at, size_bytes)
474
+ VALUES (?, ?, ?, ?, ?, ?)
475
+ """,
476
+ (key, value, model, now, now, size_bytes),
477
+ )
478
+ conn.commit()
479
+
480
+ def clear(self) -> int:
481
+ """Clear all entries.
482
+
483
+ Returns:
484
+ Number of entries deleted
485
+ """
486
+ with self._get_connection() as conn:
487
+ count = conn.execute("SELECT COUNT(*) as cnt FROM cache").fetchone()["cnt"]
488
+ conn.execute("DELETE FROM cache")
489
+ conn.commit()
490
+ return count
491
+
492
+ def stats(self) -> dict[str, Any]:
493
+ """Return cache statistics.
494
+
495
+ Returns:
496
+ Dict with count, size_bytes, size_mb, db_path
497
+ """
498
+ with self._get_connection() as conn:
499
+ row = conn.execute(
500
+ """
501
+ SELECT COUNT(*) as count, COALESCE(SUM(size_bytes), 0) as size_bytes
502
+ FROM cache
503
+ """
504
+ ).fetchone()
505
+
506
+ return {
507
+ "count": row["count"],
508
+ "size_bytes": row["size_bytes"],
509
+ "size_mb": round(row["size_bytes"] / (1024 * 1024), 2),
510
+ "max_size_mb": round(self._max_size_bytes / (1024 * 1024), 2),
511
+ "db_path": str(self._db_path),
512
+ }
513
+
514
+ def stats_by_model(self) -> dict[str, dict[str, Any]]:
515
+ """Get cache statistics grouped by model.
516
+
517
+ Returns:
518
+ Dict mapping model name to {"count": int, "size_bytes": int, "size_mb": float}
519
+ """
520
+ with self._get_connection() as conn:
521
+ cursor = conn.execute("""
522
+ SELECT
523
+ COALESCE(NULLIF(model, ''), 'unknown') as model_name,
524
+ COUNT(*) as count,
525
+ COALESCE(SUM(size_bytes), 0) as total_size
526
+ FROM cache
527
+ GROUP BY model_name
528
+ ORDER BY total_size DESC
529
+ """)
530
+ result = {}
531
+ for row in cursor.fetchall():
532
+ result[row["model_name"]] = {
533
+ "count": row["count"],
534
+ "size_bytes": row["total_size"],
535
+ "size_mb": round(row["total_size"] / (1024 * 1024), 2),
536
+ }
537
+ return result
538
+
539
+ def list_entries(self, limit: int = 20) -> list[dict[str, Any]]:
540
+ """List recent cache entries.
541
+
542
+ Args:
543
+ limit: Maximum number of entries to return.
544
+
545
+ Returns:
546
+ List of entry dicts with key, model, size_bytes, created_at,
547
+ accessed_at, preview.
548
+ """
549
+ from datetime import UTC, datetime
550
+
551
+ with self._get_connection() as conn:
552
+ cursor = conn.execute(
553
+ """
554
+ SELECT
555
+ key,
556
+ model,
557
+ size_bytes,
558
+ created_at,
559
+ accessed_at,
560
+ substr(value, 1, 200) as value_preview
561
+ FROM cache
562
+ ORDER BY accessed_at DESC
563
+ LIMIT ?
564
+ """,
565
+ (limit,),
566
+ )
567
+ entries = []
568
+ for row in cursor.fetchall():
569
+ entries.append(
570
+ {
571
+ "key": row["key"],
572
+ "model": row["model"] or "unknown",
573
+ "size_bytes": row["size_bytes"],
574
+ "created_at": datetime.fromtimestamp(
575
+ row["created_at"], tz=UTC
576
+ ).isoformat(),
577
+ "accessed_at": datetime.fromtimestamp(
578
+ row["accessed_at"], tz=UTC
579
+ ).isoformat(),
580
+ "preview": self._parse_value_preview(row["value_preview"]),
581
+ }
582
+ )
583
+ return entries
584
+
585
+ def _parse_value_preview(self, value: str | None) -> str:
586
+ """Parse cached value to generate a human-readable preview.
587
+
588
+ Args:
589
+ value: The cached value (JSON string or plain text).
590
+
591
+ Returns:
592
+ Preview string like "image: Colorful bar chart..." or "text: # Title..."
593
+ """
594
+ import json
595
+
596
+ if not value:
597
+ return ""
598
+ try:
599
+ data = json.loads(value)
600
+ if isinstance(data, dict):
601
+ # Image description cache: has "caption" field
602
+ if "caption" in data:
603
+ caption = str(data["caption"])[:40]
604
+ return f"image: {caption}..."
605
+ # Frontmatter cache: has "title" field
606
+ if "title" in data:
607
+ title = str(data["title"])[:40]
608
+ return f"frontmatter: {title}..."
609
+ # Plain text result
610
+ text = str(data) if not isinstance(data, str) else data
611
+ return f"text: {text[:40]}..."
612
+ except (json.JSONDecodeError, TypeError):
613
+ return f"text: {value[:40]}..."
614
+
615
+
616
+ class PersistentCache:
617
+ """Dual-layer persistent cache: project-level + global-level.
618
+
619
+ Lookup order: project cache -> global cache -> None
620
+ Write behavior: write to both caches simultaneously
621
+
622
+ Supports "no-cache" mode (skip_read=True) which:
623
+ - Skips reading from cache (always returns None)
624
+ - Still writes to cache (for future use)
625
+ This follows Bun's --no-cache semantics.
626
+
627
+ Supports pattern-based cache skip (no_cache_patterns) which:
628
+ - Skips reading from cache for matching files
629
+ - Still writes to cache (for future use)
630
+ - Patterns are glob-style, matched against relative paths
631
+ """
632
+
633
+ def __init__(
634
+ self,
635
+ project_dir: Path | None = None,
636
+ global_dir: Path | None = None,
637
+ max_size_bytes: int = DEFAULT_CACHE_SIZE_LIMIT,
638
+ enabled: bool = True,
639
+ skip_read: bool = False,
640
+ no_cache_patterns: list[str] | None = None,
641
+ ) -> None:
642
+ """Initialize dual-layer cache.
643
+
644
+ Args:
645
+ project_dir: Project directory (will create .markitai/cache.db)
646
+ global_dir: Global cache directory (default ~/.markitai)
647
+ max_size_bytes: Max size per cache file
648
+ enabled: Whether caching is enabled (both read and write)
649
+ skip_read: If True, skip reading from cache but still write
650
+ (Bun's --no-cache semantics: force fresh, update cache)
651
+ no_cache_patterns: List of glob patterns to skip cache for specific files.
652
+ Patterns are matched against relative paths from input_dir.
653
+ """
654
+ self._enabled = enabled
655
+ self._skip_read = skip_read
656
+ self._no_cache_patterns = no_cache_patterns or []
657
+ self._project_cache: SQLiteCache | None = None
658
+ self._global_cache: SQLiteCache | None = None
659
+ self._hits = 0
660
+ self._misses = 0
661
+
662
+ if not enabled:
663
+ return
664
+
665
+ # Initialize project cache
666
+ if project_dir:
667
+ project_cache_path = (
668
+ Path(project_dir)
669
+ / DEFAULT_PROJECT_CACHE_DIR
670
+ / DEFAULT_CACHE_DB_FILENAME
671
+ )
672
+ try:
673
+ self._project_cache = SQLiteCache(project_cache_path, max_size_bytes)
674
+ logger.debug(f"[Cache] Project cache: {project_cache_path}")
675
+ except Exception as e:
676
+ logger.warning(f"[Cache] Failed to init project cache: {e}")
677
+
678
+ # Initialize global cache
679
+ global_cache_dir = global_dir or Path(DEFAULT_GLOBAL_CACHE_DIR).expanduser()
680
+ global_cache_path = Path(global_cache_dir) / DEFAULT_CACHE_DB_FILENAME
681
+ try:
682
+ self._global_cache = SQLiteCache(global_cache_path, max_size_bytes)
683
+ logger.debug(f"[Cache] Global cache: {global_cache_path}")
684
+ except Exception as e:
685
+ logger.warning(f"[Cache] Failed to init global cache: {e}")
686
+
687
+ def _glob_match(self, path: str, pattern: str) -> bool:
688
+ """Enhanced glob matching that properly handles ** for zero-or-more directories.
689
+
690
+ Standard fnmatch treats ** as matching one-or-more characters, not zero-or-more
691
+ directory levels. This method enhances fnmatch to handle **/ prefix correctly.
692
+
693
+ Args:
694
+ path: Normalized path (forward slashes)
695
+ pattern: Glob pattern (forward slashes)
696
+
697
+ Returns:
698
+ True if path matches pattern
699
+ """
700
+ import fnmatch
701
+
702
+ # Standard fnmatch first
703
+ if fnmatch.fnmatch(path, pattern):
704
+ return True
705
+
706
+ # Handle **/ prefix: should match zero or more directories
707
+ # e.g., "**/*.pdf" should match both "file.pdf" and "a/b/file.pdf"
708
+ if pattern.startswith("**/"):
709
+ # Try matching without the **/ prefix (zero directories case)
710
+ pattern_without_prefix = pattern[3:]
711
+ if fnmatch.fnmatch(path, pattern_without_prefix):
712
+ return True
713
+
714
+ # Handle **/ in the middle of pattern
715
+ # e.g., "src/**/test.py" should match "src/test.py"
716
+ if "**/" in pattern and not pattern.startswith("**/"):
717
+ # Replace **/ with empty string to test zero-directory case
718
+ collapsed = pattern.replace("**/", "", 1)
719
+ if fnmatch.fnmatch(path, collapsed):
720
+ return True
721
+
722
+ return False
723
+
724
+ def _extract_matchable_path(self, context: str) -> str:
725
+ """Extract a matchable file path from various context formats.
726
+
727
+ Context can come in different formats:
728
+ - Simple filename: "candy.JPG"
729
+ - Relative path: "sub_dir/file.doc"
730
+ - Absolute path: "/home/user/project/sub_dir/file.doc"
731
+ - Path with suffix: "/home/user/project/candy.JPG:images"
732
+ - Windows path: "C:\\Users\\test\\candy.JPG"
733
+
734
+ This method extracts just the filename for matching against patterns.
735
+
736
+ Args:
737
+ context: Context identifier in any format
738
+
739
+ Returns:
740
+ Extracted path suitable for pattern matching
741
+ """
742
+ # Normalize path separators first (Windows -> Unix)
743
+ path = context.replace("\\", "/")
744
+
745
+ # Remove common suffixes like ":images", ":clean", ":frontmatter"
746
+ # But be careful with Windows drive letters like "C:"
747
+ if ":" in path:
748
+ # Check if it's a Windows drive letter (single char before colon)
749
+ colon_idx = path.index(":")
750
+ if colon_idx == 1:
751
+ # Windows drive letter, look for next colon
752
+ rest = path[2:]
753
+ if ":" in rest:
754
+ path = path[: 2 + rest.index(":")]
755
+ else:
756
+ # Regular suffix like ":images"
757
+ path = path[:colon_idx]
758
+
759
+ # Extract just the filename from paths
760
+ # This allows patterns like "*.JPG" to match "/full/path/to/candy.JPG"
761
+ if "/" in path:
762
+ filename = path.rsplit("/", 1)[-1]
763
+ else:
764
+ filename = path
765
+
766
+ return filename
767
+
768
+ def _should_skip_cache(self, context: str) -> bool:
769
+ """Check if cache should be skipped for the given context.
770
+
771
+ Args:
772
+ context: Context identifier (can be filename, relative path, or absolute path)
773
+
774
+ Returns:
775
+ True if cache should be skipped for this context
776
+ """
777
+ if not context or not self._no_cache_patterns:
778
+ return False
779
+
780
+ # Normalize path separators to forward slash for consistent matching
781
+ normalized_context = context.replace("\\", "/")
782
+
783
+ # Also extract just the filename for patterns like "*.JPG"
784
+ filename = self._extract_matchable_path(context)
785
+
786
+ for pattern in self._no_cache_patterns:
787
+ # Normalize pattern separators
788
+ normalized_pattern = pattern.replace("\\", "/")
789
+
790
+ # Try matching against full context path first
791
+ if self._glob_match(normalized_context, normalized_pattern):
792
+ logger.debug(
793
+ f"[Cache] Skipping cache for '{context}' (matched pattern: {pattern})"
794
+ )
795
+ return True
796
+
797
+ # Also try matching against just the filename
798
+ # This handles cases where context is absolute path but pattern is "*.JPG"
799
+ if filename != normalized_context and self._glob_match(
800
+ filename, normalized_pattern
801
+ ):
802
+ logger.debug(
803
+ f"[Cache] Skipping cache for '{context}' (filename '{filename}' matched pattern: {pattern})"
804
+ )
805
+ return True
806
+
807
+ return False
808
+
809
+ def get(self, prompt: str, content: str, context: str = "") -> Any | None:
810
+ """Lookup in project cache first, then global cache.
811
+
812
+ Args:
813
+ prompt: Prompt template used
814
+ content: Content being processed
815
+ context: Context identifier for pattern matching (e.g., relative file path)
816
+
817
+ Returns:
818
+ Cached result (deserialized from JSON) or None
819
+ """
820
+ if not self._enabled or self._skip_read:
821
+ # skip_read: Bun-style --no-cache (force fresh, still write)
822
+ return None
823
+
824
+ # Check pattern-based skip
825
+ if self._should_skip_cache(context):
826
+ return None
827
+
828
+ # Try project cache first
829
+ if self._project_cache:
830
+ result = self._project_cache.get(prompt, content)
831
+ if result is not None:
832
+ self._hits += 1
833
+ logger.debug("[Cache] Project cache hit")
834
+ return json.loads(result)
835
+
836
+ # Fallback to global cache
837
+ if self._global_cache:
838
+ result = self._global_cache.get(prompt, content)
839
+ if result is not None:
840
+ self._hits += 1
841
+ logger.debug("[Cache] Global cache hit")
842
+ return json.loads(result)
843
+
844
+ self._misses += 1
845
+ return None
846
+
847
+ def set(self, prompt: str, content: str, result: Any, model: str = "") -> None:
848
+ """Write to both caches.
849
+
850
+ Args:
851
+ prompt: Prompt template used
852
+ content: Content being processed
853
+ result: Result to cache (will be JSON serialized)
854
+ model: Model identifier
855
+ """
856
+ if not self._enabled:
857
+ return
858
+
859
+ value = json.dumps(result, ensure_ascii=False)
860
+
861
+ if self._project_cache:
862
+ try:
863
+ self._project_cache.set(prompt, content, value, model)
864
+ except Exception as e:
865
+ logger.warning(f"[Cache] Failed to write to project cache: {e}")
866
+
867
+ if self._global_cache:
868
+ try:
869
+ self._global_cache.set(prompt, content, value, model)
870
+ except Exception as e:
871
+ logger.warning(f"[Cache] Failed to write to global cache: {e}")
872
+
873
+ def clear(self, scope: str = "project") -> dict[str, int]:
874
+ """Clear cache entries.
875
+
876
+ Args:
877
+ scope: "project", "global", or "all"
878
+
879
+ Returns:
880
+ Dict with counts of deleted entries
881
+ """
882
+ result = {"project": 0, "global": 0}
883
+
884
+ if scope in ("project", "all") and self._project_cache:
885
+ result["project"] = self._project_cache.clear()
886
+
887
+ if scope in ("global", "all") and self._global_cache:
888
+ result["global"] = self._global_cache.clear()
889
+
890
+ return result
891
+
892
+ def stats(self) -> dict[str, Any]:
893
+ """Return cache statistics.
894
+
895
+ Returns:
896
+ Dict with project/global stats and hit rate
897
+ """
898
+ total_requests = self._hits + self._misses
899
+ hit_rate = (self._hits / total_requests * 100) if total_requests > 0 else 0.0
900
+
901
+ return {
902
+ "project": self._project_cache.stats() if self._project_cache else None,
903
+ "global": self._global_cache.stats() if self._global_cache else None,
904
+ "hits": self._hits,
905
+ "misses": self._misses,
906
+ "hit_rate": round(hit_rate, 2),
907
+ }
908
+
909
+
910
+ class ContentCache:
911
+ """LRU cache with TTL for LLM responses based on content hash.
912
+
913
+ Uses OrderedDict for O(1) LRU eviction instead of O(n) min() search.
914
+ """
915
+
916
+ def __init__(
917
+ self,
918
+ maxsize: int = DEFAULT_CACHE_MAXSIZE,
919
+ ttl_seconds: int = DEFAULT_CACHE_TTL_SECONDS,
920
+ ) -> None:
921
+ """
922
+ Initialize content cache.
923
+
924
+ Args:
925
+ maxsize: Maximum number of entries to cache
926
+ ttl_seconds: Time-to-live in seconds
927
+ """
928
+ from collections import OrderedDict
929
+
930
+ self._cache: OrderedDict[str, tuple[Any, float]] = OrderedDict()
931
+ self._maxsize = maxsize
932
+ self._ttl = ttl_seconds
933
+
934
+ def _compute_hash(self, prompt: str, content: str) -> str:
935
+ """Compute hash key from prompt and content.
936
+
937
+ Uses full content for accurate cache keys. For very large content,
938
+ the hash computation is still fast due to incremental SHA256.
939
+ """
940
+ import hashlib
941
+
942
+ combined = f"{prompt}|{content}"
943
+ return hashlib.sha256(combined.encode()).hexdigest()[:16]
944
+
945
+ def get(self, prompt: str, content: str) -> Any | None:
946
+ """
947
+ Get cached result if exists and not expired.
948
+
949
+ On hit, moves the entry to end (most recently used).
950
+
951
+ Args:
952
+ prompt: Prompt template used
953
+ content: Content being processed
954
+
955
+ Returns:
956
+ Cached result or None if not found/expired
957
+ """
958
+ key = self._compute_hash(prompt, content)
959
+ if key not in self._cache:
960
+ return None
961
+
962
+ result, timestamp = self._cache[key]
963
+ if time.time() - timestamp > self._ttl:
964
+ del self._cache[key]
965
+ return None
966
+
967
+ # Move to end on access (LRU behavior)
968
+ self._cache.move_to_end(key)
969
+ return result
970
+
971
+ def set(self, prompt: str, content: str, result: Any) -> None:
972
+ """
973
+ Cache a result.
974
+
975
+ Uses O(1) LRU eviction via OrderedDict.popitem(last=False).
976
+
977
+ Args:
978
+ prompt: Prompt template used
979
+ content: Content being processed
980
+ result: Result to cache
981
+ """
982
+ key = self._compute_hash(prompt, content)
983
+
984
+ # If key exists, update and move to end
985
+ if key in self._cache:
986
+ self._cache[key] = (result, time.time())
987
+ self._cache.move_to_end(key)
988
+ return
989
+
990
+ # Evict oldest entry if cache is full - O(1) operation
991
+ if len(self._cache) >= self._maxsize:
992
+ self._cache.popitem(last=False)
993
+
994
+ self._cache[key] = (result, time.time())
995
+
996
+ def clear(self) -> None:
997
+ """Clear all cached entries."""
998
+ self._cache.clear()
999
+
1000
+ @property
1001
+ def size(self) -> int:
1002
+ """Number of cached entries."""
1003
+ return len(self._cache)
1004
+
1005
+
1006
+ class LLMProcessor:
1007
+ """LLM processor using LiteLLM Router for load balancing."""
1008
+
1009
+ def __init__(
1010
+ self,
1011
+ config: LLMConfig,
1012
+ prompts_config: PromptsConfig | None = None,
1013
+ runtime: LLMRuntime | None = None,
1014
+ project_dir: Path | None = None,
1015
+ no_cache: bool = False,
1016
+ no_cache_patterns: list[str] | None = None,
1017
+ ) -> None:
1018
+ """
1019
+ Initialize LLM processor.
1020
+
1021
+ Args:
1022
+ config: LLM configuration
1023
+ prompts_config: Optional prompts configuration
1024
+ runtime: Optional shared runtime for concurrency control.
1025
+ If provided, uses runtime's semaphore instead of creating one.
1026
+ project_dir: Optional project directory for project-level cache.
1027
+ If None, only global cache is used.
1028
+ no_cache: If True, skip reading from cache but still write results.
1029
+ Follows Bun's --no-cache semantics (force fresh, update cache).
1030
+ no_cache_patterns: List of glob patterns to skip cache for specific files.
1031
+ Patterns are matched against relative paths from input_dir.
1032
+ E.g., ["*.pdf", "reports/**", "file.docx"]
1033
+ """
1034
+ self.config = config
1035
+ self._runtime = runtime
1036
+ self._router: Router | None = None
1037
+ self._vision_router: Router | None = None # Lazy-initialized vision router
1038
+ self._semaphore: asyncio.Semaphore | None = None
1039
+ self._prompt_manager = PromptManager(prompts_config)
1040
+
1041
+ # Usage tracking (global across all contexts)
1042
+ # Use defaultdict to avoid check-then-create race conditions
1043
+ def _make_usage_dict() -> dict[str, Any]:
1044
+ return {
1045
+ "requests": 0,
1046
+ "input_tokens": 0,
1047
+ "output_tokens": 0,
1048
+ "cost_usd": 0.0,
1049
+ }
1050
+
1051
+ self._usage: defaultdict[str, dict[str, Any]] = defaultdict(_make_usage_dict)
1052
+
1053
+ # Per-context usage tracking for batch processing
1054
+ self._context_usage: defaultdict[str, defaultdict[str, dict[str, Any]]] = (
1055
+ defaultdict(lambda: defaultdict(_make_usage_dict))
1056
+ )
1057
+
1058
+ # Call counter for each context (file)
1059
+ self._call_counter: defaultdict[str, int] = defaultdict(int)
1060
+
1061
+ # Lock for thread-safe access to usage tracking dicts in concurrent contexts
1062
+ # Using threading.Lock instead of asyncio.Lock because:
1063
+ # 1. Dict operations are CPU-bound and don't need await
1064
+ # 2. Works in both sync and async contexts
1065
+ # The lock hold time is minimal (only simple dict updates)
1066
+ self._usage_lock = threading.Lock()
1067
+
1068
+ # In-memory content cache for session-level deduplication (fast, no I/O)
1069
+ self._cache = ContentCache()
1070
+ self._cache_hits = 0
1071
+ self._cache_misses = 0
1072
+
1073
+ # Persistent cache for cross-session reuse (SQLite-based)
1074
+ # no_cache=True: skip reading but still write (Bun semantics)
1075
+ # no_cache_patterns: skip reading for specific files matching patterns
1076
+ self._persistent_cache = PersistentCache(
1077
+ project_dir=project_dir,
1078
+ skip_read=no_cache,
1079
+ no_cache_patterns=no_cache_patterns,
1080
+ )
1081
+
1082
+ # Image cache for avoiding repeated file reads during document processing
1083
+ # Key: file path string, Value: (bytes, base64_encoded_string)
1084
+ # Uses OrderedDict for LRU eviction when limits are reached
1085
+ from collections import OrderedDict
1086
+
1087
+ self._image_cache: OrderedDict[str, tuple[bytes, str]] = OrderedDict()
1088
+ self._image_cache_max_size = 200 # Max number of images to cache
1089
+ self._image_cache_max_bytes = 500 * 1024 * 1024 # 500MB max total cache size
1090
+ self._image_cache_bytes = 0 # Current total bytes in cache
1091
+
1092
+ # Register LiteLLM callback for additional details
1093
+ self._setup_callbacks()
1094
+
1095
+ def _setup_callbacks(self) -> None:
1096
+ """Register LiteLLM callbacks for detailed logging."""
1097
+ # Add our custom logger to litellm callbacks if not already added
1098
+ if _markitai_llm_logger not in (litellm.callbacks or []):
1099
+ if litellm.callbacks is None:
1100
+ litellm.callbacks = []
1101
+ litellm.callbacks.append(_markitai_llm_logger)
1102
+
1103
+ def _get_next_call_index(self, context: str) -> int:
1104
+ """Get the next call index for a given context.
1105
+
1106
+ Thread-safe: uses lock for atomic increment.
1107
+ """
1108
+ with self._usage_lock:
1109
+ self._call_counter[context] += 1
1110
+ return self._call_counter[context]
1111
+
1112
+ def reset_call_counter(self, context: str = "") -> None:
1113
+ """Reset call counter for a context or all contexts.
1114
+
1115
+ Thread-safe: uses lock for safe modification.
1116
+ """
1117
+ with self._usage_lock:
1118
+ if context:
1119
+ self._call_counter.pop(context, None)
1120
+ else:
1121
+ self._call_counter.clear()
1122
+
1123
+ @property
1124
+ def router(self) -> Router:
1125
+ """Get or create the LiteLLM Router."""
1126
+ if self._router is None:
1127
+ self._router = self._create_router()
1128
+ return self._router
1129
+
1130
+ @property
1131
+ def semaphore(self) -> asyncio.Semaphore:
1132
+ """Get the LLM concurrency semaphore.
1133
+
1134
+ If a runtime was provided, uses the shared semaphore from runtime.
1135
+ Otherwise creates a local semaphore.
1136
+ """
1137
+ if self._runtime is not None:
1138
+ return self._runtime.semaphore
1139
+ if self._semaphore is None:
1140
+ self._semaphore = asyncio.Semaphore(self.config.concurrency)
1141
+ return self._semaphore
1142
+
1143
+ @property
1144
+ def io_semaphore(self) -> asyncio.Semaphore:
1145
+ """Get the I/O concurrency semaphore for file operations.
1146
+
1147
+ Separate from LLM semaphore to allow higher I/O parallelism.
1148
+ """
1149
+ if self._runtime is not None:
1150
+ return self._runtime.io_semaphore
1151
+ # Fallback: use higher limit for local I/O operations
1152
+ return asyncio.Semaphore(DEFAULT_IO_CONCURRENCY)
1153
+
1154
+ def _create_router(self) -> Router:
1155
+ """Create LiteLLM Router from configuration."""
1156
+ if not self.config.model_list:
1157
+ raise ValueError("No models configured in llm.model_list")
1158
+
1159
+ # Build model list with resolved API keys and max_tokens
1160
+ model_list = []
1161
+ for model_config in self.config.model_list:
1162
+ model_id = model_config.litellm_params.model
1163
+ model_entry = {
1164
+ "model_name": model_config.model_name,
1165
+ "litellm_params": {
1166
+ "model": model_id,
1167
+ },
1168
+ }
1169
+
1170
+ # Add optional params
1171
+ api_key = model_config.litellm_params.get_resolved_api_key()
1172
+ if api_key:
1173
+ model_entry["litellm_params"]["api_key"] = api_key
1174
+
1175
+ if model_config.litellm_params.api_base:
1176
+ model_entry["litellm_params"]["api_base"] = (
1177
+ model_config.litellm_params.api_base
1178
+ )
1179
+
1180
+ if model_config.litellm_params.weight != 1:
1181
+ model_entry["litellm_params"]["weight"] = (
1182
+ model_config.litellm_params.weight
1183
+ )
1184
+
1185
+ # Note: max_tokens is NOT set at Router level
1186
+ # It will be calculated dynamically per-request based on input size
1187
+ # This avoids context overflow issues with shared context models
1188
+
1189
+ if model_config.model_info:
1190
+ model_entry["model_info"] = model_config.model_info.model_dump()
1191
+
1192
+ model_list.append(model_entry)
1193
+
1194
+ # Build router settings
1195
+ router_settings = self.config.router_settings.model_dump()
1196
+
1197
+ # Disable internal retries - we handle retries ourselves for better logging
1198
+ router_settings["num_retries"] = 0
1199
+
1200
+ # Log router configuration (compact format)
1201
+ model_names = [e["litellm_params"]["model"].split("/")[-1] for e in model_list]
1202
+ logger.info(
1203
+ f"[Router] Creating with strategy={router_settings.get('routing_strategy')}, "
1204
+ f"models={len(model_list)}"
1205
+ )
1206
+ logger.debug(f"[Router] Models: {', '.join(model_names)}")
1207
+
1208
+ return Router(model_list=model_list, **router_settings)
1209
+
1210
+ def _create_router_from_models(
1211
+ self, models: list[Any], router_settings: dict[str, Any] | None = None
1212
+ ) -> Router:
1213
+ """Create a Router from a subset of model configurations.
1214
+
1215
+ Args:
1216
+ models: List of ModelConfig objects from self.config.model_list
1217
+ router_settings: Optional router settings (uses default if not provided)
1218
+
1219
+ Returns:
1220
+ LiteLLM Router instance
1221
+ """
1222
+ # Build model list with resolved API keys and max_tokens
1223
+ model_list = []
1224
+ for model_config in models:
1225
+ model_id = model_config.litellm_params.model
1226
+ model_entry = {
1227
+ "model_name": model_config.model_name,
1228
+ "litellm_params": {
1229
+ "model": model_id,
1230
+ },
1231
+ }
1232
+
1233
+ # Add optional params
1234
+ api_key = model_config.litellm_params.get_resolved_api_key()
1235
+ if api_key:
1236
+ model_entry["litellm_params"]["api_key"] = api_key
1237
+
1238
+ if model_config.litellm_params.api_base:
1239
+ model_entry["litellm_params"]["api_base"] = (
1240
+ model_config.litellm_params.api_base
1241
+ )
1242
+
1243
+ if model_config.litellm_params.weight != 1:
1244
+ model_entry["litellm_params"]["weight"] = (
1245
+ model_config.litellm_params.weight
1246
+ )
1247
+
1248
+ # Note: max_tokens calculated dynamically per-request
1249
+
1250
+ if model_config.model_info:
1251
+ model_entry["model_info"] = model_config.model_info.model_dump()
1252
+
1253
+ model_list.append(model_entry)
1254
+
1255
+ # Use provided settings or default
1256
+ settings = router_settings or self.config.router_settings.model_dump()
1257
+ settings["num_retries"] = 0 # We handle retries ourselves
1258
+
1259
+ return Router(model_list=model_list, **settings)
1260
+
1261
+ def _is_vision_model(self, model_config: Any) -> bool:
1262
+ """Check if a model supports vision.
1263
+
1264
+ Priority:
1265
+ 1. Config override (model_info.supports_vision) if explicitly set
1266
+ 2. Auto-detect from litellm.get_model_info()
1267
+
1268
+ Args:
1269
+ model_config: Model configuration object
1270
+
1271
+ Returns:
1272
+ True if model supports vision
1273
+ """
1274
+ # Check config override first
1275
+ if (
1276
+ model_config.model_info
1277
+ and model_config.model_info.supports_vision is not None
1278
+ ):
1279
+ return model_config.model_info.supports_vision
1280
+
1281
+ # Auto-detect from litellm
1282
+ model_id = model_config.litellm_params.model
1283
+ info = get_model_info_cached(model_id)
1284
+ return info.get("supports_vision", False)
1285
+
1286
+ @property
1287
+ def vision_router(self) -> Router:
1288
+ """Get or create Router with only vision-capable models (lazy).
1289
+
1290
+ Filters models using auto-detection from litellm or config override.
1291
+ Falls back to main router if no vision models found.
1292
+
1293
+ Returns:
1294
+ LiteLLM Router with vision-capable models only
1295
+ """
1296
+ if self._vision_router is None:
1297
+ vision_models = [
1298
+ m for m in self.config.model_list if self._is_vision_model(m)
1299
+ ]
1300
+
1301
+ if not vision_models:
1302
+ # No dedicated vision models - fall back to main router
1303
+ logger.warning(
1304
+ "[Router] No vision-capable models configured, using main router"
1305
+ )
1306
+ self._vision_router = self.router
1307
+ else:
1308
+ model_names = [
1309
+ m.litellm_params.model.split("/")[-1] for m in vision_models
1310
+ ]
1311
+ logger.info(
1312
+ f"[Router] Creating vision router with {len(vision_models)} models"
1313
+ )
1314
+ logger.debug(f"[Router] Vision models: {', '.join(model_names)}")
1315
+ self._vision_router = self._create_router_from_models(vision_models)
1316
+
1317
+ return self._vision_router
1318
+
1319
+ def _message_contains_image(self, messages: list[dict[str, Any]]) -> bool:
1320
+ """Detect if messages contain image content.
1321
+
1322
+ Checks for image_url type in message content parts.
1323
+
1324
+ Args:
1325
+ messages: List of chat messages
1326
+
1327
+ Returns:
1328
+ True if any message contains an image
1329
+ """
1330
+ for msg in messages:
1331
+ content = msg.get("content")
1332
+ if isinstance(content, list):
1333
+ for part in content:
1334
+ if isinstance(part, dict) and part.get("type") == "image_url":
1335
+ return True
1336
+ return False
1337
+
1338
+ async def _call_llm(
1339
+ self,
1340
+ model: str,
1341
+ messages: list[dict[str, Any]],
1342
+ context: str = "",
1343
+ ) -> LLMResponse:
1344
+ """
1345
+ Make an LLM call with rate limiting, retry logic, and detailed logging.
1346
+
1347
+ Smart router selection: automatically uses vision_router when messages
1348
+ contain images, otherwise uses the main router.
1349
+
1350
+ Args:
1351
+ model: Logical model name (e.g., "default")
1352
+ messages: Chat messages
1353
+ context: Context identifier for logging (e.g., filename)
1354
+
1355
+ Returns:
1356
+ LLMResponse with content and usage info
1357
+ """
1358
+ # Generate call ID for logging
1359
+ call_index = self._get_next_call_index(context) if context else 0
1360
+ call_id = f"{context}:{call_index}" if context else f"call:{call_index}"
1361
+
1362
+ # Smart router selection based on message content
1363
+ requires_vision = self._message_contains_image(messages)
1364
+ router = self.vision_router if requires_vision else self.router
1365
+
1366
+ max_retries = self.config.router_settings.num_retries
1367
+ return await self._call_llm_with_retry(
1368
+ model=model,
1369
+ messages=messages,
1370
+ call_id=call_id,
1371
+ context=context,
1372
+ max_retries=max_retries,
1373
+ router=router,
1374
+ )
1375
+
1376
+ def _calculate_dynamic_max_tokens(
1377
+ self, messages: list[Any], model_hint: str | None = None
1378
+ ) -> int:
1379
+ """Calculate dynamic max_tokens based on input size and model limits.
1380
+
1381
+ Uses conservative estimates to avoid context overflow across all models
1382
+ in the router's model list.
1383
+
1384
+ Args:
1385
+ messages: Chat messages to estimate input tokens
1386
+ model_hint: Optional model name for more accurate limits
1387
+
1388
+ Returns:
1389
+ Safe max_tokens value that won't exceed context limits
1390
+ """
1391
+ import re
1392
+
1393
+ # Estimate input tokens (use gpt-4 tokenizer as reasonable approximation)
1394
+ try:
1395
+ input_tokens = litellm.token_counter(model="gpt-4", messages=messages)
1396
+ except Exception:
1397
+ # Fallback: rough estimate based on character count
1398
+ total_chars = sum(len(str(m.get("content", ""))) for m in messages)
1399
+ input_tokens = total_chars // 4 # ~4 chars per token
1400
+
1401
+ # Detect table-heavy content (tables require more output tokens for formatting)
1402
+ content_str = str(messages)
1403
+ table_rows = len(re.findall(r"\|[^|]+\|", content_str))
1404
+ is_table_heavy = table_rows > 20 # More than 20 table rows
1405
+
1406
+ # Get model limits - use minimum across all configured models for safety
1407
+ min_context = float("inf")
1408
+ min_output = float("inf")
1409
+
1410
+ for model_config in self.config.model_list:
1411
+ model_id = model_config.litellm_params.model
1412
+ info = get_model_info_cached(model_id)
1413
+ min_context = min(min_context, info["max_input_tokens"])
1414
+ min_output = min(min_output, info["max_output_tokens"])
1415
+
1416
+ # Use defaults if no models configured
1417
+ if min_context == float("inf"):
1418
+ min_context = 128000
1419
+ if min_output == float("inf"):
1420
+ min_output = DEFAULT_MAX_OUTPUT_TOKENS
1421
+
1422
+ # Calculate available output space
1423
+ # Reserve buffer for safety (tokenizer differences, system overhead)
1424
+ buffer = max(500, int(input_tokens * 0.1)) # 10% or 500, whichever is larger
1425
+ available_context = int(min_context) - input_tokens - buffer
1426
+
1427
+ # For table-heavy content, ensure output has at least 1.5x input tokens
1428
+ # since reformatting tables to Markdown often expands token count
1429
+ if is_table_heavy:
1430
+ min_required_output = int(input_tokens * 1.5)
1431
+ available_context = max(available_context, min_required_output)
1432
+ logger.debug(
1433
+ f"[DynamicTokens] Table-heavy content detected ({table_rows} rows), "
1434
+ f"min_required_output={min_required_output}"
1435
+ )
1436
+
1437
+ # max_tokens = min(model's max_output, available context space)
1438
+ max_tokens = min(int(min_output), available_context)
1439
+
1440
+ # Ensure reasonable minimum (higher for table-heavy content)
1441
+ min_floor = 4000 if is_table_heavy else 1000
1442
+ max_tokens = max(max_tokens, min_floor)
1443
+
1444
+ logger.debug(
1445
+ f"[DynamicTokens] input={input_tokens}, context={int(min_context)}, "
1446
+ f"max_output={int(min_output)}, calculated={max_tokens}"
1447
+ )
1448
+
1449
+ return max_tokens
1450
+
1451
+ async def _call_llm_with_retry(
1452
+ self,
1453
+ model: str,
1454
+ messages: list[dict[str, Any]],
1455
+ call_id: str,
1456
+ context: str = "",
1457
+ max_retries: int = DEFAULT_MAX_RETRIES,
1458
+ router: Router | None = None,
1459
+ ) -> LLMResponse:
1460
+ """
1461
+ Make an LLM call with custom retry logic and detailed logging.
1462
+
1463
+ Args:
1464
+ model: Logical model name (e.g., "default")
1465
+ messages: Chat messages
1466
+ call_id: Unique identifier for this call (for logging)
1467
+ context: Context identifier for usage tracking (e.g., filename)
1468
+ max_retries: Maximum number of retry attempts
1469
+ router: Router to use (defaults to self.router if not provided)
1470
+
1471
+ Returns:
1472
+ LLMResponse with content and usage info
1473
+ """
1474
+ # Use provided router or default to main router
1475
+ active_router = router or self.router
1476
+ last_exception: Exception | None = None
1477
+
1478
+ # Calculate dynamic max_tokens based on input size
1479
+ max_tokens = self._calculate_dynamic_max_tokens(messages)
1480
+
1481
+ for attempt in range(max_retries + 1):
1482
+ start_time = time.perf_counter()
1483
+
1484
+ async with self.semaphore:
1485
+ try:
1486
+ # Log request start
1487
+ if attempt == 0:
1488
+ logger.debug(f"[LLM:{call_id}] Request to {model}")
1489
+ else:
1490
+ # Log retry attempt
1491
+ error_type = (
1492
+ type(last_exception).__name__
1493
+ if last_exception
1494
+ else "Unknown"
1495
+ )
1496
+ status_code = getattr(last_exception, "status_code", "N/A")
1497
+ logger.warning(
1498
+ f"[LLM:{call_id}] Retry #{attempt}: {error_type} "
1499
+ f"status={status_code}"
1500
+ )
1501
+
1502
+ response = await active_router.acompletion(
1503
+ model=model,
1504
+ messages=cast(list[AllMessageValues], messages),
1505
+ max_tokens=max_tokens,
1506
+ metadata={"call_id": call_id, "attempt": attempt},
1507
+ )
1508
+
1509
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
1510
+ # litellm returns Choices (not StreamingChoices) for non-streaming
1511
+ choice = cast(Choices, response.choices[0])
1512
+ content = choice.message.content or ""
1513
+ actual_model = response.model or model
1514
+
1515
+ # Calculate cost
1516
+ try:
1517
+ cost = completion_cost(completion_response=response)
1518
+ except Exception:
1519
+ cost = 0.0
1520
+
1521
+ # Track usage (usage attr exists at runtime but not in type stubs)
1522
+ usage = getattr(response, "usage", None)
1523
+ input_tokens = usage.prompt_tokens if usage else 0
1524
+ output_tokens = usage.completion_tokens if usage else 0
1525
+
1526
+ self._track_usage(
1527
+ actual_model, input_tokens, output_tokens, cost, context
1528
+ )
1529
+
1530
+ # Log result
1531
+ logger.info(
1532
+ f"[LLM:{call_id}] {actual_model} "
1533
+ f"tokens={input_tokens}+{output_tokens} "
1534
+ f"time={elapsed_ms:.0f}ms cost=${cost:.6f}"
1535
+ )
1536
+
1537
+ # Detect empty response (0 output tokens with substantial input)
1538
+ # This usually indicates a model failure that should be retried
1539
+ if output_tokens == 0 and input_tokens > 100:
1540
+ if attempt < max_retries:
1541
+ logger.warning(
1542
+ f"[LLM:{call_id}] Empty response (0 output tokens), "
1543
+ f"retrying with different model..."
1544
+ )
1545
+ # Treat as retryable error
1546
+ await asyncio.sleep(
1547
+ min(
1548
+ DEFAULT_RETRY_BASE_DELAY * (2**attempt),
1549
+ DEFAULT_RETRY_MAX_DELAY,
1550
+ )
1551
+ )
1552
+ continue
1553
+ else:
1554
+ logger.error(
1555
+ f"[LLM:{call_id}] Empty response after {max_retries + 1} "
1556
+ f"attempts, returning empty content"
1557
+ )
1558
+
1559
+ return LLMResponse(
1560
+ content=content,
1561
+ model=actual_model,
1562
+ input_tokens=input_tokens,
1563
+ output_tokens=output_tokens,
1564
+ cost_usd=cost,
1565
+ )
1566
+
1567
+ except RETRYABLE_ERRORS as e:
1568
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
1569
+ last_exception = e
1570
+
1571
+ if attempt == max_retries:
1572
+ # Final failure after all retries
1573
+ error_type = type(e).__name__
1574
+ status_code = getattr(e, "status_code", "N/A")
1575
+ provider = getattr(e, "llm_provider", "N/A")
1576
+ logger.error(
1577
+ f"[LLM:{call_id}] Failed after {max_retries + 1} attempts: "
1578
+ f"{error_type} status={status_code} provider={provider} "
1579
+ f"time={elapsed_ms:.0f}ms"
1580
+ )
1581
+ raise
1582
+
1583
+ # Calculate exponential backoff delay
1584
+ delay = min(
1585
+ DEFAULT_RETRY_BASE_DELAY * (2**attempt), DEFAULT_RETRY_MAX_DELAY
1586
+ )
1587
+ await asyncio.sleep(delay)
1588
+
1589
+ except Exception as e:
1590
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
1591
+ error_type = type(e).__name__
1592
+ status_code = getattr(e, "status_code", "N/A")
1593
+ error_msg = str(e)[:200] # Truncate long messages
1594
+ logger.error(
1595
+ f"[LLM:{call_id}] Failed: {error_type} "
1596
+ f"status={status_code} msg={error_msg} "
1597
+ f"time={elapsed_ms:.0f}ms"
1598
+ )
1599
+ raise
1600
+
1601
+ # Should not reach here, but just in case
1602
+ raise RuntimeError(f"[LLM:{call_id}] Unexpected state in retry loop")
1603
+
1604
+ def _track_usage(
1605
+ self,
1606
+ model: str,
1607
+ input_tokens: int,
1608
+ output_tokens: int,
1609
+ cost: float,
1610
+ context: str = "",
1611
+ ) -> None:
1612
+ """Track usage statistics per model (and optionally per context).
1613
+
1614
+ Thread-safe: uses lock to protect concurrent access to usage dicts.
1615
+
1616
+ Args:
1617
+ model: Model name
1618
+ input_tokens: Number of input tokens
1619
+ output_tokens: Number of output tokens
1620
+ cost: Cost in USD
1621
+ context: Optional context identifier (e.g., filename)
1622
+ """
1623
+ with self._usage_lock:
1624
+ # Track global usage (defaultdict auto-creates entries)
1625
+ self._usage[model]["requests"] += 1
1626
+ self._usage[model]["input_tokens"] += input_tokens
1627
+ self._usage[model]["output_tokens"] += output_tokens
1628
+ self._usage[model]["cost_usd"] += cost
1629
+
1630
+ # Track per-context usage if context provided
1631
+ if context:
1632
+ self._context_usage[context][model]["requests"] += 1
1633
+ self._context_usage[context][model]["input_tokens"] += input_tokens
1634
+ self._context_usage[context][model]["output_tokens"] += output_tokens
1635
+ self._context_usage[context][model]["cost_usd"] += cost
1636
+
1637
+ def get_usage(self) -> dict[str, dict[str, Any]]:
1638
+ """Get global usage statistics.
1639
+
1640
+ Thread-safe: uses lock and returns a deep copy.
1641
+ """
1642
+ import copy
1643
+
1644
+ with self._usage_lock:
1645
+ return copy.deepcopy(self._usage)
1646
+
1647
+ def get_total_cost(self) -> float:
1648
+ """Get total cost across all models.
1649
+
1650
+ Thread-safe: uses lock for consistent read.
1651
+ """
1652
+ with self._usage_lock:
1653
+ return sum(u["cost_usd"] for u in self._usage.values())
1654
+
1655
+ def get_context_usage(self, context: str) -> dict[str, dict[str, Any]]:
1656
+ """Get usage statistics for a specific context.
1657
+
1658
+ Thread-safe: uses lock and returns a deep copy.
1659
+
1660
+ Args:
1661
+ context: Context identifier (e.g., filename)
1662
+
1663
+ Returns:
1664
+ Usage statistics for that context, or empty dict if not found
1665
+ """
1666
+ import copy
1667
+
1668
+ with self._usage_lock:
1669
+ return copy.deepcopy(self._context_usage.get(context, {}))
1670
+
1671
+ def get_context_cost(self, context: str) -> float:
1672
+ """Get total cost for a specific context.
1673
+
1674
+ Thread-safe: uses lock for consistent read.
1675
+
1676
+ Args:
1677
+ context: Context identifier (e.g., filename)
1678
+
1679
+ Returns:
1680
+ Total cost for that context
1681
+ """
1682
+ with self._usage_lock:
1683
+ context_usage = self._context_usage.get(context, {})
1684
+ return sum(u["cost_usd"] for u in context_usage.values())
1685
+
1686
+ def clear_context_usage(self, context: str) -> None:
1687
+ """Clear usage tracking for a specific context.
1688
+
1689
+ Thread-safe: uses lock for safe modification.
1690
+
1691
+ Args:
1692
+ context: Context identifier to clear
1693
+ """
1694
+ with self._usage_lock:
1695
+ self._context_usage.pop(context, None)
1696
+ self._call_counter.pop(context, None)
1697
+
1698
+ def get_cache_stats(self) -> dict[str, Any]:
1699
+ """Get cache statistics.
1700
+
1701
+ Returns:
1702
+ Dict with memory cache stats, persistent cache stats, and combined hit rate
1703
+ """
1704
+ total = self._cache_hits + self._cache_misses
1705
+ hit_rate = self._cache_hits / total if total > 0 else 0.0
1706
+ return {
1707
+ "memory": {
1708
+ "hits": self._cache_hits,
1709
+ "misses": self._cache_misses,
1710
+ "hit_rate": round(hit_rate * 100, 2),
1711
+ "size": self._cache.size,
1712
+ },
1713
+ "persistent": self._persistent_cache.stats(),
1714
+ }
1715
+
1716
+ def clear_cache(self, scope: str = "memory") -> dict[str, Any]:
1717
+ """Clear the content cache and reset statistics.
1718
+
1719
+ Args:
1720
+ scope: "memory" (in-memory only), "project", "global", or "all"
1721
+
1722
+ Returns:
1723
+ Dict with counts of cleared entries
1724
+ """
1725
+ result: dict[str, Any] = {"memory": 0, "project": 0, "global": 0}
1726
+
1727
+ if scope in ("memory", "all"):
1728
+ result["memory"] = self._cache.size
1729
+ self._cache.clear()
1730
+ self._cache_hits = 0
1731
+ self._cache_misses = 0
1732
+
1733
+ if scope in ("project", "global", "all"):
1734
+ persistent_result = self._persistent_cache.clear(scope)
1735
+ result["project"] = persistent_result.get("project", 0)
1736
+ result["global"] = persistent_result.get("global", 0)
1737
+
1738
+ return result
1739
+
1740
+ def clear_image_cache(self) -> None:
1741
+ """Clear the image cache to free memory after document processing."""
1742
+ self._image_cache.clear()
1743
+ self._image_cache_bytes = 0
1744
+
1745
+ def _get_cached_image(self, image_path: Path) -> tuple[bytes, str]:
1746
+ """Get image bytes and base64 encoding, using cache if available.
1747
+
1748
+ Uses LRU eviction when cache limits are reached (both count and bytes).
1749
+ Also ensures image is under 5MB limit for LLM API compatibility.
1750
+
1751
+ Args:
1752
+ image_path: Path to the image file
1753
+
1754
+ Returns:
1755
+ Tuple of (raw bytes, base64 encoded string)
1756
+ """
1757
+ path_key = str(image_path)
1758
+
1759
+ if path_key in self._image_cache:
1760
+ # Move to end for LRU (most recently used)
1761
+ self._image_cache.move_to_end(path_key)
1762
+ return self._image_cache[path_key]
1763
+
1764
+ # Read and encode image
1765
+ image_data = image_path.read_bytes()
1766
+
1767
+ # Check size limit (5MB for Anthropic/LiteLLM safety)
1768
+ # Using 4.5MB to be safe
1769
+ MAX_IMAGE_SIZE = 4.5 * 1024 * 1024
1770
+ if len(image_data) > MAX_IMAGE_SIZE:
1771
+ try:
1772
+ import io
1773
+
1774
+ from PIL import Image
1775
+
1776
+ with io.BytesIO(image_data) as buffer:
1777
+ img = Image.open(buffer)
1778
+ # Resize logic: iterative downscaling if needed
1779
+ quality = 85
1780
+ max_dim = 2048
1781
+
1782
+ while True:
1783
+ if max(img.size) > max_dim:
1784
+ img.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
1785
+
1786
+ out_buffer = io.BytesIO()
1787
+ # Use JPEG for compression efficiency unless transparency is needed
1788
+ fmt = "JPEG"
1789
+ if img.mode in ("RGBA", "LA") or (
1790
+ img.format and img.format.upper() == "PNG"
1791
+ ):
1792
+ # If PNG is too big, convert to JPEG (losing transparency) or resize more
1793
+ # For document analysis, JPEG is usually fine
1794
+ if len(image_data) > 8 * 1024 * 1024: # If huge, force JPEG
1795
+ img = img.convert("RGB")
1796
+ fmt = "JPEG"
1797
+ else:
1798
+ fmt = "PNG"
1799
+
1800
+ if fmt == "JPEG" and img.mode != "RGB":
1801
+ img = img.convert("RGB")
1802
+
1803
+ if fmt == "JPEG":
1804
+ img.save(out_buffer, format=fmt, quality=quality)
1805
+ else:
1806
+ img.save(out_buffer, format=fmt)
1807
+ new_data = out_buffer.getvalue()
1808
+
1809
+ if len(new_data) <= MAX_IMAGE_SIZE:
1810
+ image_data = new_data
1811
+ logger.debug(
1812
+ f"Resized large image {image_path.name}: {len(new_data) / 1024 / 1024:.2f}MB"
1813
+ )
1814
+ break
1815
+
1816
+ # If still too big, reduce quality/size
1817
+ if quality > 50 and fmt == "JPEG":
1818
+ quality -= 15
1819
+ else:
1820
+ max_dim = int(max_dim * 0.75)
1821
+ if max_dim < 512: # Safety floor
1822
+ logger.warning(
1823
+ f"Could not compress {image_path.name} below 5MB even at 512px"
1824
+ )
1825
+ break
1826
+
1827
+ except Exception as e:
1828
+ logger.warning(f"Failed to resize large image {image_path.name}: {e}")
1829
+
1830
+ base64_image = base64.b64encode(image_data).decode()
1831
+
1832
+ # Calculate entry size: raw bytes + base64 string (roughly 1.33x raw size)
1833
+ entry_bytes = len(image_data) + len(base64_image)
1834
+
1835
+ # Evict old entries if adding this would exceed limits
1836
+ while self._image_cache and (
1837
+ len(self._image_cache) >= self._image_cache_max_size
1838
+ or self._image_cache_bytes + entry_bytes > self._image_cache_max_bytes
1839
+ ):
1840
+ # Remove oldest entry (first item in OrderedDict)
1841
+ _, oldest_value = self._image_cache.popitem(last=False)
1842
+ old_bytes = len(oldest_value[0]) + len(oldest_value[1])
1843
+ self._image_cache_bytes -= old_bytes
1844
+
1845
+ # Cache if entry size is reasonable (skip very large single images)
1846
+ if entry_bytes < self._image_cache_max_bytes // 2:
1847
+ self._image_cache[path_key] = (image_data, base64_image)
1848
+ self._image_cache_bytes += entry_bytes
1849
+
1850
+ return image_data, base64_image
1851
+
1852
+ @staticmethod
1853
+ def _smart_truncate(text: str, max_chars: int, preserve_end: bool = False) -> str:
1854
+ """Truncate text at sentence/paragraph boundary to preserve readability.
1855
+
1856
+ Instead of cutting at arbitrary positions, finds the nearest sentence
1857
+ or paragraph ending before the limit.
1858
+
1859
+ Args:
1860
+ text: Text to truncate
1861
+ max_chars: Maximum character limit
1862
+ preserve_end: If True, preserve the end instead of the beginning
1863
+
1864
+ Returns:
1865
+ Truncated text at a natural boundary
1866
+ """
1867
+ if len(text) <= max_chars:
1868
+ return text
1869
+
1870
+ if preserve_end:
1871
+ # Find a good starting point from the end
1872
+ search_start = len(text) - max_chars
1873
+ search_text = text[search_start : search_start + 500]
1874
+
1875
+ # Look for paragraph or sentence boundary
1876
+ for marker in ["\n\n", "\n", "。", ".", "!", "!", "?", "?"]:
1877
+ idx = search_text.find(marker)
1878
+ if idx != -1:
1879
+ return text[search_start + idx + len(marker) :]
1880
+
1881
+ return text[-max_chars:]
1882
+
1883
+ # Default: preserve beginning, find a good ending point
1884
+ search_text = (
1885
+ text[max_chars - 500 : max_chars + 200]
1886
+ if max_chars > 500
1887
+ else text[: max_chars + 200]
1888
+ )
1889
+ search_offset = max(0, max_chars - 500)
1890
+
1891
+ # Priority: paragraph > sentence > any break
1892
+ for marker in ["\n\n", "。\n", ".\n", "。", ".", "!", "!", "?", "?"]:
1893
+ idx = search_text.rfind(marker)
1894
+ if idx != -1:
1895
+ end_pos = search_offset + idx + len(marker)
1896
+ if (
1897
+ end_pos <= max_chars + 100
1898
+ ): # Allow slight overflow for better breaks
1899
+ return text[:end_pos].rstrip()
1900
+
1901
+ # Fall back to simple truncation
1902
+ return text[:max_chars]
1903
+
1904
+ @staticmethod
1905
+ def extract_protected_content(content: str) -> dict[str, list[str]]:
1906
+ """Extract content that must be preserved through LLM processing.
1907
+
1908
+ Extracts:
1909
+ - Image links: ![...](...)
1910
+ - Slide comments: <!-- Slide X --> or <!-- Slide number: X -->
1911
+ - Page number comments: <!-- Page number: X -->
1912
+ - Page image comments: <!-- ![Page X](...) --> and <!-- Page images... -->
1913
+
1914
+ Args:
1915
+ content: Original markdown content
1916
+
1917
+ Returns:
1918
+ Dict with 'images', 'slides', 'page_numbers', 'page_comments' lists
1919
+ """
1920
+ import re
1921
+
1922
+ protected: dict[str, list[str]] = {
1923
+ "images": [],
1924
+ "slides": [],
1925
+ "page_numbers": [],
1926
+ "page_comments": [],
1927
+ }
1928
+
1929
+ # Extract image links
1930
+ protected["images"] = re.findall(r"!\[[^\]]*\]\([^)]+\)", content)
1931
+
1932
+ # Extract slide comments: <!-- Slide X --> or <!-- Slide number: X -->
1933
+ protected["slides"] = re.findall(
1934
+ r"<!--\s*Slide\s+(?:number:\s*)?\d+\s*-->", content
1935
+ )
1936
+
1937
+ # Extract page number comments: <!-- Page number: X -->
1938
+ protected["page_numbers"] = re.findall(
1939
+ r"<!--\s*Page number:\s*\d+\s*-->", content
1940
+ )
1941
+
1942
+ # Extract page image comments
1943
+ # Pattern 1: <!-- Page images for reference -->
1944
+ # Pattern 2: <!-- ![Page X](screenshots/...) -->
1945
+ page_header_pattern = r"<!--\s*Page images for reference\s*-->"
1946
+ page_img_pattern = r"<!--\s*!\[Page\s+\d+\]\([^)]*\)\s*-->"
1947
+ protected["page_comments"] = re.findall(
1948
+ page_header_pattern, content
1949
+ ) + re.findall(page_img_pattern, content)
1950
+
1951
+ return protected
1952
+
1953
+ @staticmethod
1954
+ def _protect_content(content: str) -> tuple[str, dict[str, str]]:
1955
+ """Replace protected content with placeholders before LLM processing.
1956
+
1957
+ This preserves the position of images, slides, and page comments
1958
+ by replacing them with unique placeholders that the LLM is unlikely
1959
+ to modify.
1960
+
1961
+ Args:
1962
+ content: Original markdown content
1963
+
1964
+ Returns:
1965
+ Tuple of (content with placeholders, mapping of placeholder -> original)
1966
+ """
1967
+ import re
1968
+
1969
+ mapping: dict[str, str] = {}
1970
+ result = content
1971
+
1972
+ # Note: Images are NOT protected anymore.
1973
+ # The prompt instructs LLM to preserve image positions and only add alt text.
1974
+ # Protecting images with placeholders caused issues where LLM would delete
1975
+ # the placeholders, and then images would be appended to the end of the file.
1976
+
1977
+ # 1. Protect Page number markers (PDF): <!-- Page number: X -->
1978
+ # These must stay at the beginning of each page's content
1979
+ page_num_pattern = r"<!--\s*Page number:\s*\d+\s*-->"
1980
+ for page_num_idx, match in enumerate(re.finditer(page_num_pattern, result)):
1981
+ placeholder = f"__MARKITAI_PAGENUM_{page_num_idx}__"
1982
+ mapping[placeholder] = match.group(0)
1983
+ result = result.replace(match.group(0), placeholder, 1)
1984
+
1985
+ # 2. Protect Slide number markers (PPTX/PPT): <!-- Slide number: X -->
1986
+ # These must stay at the beginning of each slide's content
1987
+ slide_num_pattern = r"<!--\s*Slide number:\s*\d+\s*-->"
1988
+ for slide_num_idx, match in enumerate(re.finditer(slide_num_pattern, result)):
1989
+ placeholder = f"__MARKITAI_SLIDENUM_{slide_num_idx}__"
1990
+ mapping[placeholder] = match.group(0)
1991
+ result = result.replace(match.group(0), placeholder, 1)
1992
+
1993
+ # 3. Protect page image comments: <!-- ![Page X](...) --> and <!-- Page images... -->
1994
+ # Use separate patterns for header and individual page image comments
1995
+ page_header_pattern = r"<!--\s*Page images for reference\s*-->"
1996
+ page_img_pattern = r"<!--\s*!\[Page\s+\d+\]\([^)]*\)\s*-->"
1997
+ page_idx = 0
1998
+ for match in re.finditer(page_header_pattern, result):
1999
+ placeholder = f"__MARKITAI_PAGE_{page_idx}__"
2000
+ mapping[placeholder] = match.group(0)
2001
+ result = result.replace(match.group(0), placeholder, 1)
2002
+ page_idx += 1
2003
+ for match in re.finditer(page_img_pattern, result):
2004
+ placeholder = f"__MARKITAI_PAGE_{page_idx}__"
2005
+ mapping[placeholder] = match.group(0)
2006
+ result = result.replace(match.group(0), placeholder, 1)
2007
+ page_idx += 1
2008
+
2009
+ return result, mapping
2010
+
2011
+ @staticmethod
2012
+ def _unprotect_content(
2013
+ content: str,
2014
+ mapping: dict[str, str],
2015
+ protected: dict[str, list[str]] | None = None,
2016
+ ) -> str:
2017
+ """Restore protected content from placeholders after LLM processing.
2018
+
2019
+ Also handles cases where the LLM removed placeholders by appending
2020
+ missing content at the end, and detects garbage content replacement.
2021
+
2022
+ Args:
2023
+ content: LLM output with placeholders
2024
+ mapping: Mapping of placeholder -> original content
2025
+ protected: Optional dict of protected content for fallback restoration
2026
+
2027
+ Returns:
2028
+ Content with placeholders replaced by original content
2029
+ """
2030
+ import re
2031
+
2032
+ result = content
2033
+
2034
+ # Remove any slide/page number comments that LLM hallucinated
2035
+ # These are NOT from our placeholders and should be removed
2036
+ # Pattern: <!-- Slide number: X --> or <!-- Page number: X -->
2037
+ hallucinated_slide_pattern = r"<!--\s*Slide\s+number:\s*\d+\s*-->\s*\n?"
2038
+ hallucinated_page_pattern = r"<!--\s*Page\s+number:\s*\d+\s*-->\s*\n?"
2039
+
2040
+ # Remove hallucinated markers BEFORE replacing placeholders:
2041
+ # - If original had markers (placeholders exist): ALL raw markers are hallucinated
2042
+ # because the real ones are protected as __MARKITAI_SLIDENUM_X__ placeholders
2043
+ # - If original had NO markers (placeholders empty): ALL raw markers are hallucinated
2044
+ # Either way, we should remove all raw slide/page markers at this point
2045
+ result = re.sub(hallucinated_slide_pattern, "", result)
2046
+ result = re.sub(hallucinated_page_pattern, "", result)
2047
+
2048
+ # First pass: replace placeholders with original content
2049
+ # Ensure page/slide number markers have proper blank lines around them
2050
+ for placeholder, original in mapping.items():
2051
+ # Check if this is a page or slide number marker
2052
+ is_page_slide_marker = "PAGENUM" in placeholder or "SLIDENUM" in placeholder
2053
+ if is_page_slide_marker:
2054
+ # Find the placeholder and ensure blank lines around it
2055
+ # Pattern: optional whitespace/newlines before placeholder
2056
+ pattern = rf"(\n*)\s*{re.escape(placeholder)}\s*(\n*)"
2057
+ match = re.search(pattern, result)
2058
+ if match:
2059
+ # Replace with proper spacing: \n\n before, \n\n after
2060
+ result = re.sub(pattern, f"\n\n{original}\n\n", result, count=1)
2061
+ else:
2062
+ result = result.replace(placeholder, original)
2063
+ else:
2064
+ result = result.replace(placeholder, original)
2065
+
2066
+ # Clean up any residual placeholders that LLM might have duplicated or misplaced
2067
+ # Pattern: __MARKITAI_*__ (any of our placeholder formats)
2068
+ residual_placeholder_pattern = r"__MARKITAI_[A-Z]+_\d+__\s*\n?"
2069
+ residual_count = len(re.findall(residual_placeholder_pattern, result))
2070
+ if residual_count > 0:
2071
+ logger.debug(
2072
+ f"Removing {residual_count} residual placeholders from LLM output"
2073
+ )
2074
+ result = re.sub(residual_placeholder_pattern, "", result)
2075
+
2076
+ # NOTE: Removed heuristic logic that auto-inserted images into short slide sections.
2077
+ # This caused false positives where legitimate short slides like "Agenda", "Thanks",
2078
+ # "Q&A" were incorrectly replaced with images. The LLM should preserve slide content
2079
+ # as-is, and missing images will be handled by the fallback restoration below.
2080
+
2081
+ # Second pass: if protected content was provided, restore any missing items
2082
+ # This handles cases where the LLM removed placeholders entirely
2083
+ if protected:
2084
+ import re
2085
+
2086
+ # Helper to check if an image is already in result (by filename)
2087
+ def image_exists_in_result(img_syntax: str, text: str) -> bool:
2088
+ """Check if image already exists in result by filename."""
2089
+ match = re.search(r"\]\(([^)]+)\)", img_syntax)
2090
+ if match:
2091
+ img_path = match.group(1)
2092
+ img_name = img_path.split("/")[-1]
2093
+ # Check if same filename exists in any image reference
2094
+ return bool(
2095
+ re.search(rf"!\[[^\]]*\]\([^)]*{re.escape(img_name)}\)", text)
2096
+ )
2097
+ return False
2098
+
2099
+ # Restore missing images at end (fallback)
2100
+ # Only restore if the image filename doesn't already exist
2101
+ for img in protected.get("images", []):
2102
+ if img not in result and not image_exists_in_result(img, result):
2103
+ match = re.search(r"\]\(([^)]+)\)", img)
2104
+ if match:
2105
+ img_name = match.group(1).split("/")[-1]
2106
+ logger.debug(f"Restoring missing image at end: {img_name}")
2107
+ result = result.rstrip() + "\n\n" + img
2108
+
2109
+ # Restore missing slide comments at heading boundaries
2110
+ # Key fix: Match slides to H1/H2 headings more intelligently
2111
+ missing_slides = [s for s in protected.get("slides", []) if s not in result]
2112
+ if missing_slides:
2113
+ slide_info = []
2114
+ for slide in missing_slides:
2115
+ # Support both "Slide X" and "Slide number: X" formats
2116
+ match = re.search(r"Slide\s+(?:number:\s*)?(\d+)", slide)
2117
+ if match:
2118
+ slide_info.append((int(match.group(1)), slide))
2119
+ slide_info.sort()
2120
+
2121
+ lines = result.split("\n")
2122
+ # Find H1 and H2 headings as potential slide boundaries
2123
+ heading_positions = [
2124
+ i
2125
+ for i, line in enumerate(lines)
2126
+ if line.startswith("# ") or line.startswith("## ")
2127
+ ]
2128
+
2129
+ # Only insert if we have matching heading positions
2130
+ # Don't append orphan slide comments to the end
2131
+ inserted_count = 0
2132
+ for idx, (slide_num, slide) in enumerate(slide_info):
2133
+ if idx < len(heading_positions):
2134
+ insert_pos = heading_positions[idx] + inserted_count * 2
2135
+ lines.insert(insert_pos, slide)
2136
+ lines.insert(insert_pos + 1, "")
2137
+ inserted_count += 1
2138
+ logger.debug(
2139
+ f"Restored slide {slide_num} before heading at line {insert_pos}"
2140
+ )
2141
+ # Don't append orphan slides to the end - they look wrong
2142
+ result = "\n".join(lines)
2143
+
2144
+ # Restore missing page number markers
2145
+ # Page number markers should be at the beginning of each page's content
2146
+ missing_page_nums = [
2147
+ p for p in protected.get("page_numbers", []) if p not in result
2148
+ ]
2149
+ if missing_page_nums:
2150
+ # Sort by page number
2151
+ page_info = []
2152
+ for page_marker in missing_page_nums:
2153
+ match = re.search(r"Page number:\s*(\d+)", page_marker)
2154
+ if match:
2155
+ page_info.append((int(match.group(1)), page_marker))
2156
+ page_info.sort()
2157
+
2158
+ # Find major content boundaries (H1/H2 headings) as insertion points
2159
+ lines = result.split("\n")
2160
+ heading_positions = []
2161
+ for i, line in enumerate(lines):
2162
+ if line.startswith("# ") or line.startswith("## "):
2163
+ heading_positions.append(i)
2164
+
2165
+ # Insert missing page markers before headings
2166
+ # Only when there are enough heading positions
2167
+ inserted_count = 0
2168
+ for idx, (page_num, marker) in enumerate(page_info):
2169
+ if idx < len(heading_positions):
2170
+ insert_pos = heading_positions[idx] + inserted_count * 2
2171
+ lines.insert(insert_pos, marker)
2172
+ lines.insert(insert_pos + 1, "")
2173
+ inserted_count += 1
2174
+ logger.debug(
2175
+ f"Restored page number {page_num} before heading at line {insert_pos}"
2176
+ )
2177
+
2178
+ result = "\n".join(lines)
2179
+
2180
+ # Restore missing page comments at end
2181
+ # Only restore if not already present (avoid duplicates)
2182
+ page_header = "<!-- Page images for reference -->"
2183
+ has_page_header = page_header in result
2184
+
2185
+ for comment in protected.get("page_comments", []):
2186
+ if comment not in result:
2187
+ # For page header, only add if not present
2188
+ if comment == page_header:
2189
+ if not has_page_header:
2190
+ result = result.rstrip() + "\n\n" + comment
2191
+ has_page_header = True
2192
+ # For individual page image comments, check if already exists
2193
+ else:
2194
+ # Extract page number to check for duplicates
2195
+ page_match = re.search(r"!\[Page\s+(\d+)\]", comment)
2196
+ if page_match:
2197
+ page_num = page_match.group(1)
2198
+ # Check if this page is already referenced (commented or not)
2199
+ page_pattern = rf"!\[Page\s+{page_num}\]"
2200
+ if not re.search(page_pattern, result):
2201
+ result = result.rstrip() + "\n" + comment
2202
+
2203
+ return result
2204
+
2205
+ @staticmethod
2206
+ def _fix_malformed_image_refs(text: str) -> str:
2207
+ """Fix malformed image references with extra closing parentheses.
2208
+
2209
+ Fixes cases like: ![alt](path.jpg)) -> ![alt](path.jpg)
2210
+
2211
+ This handles a common LLM output error where extra ) are added
2212
+ after image references. Uses context-aware parsing to avoid
2213
+ breaking legitimate nested structures like:
2214
+ - [![alt](img)](link) - clickable image
2215
+ - (text: ![alt](img)) - image inside parentheses
2216
+
2217
+ Args:
2218
+ text: Content that may contain malformed image refs
2219
+
2220
+ Returns:
2221
+ Content with fixed image references
2222
+ """
2223
+ result = []
2224
+ i = 0
2225
+ while i < len(text):
2226
+ # Check for image reference start: ![
2227
+ if text[i : i + 2] == "![":
2228
+ # Find the ]( delimiter
2229
+ bracket_end = text.find("](", i + 2)
2230
+ if bracket_end != -1:
2231
+ # Find the matching ) for the image path
2232
+ # Handle nested parens in path like: ![alt](path(1).jpg)
2233
+ paren_start = bracket_end + 2
2234
+ paren_count = 1
2235
+ j = paren_start
2236
+ while j < len(text) and paren_count > 0:
2237
+ if text[j] == "(":
2238
+ paren_count += 1
2239
+ elif text[j] == ")":
2240
+ paren_count -= 1
2241
+ j += 1
2242
+
2243
+ # j now points to position after the closing )
2244
+ img_ref = text[i:j]
2245
+ result.append(img_ref)
2246
+
2247
+ # Count extra ) immediately after the image ref
2248
+ extra_parens = 0
2249
+ while (
2250
+ j + extra_parens < len(text) and text[j + extra_parens] == ")"
2251
+ ):
2252
+ extra_parens += 1
2253
+
2254
+ if extra_parens > 0:
2255
+ # Check if these ) are legitimate closers for outer parens
2256
+ # by counting unmatched ( in the content before this image
2257
+ prefix = "".join(
2258
+ result[:-1]
2259
+ ) # Exclude the image ref just added
2260
+ open_parens = prefix.count("(") - prefix.count(")")
2261
+
2262
+ # Only keep ) that match unclosed (
2263
+ keep_parens = min(extra_parens, max(0, open_parens))
2264
+ result.append(")" * keep_parens)
2265
+ i = j + extra_parens
2266
+ else:
2267
+ i = j
2268
+ continue
2269
+
2270
+ result.append(text[i])
2271
+ i += 1
2272
+
2273
+ return "".join(result)
2274
+
2275
+ @staticmethod
2276
+ def restore_protected_content(result: str, protected: dict[str, list[str]]) -> str:
2277
+ """Restore any protected content that was lost during LLM processing.
2278
+
2279
+ Legacy method - use _unprotect_content for new code.
2280
+
2281
+ Args:
2282
+ result: LLM output
2283
+ protected: Dict of protected content from extract_protected_content
2284
+
2285
+ Returns:
2286
+ Result with missing protected content restored
2287
+ """
2288
+ return LLMProcessor._unprotect_content(result, {}, protected)
2289
+
2290
+ async def clean_markdown(self, content: str, context: str = "") -> str:
2291
+ """
2292
+ Clean and optimize markdown content.
2293
+
2294
+ Uses placeholder-based protection to preserve images, slides, and
2295
+ page comments in their original positions during LLM processing.
2296
+
2297
+ Cache lookup order:
2298
+ 1. In-memory cache (session-level, fast)
2299
+ 2. Persistent cache (cross-session, SQLite)
2300
+ 3. LLM API call
2301
+
2302
+ Args:
2303
+ content: Raw markdown content
2304
+ context: Context identifier for logging (e.g., filename)
2305
+
2306
+ Returns:
2307
+ Cleaned markdown content
2308
+ """
2309
+ cache_key = "cleaner"
2310
+
2311
+ # 1. Check in-memory cache first (fastest)
2312
+ cached = self._cache.get(cache_key, content)
2313
+ if cached is not None:
2314
+ self._cache_hits += 1
2315
+ logger.debug(
2316
+ f"[{_context_display_name(context)}] Memory cache hit for clean_markdown"
2317
+ )
2318
+ return cached
2319
+
2320
+ # 2. Check persistent cache (cross-session)
2321
+ cached = self._persistent_cache.get(cache_key, content, context=context)
2322
+ if cached is not None:
2323
+ self._cache_hits += 1
2324
+ logger.debug(
2325
+ f"[{_context_display_name(context)}] Persistent cache hit for clean_markdown"
2326
+ )
2327
+ # Also populate in-memory cache for faster subsequent access
2328
+ self._cache.set(cache_key, content, cached)
2329
+ return cached
2330
+
2331
+ self._cache_misses += 1
2332
+
2333
+ # 3. Extract and protect content before LLM processing
2334
+ protected = self.extract_protected_content(content)
2335
+ protected_content, mapping = self._protect_content(content)
2336
+
2337
+ prompt = self._prompt_manager.get_prompt("cleaner", content=protected_content)
2338
+
2339
+ response = await self._call_llm(
2340
+ model="default",
2341
+ messages=[{"role": "user", "content": prompt}],
2342
+ context=context,
2343
+ )
2344
+
2345
+ # Restore protected content from placeholders, with fallback for removed items
2346
+ result = self._unprotect_content(response.content, mapping, protected)
2347
+
2348
+ # Cache the result in both layers
2349
+ self._cache.set(cache_key, content, result)
2350
+ self._persistent_cache.set(cache_key, content, result, model="default")
2351
+
2352
+ return result
2353
+
2354
+ async def generate_frontmatter(
2355
+ self,
2356
+ content: str,
2357
+ source: str,
2358
+ ) -> str:
2359
+ """
2360
+ Generate YAML frontmatter for markdown content.
2361
+
2362
+ Cache lookup order:
2363
+ 1. In-memory cache (session-level, fast)
2364
+ 2. Persistent cache (cross-session, SQLite)
2365
+ 3. LLM API call
2366
+
2367
+ Args:
2368
+ content: Markdown content
2369
+ source: Source file name
2370
+
2371
+ Returns:
2372
+ YAML frontmatter string (without --- markers)
2373
+ """
2374
+ cache_key = f"frontmatter:{source}"
2375
+
2376
+ # 1. Check in-memory cache first (fastest)
2377
+ cached = self._cache.get(cache_key, content)
2378
+ if cached is not None:
2379
+ self._cache_hits += 1
2380
+ logger.debug(f"[{source}] Memory cache hit for generate_frontmatter")
2381
+ return cached
2382
+
2383
+ # 2. Check persistent cache (cross-session)
2384
+ cached = self._persistent_cache.get(cache_key, content, context=source)
2385
+ if cached is not None:
2386
+ self._cache_hits += 1
2387
+ logger.debug(f"[{source}] Persistent cache hit for generate_frontmatter")
2388
+ # Also populate in-memory cache for faster subsequent access
2389
+ self._cache.set(cache_key, content, cached)
2390
+ return cached
2391
+
2392
+ self._cache_misses += 1
2393
+
2394
+ # 3. Detect document language
2395
+ language = get_language_name(detect_language(content))
2396
+
2397
+ prompt = self._prompt_manager.get_prompt(
2398
+ "frontmatter",
2399
+ content=self._smart_truncate(content, 4000),
2400
+ source=source,
2401
+ language=language,
2402
+ )
2403
+
2404
+ response = await self._call_llm(
2405
+ model="default",
2406
+ messages=[{"role": "user", "content": prompt}],
2407
+ context=source,
2408
+ )
2409
+
2410
+ result = response.content
2411
+
2412
+ # Cache the result in both layers
2413
+ self._cache.set(cache_key, content, result)
2414
+ self._persistent_cache.set(cache_key, content, result, model="default")
2415
+
2416
+ return result
2417
+
2418
+ async def analyze_image(
2419
+ self, image_path: Path, language: str = "en", context: str = ""
2420
+ ) -> ImageAnalysis:
2421
+ """
2422
+ Analyze an image using vision model.
2423
+
2424
+ Uses Instructor for structured output with fallback mechanisms:
2425
+ 1. Try Instructor with structured output
2426
+ 2. Fallback to JSON mode + manual parsing
2427
+ 3. Fallback to original two-call method
2428
+
2429
+ Args:
2430
+ image_path: Path to the image file
2431
+ language: Language for output (e.g., "en", "zh")
2432
+ context: Context identifier for usage tracking (e.g., source filename)
2433
+
2434
+ Returns:
2435
+ ImageAnalysis with caption and description
2436
+ """
2437
+ # Filter unsupported image formats (SVG, BMP, ICO etc.)
2438
+ if not is_llm_supported_image(image_path.suffix):
2439
+ logger.debug(
2440
+ f"[{image_path.name}] Skipping unsupported format: {image_path.suffix}"
2441
+ )
2442
+ return ImageAnalysis(
2443
+ caption=image_path.stem,
2444
+ description=f"Image format {image_path.suffix} not supported for analysis",
2445
+ )
2446
+
2447
+ # Get cached image data and base64 encoding
2448
+ _, base64_image = self._get_cached_image(image_path)
2449
+
2450
+ # Check persistent cache using image hash + language as key
2451
+ # Use SHA256 hash of base64 as image fingerprint to avoid collisions
2452
+ # (JPEG files share the same header, so first N chars are identical)
2453
+ cache_key = f"image_analysis:{language}"
2454
+ image_fingerprint = hashlib.sha256(base64_image.encode()).hexdigest()
2455
+ cached = self._persistent_cache.get(
2456
+ cache_key, image_fingerprint, context=context
2457
+ )
2458
+ if cached is not None:
2459
+ logger.debug(f"[{image_path.name}] Persistent cache hit for analyze_image")
2460
+ # Reconstruct ImageAnalysis from cached dict
2461
+ return ImageAnalysis(
2462
+ caption=cached.get("caption", ""),
2463
+ description=cached.get("description", ""),
2464
+ extracted_text=cached.get("extracted_text"),
2465
+ )
2466
+
2467
+ # Determine MIME type
2468
+ mime_type = get_mime_type(image_path.suffix)
2469
+
2470
+ # Language instruction
2471
+ lang_instruction = (
2472
+ "Output in English." if language == "en" else "使用中文输出。"
2473
+ )
2474
+
2475
+ # Get combined prompt
2476
+ prompt = self._prompt_manager.get_prompt("image_analysis")
2477
+ prompt = prompt.replace(
2478
+ "**输出语言必须与源文档保持一致** - 英文文档用英文,中文文档用中文",
2479
+ lang_instruction,
2480
+ )
2481
+
2482
+ # Build message with image
2483
+ messages = [
2484
+ {
2485
+ "role": "user",
2486
+ "content": [
2487
+ {"type": "text", "text": prompt},
2488
+ {
2489
+ "type": "image_url",
2490
+ "image_url": {"url": f"data:{mime_type};base64,{base64_image}"},
2491
+ },
2492
+ ],
2493
+ }
2494
+ ]
2495
+
2496
+ # Use "default" model name - smart router will auto-select vision-capable model
2497
+ # since the message contains image content
2498
+ vision_model = "default"
2499
+
2500
+ # Try structured output methods with fallbacks
2501
+ result = await self._analyze_image_with_fallback(
2502
+ messages, vision_model, image_path.name, context
2503
+ )
2504
+
2505
+ # Store in persistent cache
2506
+ cache_value = {
2507
+ "caption": result.caption,
2508
+ "description": result.description,
2509
+ "extracted_text": result.extracted_text,
2510
+ }
2511
+ self._persistent_cache.set(
2512
+ cache_key, image_fingerprint, cache_value, model="vision"
2513
+ )
2514
+
2515
+ return result
2516
+
2517
+ async def analyze_images_batch(
2518
+ self,
2519
+ image_paths: list[Path],
2520
+ language: str = "en",
2521
+ max_images_per_batch: int = DEFAULT_MAX_IMAGES_PER_BATCH,
2522
+ context: str = "",
2523
+ ) -> list[ImageAnalysis]:
2524
+ """
2525
+ Analyze multiple images in batches with parallel execution.
2526
+
2527
+ Batches are processed concurrently using asyncio.gather for better
2528
+ throughput. LLM concurrency is controlled by the shared semaphore.
2529
+
2530
+ Args:
2531
+ image_paths: List of image paths to analyze
2532
+ language: Language for output ("en" or "zh")
2533
+ max_images_per_batch: Max images per LLM call (default 10)
2534
+ context: Context identifier for usage tracking (e.g., source filename)
2535
+
2536
+ Returns:
2537
+ List of ImageAnalysis results in same order as input
2538
+ """
2539
+ if not image_paths:
2540
+ return []
2541
+
2542
+ # Split into batches
2543
+ num_batches = (
2544
+ len(image_paths) + max_images_per_batch - 1
2545
+ ) // max_images_per_batch
2546
+
2547
+ batches: list[tuple[int, list[Path]]] = []
2548
+ for batch_num in range(num_batches):
2549
+ batch_start = batch_num * max_images_per_batch
2550
+ batch_end = min(batch_start + max_images_per_batch, len(image_paths))
2551
+ batch_paths = image_paths[batch_start:batch_end]
2552
+ batches.append((batch_num, batch_paths))
2553
+
2554
+ # Limit concurrent batches to avoid memory pressure from loading all images
2555
+ # at once. The semaphore controls LLM API calls, but images are loaded
2556
+ # before acquiring the semaphore. This batch-level limit prevents that.
2557
+ max_concurrent_batches = min(self.config.concurrency, num_batches)
2558
+ batch_semaphore = asyncio.Semaphore(max_concurrent_batches)
2559
+
2560
+ display_name = _context_display_name(context)
2561
+ logger.info(
2562
+ f"[{display_name}] Analyzing {len(image_paths)} images in "
2563
+ f"{num_batches} batches (max {max_concurrent_batches} concurrent)"
2564
+ )
2565
+
2566
+ # Process batches with backpressure and streaming
2567
+ async def process_batch(
2568
+ batch_num: int, batch_paths: list[Path]
2569
+ ) -> tuple[int, list[ImageAnalysis]]:
2570
+ """Process a single batch with backpressure control."""
2571
+ async with batch_semaphore:
2572
+ try:
2573
+ results = await self.analyze_batch(batch_paths, language, context)
2574
+ return (batch_num, results)
2575
+ except Exception as e:
2576
+ logger.warning(
2577
+ f"[{display_name}] Batch {batch_num + 1}/{num_batches} failed: {e}"
2578
+ )
2579
+ # Return empty results with placeholder for failed images
2580
+ return (
2581
+ batch_num,
2582
+ [
2583
+ ImageAnalysis(
2584
+ caption=f"Image {i + 1}",
2585
+ description="Analysis failed",
2586
+ )
2587
+ for i in range(len(batch_paths))
2588
+ ],
2589
+ )
2590
+
2591
+ # Launch all batches and process results as they complete
2592
+ # Using as_completed allows earlier batches to free resources sooner
2593
+ tasks = {
2594
+ asyncio.create_task(process_batch(batch_num, paths)): batch_num
2595
+ for batch_num, paths in batches
2596
+ }
2597
+
2598
+ batch_results: list[tuple[int, list[ImageAnalysis]]] = []
2599
+ for coro in asyncio.as_completed(tasks.keys()):
2600
+ try:
2601
+ result = await coro
2602
+ batch_results.append(result)
2603
+ except Exception as e:
2604
+ # Find which batch failed by checking tasks
2605
+ logger.error(f"[{display_name}] Batch processing error: {e}")
2606
+
2607
+ # Sort by batch number and flatten results
2608
+ batch_results_sorted = sorted(batch_results, key=lambda x: x[0])
2609
+ all_results: list[ImageAnalysis] = []
2610
+ for _, results in batch_results_sorted:
2611
+ all_results.extend(results)
2612
+
2613
+ return all_results
2614
+
2615
+ async def analyze_batch(
2616
+ self,
2617
+ image_paths: list[Path],
2618
+ language: str,
2619
+ context: str = "",
2620
+ ) -> list[ImageAnalysis]:
2621
+ """Batch image analysis using Instructor.
2622
+
2623
+ Uses the same prompt template as single image analysis for consistency.
2624
+ Checks persistent cache first and only calls LLM for uncached images.
2625
+
2626
+ Args:
2627
+ image_paths: List of image paths to analyze
2628
+ language: Language for output ("en" or "zh")
2629
+ context: Context identifier for usage tracking
2630
+
2631
+ Returns:
2632
+ List of ImageAnalysis results
2633
+ """
2634
+ # Filter unsupported formats and track their indices
2635
+ unsupported_results: dict[int, ImageAnalysis] = {}
2636
+ supported_paths: list[tuple[int, Path]] = []
2637
+ for i, image_path in enumerate(image_paths):
2638
+ if not is_llm_supported_image(image_path.suffix):
2639
+ logger.debug(
2640
+ f"[{image_path.name}] Skipping unsupported format: {image_path.suffix}"
2641
+ )
2642
+ unsupported_results[i] = ImageAnalysis(
2643
+ caption=image_path.stem,
2644
+ description=f"Image format {image_path.suffix} not supported for analysis",
2645
+ )
2646
+ else:
2647
+ supported_paths.append((i, image_path))
2648
+
2649
+ # If all images are unsupported, return placeholder results
2650
+ if not supported_paths:
2651
+ return [unsupported_results[i] for i in range(len(image_paths))]
2652
+
2653
+ # Check persistent cache for all images first
2654
+ # Use same cache key format as analyze_image for consistency
2655
+ cache_key = f"image_analysis:{language}"
2656
+ cached_results: dict[int, ImageAnalysis] = {}
2657
+ uncached_indices: list[int] = []
2658
+ image_fingerprints: dict[int, str] = {}
2659
+
2660
+ for orig_idx, image_path in supported_paths:
2661
+ _, base64_image = self._get_cached_image(image_path)
2662
+ # Use SHA256 hash to avoid collisions (JPEG files share same header)
2663
+ fingerprint = hashlib.sha256(base64_image.encode()).hexdigest()
2664
+ image_fingerprints[orig_idx] = fingerprint
2665
+
2666
+ cached = self._persistent_cache.get(cache_key, fingerprint, context=context)
2667
+ if cached is not None:
2668
+ logger.debug(f"[{image_path.name}] Cache hit in batch analysis")
2669
+ cached_results[orig_idx] = ImageAnalysis(
2670
+ caption=cached.get("caption", ""),
2671
+ description=cached.get("description", ""),
2672
+ extracted_text=cached.get("extracted_text"),
2673
+ )
2674
+ else:
2675
+ uncached_indices.append(orig_idx)
2676
+
2677
+ # If all supported images are cached, return merged results
2678
+ display_name = _context_display_name(context)
2679
+ if not uncached_indices:
2680
+ logger.info(
2681
+ f"[{display_name}] All {len(supported_paths)} supported images found in cache"
2682
+ )
2683
+ # Merge unsupported and cached results
2684
+ return [
2685
+ unsupported_results.get(i) or cached_results[i]
2686
+ for i in range(len(image_paths))
2687
+ ]
2688
+
2689
+ # Only process uncached images
2690
+ uncached_paths = [image_paths[i] for i in uncached_indices]
2691
+ logger.debug(
2692
+ f"[{display_name}] Cache: {len(cached_results)} hits, "
2693
+ f"{len(uncached_indices)} misses"
2694
+ )
2695
+
2696
+ # Get base prompt from template (same as single image analysis)
2697
+ lang_instruction = (
2698
+ "Output in English." if language == "en" else "使用中文输出。"
2699
+ )
2700
+ base_prompt = self._prompt_manager.get_prompt("image_analysis")
2701
+ base_prompt = base_prompt.replace(
2702
+ "**输出语言必须与源文档保持一致** - 英文文档用英文,中文文档用中文",
2703
+ lang_instruction,
2704
+ )
2705
+
2706
+ # Build batch prompt with the same base prompt
2707
+ batch_header = (
2708
+ f"请依次分析以下 {len(uncached_paths)} 张图片。对每张图片,"
2709
+ if language == "zh"
2710
+ else f"Analyze the following {len(uncached_paths)} images in order. For each image, "
2711
+ )
2712
+ prompt = f"{batch_header}{base_prompt}\n\nReturn a JSON object with an 'images' array containing results for each image in order."
2713
+
2714
+ # Build content parts with uncached images only
2715
+ content_parts: list[dict] = [{"type": "text", "text": prompt}]
2716
+
2717
+ for i, image_path in enumerate(uncached_paths, 1):
2718
+ _, base64_image = self._get_cached_image(image_path)
2719
+ mime_type = get_mime_type(image_path.suffix)
2720
+
2721
+ # Unique image label that won't conflict with document content
2722
+ content_parts.append(
2723
+ {"type": "text", "text": f"\n__MARKITAI_IMG_LABEL_{i}__"}
2724
+ )
2725
+ content_parts.append(
2726
+ {
2727
+ "type": "image_url",
2728
+ "image_url": {"url": f"data:{mime_type};base64,{base64_image}"},
2729
+ }
2730
+ )
2731
+
2732
+ try:
2733
+ async with self.semaphore:
2734
+ # Calculate dynamic max_tokens
2735
+ messages = [{"role": "user", "content": content_parts}]
2736
+ max_tokens = self._calculate_dynamic_max_tokens(messages)
2737
+
2738
+ client = instructor.from_litellm(
2739
+ self.vision_router.acompletion, mode=instructor.Mode.JSON
2740
+ )
2741
+ # max_retries allows Instructor to retry with validation error
2742
+ # feedback, which helps LLM fix JSON escaping issues
2743
+ (
2744
+ response,
2745
+ raw_response,
2746
+ ) = await client.chat.completions.create_with_completion(
2747
+ model="default",
2748
+ messages=cast(
2749
+ list[ChatCompletionMessageParam],
2750
+ messages,
2751
+ ),
2752
+ response_model=BatchImageAnalysisResult,
2753
+ max_retries=DEFAULT_INSTRUCTOR_MAX_RETRIES,
2754
+ max_tokens=max_tokens,
2755
+ )
2756
+
2757
+ # Check for truncation
2758
+ if hasattr(raw_response, "choices") and raw_response.choices:
2759
+ finish_reason = getattr(
2760
+ raw_response.choices[0], "finish_reason", None
2761
+ )
2762
+ if finish_reason == "length":
2763
+ raise ValueError("Output truncated due to max_tokens limit")
2764
+
2765
+ # Track usage
2766
+ actual_model = getattr(raw_response, "model", None) or "default"
2767
+ input_tokens = 0
2768
+ output_tokens = 0
2769
+ cost = 0.0
2770
+ if hasattr(raw_response, "usage") and raw_response.usage is not None:
2771
+ input_tokens = getattr(raw_response.usage, "prompt_tokens", 0) or 0
2772
+ output_tokens = (
2773
+ getattr(raw_response.usage, "completion_tokens", 0) or 0
2774
+ )
2775
+ try:
2776
+ cost = completion_cost(completion_response=raw_response)
2777
+ except Exception:
2778
+ cost = 0.0
2779
+ self._track_usage(
2780
+ actual_model, input_tokens, output_tokens, cost, context
2781
+ )
2782
+
2783
+ # Calculate per-image usage (divide batch usage by number of images)
2784
+ num_images = max(len(response.images), 1)
2785
+ per_image_llm_usage: LLMUsageByModel = {
2786
+ actual_model: cast(
2787
+ "ModelUsageStats",
2788
+ {
2789
+ "requests": 1, # Each image counts as 1 request share
2790
+ "input_tokens": input_tokens // num_images,
2791
+ "output_tokens": output_tokens // num_images,
2792
+ "cost_usd": cost / num_images,
2793
+ },
2794
+ )
2795
+ }
2796
+
2797
+ # Convert to ImageAnalysis list and store in cache
2798
+ new_results: list[ImageAnalysis] = []
2799
+ for idx, img_result in enumerate(response.images):
2800
+ analysis = ImageAnalysis(
2801
+ caption=img_result.caption,
2802
+ description=img_result.description,
2803
+ extracted_text=img_result.extracted_text,
2804
+ llm_usage=per_image_llm_usage,
2805
+ )
2806
+ new_results.append(analysis)
2807
+
2808
+ # Store in persistent cache using original index
2809
+ if idx < len(uncached_indices):
2810
+ original_idx = uncached_indices[idx]
2811
+ fingerprint = image_fingerprints[original_idx]
2812
+ cache_value = {
2813
+ "caption": analysis.caption,
2814
+ "description": analysis.description,
2815
+ "extracted_text": analysis.extracted_text,
2816
+ }
2817
+ self._persistent_cache.set(
2818
+ cache_key, fingerprint, cache_value, model="vision"
2819
+ )
2820
+
2821
+ # Ensure we have results for all uncached images
2822
+ while len(new_results) < len(uncached_paths):
2823
+ new_results.append(
2824
+ ImageAnalysis(
2825
+ caption="Image",
2826
+ description="Image analysis failed",
2827
+ extracted_text=None,
2828
+ llm_usage=per_image_llm_usage,
2829
+ )
2830
+ )
2831
+
2832
+ # Merge unsupported, cached and new results in original order
2833
+ final_results: list[ImageAnalysis] = []
2834
+ new_result_iter = iter(new_results)
2835
+ for i in range(len(image_paths)):
2836
+ if i in unsupported_results:
2837
+ final_results.append(unsupported_results[i])
2838
+ elif i in cached_results:
2839
+ final_results.append(cached_results[i])
2840
+ else:
2841
+ final_results.append(next(new_result_iter))
2842
+
2843
+ return final_results
2844
+
2845
+ except Exception as e:
2846
+ logger.warning(
2847
+ f"Batch image analysis failed: {e}, falling back to individual analysis"
2848
+ )
2849
+ # Fallback: analyze each image individually (uses persistent cache)
2850
+ # Pass context to maintain accurate per-file usage tracking
2851
+ # Note: cached_results may already have some hits from the initial check
2852
+ fallback_results: list[ImageAnalysis] = []
2853
+ for i, image_path in enumerate(image_paths):
2854
+ if i in unsupported_results:
2855
+ # Use unsupported placeholder result
2856
+ fallback_results.append(unsupported_results[i])
2857
+ elif i in cached_results:
2858
+ # Use already-cached result
2859
+ fallback_results.append(cached_results[i])
2860
+ else:
2861
+ try:
2862
+ # analyze_image will also check/populate cache
2863
+ result = await self.analyze_image(image_path, language, context)
2864
+ fallback_results.append(result)
2865
+ except Exception:
2866
+ fallback_results.append(
2867
+ ImageAnalysis(
2868
+ caption="Image",
2869
+ description="Image analysis failed",
2870
+ extracted_text=None,
2871
+ )
2872
+ )
2873
+ return fallback_results
2874
+
2875
+ def _get_actual_model_name(self, logical_name: str) -> str:
2876
+ """Get actual model name from router configuration."""
2877
+ for model_config in self.config.model_list:
2878
+ if model_config.model_name == logical_name:
2879
+ return model_config.litellm_params.model
2880
+ # Fallback to first model if logical name not found
2881
+ if self.config.model_list:
2882
+ return self.config.model_list[0].litellm_params.model
2883
+ return "gpt-4o-mini" # Ultimate fallback
2884
+
2885
+ async def _analyze_image_with_fallback(
2886
+ self,
2887
+ messages: list[dict],
2888
+ model: str,
2889
+ image_name: str,
2890
+ context: str = "",
2891
+ ) -> ImageAnalysis:
2892
+ """
2893
+ Analyze image with multiple fallback strategies.
2894
+
2895
+ Strategy 1: Instructor structured output (most precise)
2896
+ Strategy 2: JSON mode + manual parsing
2897
+ Strategy 3: Original two-call method (most compatible)
2898
+
2899
+ Args:
2900
+ messages: LLM messages with image
2901
+ model: Model name to use
2902
+ image_name: Image filename for logging
2903
+ context: Context identifier for usage tracking
2904
+ """
2905
+ # Strategy 1: Try Instructor
2906
+ try:
2907
+ # Deep copy to prevent Instructor from modifying original messages
2908
+ result = await self._analyze_with_instructor(
2909
+ copy.deepcopy(messages), model, context
2910
+ )
2911
+ return result
2912
+ except Exception as e:
2913
+ logger.debug(f"[{image_name}] Instructor failed: {e}, trying JSON mode")
2914
+
2915
+ # Strategy 2: Try JSON mode
2916
+ try:
2917
+ result = await self._analyze_with_json_mode(
2918
+ copy.deepcopy(messages), model, context
2919
+ )
2920
+ logger.debug(f"[{image_name}] Used JSON mode fallback")
2921
+ return result
2922
+ except Exception as e:
2923
+ logger.debug(
2924
+ f"[{image_name}] JSON mode failed: {e}, using two-call fallback"
2925
+ )
2926
+
2927
+ # Strategy 3: Original two-call method
2928
+ return await self._analyze_with_two_calls(
2929
+ copy.deepcopy(messages), model, context=context or image_name
2930
+ )
2931
+
2932
+ async def _analyze_with_instructor(
2933
+ self,
2934
+ messages: list[dict],
2935
+ model: str,
2936
+ context: str = "",
2937
+ ) -> ImageAnalysis:
2938
+ """Analyze using Instructor for structured output."""
2939
+ async with self.semaphore:
2940
+ # Calculate dynamic max_tokens
2941
+ max_tokens = self._calculate_dynamic_max_tokens(messages)
2942
+
2943
+ # Create instructor client from vision router for load balancing
2944
+ client = instructor.from_litellm(
2945
+ self.vision_router.acompletion, mode=instructor.Mode.JSON
2946
+ )
2947
+
2948
+ # Use create_with_completion to get both the model and the raw response
2949
+ # max_retries allows Instructor to retry with validation error
2950
+ # feedback, which helps LLM fix JSON escaping issues
2951
+ (
2952
+ response,
2953
+ raw_response,
2954
+ ) = await client.chat.completions.create_with_completion(
2955
+ model=model,
2956
+ messages=cast(list[ChatCompletionMessageParam], messages),
2957
+ response_model=ImageAnalysisResult,
2958
+ max_retries=DEFAULT_INSTRUCTOR_MAX_RETRIES,
2959
+ max_tokens=max_tokens,
2960
+ )
2961
+
2962
+ # Check for truncation
2963
+ if hasattr(raw_response, "choices") and raw_response.choices:
2964
+ finish_reason = getattr(raw_response.choices[0], "finish_reason", None)
2965
+ if finish_reason == "length":
2966
+ raise ValueError("Output truncated due to max_tokens limit")
2967
+
2968
+ # Track usage from raw API response
2969
+ # Get actual model from response for accurate tracking
2970
+ actual_model = getattr(raw_response, "model", None) or model
2971
+ input_tokens = 0
2972
+ output_tokens = 0
2973
+ cost = 0.0
2974
+ if hasattr(raw_response, "usage") and raw_response.usage is not None:
2975
+ input_tokens = getattr(raw_response.usage, "prompt_tokens", 0) or 0
2976
+ output_tokens = getattr(raw_response.usage, "completion_tokens", 0) or 0
2977
+ try:
2978
+ cost = completion_cost(completion_response=raw_response)
2979
+ except Exception:
2980
+ cost = 0.0
2981
+ self._track_usage(
2982
+ actual_model, input_tokens, output_tokens, cost, context
2983
+ )
2984
+
2985
+ # Build llm_usage dict for this analysis
2986
+ llm_usage: LLMUsageByModel = {
2987
+ actual_model: cast(
2988
+ "ModelUsageStats",
2989
+ {
2990
+ "requests": 1,
2991
+ "input_tokens": input_tokens,
2992
+ "output_tokens": output_tokens,
2993
+ "cost_usd": cost,
2994
+ },
2995
+ )
2996
+ }
2997
+
2998
+ return ImageAnalysis(
2999
+ caption=response.caption.strip(),
3000
+ description=response.description,
3001
+ extracted_text=response.extracted_text,
3002
+ llm_usage=llm_usage,
3003
+ )
3004
+
3005
+ async def _analyze_with_json_mode(
3006
+ self,
3007
+ messages: list[dict],
3008
+ model: str,
3009
+ context: str = "",
3010
+ ) -> ImageAnalysis:
3011
+ """Analyze using JSON mode with manual parsing."""
3012
+ # Add JSON instruction to the prompt
3013
+ json_messages = messages.copy()
3014
+ json_messages[0] = {
3015
+ **messages[0],
3016
+ "content": [
3017
+ {
3018
+ "type": "text",
3019
+ "text": messages[0]["content"][0]["text"]
3020
+ + "\n\nReturn a JSON object with 'caption' and 'description' fields.",
3021
+ },
3022
+ messages[0]["content"][1], # image
3023
+ ],
3024
+ }
3025
+
3026
+ async with self.semaphore:
3027
+ # Calculate dynamic max_tokens for vision request
3028
+ max_tokens = self._calculate_dynamic_max_tokens(json_messages)
3029
+
3030
+ # Use vision_router for image analysis (not main router)
3031
+ response = await self.vision_router.acompletion(
3032
+ model=model,
3033
+ messages=cast(list[AllMessageValues], json_messages),
3034
+ max_tokens=max_tokens,
3035
+ response_format={"type": "json_object"},
3036
+ )
3037
+
3038
+ # litellm returns Choices (not StreamingChoices) for non-streaming
3039
+ choice = cast(Choices, response.choices[0])
3040
+ content = choice.message.content if choice.message else "{}"
3041
+ actual_model = response.model or model
3042
+
3043
+ # Track usage
3044
+ usage = getattr(response, "usage", None)
3045
+ input_tokens = usage.prompt_tokens if usage else 0
3046
+ output_tokens = usage.completion_tokens if usage else 0
3047
+ try:
3048
+ cost = completion_cost(completion_response=response)
3049
+ except Exception:
3050
+ cost = 0.0
3051
+ self._track_usage(actual_model, input_tokens, output_tokens, cost, context)
3052
+
3053
+ # Parse JSON
3054
+ data = json.loads(content or "{}")
3055
+
3056
+ # Build llm_usage dict for this analysis
3057
+ llm_usage: LLMUsageByModel = {
3058
+ actual_model: cast(
3059
+ "ModelUsageStats",
3060
+ {
3061
+ "requests": 1,
3062
+ "input_tokens": input_tokens,
3063
+ "output_tokens": output_tokens,
3064
+ "cost_usd": cost,
3065
+ },
3066
+ )
3067
+ }
3068
+
3069
+ return ImageAnalysis(
3070
+ caption=data.get("caption", "").strip(),
3071
+ description=data.get("description", ""),
3072
+ extracted_text=data.get("extracted_text"),
3073
+ llm_usage=llm_usage,
3074
+ )
3075
+
3076
+ async def _analyze_with_two_calls(
3077
+ self,
3078
+ messages: list[dict],
3079
+ model: str, # noqa: ARG002
3080
+ context: str = "",
3081
+ ) -> ImageAnalysis:
3082
+ """Original two-call method as final fallback."""
3083
+ # Extract original prompt and image from messages
3084
+ original_content = messages[0]["content"]
3085
+ image_content = original_content[1] # The image part
3086
+
3087
+ # Language instruction (extract from original prompt)
3088
+ lang_instruction = "Output in English."
3089
+ if "使用中文输出" in original_content[0]["text"]:
3090
+ lang_instruction = "使用中文输出。"
3091
+
3092
+ # Generate caption
3093
+ caption_prompt = self._prompt_manager.get_prompt("image_caption")
3094
+ caption_prompt = caption_prompt.replace(
3095
+ "**输出语言必须与源文档保持一致** - 英文文档用英文,中文文档用中文",
3096
+ lang_instruction,
3097
+ )
3098
+ caption_response = await self._call_llm(
3099
+ model="default",
3100
+ messages=[
3101
+ {
3102
+ "role": "user",
3103
+ "content": [
3104
+ {"type": "text", "text": caption_prompt},
3105
+ image_content,
3106
+ ],
3107
+ }
3108
+ ],
3109
+ context=context,
3110
+ )
3111
+
3112
+ # Generate description
3113
+ desc_prompt = self._prompt_manager.get_prompt("image_description")
3114
+ desc_response = await self._call_llm(
3115
+ model="default",
3116
+ messages=[
3117
+ {
3118
+ "role": "user",
3119
+ "content": [
3120
+ {"type": "text", "text": desc_prompt},
3121
+ image_content,
3122
+ ],
3123
+ }
3124
+ ],
3125
+ context=context,
3126
+ )
3127
+
3128
+ # Build aggregated llm_usage from both calls
3129
+ llm_usage: LLMUsageByModel = {}
3130
+ for resp in [caption_response, desc_response]:
3131
+ if resp.model not in llm_usage:
3132
+ llm_usage[resp.model] = cast(
3133
+ "ModelUsageStats",
3134
+ {
3135
+ "requests": 0,
3136
+ "input_tokens": 0,
3137
+ "output_tokens": 0,
3138
+ "cost_usd": 0.0,
3139
+ },
3140
+ )
3141
+ llm_usage[resp.model]["requests"] += 1
3142
+ llm_usage[resp.model]["input_tokens"] += resp.input_tokens
3143
+ llm_usage[resp.model]["output_tokens"] += resp.output_tokens
3144
+ llm_usage[resp.model]["cost_usd"] += resp.cost_usd
3145
+
3146
+ return ImageAnalysis(
3147
+ caption=caption_response.content.strip(),
3148
+ description=desc_response.content,
3149
+ llm_usage=llm_usage,
3150
+ )
3151
+
3152
+ async def extract_page_content(self, image_path: Path, context: str = "") -> str:
3153
+ """
3154
+ Extract text content from a document page image.
3155
+
3156
+ Used for OCR+LLM mode and PPTX+LLM mode where pages are rendered
3157
+ as images and we want to extract structured text content.
3158
+
3159
+ Args:
3160
+ image_path: Path to the page image file
3161
+ context: Context identifier for logging (e.g., parent document name)
3162
+
3163
+ Returns:
3164
+ Extracted markdown content from the page
3165
+ """
3166
+ # Get cached image data and base64 encoding
3167
+ _, base64_image = self._get_cached_image(image_path)
3168
+
3169
+ # Determine MIME type
3170
+ mime_type = get_mime_type(image_path.suffix)
3171
+
3172
+ # Get page content extraction prompt
3173
+ prompt = self._prompt_manager.get_prompt("page_content")
3174
+
3175
+ # Use image_path.name as context if not provided
3176
+ call_context = context or image_path.name
3177
+
3178
+ response = await self._call_llm(
3179
+ model="default",
3180
+ messages=[
3181
+ {
3182
+ "role": "user",
3183
+ "content": [
3184
+ {"type": "text", "text": prompt},
3185
+ {
3186
+ "type": "image_url",
3187
+ "image_url": {
3188
+ "url": f"data:{mime_type};base64,{base64_image}"
3189
+ },
3190
+ },
3191
+ ],
3192
+ }
3193
+ ],
3194
+ context=call_context,
3195
+ )
3196
+
3197
+ return response.content
3198
+
3199
+ @staticmethod
3200
+ def _protect_image_positions(text: str) -> tuple[str, dict[str, str]]:
3201
+ """Replace image references with position markers to prevent LLM from moving them.
3202
+
3203
+ Args:
3204
+ text: Markdown text with image references
3205
+
3206
+ Returns:
3207
+ Tuple of (text with markers, mapping of marker -> original image reference)
3208
+ """
3209
+ import re
3210
+
3211
+ mapping: dict[str, str] = {}
3212
+ result = text
3213
+
3214
+ # Match ALL image references: ![...](...)
3215
+ # This includes both local assets and external URLs
3216
+ # Excludes screenshots placeholder which has its own protection
3217
+ img_pattern = r"!\[[^\]]*\]\([^)]+\)"
3218
+ for i, match in enumerate(re.finditer(img_pattern, text)):
3219
+ img_ref = match.group(0)
3220
+ # Skip screenshot placeholders (handled separately)
3221
+ if "screenshots/" in img_ref:
3222
+ continue
3223
+ marker = f"<!-- IMG_MARKER: {i} -->"
3224
+ mapping[marker] = img_ref
3225
+ result = result.replace(img_ref, marker, 1)
3226
+
3227
+ return result, mapping
3228
+
3229
+ @staticmethod
3230
+ def _restore_image_positions(text: str, mapping: dict[str, str]) -> str:
3231
+ """Restore original image references from position markers.
3232
+
3233
+ Args:
3234
+ text: Text with position markers
3235
+ mapping: Mapping of marker -> original image reference
3236
+
3237
+ Returns:
3238
+ Text with original image references restored
3239
+ """
3240
+ result = text
3241
+ for marker, original in mapping.items():
3242
+ result = result.replace(marker, original)
3243
+ return result
3244
+
3245
+ async def enhance_url_with_vision(
3246
+ self,
3247
+ content: str,
3248
+ screenshot_path: Path,
3249
+ context: str = "",
3250
+ ) -> tuple[str, str]:
3251
+ """
3252
+ Enhance URL content using screenshot as visual reference.
3253
+
3254
+ Unlike enhance_document_with_vision, this method:
3255
+ - Does NOT use slide/page number protection (URLs don't have these)
3256
+ - Generates frontmatter along with cleaned content
3257
+ - Uses a simpler content protection strategy
3258
+
3259
+ Args:
3260
+ content: URL content (may be multi-source combined)
3261
+ screenshot_path: Path to full-page screenshot
3262
+ context: Source URL for logging
3263
+
3264
+ Returns:
3265
+ Tuple of (cleaned_markdown, frontmatter_yaml)
3266
+ """
3267
+ import time
3268
+
3269
+ import yaml
3270
+
3271
+ start_time = time.perf_counter()
3272
+
3273
+ # Check persistent cache
3274
+ cache_key = f"enhance_url:{context}"
3275
+ cache_content = f"{screenshot_path.name}|{content[:1000]}"
3276
+ cached = self._persistent_cache.get(cache_key, cache_content, context=context)
3277
+ if cached is not None:
3278
+ logger.debug(
3279
+ f"[{context}] Persistent cache hit for enhance_url_with_vision"
3280
+ )
3281
+ return cached.get("cleaned_markdown", content), cached.get(
3282
+ "frontmatter_yaml", ""
3283
+ )
3284
+
3285
+ # Only protect image references, NOT slide/page markers (URLs don't have them)
3286
+ protected_text, img_mapping = self._protect_image_positions(content)
3287
+
3288
+ # Get URL-specific prompt (not document_enhance_complete which has slide/page markers)
3289
+ prompt = self._prompt_manager.get_prompt(
3290
+ "url_enhance",
3291
+ source=context,
3292
+ )
3293
+
3294
+ # Build content parts
3295
+ content_parts: list[dict] = [
3296
+ {
3297
+ "type": "text",
3298
+ "text": f"{prompt}\n\n## URL Content:\n\n{protected_text}",
3299
+ },
3300
+ ]
3301
+
3302
+ # Add screenshot
3303
+ _, base64_image = self._get_cached_image(screenshot_path)
3304
+ mime_type = get_mime_type(screenshot_path.suffix)
3305
+ content_parts.append({"type": "text", "text": "\n__MARKITAI_SCREENSHOT__"})
3306
+ content_parts.append(
3307
+ {
3308
+ "type": "image_url",
3309
+ "image_url": {"url": f"data:{mime_type};base64,{base64_image}"},
3310
+ }
3311
+ )
3312
+
3313
+ async with self.semaphore:
3314
+ # Calculate dynamic max_tokens
3315
+ messages = [{"role": "user", "content": content_parts}]
3316
+ max_tokens = self._calculate_dynamic_max_tokens(messages)
3317
+
3318
+ client = instructor.from_litellm(
3319
+ self.vision_router.acompletion, mode=instructor.Mode.JSON
3320
+ )
3321
+ (
3322
+ response,
3323
+ raw_response,
3324
+ ) = await client.chat.completions.create_with_completion(
3325
+ model="default",
3326
+ messages=cast(
3327
+ list[ChatCompletionMessageParam],
3328
+ messages,
3329
+ ),
3330
+ response_model=EnhancedDocumentResult,
3331
+ max_retries=DEFAULT_INSTRUCTOR_MAX_RETRIES,
3332
+ max_tokens=max_tokens,
3333
+ )
3334
+
3335
+ # Track usage and log completion
3336
+ actual_model = getattr(raw_response, "model", None) or "default"
3337
+ input_tokens = 0
3338
+ output_tokens = 0
3339
+ cost = 0.0
3340
+ elapsed = time.perf_counter() - start_time
3341
+
3342
+ if hasattr(raw_response, "usage") and raw_response.usage is not None:
3343
+ input_tokens = getattr(raw_response.usage, "prompt_tokens", 0) or 0
3344
+ output_tokens = getattr(raw_response.usage, "completion_tokens", 0) or 0
3345
+ try:
3346
+ cost = completion_cost(completion_response=raw_response)
3347
+ except Exception:
3348
+ cost = 0.0
3349
+ self._track_usage(
3350
+ actual_model,
3351
+ input_tokens,
3352
+ output_tokens,
3353
+ cost,
3354
+ context,
3355
+ )
3356
+
3357
+ logger.info(
3358
+ f"[LLM:{context}] url_vision_enhance: {actual_model} "
3359
+ f"tokens={input_tokens}+{output_tokens} "
3360
+ f"time={int(elapsed * 1000)}ms cost=${cost:.6f}"
3361
+ )
3362
+
3363
+ # Restore image positions
3364
+ cleaned_markdown = self._restore_image_positions(
3365
+ response.cleaned_markdown, img_mapping
3366
+ )
3367
+
3368
+ # Remove any hallucinated or leaked markers that shouldn't be in URL output
3369
+ import re
3370
+
3371
+ # Remove hallucinated slide/page markers (URLs shouldn't have these)
3372
+ cleaned_markdown = re.sub(
3373
+ r"<!--\s*Slide\s+number:\s*\d+\s*-->\s*\n?", "", cleaned_markdown
3374
+ )
3375
+ cleaned_markdown = re.sub(
3376
+ r"<!--\s*Page\s+number:\s*\d+\s*-->\s*\n?", "", cleaned_markdown
3377
+ )
3378
+ # Remove source labels that may leak from multi-source content
3379
+ cleaned_markdown = re.sub(
3380
+ r"<!--\s*Source:\s*[^>]+-->\s*\n?", "", cleaned_markdown
3381
+ )
3382
+ cleaned_markdown = re.sub(
3383
+ r"##\s*(Static Content|Browser Content|Screenshot Reference)\s*\n+",
3384
+ "",
3385
+ cleaned_markdown,
3386
+ )
3387
+ # Also remove any residual MARKITAI placeholders
3388
+ cleaned_markdown = re.sub(
3389
+ r"__MARKITAI_[A-Z_]+_?\d*__\s*\n?", "", cleaned_markdown
3390
+ )
3391
+
3392
+ # Fix malformed image refs
3393
+ cleaned_markdown = self._fix_malformed_image_refs(cleaned_markdown)
3394
+
3395
+ # Build frontmatter
3396
+ frontmatter_dict = {
3397
+ "title": response.frontmatter.title,
3398
+ "source": context,
3399
+ "description": response.frontmatter.description,
3400
+ "tags": response.frontmatter.tags,
3401
+ }
3402
+ frontmatter_yaml = yaml.dump(
3403
+ frontmatter_dict, allow_unicode=True, default_flow_style=False
3404
+ ).strip()
3405
+
3406
+ # Cache result
3407
+ cache_value = {
3408
+ "cleaned_markdown": cleaned_markdown,
3409
+ "frontmatter_yaml": frontmatter_yaml,
3410
+ }
3411
+ self._persistent_cache.set(
3412
+ cache_key, cache_content, cache_value, model="vision"
3413
+ )
3414
+
3415
+ return cleaned_markdown, frontmatter_yaml
3416
+
3417
+ async def enhance_document_with_vision(
3418
+ self,
3419
+ extracted_text: str,
3420
+ page_images: list[Path],
3421
+ context: str = "",
3422
+ ) -> str:
3423
+ """
3424
+ Clean document format using extracted text and page images as reference.
3425
+
3426
+ This method only cleans formatting issues (removes residuals, fixes structure).
3427
+ It does NOT restructure or rewrite content.
3428
+
3429
+ Uses placeholder-based protection to preserve images, slides, and
3430
+ page comments in their original positions during LLM processing.
3431
+
3432
+ Args:
3433
+ extracted_text: Text extracted by pymupdf4llm/markitdown
3434
+ page_images: List of paths to page/slide images
3435
+ context: Context identifier for logging (e.g., document name)
3436
+
3437
+ Returns:
3438
+ Cleaned markdown content (same content, cleaner format)
3439
+ """
3440
+ if not page_images:
3441
+ return extracted_text
3442
+
3443
+ # Check persistent cache using page count + text fingerprint as key
3444
+ # Create a fingerprint from text + page image names for cache lookup
3445
+ page_names = "|".join(p.name for p in page_images[:10]) # First 10 page names
3446
+ cache_key = f"enhance_vision:{context}:{len(page_images)}"
3447
+ cache_content = f"{page_names}|{extracted_text[:1000]}"
3448
+ cached = self._persistent_cache.get(cache_key, cache_content, context=context)
3449
+ if cached is not None:
3450
+ logger.debug(
3451
+ f"[{_context_display_name(context)}] Persistent cache hit for enhance_document_with_vision"
3452
+ )
3453
+ # Fix malformed image refs even for cached content (handles old cache entries)
3454
+ return self._fix_malformed_image_refs(cached)
3455
+
3456
+ # Extract and protect content before LLM processing
3457
+ protected = self.extract_protected_content(extracted_text)
3458
+ protected_content, mapping = self._protect_content(extracted_text)
3459
+
3460
+ # Build message with text + images
3461
+ prompt = self._prompt_manager.get_prompt("document_enhance")
3462
+
3463
+ # Prepare content parts
3464
+ content_parts: list[dict] = [
3465
+ {
3466
+ "type": "text",
3467
+ "text": f"{prompt}\n\n## Extracted Text:\n\n{protected_content}",
3468
+ },
3469
+ ]
3470
+
3471
+ # Add page images (using cache to avoid repeated reads)
3472
+ for i, image_path in enumerate(page_images, 1):
3473
+ _, base64_image = self._get_cached_image(image_path)
3474
+ mime_type = get_mime_type(image_path.suffix)
3475
+
3476
+ # Unique page label that won't conflict with document content
3477
+ content_parts.append(
3478
+ {
3479
+ "type": "text",
3480
+ "text": f"\n__MARKITAI_PAGE_LABEL_{i}__",
3481
+ }
3482
+ )
3483
+ content_parts.append(
3484
+ {
3485
+ "type": "image_url",
3486
+ "image_url": {"url": f"data:{mime_type};base64,{base64_image}"},
3487
+ }
3488
+ )
3489
+
3490
+ response = await self._call_llm(
3491
+ model="default",
3492
+ messages=[{"role": "user", "content": content_parts}],
3493
+ context=context,
3494
+ )
3495
+
3496
+ # Restore protected content from placeholders, with fallback for removed items
3497
+ result = self._unprotect_content(response.content, mapping, protected)
3498
+
3499
+ # Fix malformed image references (e.g., extra closing parentheses)
3500
+ result = self._fix_malformed_image_refs(result)
3501
+
3502
+ # Store in persistent cache
3503
+ self._persistent_cache.set(cache_key, cache_content, result, model="vision")
3504
+
3505
+ return result
3506
+
3507
+ async def enhance_document_complete(
3508
+ self,
3509
+ extracted_text: str,
3510
+ page_images: list[Path],
3511
+ source: str = "",
3512
+ max_pages_per_batch: int = DEFAULT_MAX_PAGES_PER_BATCH,
3513
+ ) -> tuple[str, str]:
3514
+ """
3515
+ Complete document enhancement: clean format + generate frontmatter.
3516
+
3517
+ Architecture:
3518
+ - Single batch (pages <= max_pages_per_batch): Use Instructor for combined
3519
+ cleaning + frontmatter in one LLM call (saves one API call)
3520
+ - Multi batch (pages > max_pages_per_batch): Clean in batches, then
3521
+ generate frontmatter separately
3522
+
3523
+ Args:
3524
+ extracted_text: Text extracted by pymupdf4llm/markitdown
3525
+ page_images: List of paths to page/slide images
3526
+ source: Source file name
3527
+ max_pages_per_batch: Max pages per batch (default 10)
3528
+
3529
+ Returns:
3530
+ Tuple of (cleaned_markdown, frontmatter_yaml)
3531
+ """
3532
+ if not page_images:
3533
+ # No images, fall back to regular process_document
3534
+ return await self.process_document(extracted_text, source)
3535
+
3536
+ # Single batch: use combined Instructor call (saves one API call)
3537
+ if len(page_images) <= max_pages_per_batch:
3538
+ logger.info(
3539
+ f"[{source}] Processing {len(page_images)} pages with combined call"
3540
+ )
3541
+ try:
3542
+ return await self._enhance_with_frontmatter(
3543
+ extracted_text, page_images, source
3544
+ )
3545
+ except Exception as e:
3546
+ # Log succinct warning instead of full exception trace
3547
+ err_msg = str(e)
3548
+ if len(err_msg) > 200:
3549
+ err_msg = err_msg[:200] + "..."
3550
+ logger.warning(
3551
+ f"[{source}] Combined call failed: {type(e).__name__}: {err_msg}, "
3552
+ "falling back to separate calls"
3553
+ )
3554
+ # Fallback to separate calls
3555
+ cleaned = await self.enhance_document_with_vision(
3556
+ extracted_text, page_images, context=source
3557
+ )
3558
+ frontmatter = await self.generate_frontmatter(cleaned, source)
3559
+ return cleaned, frontmatter
3560
+
3561
+ # Multi batch: clean in batches AND generate frontmatter in parallel
3562
+ # Frontmatter can be generated from original text while cleaning proceeds
3563
+ logger.info(
3564
+ f"[{source}] Processing {len(page_images)} pages in batches of "
3565
+ f"{max_pages_per_batch} (parallel frontmatter)"
3566
+ )
3567
+
3568
+ # Launch cleaning and frontmatter generation concurrently
3569
+ clean_task = asyncio.create_task(
3570
+ self._enhance_document_batched_simple(
3571
+ extracted_text, page_images, max_pages_per_batch, source
3572
+ )
3573
+ )
3574
+ # Generate frontmatter from beginning of original text (first 5000 chars)
3575
+ frontmatter_task = asyncio.create_task(
3576
+ self.generate_frontmatter(extracted_text[:5000], source)
3577
+ )
3578
+
3579
+ cleaned, frontmatter = await asyncio.gather(clean_task, frontmatter_task)
3580
+
3581
+ return cleaned, frontmatter
3582
+
3583
+ async def _enhance_with_frontmatter(
3584
+ self,
3585
+ extracted_text: str,
3586
+ page_images: list[Path],
3587
+ source: str,
3588
+ ) -> tuple[str, str]:
3589
+ """Enhance document with vision and generate frontmatter in one call.
3590
+
3591
+ Uses Instructor for structured output.
3592
+
3593
+ Args:
3594
+ extracted_text: Text to clean
3595
+ page_images: Page images for visual reference
3596
+ source: Source file name
3597
+
3598
+ Returns:
3599
+ Tuple of (cleaned_markdown, frontmatter_yaml)
3600
+ """
3601
+ import time
3602
+
3603
+ import yaml
3604
+
3605
+ start_time = time.perf_counter()
3606
+
3607
+ # Check persistent cache first
3608
+ # Use page count + source + text fingerprint as cache key
3609
+ page_names = "|".join(p.name for p in page_images[:10]) # First 10 page names
3610
+ cache_key = f"enhance_frontmatter:{source}:{len(page_images)}"
3611
+ cache_content = f"{page_names}|{extracted_text[:1000]}"
3612
+ cached = self._persistent_cache.get(cache_key, cache_content, context=source)
3613
+ if cached is not None:
3614
+ logger.debug(
3615
+ f"[{source}] Persistent cache hit for _enhance_with_frontmatter"
3616
+ )
3617
+ # Fix malformed image refs even for cached content (handles old cache entries)
3618
+ cleaned = self._fix_malformed_image_refs(cached.get("cleaned_markdown", ""))
3619
+ return cleaned, cached.get("frontmatter_yaml", "")
3620
+
3621
+ # Extract protected content for fallback restoration
3622
+ protected = self.extract_protected_content(extracted_text)
3623
+
3624
+ # Protect slide comments and images with placeholders before LLM processing
3625
+ protected_text, mapping = self._protect_content(extracted_text)
3626
+
3627
+ # Get combined prompt
3628
+ prompt = self._prompt_manager.get_prompt(
3629
+ "document_enhance_complete",
3630
+ source=source,
3631
+ )
3632
+
3633
+ # Build content parts
3634
+ content_parts: list[dict] = [
3635
+ {
3636
+ "type": "text",
3637
+ "text": f"{prompt}\n\n## Extracted Text:\n\n{protected_text}",
3638
+ },
3639
+ ]
3640
+
3641
+ # Add page images
3642
+ for i, image_path in enumerate(page_images, 1):
3643
+ _, base64_image = self._get_cached_image(image_path)
3644
+ mime_type = get_mime_type(image_path.suffix)
3645
+ # Unique page label that won't conflict with document content
3646
+ content_parts.append(
3647
+ {"type": "text", "text": f"\n__MARKITAI_PAGE_LABEL_{i}__"}
3648
+ )
3649
+ content_parts.append(
3650
+ {
3651
+ "type": "image_url",
3652
+ "image_url": {"url": f"data:{mime_type};base64,{base64_image}"},
3653
+ }
3654
+ )
3655
+
3656
+ async with self.semaphore:
3657
+ # Calculate dynamic max_tokens
3658
+ messages = [{"role": "user", "content": content_parts}]
3659
+ max_tokens = self._calculate_dynamic_max_tokens(messages)
3660
+
3661
+ client = instructor.from_litellm(
3662
+ self.vision_router.acompletion, mode=instructor.Mode.JSON
3663
+ )
3664
+ # max_retries allows Instructor to retry with validation error
3665
+ # feedback, which helps LLM fix JSON escaping issues
3666
+ (
3667
+ response,
3668
+ raw_response,
3669
+ ) = await client.chat.completions.create_with_completion(
3670
+ model="default",
3671
+ messages=cast(
3672
+ list[ChatCompletionMessageParam],
3673
+ messages,
3674
+ ),
3675
+ response_model=EnhancedDocumentResult,
3676
+ max_retries=DEFAULT_INSTRUCTOR_MAX_RETRIES,
3677
+ max_tokens=max_tokens,
3678
+ )
3679
+
3680
+ # Check for truncation
3681
+ if hasattr(raw_response, "choices") and raw_response.choices:
3682
+ finish_reason = getattr(raw_response.choices[0], "finish_reason", None)
3683
+ if finish_reason == "length":
3684
+ raise ValueError("Output truncated due to max_tokens limit")
3685
+
3686
+ # Track usage and log completion
3687
+ actual_model = getattr(raw_response, "model", None) or "default"
3688
+ input_tokens = 0
3689
+ output_tokens = 0
3690
+ cost = 0.0
3691
+ if hasattr(raw_response, "usage") and raw_response.usage is not None:
3692
+ input_tokens = getattr(raw_response.usage, "prompt_tokens", 0) or 0
3693
+ output_tokens = getattr(raw_response.usage, "completion_tokens", 0) or 0
3694
+ try:
3695
+ cost = completion_cost(completion_response=raw_response)
3696
+ except Exception:
3697
+ cost = 0.0
3698
+ self._track_usage(
3699
+ actual_model, input_tokens, output_tokens, cost, source
3700
+ )
3701
+
3702
+ # Log completion with timing
3703
+ elapsed_ms = int((time.perf_counter() - start_time) * 1000)
3704
+ logger.info(
3705
+ f"[LLM:{source}] vision_enhance: {actual_model} "
3706
+ f"tokens={input_tokens}+{output_tokens} time={elapsed_ms}ms cost=${cost:.6f}"
3707
+ )
3708
+
3709
+ # Build frontmatter YAML
3710
+ frontmatter_dict = {
3711
+ "title": response.frontmatter.title,
3712
+ "description": response.frontmatter.description,
3713
+ "tags": response.frontmatter.tags,
3714
+ "source": source,
3715
+ }
3716
+ frontmatter_yaml = yaml.dump(
3717
+ frontmatter_dict, allow_unicode=True, default_flow_style=False
3718
+ ).strip()
3719
+
3720
+ # Restore protected content from placeholders
3721
+ # Pass protected dict for fallback restoration if LLM removed placeholders
3722
+ cleaned_markdown = self._unprotect_content(
3723
+ response.cleaned_markdown, mapping, protected
3724
+ )
3725
+
3726
+ # Fix malformed image references (e.g., extra closing parentheses)
3727
+ cleaned_markdown = self._fix_malformed_image_refs(cleaned_markdown)
3728
+
3729
+ # Store in persistent cache
3730
+ cache_value = {
3731
+ "cleaned_markdown": cleaned_markdown,
3732
+ "frontmatter_yaml": frontmatter_yaml,
3733
+ }
3734
+ self._persistent_cache.set(
3735
+ cache_key, cache_content, cache_value, model="vision"
3736
+ )
3737
+
3738
+ return cleaned_markdown, frontmatter_yaml
3739
+
3740
+ @staticmethod
3741
+ def _split_text_by_pages(text: str, num_pages: int) -> list[str]:
3742
+ """Split text into chunks corresponding to page ranges.
3743
+
3744
+ Split strategy (in priority order):
3745
+ 1. Remove trailing page image reference section first
3746
+ 2. Use <!-- Slide number: N --> markers (PPTX/PPT)
3747
+ 3. Use <!-- Page number: N --> markers (PDF)
3748
+ 4. Fallback: split by paragraphs proportionally
3749
+
3750
+ Args:
3751
+ text: Full document text
3752
+ num_pages: Number of pages/images
3753
+
3754
+ Returns:
3755
+ List of text chunks, one per page
3756
+ """
3757
+ import re
3758
+
3759
+ # Step 1: Remove trailing page image reference section
3760
+ # These are screenshot references at the end, not content separators
3761
+ ref_marker = "<!-- Page images for reference -->"
3762
+ ref_idx = text.find(ref_marker)
3763
+ if ref_idx != -1:
3764
+ main_content = text[:ref_idx].rstrip()
3765
+ else:
3766
+ main_content = text
3767
+
3768
+ # Step 2: Try slide markers (PPTX/PPT)
3769
+ slide_pattern = r"<!-- Slide number: (\d+) -->"
3770
+ slide_markers = list(re.finditer(slide_pattern, main_content))
3771
+
3772
+ if len(slide_markers) >= num_pages:
3773
+ # Use slide markers to split - each chunk starts with its slide marker
3774
+ chunks = []
3775
+ for i in range(num_pages):
3776
+ start = slide_markers[i].start()
3777
+ if i + 1 < len(slide_markers):
3778
+ end = slide_markers[i + 1].start()
3779
+ else:
3780
+ end = len(main_content)
3781
+ chunks.append(main_content[start:end].strip())
3782
+ return chunks
3783
+
3784
+ # Step 3: Try page markers (PDF)
3785
+ page_pattern = r"<!-- Page number: (\d+) -->"
3786
+ page_markers = list(re.finditer(page_pattern, main_content))
3787
+
3788
+ if len(page_markers) >= num_pages:
3789
+ # Use page markers to split - each chunk starts with its page marker
3790
+ chunks = []
3791
+ for i in range(num_pages):
3792
+ start = page_markers[i].start()
3793
+ if i + 1 < len(page_markers):
3794
+ end = page_markers[i + 1].start()
3795
+ else:
3796
+ end = len(main_content)
3797
+ chunks.append(main_content[start:end].strip())
3798
+ return chunks
3799
+
3800
+ # Step 4: Fallback - split by paragraphs proportionally
3801
+ paragraphs = main_content.split("\n\n")
3802
+ if len(paragraphs) < num_pages:
3803
+ # Very short text, just return whole text for each page
3804
+ return [main_content] * num_pages
3805
+
3806
+ paragraphs_per_page = len(paragraphs) // num_pages
3807
+ chunks = []
3808
+ for i in range(num_pages):
3809
+ start_idx = i * paragraphs_per_page
3810
+ if i == num_pages - 1:
3811
+ # Last chunk gets remaining paragraphs
3812
+ end_idx = len(paragraphs)
3813
+ else:
3814
+ end_idx = start_idx + paragraphs_per_page
3815
+ chunks.append("\n\n".join(paragraphs[start_idx:end_idx]))
3816
+
3817
+ return chunks
3818
+
3819
+ async def _enhance_document_batched_simple(
3820
+ self,
3821
+ extracted_text: str,
3822
+ page_images: list[Path],
3823
+ batch_size: int,
3824
+ source: str = "",
3825
+ ) -> str:
3826
+ """Process long documents in batches - vision cleaning only.
3827
+
3828
+ All batches use the same method for consistent output format.
3829
+
3830
+ Args:
3831
+ extracted_text: Full document text
3832
+ page_images: All page images
3833
+ batch_size: Pages per batch
3834
+ source: Source file name
3835
+
3836
+ Returns:
3837
+ Merged cleaned content
3838
+ """
3839
+ num_pages = len(page_images)
3840
+ num_batches = (num_pages + batch_size - 1) // batch_size
3841
+
3842
+ # Split text by pages
3843
+ page_texts = self._split_text_by_pages(extracted_text, num_pages)
3844
+
3845
+ cleaned_parts = []
3846
+
3847
+ for batch_num in range(num_batches):
3848
+ batch_start = batch_num * batch_size
3849
+ batch_end = min(batch_start + batch_size, num_pages)
3850
+
3851
+ # Get text and images for this batch
3852
+ batch_texts = page_texts[batch_start:batch_end]
3853
+ batch_images = page_images[batch_start:batch_end]
3854
+ batch_text = "\n\n".join(batch_texts)
3855
+
3856
+ logger.info(
3857
+ f"[{source}] Batch {batch_num + 1}/{num_batches}: "
3858
+ f"pages {batch_start + 1}-{batch_end}"
3859
+ )
3860
+
3861
+ # All batches: clean only (no frontmatter)
3862
+ # Use source as context (not batch-specific) so all usage aggregates to same context
3863
+ batch_cleaned = await self.enhance_document_with_vision(
3864
+ batch_text, batch_images, context=source
3865
+ )
3866
+
3867
+ cleaned_parts.append(batch_cleaned)
3868
+
3869
+ # Merge all batches
3870
+ return "\n\n".join(cleaned_parts)
3871
+
3872
+ async def process_document(
3873
+ self,
3874
+ markdown: str,
3875
+ source: str,
3876
+ ) -> tuple[str, str]:
3877
+ """
3878
+ Process a document with LLM: clean and generate frontmatter.
3879
+
3880
+ Uses placeholder-based protection to preserve images, slides, and
3881
+ page comments in their original positions during LLM processing.
3882
+
3883
+ Uses a combined prompt with Instructor for structured output,
3884
+ falling back to parallel separate calls if structured output fails.
3885
+
3886
+ Args:
3887
+ markdown: Raw markdown content
3888
+ source: Source file name
3889
+
3890
+ Returns:
3891
+ Tuple of (cleaned_markdown, frontmatter_yaml)
3892
+ """
3893
+ # Extract and protect content before LLM processing
3894
+ protected = self.extract_protected_content(markdown)
3895
+ protected_content, mapping = self._protect_content(markdown)
3896
+
3897
+ # Try combined approach with Instructor first
3898
+ try:
3899
+ result = await self._process_document_combined(protected_content, source)
3900
+
3901
+ # Restore protected content from placeholders, with fallback
3902
+ cleaned = self._unprotect_content(
3903
+ result.cleaned_markdown, mapping, protected
3904
+ )
3905
+
3906
+ # Convert Frontmatter to YAML string
3907
+ import yaml
3908
+
3909
+ frontmatter_dict = {
3910
+ "title": result.frontmatter.title,
3911
+ "description": result.frontmatter.description,
3912
+ "tags": result.frontmatter.tags,
3913
+ "source": source,
3914
+ }
3915
+ frontmatter_yaml = yaml.dump(
3916
+ frontmatter_dict, allow_unicode=True, default_flow_style=False
3917
+ ).strip()
3918
+ logger.debug(f"[{source}] Used combined document processing")
3919
+ return cleaned, frontmatter_yaml
3920
+ except Exception as e:
3921
+ logger.debug(
3922
+ f"[{source}] Combined processing failed: {e}, using parallel fallback"
3923
+ )
3924
+
3925
+ # Fallback: Run cleaning and frontmatter generation in parallel
3926
+ # clean_markdown uses its own protection mechanism
3927
+ clean_task = asyncio.create_task(self.clean_markdown(markdown, context=source))
3928
+ frontmatter_task = asyncio.create_task(
3929
+ self.generate_frontmatter(markdown, source)
3930
+ )
3931
+
3932
+ cleaned_result, frontmatter_result = await asyncio.gather(
3933
+ clean_task, frontmatter_task, return_exceptions=True
3934
+ )
3935
+
3936
+ cleaned: str = (
3937
+ markdown if isinstance(cleaned_result, BaseException) else cleaned_result
3938
+ )
3939
+ if isinstance(cleaned_result, BaseException):
3940
+ logger.warning(f"Markdown cleaning failed: {cleaned_result}")
3941
+
3942
+ frontmatter: str = (
3943
+ f"title: {source}\nsource: {source}"
3944
+ if isinstance(frontmatter_result, BaseException)
3945
+ else frontmatter_result
3946
+ )
3947
+ if isinstance(frontmatter_result, BaseException):
3948
+ logger.warning(f"Frontmatter generation failed: {frontmatter_result}")
3949
+
3950
+ return cleaned, frontmatter
3951
+
3952
+ async def _process_document_combined(
3953
+ self,
3954
+ markdown: str,
3955
+ source: str,
3956
+ ) -> DocumentProcessResult:
3957
+ """
3958
+ Process document with combined cleaner + frontmatter using Instructor.
3959
+
3960
+ Cache lookup order:
3961
+ 1. In-memory cache (session-level, fast)
3962
+ 2. Persistent cache (cross-session, SQLite)
3963
+ 3. LLM API call
3964
+
3965
+ Args:
3966
+ markdown: Raw markdown content
3967
+ source: Source file name
3968
+
3969
+ Returns:
3970
+ DocumentProcessResult with cleaned markdown and frontmatter
3971
+ """
3972
+ cache_key = f"document_process:{source}"
3973
+
3974
+ # Helper to reconstruct DocumentProcessResult from cached dict
3975
+ def _from_cache(cached: dict) -> DocumentProcessResult:
3976
+ return DocumentProcessResult(
3977
+ cleaned_markdown=cached.get("cleaned_markdown", ""),
3978
+ frontmatter=Frontmatter(
3979
+ title=cached.get("title", source),
3980
+ description=cached.get("description", ""),
3981
+ tags=cached.get("tags", []),
3982
+ ),
3983
+ )
3984
+
3985
+ # 1. Check in-memory cache first (fastest)
3986
+ cached = self._cache.get(cache_key, markdown)
3987
+ if cached is not None:
3988
+ self._cache_hits += 1
3989
+ logger.debug(f"[{source}] Memory cache hit for _process_document_combined")
3990
+ return _from_cache(cached)
3991
+
3992
+ # 2. Check persistent cache (cross-session)
3993
+ cached = self._persistent_cache.get(cache_key, markdown, context=source)
3994
+ if cached is not None:
3995
+ self._cache_hits += 1
3996
+ logger.debug(
3997
+ f"[{source}] Persistent cache hit for _process_document_combined"
3998
+ )
3999
+ # Also populate in-memory cache for faster subsequent access
4000
+ self._cache.set(cache_key, markdown, cached)
4001
+ return _from_cache(cached)
4002
+
4003
+ self._cache_misses += 1
4004
+
4005
+ # Detect document language
4006
+ language = get_language_name(detect_language(markdown))
4007
+
4008
+ # Truncate content if needed (with warning)
4009
+ original_len = len(markdown)
4010
+ truncated_content = self._smart_truncate(markdown, DEFAULT_MAX_CONTENT_CHARS)
4011
+ if len(truncated_content) < original_len:
4012
+ logger.warning(
4013
+ f"[LLM:{source}] Content truncated: {original_len} -> {len(truncated_content)} chars "
4014
+ f"(limit: {DEFAULT_MAX_CONTENT_CHARS}). Some content may be lost."
4015
+ )
4016
+
4017
+ # Get combined prompt with language
4018
+ prompt = self._prompt_manager.get_prompt(
4019
+ "document_process",
4020
+ content=truncated_content,
4021
+ source=source,
4022
+ language=language,
4023
+ )
4024
+
4025
+ async with self.semaphore:
4026
+ start_time = time.perf_counter()
4027
+
4028
+ # Calculate dynamic max_tokens
4029
+ messages = cast(
4030
+ list[ChatCompletionMessageParam],
4031
+ [{"role": "user", "content": prompt}],
4032
+ )
4033
+ max_tokens = self._calculate_dynamic_max_tokens(messages)
4034
+
4035
+ # Create instructor client from router for load balancing
4036
+ client = instructor.from_litellm(
4037
+ self.router.acompletion, mode=instructor.Mode.JSON
4038
+ )
4039
+
4040
+ # Use create_with_completion to get both the model and the raw response
4041
+ # Use logical model name for router load balancing
4042
+ # max_retries allows Instructor to retry with validation error
4043
+ # feedback, which helps LLM fix JSON escaping issues
4044
+ (
4045
+ response,
4046
+ raw_response,
4047
+ ) = await client.chat.completions.create_with_completion(
4048
+ model="default",
4049
+ messages=messages,
4050
+ response_model=DocumentProcessResult,
4051
+ max_retries=DEFAULT_INSTRUCTOR_MAX_RETRIES,
4052
+ max_tokens=max_tokens,
4053
+ )
4054
+
4055
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
4056
+
4057
+ # Check for truncation
4058
+ if hasattr(raw_response, "choices") and raw_response.choices:
4059
+ finish_reason = getattr(raw_response.choices[0], "finish_reason", None)
4060
+ if finish_reason == "length":
4061
+ raise ValueError("Output truncated due to max_tokens limit")
4062
+
4063
+ # Track usage from raw API response
4064
+ # Get actual model from response for accurate tracking
4065
+ actual_model = getattr(raw_response, "model", None) or "default"
4066
+ input_tokens = 0
4067
+ output_tokens = 0
4068
+ cost = 0.0
4069
+ if hasattr(raw_response, "usage") and raw_response.usage is not None:
4070
+ input_tokens = getattr(raw_response.usage, "prompt_tokens", 0) or 0
4071
+ output_tokens = getattr(raw_response.usage, "completion_tokens", 0) or 0
4072
+ try:
4073
+ cost = completion_cost(completion_response=raw_response)
4074
+ except Exception:
4075
+ cost = 0.0
4076
+ self._track_usage(
4077
+ actual_model, input_tokens, output_tokens, cost, source
4078
+ )
4079
+
4080
+ # Log detailed timing for performance analysis
4081
+ logger.info(
4082
+ f"[LLM:{source}] document_process: {actual_model} "
4083
+ f"tokens={input_tokens}+{output_tokens} "
4084
+ f"time={elapsed_ms:.0f}ms cost=${cost:.6f}"
4085
+ )
4086
+
4087
+ # Store in both cache layers
4088
+ cache_value = {
4089
+ "cleaned_markdown": response.cleaned_markdown,
4090
+ "title": response.frontmatter.title,
4091
+ "description": response.frontmatter.description,
4092
+ "tags": response.frontmatter.tags,
4093
+ }
4094
+ self._cache.set(cache_key, markdown, cache_value)
4095
+ self._persistent_cache.set(
4096
+ cache_key, markdown, cache_value, model="default"
4097
+ )
4098
+
4099
+ return response
4100
+
4101
+ def format_llm_output(
4102
+ self,
4103
+ markdown: str,
4104
+ frontmatter: str,
4105
+ ) -> str:
4106
+ """
4107
+ Format final output with frontmatter.
4108
+
4109
+ Args:
4110
+ markdown: Cleaned markdown content
4111
+ frontmatter: YAML frontmatter (without --- markers)
4112
+
4113
+ Returns:
4114
+ Complete markdown with frontmatter
4115
+ """
4116
+ from datetime import datetime
4117
+
4118
+ import yaml
4119
+
4120
+ from markitai.workflow.helpers import normalize_frontmatter
4121
+
4122
+ frontmatter = self._clean_frontmatter(frontmatter)
4123
+
4124
+ # Parse frontmatter to dict, add timestamp, then normalize
4125
+ try:
4126
+ frontmatter_dict = yaml.safe_load(frontmatter) or {}
4127
+ except yaml.YAMLError:
4128
+ frontmatter_dict = {}
4129
+
4130
+ # Add markitai_processed timestamp (use local time)
4131
+ timestamp = datetime.now().astimezone().isoformat()
4132
+ frontmatter_dict["markitai_processed"] = timestamp
4133
+
4134
+ # Normalize to ensure consistent field order
4135
+ frontmatter = normalize_frontmatter(frontmatter_dict)
4136
+
4137
+ # Remove non-commented screenshot references that shouldn't be in content
4138
+ # These are page screenshots that should only appear as comments at the end
4139
+ # Pattern: ![Page N](screenshots/...) or ![Page N](path/screenshots/...)
4140
+ # But NOT: <!-- ![Page N](screenshots/...) --> (already commented)
4141
+ markdown = self._remove_uncommented_screenshots(markdown)
4142
+
4143
+ from markitai.utils.text import (
4144
+ clean_ppt_headers_footers,
4145
+ clean_residual_placeholders,
4146
+ fix_broken_markdown_links,
4147
+ normalize_markdown_whitespace,
4148
+ )
4149
+
4150
+ # Post-processing: fix broken links and clean PPT headers/footers
4151
+ # These are fallback cleanups when LLM doesn't fully follow instructions
4152
+ markdown = fix_broken_markdown_links(markdown)
4153
+ markdown = clean_ppt_headers_footers(markdown)
4154
+ markdown = clean_residual_placeholders(markdown)
4155
+ markdown = normalize_markdown_whitespace(markdown)
4156
+ return f"---\n{frontmatter}\n---\n\n{markdown}"
4157
+
4158
+ @staticmethod
4159
+ def _remove_uncommented_screenshots(content: str) -> str:
4160
+ """Remove non-commented page screenshot references from content.
4161
+
4162
+ Page screenshots should only appear as HTML comments at the end of the document.
4163
+ If LLM accidentally outputs them as regular image references, remove them.
4164
+
4165
+ Also ensures that any screenshot references in the "Page images for reference"
4166
+ section are properly commented.
4167
+
4168
+ Args:
4169
+ content: Markdown content
4170
+
4171
+ Returns:
4172
+ Content with uncommented screenshots removed/fixed
4173
+ """
4174
+ import re
4175
+
4176
+ # Find the position of "<!-- Page images for reference -->" if it exists
4177
+ page_images_header = "<!-- Page images for reference -->"
4178
+ header_pos = content.find(page_images_header)
4179
+
4180
+ if header_pos == -1:
4181
+ # No page images section, just remove any stray screenshot references
4182
+ # IMPORTANT: Only match markitai-generated screenshot patterns to avoid
4183
+ # removing user's original screenshots/ references (P0-5 fix).
4184
+ # markitai naming format: {filename}.page{NNNN}.{ext} in screenshots/
4185
+ # Patterns to remove:
4186
+ # 1. ![Page N](screenshots/*.page*.jpg) - markitai standard pattern
4187
+ # 2. ![...](screenshots/*.page*.jpg) - LLM-generated variants with same filename
4188
+ patterns = [
4189
+ # Matches: ![Page N](screenshots/anything.pageNNNN.jpg)
4190
+ r"^!\[Page\s+\d+\]\(screenshots/[^)]+\.page\d{4}\.\w+\)\s*$",
4191
+ # Matches: ![...](screenshots/anything.pageNNNN.jpg)
4192
+ r"^!\[[^\]]*\]\(screenshots/[^)]+\.page\d{4}\.\w+\)\s*$",
4193
+ ]
4194
+ for pattern in patterns:
4195
+ content = re.sub(pattern, "", content, flags=re.MULTILINE)
4196
+
4197
+ # Also remove any page/image labels that LLM may have copied
4198
+ # Pattern: ## or ### Page N Image: followed by empty line (legacy format)
4199
+ # Pattern: [Page N] or [Image N] on its own line (simple format)
4200
+ # Pattern: __MARKITAI_PAGE_LABEL_N__ or __MARKITAI_IMG_LABEL_N__ (unique format)
4201
+ content = re.sub(
4202
+ r"^#{2,3}\s+Page\s+\d+\s+Image:\s*\n\s*\n",
4203
+ "",
4204
+ content,
4205
+ flags=re.MULTILINE,
4206
+ )
4207
+ content = re.sub(
4208
+ r"^\[(Page|Image)\s+\d+\]\s*\n",
4209
+ "",
4210
+ content,
4211
+ flags=re.MULTILINE,
4212
+ )
4213
+ content = re.sub(
4214
+ r"^__MARKITAI_(PAGE|IMG)_LABEL_\d+__\s*\n",
4215
+ "",
4216
+ content,
4217
+ flags=re.MULTILINE,
4218
+ )
4219
+ # Remove any leftover slide placeholders (shouldn't exist but cleanup)
4220
+ content = re.sub(
4221
+ r"^__MARKITAI_SLIDE_\d+__\s*\n",
4222
+ "",
4223
+ content,
4224
+ flags=re.MULTILINE,
4225
+ )
4226
+
4227
+ # Clean up any resulting empty lines
4228
+ content = re.sub(r"\n{3,}", "\n\n", content)
4229
+ else:
4230
+ # Split at the page images section
4231
+ before = content[:header_pos]
4232
+ after = content[header_pos:]
4233
+
4234
+ # Remove screenshot references from BEFORE the page images header
4235
+ # IMPORTANT: Only match markitai-generated screenshot patterns (P0-5 fix)
4236
+ patterns = [
4237
+ # Matches: ![Page N](screenshots/anything.pageNNNN.jpg)
4238
+ r"^!\[Page\s+\d+\]\(screenshots/[^)]+\.page\d{4}\.\w+\)\s*$",
4239
+ # Matches: ![...](screenshots/anything.pageNNNN.jpg)
4240
+ r"^!\[[^\]]*\]\(screenshots/[^)]+\.page\d{4}\.\w+\)\s*$",
4241
+ ]
4242
+ for pattern in patterns:
4243
+ before = re.sub(pattern, "", before, flags=re.MULTILINE)
4244
+
4245
+ # Also remove any page/image labels that LLM may have copied
4246
+ before = re.sub(
4247
+ r"^#{2,3}\s+Page\s+\d+\s+Image:\s*\n\s*\n",
4248
+ "",
4249
+ before,
4250
+ flags=re.MULTILINE,
4251
+ )
4252
+ before = re.sub(
4253
+ r"^\[(Page|Image)\s+\d+\]\s*\n",
4254
+ "",
4255
+ before,
4256
+ flags=re.MULTILINE,
4257
+ )
4258
+ before = re.sub(
4259
+ r"^__MARKITAI_(PAGE|IMG)_LABEL_\d+__\s*\n",
4260
+ "",
4261
+ before,
4262
+ flags=re.MULTILINE,
4263
+ )
4264
+ # Remove any leftover slide placeholders (shouldn't exist but cleanup)
4265
+ before = re.sub(
4266
+ r"^__MARKITAI_SLIDE_\d+__\s*\n",
4267
+ "",
4268
+ before,
4269
+ flags=re.MULTILINE,
4270
+ )
4271
+ before = re.sub(r"\n{3,}", "\n\n", before)
4272
+
4273
+ # Fix the AFTER section: convert any non-commented page images to comments
4274
+ # Match lines with page image references that are not already commented
4275
+ # This handles: ![Page N](screenshots/...)
4276
+ after_lines = after.split("\n")
4277
+ fixed_lines = []
4278
+ for line in after_lines:
4279
+ stripped = line.strip()
4280
+ # Check if it's an uncommented page image reference
4281
+ if (
4282
+ stripped.startswith("![Page")
4283
+ and "screenshots/" in stripped
4284
+ and not stripped.startswith("<!--")
4285
+ ):
4286
+ fixed_lines.append(f"<!-- {stripped} -->")
4287
+ else:
4288
+ fixed_lines.append(line)
4289
+ after = "\n".join(fixed_lines)
4290
+
4291
+ content = before + after
4292
+
4293
+ # Clean up screenshot comments section: remove blank lines between comments
4294
+ # Pattern: <!-- Page images for reference --> followed by page image comments
4295
+ page_section_pattern = (
4296
+ r"(<!-- Page images for reference -->)"
4297
+ r"((?:\s*<!-- !\[Page \d+\]\([^)]+\) -->)+)"
4298
+ )
4299
+
4300
+ def clean_page_section(match: re.Match) -> str:
4301
+ header = match.group(1)
4302
+ comments_section = match.group(2)
4303
+ # Extract individual comments and rejoin without blank lines
4304
+ comments = re.findall(r"<!-- !\[Page \d+\]\([^)]+\) -->", comments_section)
4305
+ return header + "\n" + "\n".join(comments)
4306
+
4307
+ content = re.sub(page_section_pattern, clean_page_section, content)
4308
+
4309
+ return content
4310
+
4311
+ @staticmethod
4312
+ def _clean_frontmatter(frontmatter: str) -> str:
4313
+ """
4314
+ Clean frontmatter by removing code block markers and --- markers.
4315
+
4316
+ Args:
4317
+ frontmatter: Raw frontmatter from LLM
4318
+
4319
+ Returns:
4320
+ Clean YAML frontmatter
4321
+ """
4322
+ import re
4323
+
4324
+ frontmatter = frontmatter.strip()
4325
+
4326
+ # Remove code block markers (```yaml, ```yml, ```)
4327
+ # Pattern: ```yaml or ```yml at start, ``` at end
4328
+ code_block_pattern = r"^```(?:ya?ml)?\s*\n?(.*?)\n?```$"
4329
+ match = re.match(code_block_pattern, frontmatter, re.DOTALL | re.IGNORECASE)
4330
+ if match:
4331
+ frontmatter = match.group(1).strip()
4332
+
4333
+ # Remove --- markers
4334
+ if frontmatter.startswith("---"):
4335
+ frontmatter = frontmatter[3:].strip()
4336
+ if frontmatter.endswith("---"):
4337
+ frontmatter = frontmatter[:-3].strip()
4338
+
4339
+ return frontmatter