markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/config.py ADDED
@@ -0,0 +1,602 @@
1
+ """Configuration management for Markitai."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Any, Literal
9
+
10
+ from pydantic import BaseModel, Field
11
+
12
+ from markitai.constants import (
13
+ CONFIG_FILENAME,
14
+ DEFAULT_AGENT_BROWSER_COMMAND,
15
+ DEFAULT_AGENT_BROWSER_EXTRA_WAIT_MS,
16
+ DEFAULT_AGENT_BROWSER_TIMEOUT,
17
+ DEFAULT_AGENT_BROWSER_WAIT_FOR,
18
+ DEFAULT_BATCH_CONCURRENCY,
19
+ DEFAULT_CACHE_SIZE_LIMIT,
20
+ DEFAULT_FETCH_FALLBACK_PATTERNS,
21
+ DEFAULT_FETCH_STRATEGY,
22
+ DEFAULT_GLOBAL_CACHE_DIR,
23
+ DEFAULT_IMAGE_FILTER_MIN_AREA,
24
+ DEFAULT_IMAGE_FILTER_MIN_HEIGHT,
25
+ DEFAULT_IMAGE_FILTER_MIN_WIDTH,
26
+ DEFAULT_IMAGE_FORMAT,
27
+ DEFAULT_IMAGE_MAX_HEIGHT,
28
+ DEFAULT_IMAGE_MAX_WIDTH,
29
+ DEFAULT_IMAGE_QUALITY,
30
+ DEFAULT_JINA_TIMEOUT,
31
+ DEFAULT_LLM_CONCURRENCY,
32
+ DEFAULT_LOG_DIR,
33
+ DEFAULT_LOG_LEVEL,
34
+ DEFAULT_LOG_RETENTION,
35
+ DEFAULT_LOG_ROTATION,
36
+ DEFAULT_MODEL_WEIGHT,
37
+ DEFAULT_OCR_LANG,
38
+ DEFAULT_ON_CONFLICT,
39
+ DEFAULT_OUTPUT_DIR,
40
+ DEFAULT_PROMPTS_DIR,
41
+ DEFAULT_ROUTER_NUM_RETRIES,
42
+ DEFAULT_ROUTER_TIMEOUT,
43
+ DEFAULT_ROUTING_STRATEGY,
44
+ DEFAULT_SCAN_MAX_DEPTH,
45
+ DEFAULT_SCAN_MAX_FILES,
46
+ DEFAULT_SCREENSHOT_MAX_HEIGHT,
47
+ DEFAULT_SCREENSHOT_QUALITY,
48
+ DEFAULT_SCREENSHOT_VIEWPORT_HEIGHT,
49
+ DEFAULT_SCREENSHOT_VIEWPORT_WIDTH,
50
+ DEFAULT_STATE_FLUSH_INTERVAL_SECONDS,
51
+ DEFAULT_URL_CONCURRENCY,
52
+ )
53
+
54
+
55
+ class EnvVarNotFoundError(ValueError):
56
+ """Raised when an environment variable referenced by env: syntax is not found."""
57
+
58
+ def __init__(self, var_name: str) -> None:
59
+ self.var_name = var_name
60
+ super().__init__(f"Environment variable not found: {var_name}")
61
+
62
+
63
+ def resolve_env_value(value: str, strict: bool = True) -> str | None:
64
+ """Resolve env:VAR_NAME syntax to actual environment variable value.
65
+
66
+ Args:
67
+ value: The value to resolve. If starts with "env:", looks up environment variable.
68
+ strict: If True, raises EnvVarNotFoundError when variable not found.
69
+ If False, returns None when variable not found.
70
+
71
+ Returns:
72
+ The resolved value, or None if env var not found and strict=False.
73
+
74
+ Raises:
75
+ EnvVarNotFoundError: If strict=True and environment variable not found.
76
+ """
77
+ if isinstance(value, str) and value.startswith("env:"):
78
+ env_var = value[4:]
79
+ env_value = os.environ.get(env_var)
80
+ if env_value is None:
81
+ if strict:
82
+ raise EnvVarNotFoundError(env_var)
83
+ return None
84
+ return env_value
85
+ return value
86
+
87
+
88
+ class OutputConfig(BaseModel):
89
+ """Output configuration."""
90
+
91
+ dir: str = DEFAULT_OUTPUT_DIR
92
+ on_conflict: Literal["skip", "overwrite", "rename"] = DEFAULT_ON_CONFLICT
93
+ allow_symlinks: bool = False
94
+
95
+
96
+ class LiteLLMParams(BaseModel):
97
+ """LiteLLM parameters for a model."""
98
+
99
+ model: str
100
+ api_key: str | None = None
101
+ api_base: str | None = None
102
+ weight: int = DEFAULT_MODEL_WEIGHT
103
+ max_tokens: int | None = None # Override max_output_tokens for this model
104
+
105
+ def get_resolved_api_key(self, strict: bool = True) -> str | None:
106
+ """Get API key with env: syntax resolved.
107
+
108
+ Args:
109
+ strict: If True, raises EnvVarNotFoundError when env var not found.
110
+ If False, returns None when env var not found.
111
+
112
+ Returns:
113
+ The resolved API key, or None if not configured or env var not found.
114
+
115
+ Raises:
116
+ EnvVarNotFoundError: If strict=True and environment variable not found.
117
+ """
118
+ if self.api_key:
119
+ return resolve_env_value(self.api_key, strict=strict)
120
+ return None
121
+
122
+
123
+ class ModelInfo(BaseModel):
124
+ """Model metadata. All fields are optional and auto-detected from litellm if not set."""
125
+
126
+ supports_vision: bool | None = None
127
+ max_tokens: int | None = None
128
+ max_input_tokens: int | None = None
129
+
130
+
131
+ class ModelConfig(BaseModel):
132
+ """Model configuration for LiteLLM Router."""
133
+
134
+ model_name: str
135
+ litellm_params: LiteLLMParams
136
+ model_info: ModelInfo | None = None
137
+
138
+
139
+ class RouterSettings(BaseModel):
140
+ """LiteLLM Router settings."""
141
+
142
+ routing_strategy: Literal[
143
+ "simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"
144
+ ] = DEFAULT_ROUTING_STRATEGY
145
+ num_retries: int = DEFAULT_ROUTER_NUM_RETRIES
146
+ timeout: int = DEFAULT_ROUTER_TIMEOUT
147
+ fallbacks: list[dict[str, Any]] = Field(default_factory=list)
148
+
149
+
150
+ class LLMConfig(BaseModel):
151
+ """LLM configuration."""
152
+
153
+ enabled: bool = False
154
+ model_list: list[ModelConfig] = Field(default_factory=list)
155
+ router_settings: RouterSettings = Field(default_factory=RouterSettings)
156
+ concurrency: int = DEFAULT_LLM_CONCURRENCY
157
+
158
+
159
+ class ImageFilterConfig(BaseModel):
160
+ """Image filter configuration."""
161
+
162
+ min_width: int = DEFAULT_IMAGE_FILTER_MIN_WIDTH
163
+ min_height: int = DEFAULT_IMAGE_FILTER_MIN_HEIGHT
164
+ min_area: int = DEFAULT_IMAGE_FILTER_MIN_AREA
165
+ deduplicate: bool = True
166
+
167
+
168
+ class ImageConfig(BaseModel):
169
+ """Image processing configuration."""
170
+
171
+ alt_enabled: bool = False # Generate alt text for images via LLM
172
+ desc_enabled: bool = False # Generate description files for images
173
+ compress: bool = True
174
+ quality: int = Field(default=DEFAULT_IMAGE_QUALITY, ge=1, le=100)
175
+ format: Literal["jpeg", "png", "webp"] = DEFAULT_IMAGE_FORMAT
176
+ max_width: int = DEFAULT_IMAGE_MAX_WIDTH
177
+ max_height: int = DEFAULT_IMAGE_MAX_HEIGHT
178
+ filter: ImageFilterConfig = Field(default_factory=ImageFilterConfig)
179
+
180
+
181
+ class OCRConfig(BaseModel):
182
+ """OCR configuration."""
183
+
184
+ enabled: bool = False
185
+ lang: str = DEFAULT_OCR_LANG
186
+
187
+
188
+ class ScreenshotConfig(BaseModel):
189
+ """Screenshot rendering configuration.
190
+
191
+ For PDF/PPTX: Renders pages as JPEG images.
192
+ For URLs: Captures full-page screenshots using agent-browser.
193
+ """
194
+
195
+ enabled: bool = False
196
+ # URL screenshot settings
197
+ viewport_width: int = DEFAULT_SCREENSHOT_VIEWPORT_WIDTH
198
+ viewport_height: int = DEFAULT_SCREENSHOT_VIEWPORT_HEIGHT
199
+ quality: int = Field(default=DEFAULT_SCREENSHOT_QUALITY, ge=1, le=100)
200
+ max_height: int = (
201
+ DEFAULT_SCREENSHOT_MAX_HEIGHT # Max height for full-page URL screenshots
202
+ )
203
+
204
+
205
+ class PromptsConfig(BaseModel):
206
+ """Prompts configuration."""
207
+
208
+ dir: str = DEFAULT_PROMPTS_DIR
209
+ cleaner: str | None = None
210
+ frontmatter: str | None = None
211
+ image_caption: str | None = None
212
+ image_description: str | None = None
213
+ image_analysis: str | None = None # Combined caption + description
214
+ page_content: str | None = None # Page content extraction
215
+ document_enhance: str | None = None # Document enhancement with vision
216
+ url_enhance: str | None = None # URL/web page content enhancement
217
+
218
+
219
+ class BatchConfig(BaseModel):
220
+ """Batch processing configuration."""
221
+
222
+ concurrency: int = Field(default=DEFAULT_BATCH_CONCURRENCY, ge=1)
223
+ url_concurrency: int = Field(
224
+ default=DEFAULT_URL_CONCURRENCY, ge=1
225
+ ) # Separate concurrency for URL fetching
226
+ state_flush_interval_seconds: int = DEFAULT_STATE_FLUSH_INTERVAL_SECONDS
227
+ scan_max_depth: int = Field(default=DEFAULT_SCAN_MAX_DEPTH, ge=1)
228
+ scan_max_files: int = Field(default=DEFAULT_SCAN_MAX_FILES, ge=1)
229
+
230
+
231
+ class LogConfig(BaseModel):
232
+ """Logging configuration."""
233
+
234
+ level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = DEFAULT_LOG_LEVEL
235
+ dir: str | None = DEFAULT_LOG_DIR
236
+ rotation: str = DEFAULT_LOG_ROTATION
237
+ retention: str = DEFAULT_LOG_RETENTION
238
+
239
+
240
+ class CacheConfig(BaseModel):
241
+ """Cache configuration."""
242
+
243
+ enabled: bool = True
244
+ no_cache: bool = False # Skip reading cache but still write (Bun semantics)
245
+ no_cache_patterns: list[
246
+ str
247
+ ] = [] # Patterns to skip cache (glob, relative to input_dir)
248
+ max_size_bytes: int = DEFAULT_CACHE_SIZE_LIMIT
249
+ global_dir: str = DEFAULT_GLOBAL_CACHE_DIR
250
+
251
+
252
+ class AgentBrowserConfig(BaseModel):
253
+ """agent-browser configuration for JS-rendered pages."""
254
+
255
+ command: str = DEFAULT_AGENT_BROWSER_COMMAND
256
+ timeout: int = DEFAULT_AGENT_BROWSER_TIMEOUT # milliseconds
257
+ wait_for: Literal["load", "domcontentloaded", "networkidle"] = (
258
+ DEFAULT_AGENT_BROWSER_WAIT_FOR
259
+ )
260
+ extra_wait_ms: int = DEFAULT_AGENT_BROWSER_EXTRA_WAIT_MS # Extra wait after load
261
+ session: str | None = None # Optional session name for isolated browser instances
262
+
263
+
264
+ class JinaConfig(BaseModel):
265
+ """Jina Reader API configuration."""
266
+
267
+ api_key: str | None = None # Supports env: syntax
268
+ timeout: int = DEFAULT_JINA_TIMEOUT # seconds
269
+
270
+ def get_resolved_api_key(self, strict: bool = False) -> str | None:
271
+ """Get API key with env: syntax resolved.
272
+
273
+ Args:
274
+ strict: If True, raises EnvVarNotFoundError when env var not found.
275
+ If False (default), returns None when env var not found.
276
+
277
+ Returns:
278
+ The resolved API key, or None if not configured or env var not found.
279
+ """
280
+ if self.api_key:
281
+ return resolve_env_value(self.api_key, strict=strict)
282
+ return None
283
+
284
+
285
+ class FetchConfig(BaseModel):
286
+ """URL fetch configuration for handling static and JS-rendered pages."""
287
+
288
+ strategy: Literal["auto", "static", "browser", "jina"] = DEFAULT_FETCH_STRATEGY
289
+ agent_browser: AgentBrowserConfig = Field(default_factory=AgentBrowserConfig)
290
+ jina: JinaConfig = Field(default_factory=JinaConfig)
291
+ fallback_patterns: list[str] = Field(
292
+ default_factory=lambda: list(DEFAULT_FETCH_FALLBACK_PATTERNS)
293
+ )
294
+
295
+
296
+ class PresetConfig(BaseModel):
297
+ """Preset configuration defining which features to enable."""
298
+
299
+ llm: bool = False
300
+ ocr: bool = False
301
+ alt: bool = False
302
+ desc: bool = False
303
+ screenshot: bool = False
304
+
305
+
306
+ # Built-in preset definitions
307
+ BUILTIN_PRESETS: dict[str, PresetConfig] = {
308
+ "rich": PresetConfig(llm=True, alt=True, desc=True, screenshot=True),
309
+ "standard": PresetConfig(llm=True, alt=True, desc=True),
310
+ "minimal": PresetConfig(),
311
+ }
312
+
313
+
314
+ class MarkitaiConfig(BaseModel):
315
+ """Main configuration model."""
316
+
317
+ output: OutputConfig = Field(default_factory=OutputConfig)
318
+ llm: LLMConfig = Field(default_factory=LLMConfig)
319
+ image: ImageConfig = Field(default_factory=ImageConfig)
320
+ ocr: OCRConfig = Field(default_factory=OCRConfig)
321
+ screenshot: ScreenshotConfig = Field(default_factory=ScreenshotConfig)
322
+ prompts: PromptsConfig = Field(default_factory=PromptsConfig)
323
+ batch: BatchConfig = Field(default_factory=BatchConfig)
324
+ log: LogConfig = Field(default_factory=LogConfig)
325
+ cache: CacheConfig = Field(default_factory=CacheConfig)
326
+ fetch: FetchConfig = Field(default_factory=FetchConfig)
327
+ presets: dict[str, PresetConfig] = Field(default_factory=dict)
328
+
329
+
330
+ def _deep_update(base: dict[str, Any], updates: dict[str, Any]) -> dict[str, Any]:
331
+ """Deep merge updates into base dict, preserving base structure.
332
+
333
+ Only updates keys that exist in updates, preserving other keys in base.
334
+ """
335
+ result = base.copy()
336
+ for key, value in updates.items():
337
+ if key in result and isinstance(result[key], dict) and isinstance(value, dict):
338
+ result[key] = _deep_update(result[key], value)
339
+ else:
340
+ result[key] = value
341
+ return result
342
+
343
+
344
+ def _set_nested_value(data: dict[str, Any], key_path: str, value: Any) -> None:
345
+ """Set a nested value in a dict using dot-separated key path.
346
+
347
+ Creates intermediate dicts if they don't exist.
348
+ """
349
+ parts = key_path.split(".")
350
+ current = data
351
+ for part in parts[:-1]:
352
+ if part not in current or not isinstance(current[part], dict):
353
+ current[part] = {}
354
+ current = current[part]
355
+ current[parts[-1]] = value
356
+
357
+
358
+ class ConfigManager:
359
+ """Configuration manager for loading and merging configs."""
360
+
361
+ CONFIG_FILENAME = CONFIG_FILENAME
362
+ DEFAULT_USER_CONFIG_DIR = Path.home() / ".markitai"
363
+
364
+ def __init__(self) -> None:
365
+ self._config: MarkitaiConfig | None = None
366
+ self._config_path: Path | None = None
367
+ self._raw_data: dict[str, Any] = {} # Preserve original JSON structure
368
+ self._modified_keys: set[str] = set() # Track modified key paths
369
+
370
+ @property
371
+ def config(self) -> MarkitaiConfig:
372
+ """Get current configuration, loading if necessary."""
373
+ if self._config is None:
374
+ self._config = self.load()
375
+ return self._config
376
+
377
+ @property
378
+ def config_path(self) -> Path | None:
379
+ """Get the path of the loaded configuration file."""
380
+ return self._config_path
381
+
382
+ def load(
383
+ self,
384
+ config_path: Path | str | None = None,
385
+ env_override: bool = True,
386
+ ) -> MarkitaiConfig:
387
+ """
388
+ Load configuration from file with fallback chain.
389
+
390
+ Priority (highest to lowest):
391
+ 1. Explicit config_path parameter
392
+ 2. MARKITAI_CONFIG environment variable
393
+ 3. ./markitai.json (current directory)
394
+ 4. ~/.markitai/config.json (user directory)
395
+ 5. Default values
396
+ """
397
+ config_data: dict[str, Any] = {}
398
+
399
+ # Determine config file path
400
+ resolved_path = self._resolve_config_path(config_path, env_override)
401
+
402
+ if resolved_path and resolved_path.exists():
403
+ config_data = self._load_json(resolved_path)
404
+ self._config_path = resolved_path
405
+
406
+ # Preserve original JSON structure for minimal-diff saves
407
+ self._raw_data = config_data.copy()
408
+ self._modified_keys.clear()
409
+
410
+ self._config = MarkitaiConfig.model_validate(config_data)
411
+ return self._config
412
+
413
+ def _resolve_config_path(
414
+ self,
415
+ config_path: Path | str | None,
416
+ env_override: bool,
417
+ ) -> Path | None:
418
+ """Resolve configuration file path based on priority."""
419
+ # 1. Explicit path
420
+ if config_path:
421
+ return Path(config_path)
422
+
423
+ # 2. Environment variable
424
+ if env_override:
425
+ env_path = os.environ.get("MARKITAI_CONFIG")
426
+ if env_path:
427
+ return Path(env_path)
428
+
429
+ # 3. Current directory
430
+ cwd_config = Path.cwd() / self.CONFIG_FILENAME
431
+ if cwd_config.exists():
432
+ return cwd_config
433
+
434
+ # 4. User directory
435
+ user_config = self.DEFAULT_USER_CONFIG_DIR / "config.json"
436
+ if user_config.exists():
437
+ return user_config
438
+
439
+ return None
440
+
441
+ def _load_json(self, path: Path) -> dict[str, Any]:
442
+ """Load JSON configuration file."""
443
+ with open(path, encoding="utf-8") as f:
444
+ return json.load(f)
445
+
446
+ def _generate_minimal_config(self) -> dict[str, Any]:
447
+ """Generate minimal template config for init command.
448
+
449
+ Only includes essential fields that users typically need to configure.
450
+ Auto-detectable values (max_tokens, supports_vision) are omitted.
451
+ """
452
+ return {
453
+ "output": {"dir": "./output"},
454
+ "llm": {
455
+ "enabled": False,
456
+ "model_list": [
457
+ {
458
+ "model_name": "default",
459
+ "litellm_params": {
460
+ "model": "gemini/gemini-2.5-flash",
461
+ "api_key": "env:GEMINI_API_KEY",
462
+ },
463
+ }
464
+ ],
465
+ },
466
+ "image": {
467
+ "compress": True,
468
+ "quality": 75,
469
+ },
470
+ }
471
+
472
+ def save(
473
+ self,
474
+ path: Path | str | None = None,
475
+ full_dump: bool = False,
476
+ minimal: bool = False,
477
+ ) -> Path:
478
+ """Save current configuration to file.
479
+
480
+ Args:
481
+ path: Optional path to save to. If None, uses loaded config path.
482
+ full_dump: If True, dumps entire config including defaults.
483
+ If False (default), only updates modified keys in original JSON.
484
+ minimal: If True, generates a minimal template config (for init command).
485
+ """
486
+ if self._config is None:
487
+ self._config = MarkitaiConfig()
488
+
489
+ save_path = Path(path) if path else self._config_path
490
+ if save_path is None:
491
+ save_path = self.DEFAULT_USER_CONFIG_DIR / "config.json"
492
+ elif save_path.is_dir():
493
+ # If path is a directory, append default filename
494
+ save_path = save_path / "markitai.json"
495
+
496
+ save_path.parent.mkdir(parents=True, exist_ok=True)
497
+
498
+ if minimal:
499
+ # Generate minimal template config for init command
500
+ output_data = self._generate_minimal_config()
501
+ elif full_dump:
502
+ # Full dump for init command or explicit full export
503
+ output_data = self._config.model_dump(mode="json")
504
+ else:
505
+ # Minimal-diff save: only update modified keys in original JSON
506
+ output_data = self._raw_data.copy()
507
+ for key in self._modified_keys:
508
+ _set_nested_value(output_data, key, self.get(key))
509
+
510
+ with open(save_path, "w", encoding="utf-8") as f:
511
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
512
+ f.write("\n") # Trailing newline for POSIX compliance
513
+
514
+ return save_path
515
+
516
+ def get(self, key: str, default: Any = None) -> Any:
517
+ """
518
+ Get a configuration value by dot-separated key path.
519
+
520
+ Example: config_manager.get("llm.enabled")
521
+ """
522
+ parts = key.split(".")
523
+ value: Any = self.config
524
+
525
+ for part in parts:
526
+ if isinstance(value, BaseModel):
527
+ value = getattr(value, part, None)
528
+ elif isinstance(value, dict):
529
+ value = value.get(part)
530
+ else:
531
+ return default
532
+
533
+ if value is None:
534
+ return default
535
+
536
+ return value
537
+
538
+ def set(self, key: str, value: Any) -> None:
539
+ """
540
+ Set a configuration value by dot-separated key path.
541
+
542
+ Example: config_manager.set("llm.enabled", True)
543
+ """
544
+ # Track modified key for minimal-diff save
545
+ self._modified_keys.add(key)
546
+
547
+ parts = key.split(".")
548
+ if len(parts) == 1:
549
+ setattr(self.config, key, value)
550
+ return
551
+
552
+ # Navigate to parent
553
+ parent: Any = self.config
554
+ for part in parts[:-1]:
555
+ if isinstance(parent, BaseModel):
556
+ parent = getattr(parent, part)
557
+ elif isinstance(parent, dict):
558
+ parent = parent[part]
559
+
560
+ # Set the value
561
+ final_key = parts[-1]
562
+ if isinstance(parent, BaseModel):
563
+ setattr(parent, final_key, value)
564
+ elif isinstance(parent, dict):
565
+ parent[final_key] = value
566
+
567
+ def merge_cli_args(self, **kwargs: Any) -> None:
568
+ """Merge CLI arguments into configuration."""
569
+ for key, value in kwargs.items():
570
+ if value is not None:
571
+ # Convert CLI arg names (e.g., output_dir) to config paths (e.g., output.dir)
572
+ config_key = key.replace("_", ".")
573
+ self.set(config_key, value)
574
+
575
+
576
+ # Global config manager instance
577
+ config_manager = ConfigManager()
578
+
579
+
580
+ def get_config() -> MarkitaiConfig:
581
+ """Get the global configuration."""
582
+ return config_manager.config
583
+
584
+
585
+ def get_preset(name: str, config: MarkitaiConfig | None = None) -> PresetConfig | None:
586
+ """Get a preset by name.
587
+
588
+ Looks up in config file first, then falls back to built-in presets.
589
+
590
+ Args:
591
+ name: Preset name (e.g., "rich", "standard", "minimal")
592
+ config: Optional config to look up custom presets
593
+
594
+ Returns:
595
+ PresetConfig if found, None otherwise
596
+ """
597
+ # Check config file presets first
598
+ if config and name in config.presets:
599
+ return config.presets[name]
600
+
601
+ # Fall back to built-in presets
602
+ return BUILTIN_PRESETS.get(name)