markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/cli.py ADDED
@@ -0,0 +1,3979 @@
1
+ """Command-line interface for Markitai."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ import os
9
+ import re
10
+ import sys
11
+ import tempfile
12
+ import warnings
13
+ from collections.abc import Callable
14
+ from pathlib import Path
15
+ from typing import TYPE_CHECKING, Any
16
+ from urllib.parse import urlparse
17
+
18
+ if TYPE_CHECKING:
19
+ from markitai.fetch import FetchCache, FetchStrategy
20
+ from markitai.llm import ImageAnalysis, LLMProcessor
21
+
22
+ # Suppress noisy messages before imports
23
+ os.environ.setdefault("PYMUPDF_SUGGEST_LAYOUT_ANALYZER", "0")
24
+ warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
25
+ # Suppress litellm async client cleanup warning (harmless, occurs at exit)
26
+ warnings.filterwarnings(
27
+ "ignore",
28
+ message="coroutine 'close_litellm_async_clients' was never awaited",
29
+ category=RuntimeWarning,
30
+ )
31
+
32
+ import click
33
+ from dotenv import load_dotenv
34
+
35
+ # Load .env file from current directory and parent directories
36
+ load_dotenv()
37
+
38
+ from click import Context
39
+ from loguru import logger
40
+ from rich.console import Console
41
+ from rich.panel import Panel
42
+ from rich.syntax import Syntax
43
+
44
+ from markitai import __version__
45
+ from markitai.config import ConfigManager, MarkitaiConfig
46
+ from markitai.constants import (
47
+ DEFAULT_MAX_IMAGES_PER_BATCH,
48
+ IMAGE_EXTENSIONS,
49
+ MAX_DOCUMENT_SIZE,
50
+ )
51
+ from markitai.converter import FileFormat, detect_format
52
+ from markitai.converter.base import EXTENSION_MAP
53
+ from markitai.image import ImageProcessor
54
+ from markitai.json_order import order_report
55
+ from markitai.security import (
56
+ atomic_write_json,
57
+ atomic_write_text,
58
+ validate_file_size,
59
+ )
60
+ from markitai.utils.output import resolve_output_path
61
+ from markitai.utils.paths import ensure_dir, ensure_screenshots_dir
62
+ from markitai.workflow.helpers import (
63
+ add_basic_frontmatter as _add_basic_frontmatter,
64
+ )
65
+ from markitai.workflow.helpers import (
66
+ create_llm_processor,
67
+ write_images_json,
68
+ )
69
+ from markitai.workflow.helpers import (
70
+ detect_language as _detect_language,
71
+ )
72
+ from markitai.workflow.helpers import (
73
+ merge_llm_usage as _merge_llm_usage,
74
+ )
75
+ from markitai.workflow.single import ImageAnalysisResult
76
+
77
+ console = Console()
78
+ # Separate stderr console for status/progress (doesn't mix with stdout output)
79
+ stderr_console = Console(stderr=True)
80
+
81
+
82
+ class ProgressReporter:
83
+ """Progress reporter for single file/URL conversion.
84
+
85
+ In non-verbose mode, shows:
86
+ 1. Spinner during conversion/processing stages
87
+ 2. Completion messages after each stage
88
+ 3. Clears all output before final result
89
+
90
+ In verbose mode, does nothing (logging handles feedback).
91
+ """
92
+
93
+ def __init__(self, enabled: bool = True):
94
+ """Initialize progress reporter.
95
+
96
+ Args:
97
+ enabled: Whether to show progress (False in verbose mode)
98
+ """
99
+ self.enabled = enabled
100
+ self._status = None
101
+ self._messages: list[str] = []
102
+
103
+ def start_spinner(self, message: str) -> None:
104
+ """Start showing a spinner with message."""
105
+ if not self.enabled:
106
+ return
107
+ self.stop_spinner() # Stop any existing spinner
108
+ self._status = stderr_console.status(f"[cyan]{message}[/cyan]", spinner="dots")
109
+ self._status.start()
110
+
111
+ def stop_spinner(self) -> None:
112
+ """Stop the current spinner."""
113
+ if self._status is not None:
114
+ self._status.stop()
115
+ self._status = None
116
+
117
+ def log(self, message: str) -> None:
118
+ """Print a progress message."""
119
+ if not self.enabled:
120
+ return
121
+ self.stop_spinner()
122
+ self._messages.append(message)
123
+ stderr_console.print(f"[dim]{message}[/dim]")
124
+
125
+ def clear_and_finish(self) -> None:
126
+ """Clear all progress output before printing final result.
127
+
128
+ Uses ANSI escape codes to move cursor up and clear lines.
129
+ """
130
+ if not self.enabled:
131
+ return
132
+ self.stop_spinner()
133
+
134
+ # Clear previous messages by moving cursor up and clearing lines
135
+ if self._messages:
136
+ # Move cursor up N lines and clear each line
137
+ for _ in self._messages:
138
+ # Move up one line and clear it
139
+ stderr_console.file.write("\033[A\033[2K")
140
+ stderr_console.file.flush()
141
+ self._messages.clear()
142
+
143
+ def __enter__(self):
144
+ return self
145
+
146
+ def __exit__(self, exc_type, exc_val, exc_tb):
147
+ self.stop_spinner()
148
+ return False
149
+
150
+
151
+ # URL pattern for detecting URLs
152
+ _URL_PATTERN = re.compile(r"^https?://", re.IGNORECASE)
153
+
154
+
155
+ def is_url(s: str) -> bool:
156
+ """Check if string is a URL (http:// or https://)."""
157
+ return bool(_URL_PATTERN.match(s))
158
+
159
+
160
+ def url_to_filename(url: str) -> str:
161
+ """Generate a safe filename from URL.
162
+
163
+ Examples:
164
+ https://example.com/page.html -> page.html.md
165
+ https://example.com/path/to/doc -> doc.md
166
+ https://example.com/ -> example_com.md
167
+ https://youtube.com/watch?v=abc -> youtube_com_watch.md
168
+ """
169
+ parsed = urlparse(url)
170
+
171
+ # Try to get filename from path
172
+ path = parsed.path.rstrip("/")
173
+ if path:
174
+ # Get last segment of path
175
+ filename = path.split("/")[-1]
176
+ if filename:
177
+ # Sanitize for cross-platform compatibility
178
+ filename = _sanitize_filename(filename)
179
+ return f"{filename}.md"
180
+
181
+ # Fallback: use domain name
182
+ domain = parsed.netloc.replace(".", "_").replace(":", "_")
183
+ path_part = parsed.path.strip("/").replace("/", "_")[:50] # limit length
184
+ if path_part:
185
+ return f"{_sanitize_filename(domain)}_{_sanitize_filename(path_part)}.md"
186
+ return f"{_sanitize_filename(domain)}.md"
187
+
188
+
189
+ def _sanitize_filename(name: str) -> str:
190
+ """Sanitize filename for cross-platform compatibility.
191
+
192
+ Removes or replaces characters that are invalid on Windows/Linux/macOS.
193
+ """
194
+ # Characters invalid on Windows: \ / : * ? " < > |
195
+ # Also replace other problematic characters
196
+ invalid_chars = r'<>:"/\|?*'
197
+ for char in invalid_chars:
198
+ name = name.replace(char, "_")
199
+ # Remove leading/trailing spaces and dots (Windows issue)
200
+ name = name.strip(". ")
201
+ # Limit length (255 is common max, but leave room for .md extension)
202
+ if len(name) > 200:
203
+ name = name[:200]
204
+ return name or "unnamed"
205
+
206
+
207
+ # Import shared ThreadPoolExecutor shutdown function from utils.executor
208
+ # This module provides a global executor shared across all conversion operations
209
+ from markitai.utils.executor import shutdown_converter_executor
210
+
211
+
212
+ def compute_task_hash(
213
+ input_path: Path,
214
+ output_dir: Path,
215
+ options: dict[str, Any] | None = None,
216
+ ) -> str:
217
+ """Compute hash from task input parameters.
218
+
219
+ Hash is based on:
220
+ - input_path (resolved)
221
+ - output_dir (resolved)
222
+ - key task options (llm, ocr, etc.)
223
+
224
+ This ensures different parameter combinations produce different hashes.
225
+
226
+ Args:
227
+ input_path: Input file or directory path
228
+ output_dir: Output directory path
229
+ options: Task options dict (llm, ocr, etc.)
230
+
231
+ Returns:
232
+ 6-character hex hash string
233
+ """
234
+ import hashlib
235
+
236
+ # Extract key options that affect output
237
+ key_options = {}
238
+ if options:
239
+ key_options = {
240
+ k: v
241
+ for k, v in options.items()
242
+ if k
243
+ in (
244
+ "llm",
245
+ "ocr",
246
+ "screenshot",
247
+ "alt",
248
+ "desc",
249
+ )
250
+ }
251
+
252
+ hash_params = {
253
+ "input": str(input_path.resolve()),
254
+ "output": str(output_dir.resolve()),
255
+ "options": key_options,
256
+ }
257
+ hash_str = json.dumps(hash_params, sort_keys=True)
258
+ return hashlib.md5(hash_str.encode()).hexdigest()[:6]
259
+
260
+
261
+ def get_report_file_path(
262
+ output_dir: Path,
263
+ task_hash: str,
264
+ on_conflict: str = "rename",
265
+ ) -> Path:
266
+ """Generate report file path based on task hash.
267
+
268
+ Format: reports/markitai.<hash>.report.json
269
+ Respects on_conflict strategy for rename.
270
+
271
+ Args:
272
+ output_dir: Output directory
273
+ task_hash: Task hash string
274
+ on_conflict: Conflict resolution strategy
275
+
276
+ Returns:
277
+ Path to the report file
278
+ """
279
+ reports_dir = output_dir / "reports"
280
+ base_path = reports_dir / f"markitai.{task_hash}.report.json"
281
+
282
+ if not base_path.exists():
283
+ return base_path
284
+
285
+ if on_conflict == "skip":
286
+ return base_path # Will be handled by caller
287
+ elif on_conflict == "overwrite":
288
+ return base_path
289
+ else: # rename
290
+ seq = 2
291
+ while True:
292
+ new_path = reports_dir / f"markitai.{task_hash}.v{seq}.report.json"
293
+ if not new_path.exists():
294
+ return new_path
295
+ seq += 1
296
+
297
+
298
+ # =============================================================================
299
+ # Custom CLI Group
300
+ # =============================================================================
301
+
302
+
303
+ class MarkitaiGroup(click.Group):
304
+ """Custom Group that supports main command with arguments and subcommands.
305
+
306
+ This allows:
307
+ markitai document.docx --llm # Convert file (main command)
308
+ markitai urls.urls -o out # URL list batch (.urls auto-detected)
309
+ markitai config list # Subcommand
310
+ """
311
+
312
+ # Options that take a path argument (so we skip their values when looking for INPUT)
313
+ _PATH_OPTIONS = {"-o", "--output", "-c", "--config"}
314
+
315
+ def parse_args(self, ctx: Context, args: list[str]) -> list[str]:
316
+ """Parse arguments, detecting if first arg is a subcommand or file path."""
317
+ # Find INPUT: first positional arg that's not:
318
+ # - An option flag (starts with -)
319
+ # - A subcommand
320
+ # - A value for a path option
321
+ ctx.ensure_object(dict)
322
+ skip_next = False
323
+ input_idx = None
324
+
325
+ for i, arg in enumerate(args):
326
+ if skip_next:
327
+ skip_next = False
328
+ continue
329
+
330
+ # Check if this is an option that takes a value
331
+ if arg in self._PATH_OPTIONS or arg.startswith(
332
+ tuple(f"{opt}=" for opt in self._PATH_OPTIONS)
333
+ ):
334
+ if "=" not in arg:
335
+ skip_next = True # Next arg is the option's value
336
+ continue
337
+
338
+ if arg.startswith("-"):
339
+ # Other options (flags or with values)
340
+ # For simplicity, assume they don't need skipping unless it's a known path option
341
+ continue
342
+
343
+ # First positional argument
344
+ if arg in self.commands:
345
+ # It's a subcommand - stop looking
346
+ break
347
+ else:
348
+ # It's a file path - store for later use
349
+ ctx.obj["_input_path"] = arg
350
+ input_idx = i
351
+ break
352
+
353
+ # Remove INPUT from args so Group doesn't treat it as subcommand
354
+ if input_idx is not None:
355
+ args = args[:input_idx] + args[input_idx + 1 :]
356
+
357
+ return super().parse_args(ctx, args)
358
+
359
+ def format_usage(
360
+ self,
361
+ ctx: Context,
362
+ formatter: click.HelpFormatter,
363
+ ) -> None:
364
+ """Custom usage line to show INPUT argument."""
365
+ formatter.write_usage(
366
+ ctx.command_path,
367
+ "[OPTIONS] INPUT [COMMAND]",
368
+ )
369
+
370
+ def format_help(self, ctx: Context, formatter: click.HelpFormatter) -> None:
371
+ """Custom help formatting to show INPUT argument."""
372
+ # Usage
373
+ self.format_usage(ctx, formatter)
374
+
375
+ # Help text
376
+ self.format_help_text(ctx, formatter)
377
+
378
+ # Arguments section
379
+ with formatter.section("Arguments"):
380
+ formatter.write_dl(
381
+ [
382
+ (
383
+ "INPUT",
384
+ "File, directory, URL, or .urls file to convert",
385
+ )
386
+ ]
387
+ )
388
+
389
+ # Options (not format_options which may include epilog)
390
+ opts = []
391
+ for param in self.get_params(ctx):
392
+ rv = param.get_help_record(ctx)
393
+ if rv is not None:
394
+ opts.append(rv)
395
+ if opts:
396
+ with formatter.section("Options"):
397
+ formatter.write_dl(opts)
398
+
399
+ # Commands
400
+ commands = []
401
+ for name in self.list_commands(ctx):
402
+ cmd = self.get_command(ctx, name)
403
+ if cmd is None or cmd.hidden:
404
+ continue
405
+ commands.append((name, cmd.get_short_help_str(limit=formatter.width)))
406
+ if commands:
407
+ with formatter.section("Commands"):
408
+ formatter.write_dl(commands)
409
+
410
+
411
+ # =============================================================================
412
+ # Utility functions
413
+ # =============================================================================
414
+
415
+
416
+ class LoggingContext:
417
+ """Context manager for temporarily disabling/re-enabling console logging.
418
+
419
+ This provides a clean way to manage loguru console handler lifecycle,
420
+ especially useful for batch processing with Rich progress bars.
421
+
422
+ Usage:
423
+ logging_ctx = LoggingContext(console_handler_id, verbose)
424
+ with logging_ctx.suspend_console():
425
+ # Rich progress bar here - no console log conflicts
426
+ ...
427
+ # Console logging automatically restored
428
+ """
429
+
430
+ def __init__(self, console_handler_id: int | None, verbose: bool = False) -> None:
431
+ self.original_handler_id = console_handler_id
432
+ self.verbose = verbose
433
+ self._current_handler_id: int | None = console_handler_id
434
+ self._suspended = False
435
+
436
+ @property
437
+ def current_handler_id(self) -> int | None:
438
+ """Get the current console handler ID."""
439
+ return self._current_handler_id
440
+
441
+ def suspend_console(self) -> LoggingContext:
442
+ """Return self as context manager for suspend/resume."""
443
+ return self
444
+
445
+ def __enter__(self) -> LoggingContext:
446
+ """Suspend console logging."""
447
+ if self._current_handler_id is not None and not self._suspended:
448
+ try:
449
+ logger.remove(self._current_handler_id)
450
+ self._suspended = True
451
+ except ValueError:
452
+ pass # Handler already removed
453
+ return self
454
+
455
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
456
+ """Resume console logging."""
457
+ if self._suspended:
458
+ console_level = "DEBUG" if self.verbose else "INFO"
459
+ self._current_handler_id = logger.add(
460
+ sys.stderr,
461
+ level=console_level,
462
+ format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
463
+ )
464
+ self._suspended = False
465
+
466
+
467
+ class InterceptHandler(logging.Handler):
468
+ """Intercept standard logging and forward to loguru.
469
+
470
+ This allows capturing logs from dependencies (litellm, instructor, etc.)
471
+ into our unified logging system.
472
+ """
473
+
474
+ def emit(self, record: logging.LogRecord) -> None:
475
+ # Get corresponding loguru level
476
+ try:
477
+ level = logger.level(record.levelname).name
478
+ except ValueError:
479
+ level = record.levelno
480
+
481
+ # Find caller from where the logged message originated
482
+ frame, depth = logging.currentframe(), 2
483
+ while frame and frame.f_code.co_filename == logging.__file__:
484
+ frame = frame.f_back
485
+ depth += 1
486
+
487
+ logger.opt(depth=depth, exception=record.exc_info).log(
488
+ level, record.getMessage()
489
+ )
490
+
491
+
492
+ def setup_logging(
493
+ verbose: bool,
494
+ log_dir: str | None = None,
495
+ log_level: str = "DEBUG",
496
+ rotation: str = "10 MB",
497
+ retention: str = "7 days",
498
+ quiet: bool = False,
499
+ ) -> tuple[int | None, Path | None]:
500
+ """Configure logging based on configuration.
501
+
502
+ Args:
503
+ verbose: Enable DEBUG level for console output.
504
+ log_dir: Directory for log files. Supports ~ expansion.
505
+ Can be overridden by MARKITAI_LOG_DIR env var.
506
+ log_level: Log level for file output.
507
+ rotation: Log file rotation size.
508
+ retention: Log file retention period.
509
+ quiet: If True, disable console logging entirely (for single file mode).
510
+ Logs will still be written to file if log_dir is configured.
511
+
512
+ Returns:
513
+ Tuple of (console_handler_id, log_file_path).
514
+ Console handler ID can be used to temporarily disable console logging.
515
+ Log file path is None if file logging is disabled.
516
+ """
517
+ from datetime import datetime
518
+
519
+ logger.remove()
520
+
521
+ # Console logging: disabled in quiet mode, otherwise based on verbose flag
522
+ console_handler_id: int | None = None
523
+ if not quiet:
524
+ console_level = "DEBUG" if verbose else "INFO"
525
+ console_handler_id = logger.add(
526
+ sys.stderr,
527
+ level=console_level,
528
+ format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
529
+ )
530
+
531
+ # Check environment variable override
532
+ env_log_dir = os.environ.get("MARKITAI_LOG_DIR")
533
+ if env_log_dir:
534
+ log_dir = env_log_dir
535
+
536
+ # Add file logging (independent handler, not affected by console disable)
537
+ log_file_path: Path | None = None
538
+ if log_dir:
539
+ log_path = Path(log_dir).expanduser()
540
+ log_path.mkdir(parents=True, exist_ok=True)
541
+ # Generate log filename with current timestamp (matching loguru's format)
542
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
543
+ log_file_path = log_path / f"markitai_{timestamp}.log"
544
+ logger.add(
545
+ log_file_path,
546
+ level=log_level,
547
+ rotation=rotation,
548
+ retention=retention,
549
+ serialize=True,
550
+ )
551
+
552
+ # Intercept standard logging from dependencies (litellm, instructor, etc.)
553
+ # and route to loguru for unified log handling
554
+ intercept_handler = InterceptHandler()
555
+ for logger_name in ["LiteLLM", "LiteLLM Router", "LiteLLM Proxy", "httpx"]:
556
+ stdlib_logger = logging.getLogger(logger_name)
557
+ stdlib_logger.handlers.clear() # Remove existing handlers (e.g., StreamHandler)
558
+ stdlib_logger.addHandler(intercept_handler)
559
+ stdlib_logger.propagate = False # Don't propagate to root logger
560
+
561
+ return console_handler_id, log_file_path
562
+
563
+
564
+ def print_version(ctx: Context, param: Any, value: bool) -> None:
565
+ """Print version and exit."""
566
+ if not value or ctx.resilient_parsing:
567
+ return
568
+ console.print(f"markitai {__version__}")
569
+ ctx.exit(0)
570
+
571
+
572
+ # =============================================================================
573
+ # Main CLI app
574
+ # =============================================================================
575
+
576
+
577
+ @click.group(
578
+ cls=MarkitaiGroup,
579
+ invoke_without_command=True,
580
+ context_settings={"help_option_names": ["-h", "--help"]},
581
+ )
582
+ @click.option(
583
+ "--output",
584
+ "-o",
585
+ type=click.Path(path_type=Path),
586
+ default=Path("./output"),
587
+ help="Output directory.",
588
+ )
589
+ @click.option(
590
+ "--config",
591
+ "-c",
592
+ "config_path",
593
+ type=click.Path(exists=True, path_type=Path),
594
+ default=None,
595
+ help="Path to configuration file.",
596
+ )
597
+ @click.option(
598
+ "--preset",
599
+ "-p",
600
+ type=click.Choice(["rich", "standard", "minimal"], case_sensitive=False),
601
+ default=None,
602
+ help="Use a preset configuration (rich/standard/minimal).",
603
+ )
604
+ @click.option(
605
+ "--llm/--no-llm",
606
+ default=None,
607
+ help="Enable/disable LLM processing.",
608
+ )
609
+ @click.option(
610
+ "--alt/--no-alt",
611
+ default=None,
612
+ help="Enable/disable alt text generation for images.",
613
+ )
614
+ @click.option(
615
+ "--desc/--no-desc",
616
+ default=None,
617
+ help="Enable/disable JSON description file for images.",
618
+ )
619
+ @click.option(
620
+ "--ocr/--no-ocr",
621
+ default=None,
622
+ help="Enable/disable OCR for scanned documents.",
623
+ )
624
+ @click.option(
625
+ "--screenshot/--no-screenshot",
626
+ default=None,
627
+ help="Enable/disable page screenshots for PDF/PPTX.",
628
+ )
629
+ @click.option(
630
+ "--resume",
631
+ is_flag=True,
632
+ help="Resume interrupted batch processing.",
633
+ )
634
+ @click.option(
635
+ "--no-compress",
636
+ is_flag=True,
637
+ help="Disable image compression.",
638
+ )
639
+ @click.option(
640
+ "--no-cache",
641
+ is_flag=True,
642
+ help="Disable LLM result caching (force fresh API calls).",
643
+ )
644
+ @click.option(
645
+ "--no-cache-for",
646
+ type=str,
647
+ default=None,
648
+ help="Disable cache for specific files/patterns (comma-separated, supports glob). "
649
+ "E.g., 'file.pdf', '*.docx', '**/reports/*.pdf'.",
650
+ )
651
+ @click.option(
652
+ "--llm-concurrency",
653
+ type=int,
654
+ default=None,
655
+ help="Number of concurrent LLM requests (default from config).",
656
+ )
657
+ @click.option(
658
+ "--batch-concurrency",
659
+ "-j",
660
+ type=int,
661
+ default=None,
662
+ help="Number of concurrent batch tasks (default from config).",
663
+ )
664
+ @click.option(
665
+ "--url-concurrency",
666
+ type=int,
667
+ default=None,
668
+ help="Number of concurrent URL fetches (default from config, separate from file processing).",
669
+ )
670
+ @click.option(
671
+ "--agent-browser",
672
+ "use_agent_browser",
673
+ is_flag=True,
674
+ help="Force browser rendering for URLs via agent-browser.",
675
+ )
676
+ @click.option(
677
+ "--jina",
678
+ "use_jina",
679
+ is_flag=True,
680
+ help="Force Jina Reader API for URL fetching.",
681
+ )
682
+ @click.option(
683
+ "--verbose",
684
+ is_flag=True,
685
+ help="Enable verbose output.",
686
+ )
687
+ @click.option(
688
+ "--dry-run",
689
+ is_flag=True,
690
+ help="Preview conversion without writing files.",
691
+ )
692
+ @click.option(
693
+ "--version",
694
+ "-v",
695
+ is_flag=True,
696
+ callback=print_version,
697
+ expose_value=False,
698
+ is_eager=True,
699
+ help="Show version and exit.",
700
+ )
701
+ @click.pass_context
702
+ def app(
703
+ ctx: Context,
704
+ output: Path,
705
+ config_path: Path | None,
706
+ preset: str | None,
707
+ llm: bool | None,
708
+ alt: bool | None,
709
+ desc: bool | None,
710
+ ocr: bool | None,
711
+ screenshot: bool | None,
712
+ resume: bool,
713
+ no_compress: bool,
714
+ no_cache: bool,
715
+ no_cache_for: str | None,
716
+ batch_concurrency: int | None,
717
+ url_concurrency: int | None,
718
+ llm_concurrency: int | None,
719
+ use_agent_browser: bool,
720
+ use_jina: bool,
721
+ verbose: bool,
722
+ dry_run: bool,
723
+ ) -> None:
724
+ """Markitai - Document to Markdown converter with LLM enhancement.
725
+
726
+ Convert various document formats and URLs to Markdown with optional
727
+ LLM-powered enhancement for format optimization and image analysis.
728
+
729
+ \b
730
+ Presets:
731
+ rich - LLM + alt + desc + screenshot (complex documents)
732
+ standard - LLM + alt + desc (normal documents)
733
+ minimal - No enhancement (just convert)
734
+
735
+ \b
736
+ Examples:
737
+ markitai document.docx # Convert single file
738
+ markitai https://example.com/page # Convert web page
739
+ markitai urls.urls -o ./output/ # Batch URL processing
740
+ markitai https://youtube.com/watch?v=abc # Convert YouTube video
741
+ markitai document.pdf --preset rich # Use rich preset
742
+ markitai document.pdf --preset rich --ocr # Rich + OCR for scans
743
+ markitai document.pdf --preset rich --no-desc # Rich without desc
744
+ markitai ./docs/ -o ./output/ --resume # Batch conversion
745
+ markitai config list # Show configuration
746
+ """
747
+ # If subcommand is invoked, let it handle
748
+ if ctx.invoked_subcommand is not None:
749
+ return
750
+
751
+ # Get input path from context (set by MarkitaiGroup.parse_args)
752
+ ctx.ensure_object(dict)
753
+ input_path_str = ctx.obj.get("_input_path")
754
+
755
+ if not input_path_str:
756
+ click.echo(ctx.get_help())
757
+ ctx.exit(0)
758
+
759
+ # Check if input is a URL
760
+ is_url_input = is_url(input_path_str)
761
+
762
+ # Initialize URL list mode variables
763
+ url_entries: list = []
764
+ is_url_list_mode = False
765
+ input_path: Path | None = None
766
+
767
+ # For file/directory inputs, validate existence and check for .urls file
768
+ if not is_url_input:
769
+ input_path = Path(input_path_str)
770
+ if not input_path.exists():
771
+ console.print(f"[red]Error: Path '{input_path}' does not exist.[/red]")
772
+ ctx.exit(1)
773
+
774
+ # Auto-detect .urls file
775
+ if input_path.is_file() and input_path.suffix == ".urls":
776
+ from markitai.urls import UrlListParseError, parse_url_list
777
+
778
+ try:
779
+ url_entries = parse_url_list(input_path)
780
+ except UrlListParseError as e:
781
+ console.print(f"[red]Error parsing URL list: {e}[/red]")
782
+ ctx.exit(1)
783
+
784
+ if not url_entries:
785
+ console.print(f"[yellow]No valid URLs found in {input_path}[/yellow]")
786
+ ctx.exit(0)
787
+
788
+ is_url_list_mode = True
789
+ input_path = None # Clear input_path for URL list mode
790
+
791
+ # Load configuration first
792
+ config_manager = ConfigManager()
793
+ cfg = config_manager.load(config_path=config_path)
794
+
795
+ # Determine if we're in single file/URL mode (not batch)
796
+ # Single file/URL mode: quiet console unless --verbose is specified
797
+ # URL list mode is batch mode
798
+ is_single_mode = (
799
+ is_url_input or (input_path is not None and input_path.is_file())
800
+ ) and not is_url_list_mode
801
+ quiet_console = is_single_mode and not verbose
802
+
803
+ # Setup logging with configuration
804
+ console_handler_id, log_file_path = setup_logging(
805
+ verbose=verbose,
806
+ log_dir=cfg.log.dir,
807
+ log_level=cfg.log.level,
808
+ rotation=cfg.log.rotation,
809
+ retention=cfg.log.retention,
810
+ quiet=quiet_console,
811
+ )
812
+
813
+ # Log configuration status after logging is set up
814
+ if config_manager.config_path:
815
+ logger.info(f"[Config] Loaded from: {config_manager.config_path}")
816
+ else:
817
+ logger.warning("[Config] No config file found, using defaults")
818
+
819
+ # Warn if LLM is enabled but no models configured
820
+ if cfg.llm.enabled and not cfg.llm.model_list:
821
+ logger.warning(
822
+ "[Config] LLM enabled but no models configured. "
823
+ "Add models to llm.model_list in config file or specify -c <config_path>"
824
+ )
825
+ elif cfg.llm.enabled and cfg.llm.model_list:
826
+ model_names = [m.litellm_params.model for m in cfg.llm.model_list]
827
+ unique_models = set(model_names)
828
+ logger.debug(
829
+ f"[Config] LLM models configured: {len(model_names)} entries, "
830
+ f"{len(unique_models)} unique models"
831
+ )
832
+
833
+ # Store handler ID, log file path and verbose in context for batch processing
834
+ ctx.obj["_console_handler_id"] = console_handler_id
835
+ ctx.obj["_log_file_path"] = log_file_path
836
+ ctx.obj["_verbose"] = verbose
837
+
838
+ # Apply preset first (if specified)
839
+ from markitai.config import get_preset
840
+
841
+ if preset:
842
+ preset_config = get_preset(preset, cfg)
843
+ if preset_config:
844
+ # Apply preset values as base
845
+ cfg.llm.enabled = preset_config.llm
846
+ cfg.image.alt_enabled = preset_config.alt
847
+ cfg.image.desc_enabled = preset_config.desc
848
+ cfg.ocr.enabled = preset_config.ocr
849
+ cfg.screenshot.enabled = preset_config.screenshot
850
+ logger.debug(f"Applied preset: {preset}")
851
+ else:
852
+ console.print(f"[yellow]Warning: Unknown preset '{preset}'[/yellow]")
853
+
854
+ # Override with explicit CLI options (--flag or --no-flag)
855
+ # None means not specified, so we don't override
856
+ if llm is not None:
857
+ cfg.llm.enabled = llm
858
+ if alt is not None:
859
+ cfg.image.alt_enabled = alt
860
+ if desc is not None:
861
+ cfg.image.desc_enabled = desc
862
+ if ocr is not None:
863
+ cfg.ocr.enabled = ocr
864
+ if screenshot is not None:
865
+ cfg.screenshot.enabled = screenshot
866
+ if no_compress:
867
+ cfg.image.compress = False
868
+ if no_cache:
869
+ cfg.cache.no_cache = True
870
+ if no_cache_for:
871
+ # Parse comma-separated patterns
872
+ cfg.cache.no_cache_patterns = [
873
+ p.strip() for p in no_cache_for.split(",") if p.strip()
874
+ ]
875
+ if batch_concurrency is not None:
876
+ cfg.batch.concurrency = batch_concurrency
877
+ if url_concurrency is not None:
878
+ cfg.batch.url_concurrency = url_concurrency
879
+ if llm_concurrency is not None:
880
+ cfg.llm.concurrency = llm_concurrency
881
+
882
+ # Validate vision model configuration if image analysis is enabled
883
+ _check_vision_model_config(cfg, console, verbose)
884
+
885
+ # Validate fetch strategy flags (mutually exclusive)
886
+ if use_agent_browser and use_jina:
887
+ console.print(
888
+ "[red]Error: --agent-browser and --jina are mutually exclusive.[/red]"
889
+ )
890
+ ctx.exit(1)
891
+
892
+ # Determine fetch strategy
893
+ from markitai.fetch import FetchStrategy
894
+
895
+ if use_agent_browser:
896
+ fetch_strategy = FetchStrategy.BROWSER
897
+ explicit_fetch_strategy = True
898
+ elif use_jina:
899
+ fetch_strategy = FetchStrategy.JINA
900
+ explicit_fetch_strategy = True
901
+ else:
902
+ # Use config default or auto
903
+ fetch_strategy = FetchStrategy(cfg.fetch.strategy)
904
+ explicit_fetch_strategy = False
905
+
906
+ # Log input info
907
+ if is_url_list_mode:
908
+ logger.debug(f"Processing URL list: {len(url_entries)} URLs")
909
+ elif is_url_input:
910
+ logger.debug(f"Processing URL: {input_path_str}")
911
+ else:
912
+ assert input_path is not None # Already validated above
913
+ logger.debug(f"Processing: {input_path.resolve()}")
914
+ logger.debug(f"Output directory: {output.resolve()}")
915
+
916
+ async def run_workflow() -> None:
917
+ # URL list batch mode
918
+ if is_url_list_mode:
919
+ await process_url_batch(
920
+ url_entries,
921
+ output,
922
+ cfg,
923
+ dry_run,
924
+ verbose,
925
+ log_file_path,
926
+ concurrency=cfg.batch.url_concurrency,
927
+ fetch_strategy=fetch_strategy,
928
+ explicit_fetch_strategy=explicit_fetch_strategy,
929
+ )
930
+ return
931
+
932
+ # Single URL mode
933
+ if is_url_input:
934
+ assert input_path_str is not None # Guaranteed when is_url_input is True
935
+ await process_url(
936
+ input_path_str,
937
+ output,
938
+ cfg,
939
+ dry_run,
940
+ verbose,
941
+ log_file_path,
942
+ fetch_strategy=fetch_strategy,
943
+ explicit_fetch_strategy=explicit_fetch_strategy,
944
+ )
945
+ return
946
+
947
+ # File/directory mode
948
+ assert input_path is not None # Already validated above
949
+
950
+ # Check if input is directory (batch mode)
951
+ if input_path.is_dir():
952
+ await process_batch(
953
+ input_path,
954
+ output,
955
+ cfg,
956
+ resume,
957
+ dry_run,
958
+ verbose=verbose,
959
+ console_handler_id=console_handler_id,
960
+ log_file_path=log_file_path,
961
+ fetch_strategy=fetch_strategy,
962
+ explicit_fetch_strategy=explicit_fetch_strategy,
963
+ )
964
+ return
965
+
966
+ # Single file mode
967
+ await process_single_file(
968
+ input_path, output, cfg, dry_run, log_file_path, verbose
969
+ )
970
+
971
+ async def run_workflow_with_cleanup() -> None:
972
+ """Run workflow with explicit resource cleanup on exit."""
973
+ from markitai.fetch import close_shared_clients
974
+
975
+ try:
976
+ await run_workflow()
977
+ finally:
978
+ # Cleanup shared resources
979
+ await close_shared_clients() # Close httpx.AsyncClient for Jina
980
+ shutdown_converter_executor() # Shutdown ThreadPoolExecutor
981
+ # Note: FetchCache cleanup happens automatically when process exits
982
+ # as SQLite handles connection cleanup. For explicit cleanup, the
983
+ # global _fetch_cache.close() could be called, but it's not critical.
984
+
985
+ asyncio.run(run_workflow_with_cleanup())
986
+
987
+
988
+ # =============================================================================
989
+ # Config subcommands
990
+ # =============================================================================
991
+
992
+
993
+ @app.group()
994
+ def config() -> None:
995
+ """Configuration management commands."""
996
+ pass
997
+
998
+
999
+ @config.command("list")
1000
+ def config_list() -> None:
1001
+ """Show current effective configuration."""
1002
+ manager = ConfigManager()
1003
+ cfg = manager.load()
1004
+
1005
+ config_dict = cfg.model_dump(mode="json", exclude_none=True)
1006
+ config_json = json.dumps(config_dict, indent=2, ensure_ascii=False)
1007
+
1008
+ syntax = Syntax(config_json, "json", theme="monokai", line_numbers=False)
1009
+ console.print(syntax)
1010
+
1011
+
1012
+ @config.command("path")
1013
+ def config_path_cmd() -> None:
1014
+ """Show configuration file paths."""
1015
+ manager = ConfigManager()
1016
+ manager.load()
1017
+
1018
+ console.print("[bold]Configuration file search order:[/bold]")
1019
+ console.print(" 1. --config CLI argument")
1020
+ console.print(" 2. MARKITAI_CONFIG environment variable")
1021
+ console.print(" 3. ./markitai.json (current directory)")
1022
+ console.print(f" 4. {manager.DEFAULT_USER_CONFIG_DIR / 'config.json'}")
1023
+ console.print()
1024
+
1025
+ if manager.config_path:
1026
+ console.print(f"[green]Currently using:[/green] {manager.config_path}")
1027
+ else:
1028
+ console.print(
1029
+ "[yellow]Using default configuration (no config file found)[/yellow]"
1030
+ )
1031
+
1032
+
1033
+ @config.command("init")
1034
+ @click.option(
1035
+ "--output",
1036
+ "-o",
1037
+ "output_path",
1038
+ type=click.Path(path_type=Path),
1039
+ default=None,
1040
+ help="Output path for configuration file.",
1041
+ )
1042
+ def config_init(output_path: Path | None) -> None:
1043
+ """Initialize a configuration file with defaults."""
1044
+ manager = ConfigManager()
1045
+
1046
+ if output_path is None:
1047
+ output_path = manager.DEFAULT_USER_CONFIG_DIR / "config.json"
1048
+ elif output_path.is_dir():
1049
+ # User passed a directory, append default filename
1050
+ output_path = output_path / "markitai.json"
1051
+
1052
+ # Check if file exists (not directory)
1053
+ if output_path.exists() and output_path.is_file():
1054
+ if not click.confirm(f"{output_path} already exists. Overwrite?"):
1055
+ raise click.Abort()
1056
+
1057
+ # Save minimal template config (essential fields only)
1058
+ saved_path = manager.save(output_path, minimal=True)
1059
+ console.print(f"[green]Configuration file created:[/green] {saved_path}")
1060
+ console.print("\nEdit this file to customize your settings.")
1061
+ console.print(
1062
+ "[dim]Note: max_tokens, supports_vision are auto-detected from litellm.[/dim]"
1063
+ )
1064
+ console.print("Run 'markitai config list' to see the current configuration.")
1065
+
1066
+
1067
+ @config.command("validate")
1068
+ @click.argument(
1069
+ "config_file",
1070
+ type=click.Path(exists=True, path_type=Path),
1071
+ required=False,
1072
+ )
1073
+ def config_validate(config_file: Path | None) -> None:
1074
+ """Validate a configuration file."""
1075
+ manager = ConfigManager()
1076
+
1077
+ try:
1078
+ manager.load(config_path=config_file)
1079
+
1080
+ console.print("[green]Configuration is valid![/green]")
1081
+
1082
+ if manager.config_path:
1083
+ console.print(f"[dim]Validated: {manager.config_path}[/dim]")
1084
+
1085
+ except Exception as e:
1086
+ console.print(f"[red]Configuration error:[/red] {e}")
1087
+ raise SystemExit(2)
1088
+
1089
+
1090
+ @config.command("get")
1091
+ @click.argument("key")
1092
+ def config_get(key: str) -> None:
1093
+ """Get a configuration value."""
1094
+ manager = ConfigManager()
1095
+ manager.load()
1096
+
1097
+ value = manager.get(key)
1098
+ if value is None:
1099
+ console.print(f"[yellow]Key not found:[/yellow] {key}")
1100
+ raise SystemExit(1)
1101
+
1102
+ # Format output
1103
+ if isinstance(value, (dict, list)):
1104
+ console.print(json.dumps(value, indent=2, ensure_ascii=False))
1105
+ else:
1106
+ console.print(str(value))
1107
+
1108
+
1109
+ @config.command("set")
1110
+ @click.argument("key")
1111
+ @click.argument("value")
1112
+ def config_set(key: str, value: str) -> None:
1113
+ """Set a configuration value."""
1114
+ manager = ConfigManager()
1115
+ manager.load()
1116
+
1117
+ # Parse value
1118
+ parsed_value: bool | int | float | str
1119
+ if value.lower() in ("true", "false"):
1120
+ parsed_value = value.lower() == "true"
1121
+ else:
1122
+ try:
1123
+ parsed_value = int(value)
1124
+ except ValueError:
1125
+ try:
1126
+ parsed_value = float(value)
1127
+ except ValueError:
1128
+ parsed_value = value
1129
+
1130
+ try:
1131
+ manager.set(key, parsed_value)
1132
+ manager.save()
1133
+ console.print(f"[green]Set {key} = {parsed_value}[/green]")
1134
+
1135
+ except Exception as e:
1136
+ console.print(f"[red]Error setting value:[/red] {e}")
1137
+ raise SystemExit(1)
1138
+
1139
+
1140
+ # =============================================================================
1141
+ # Cache subcommands
1142
+ # =============================================================================
1143
+
1144
+
1145
+ @app.group()
1146
+ def cache() -> None:
1147
+ """Cache management commands."""
1148
+ pass
1149
+
1150
+
1151
+ @cache.command("stats")
1152
+ @click.option(
1153
+ "--json",
1154
+ "as_json",
1155
+ is_flag=True,
1156
+ help="Output as JSON.",
1157
+ )
1158
+ @click.option(
1159
+ "-v",
1160
+ "--verbose",
1161
+ is_flag=True,
1162
+ help="Show detailed cache entries and model breakdown.",
1163
+ )
1164
+ @click.option(
1165
+ "--limit",
1166
+ default=20,
1167
+ type=int,
1168
+ help="Number of entries to show in verbose mode (default: 20).",
1169
+ )
1170
+ @click.option(
1171
+ "--scope",
1172
+ type=click.Choice(["project", "global", "all"]),
1173
+ default="all",
1174
+ help="Cache scope to display (default: all).",
1175
+ )
1176
+ def cache_stats(as_json: bool, verbose: bool, limit: int, scope: str) -> None:
1177
+ """Show cache statistics."""
1178
+ from rich.table import Table
1179
+
1180
+ from markitai.constants import (
1181
+ DEFAULT_CACHE_DB_FILENAME,
1182
+ DEFAULT_PROJECT_CACHE_DIR,
1183
+ )
1184
+ from markitai.llm import SQLiteCache
1185
+
1186
+ def format_size(size_bytes: int) -> str:
1187
+ """Format size in human-readable format."""
1188
+ if size_bytes < 1024:
1189
+ return f"{size_bytes} B"
1190
+ elif size_bytes < 1024 * 1024:
1191
+ return f"{size_bytes / 1024:.1f} KB"
1192
+ else:
1193
+ return f"{size_bytes / (1024 * 1024):.2f} MB"
1194
+
1195
+ def print_verbose_details(
1196
+ cache: SQLiteCache, cache_name: str, limit: int, as_json: bool
1197
+ ) -> dict[str, Any]:
1198
+ """Collect and optionally print verbose cache details."""
1199
+ by_model = cache.stats_by_model()
1200
+ entries = cache.list_entries(limit)
1201
+
1202
+ if not as_json:
1203
+ # Print By Model table
1204
+ if by_model:
1205
+ model_table = Table(title=f"{cache_name} - By Model")
1206
+ model_table.add_column("Model", style="cyan")
1207
+ model_table.add_column("Entries", justify="right")
1208
+ model_table.add_column("Size", justify="right")
1209
+ for model, data in by_model.items():
1210
+ model_table.add_row(
1211
+ model, str(data["count"]), format_size(data["size_bytes"])
1212
+ )
1213
+ console.print(model_table)
1214
+ console.print()
1215
+
1216
+ # Print Recent Entries table
1217
+ if entries:
1218
+ entry_table = Table(title=f"{cache_name} - Recent Entries")
1219
+ entry_table.add_column("Key", style="dim", max_width=18)
1220
+ entry_table.add_column("Model", max_width=30)
1221
+ entry_table.add_column("Size", justify="right")
1222
+ entry_table.add_column("Preview", max_width=40)
1223
+ for entry in entries:
1224
+ key_display = (
1225
+ entry["key"][:16] + "..."
1226
+ if len(entry["key"]) > 16
1227
+ else entry["key"]
1228
+ )
1229
+ entry_table.add_row(
1230
+ key_display,
1231
+ entry["model"],
1232
+ format_size(entry["size_bytes"]),
1233
+ entry["preview"],
1234
+ )
1235
+ console.print(entry_table)
1236
+
1237
+ return {"by_model": by_model, "entries": entries}
1238
+
1239
+ manager = ConfigManager()
1240
+ cfg = manager.load()
1241
+
1242
+ stats_data: dict[str, Any] = {
1243
+ "project": None,
1244
+ "global": None,
1245
+ "enabled": cfg.cache.enabled,
1246
+ }
1247
+
1248
+ # Check project cache (current directory)
1249
+ project_cache: SQLiteCache | None = None
1250
+ if scope in ("project", "all"):
1251
+ project_cache_path = (
1252
+ Path.cwd() / DEFAULT_PROJECT_CACHE_DIR / DEFAULT_CACHE_DB_FILENAME
1253
+ )
1254
+ if project_cache_path.exists():
1255
+ try:
1256
+ project_cache = SQLiteCache(
1257
+ project_cache_path, cfg.cache.max_size_bytes
1258
+ )
1259
+ stats_data["project"] = project_cache.stats()
1260
+ except Exception as e:
1261
+ stats_data["project"] = {"error": str(e)}
1262
+
1263
+ # Check global cache
1264
+ global_cache: SQLiteCache | None = None
1265
+ if scope in ("global", "all"):
1266
+ global_cache_path = (
1267
+ Path(cfg.cache.global_dir).expanduser() / DEFAULT_CACHE_DB_FILENAME
1268
+ )
1269
+ if global_cache_path.exists():
1270
+ try:
1271
+ global_cache = SQLiteCache(global_cache_path, cfg.cache.max_size_bytes)
1272
+ stats_data["global"] = global_cache.stats()
1273
+ except Exception as e:
1274
+ stats_data["global"] = {"error": str(e)}
1275
+
1276
+ # Collect verbose data if needed
1277
+ if verbose:
1278
+ if (
1279
+ project_cache
1280
+ and stats_data["project"]
1281
+ and "error" not in stats_data["project"]
1282
+ ):
1283
+ verbose_data = print_verbose_details(
1284
+ project_cache, "Project Cache", limit, as_json
1285
+ )
1286
+ stats_data["project"]["by_model"] = verbose_data["by_model"]
1287
+ stats_data["project"]["entries"] = verbose_data["entries"]
1288
+
1289
+ if (
1290
+ global_cache
1291
+ and stats_data["global"]
1292
+ and "error" not in stats_data["global"]
1293
+ ):
1294
+ verbose_data = print_verbose_details(
1295
+ global_cache, "Global Cache", limit, as_json
1296
+ )
1297
+ stats_data["global"]["by_model"] = verbose_data["by_model"]
1298
+ stats_data["global"]["entries"] = verbose_data["entries"]
1299
+
1300
+ if as_json:
1301
+ # Use soft_wrap=True to prevent rich from breaking long lines
1302
+ console.print(
1303
+ json.dumps(stats_data, indent=2, ensure_ascii=False), soft_wrap=True
1304
+ )
1305
+ else:
1306
+ console.print("[bold]Cache Statistics[/bold]")
1307
+ console.print(f"Enabled: {cfg.cache.enabled}")
1308
+ console.print()
1309
+
1310
+ if scope in ("project", "all"):
1311
+ if stats_data["project"]:
1312
+ p = stats_data["project"]
1313
+ if "error" in p:
1314
+ console.print(f"[red]Project cache error:[/red] {p['error']}")
1315
+ else:
1316
+ console.print("[bold]Project Cache[/bold]")
1317
+ console.print(f" Path: {p['db_path']}")
1318
+ console.print(f" Entries: {p['count']}")
1319
+ console.print(f" Size: {p['size_mb']} MB / {p['max_size_mb']} MB")
1320
+ console.print()
1321
+ else:
1322
+ console.print("[dim]No project cache found in current directory[/dim]")
1323
+ console.print()
1324
+
1325
+ if scope in ("global", "all"):
1326
+ if stats_data["global"]:
1327
+ g = stats_data["global"]
1328
+ if "error" in g:
1329
+ console.print(f"[red]Global cache error:[/red] {g['error']}")
1330
+ else:
1331
+ console.print("[bold]Global Cache[/bold]")
1332
+ console.print(f" Path: {g['db_path']}")
1333
+ console.print(f" Entries: {g['count']}")
1334
+ console.print(f" Size: {g['size_mb']} MB / {g['max_size_mb']} MB")
1335
+ console.print()
1336
+ else:
1337
+ console.print("[dim]No global cache found[/dim]")
1338
+
1339
+
1340
+ @cache.command("clear")
1341
+ @click.option(
1342
+ "--scope",
1343
+ type=click.Choice(["project", "global", "all"]),
1344
+ default="project",
1345
+ help="Which cache to clear (default: project).",
1346
+ )
1347
+ @click.option(
1348
+ "--yes",
1349
+ "-y",
1350
+ is_flag=True,
1351
+ help="Skip confirmation prompt.",
1352
+ )
1353
+ def cache_clear(scope: str, yes: bool) -> None:
1354
+ """Clear cache entries."""
1355
+ from markitai.constants import (
1356
+ DEFAULT_CACHE_DB_FILENAME,
1357
+ DEFAULT_PROJECT_CACHE_DIR,
1358
+ )
1359
+ from markitai.llm import SQLiteCache
1360
+
1361
+ manager = ConfigManager()
1362
+ cfg = manager.load()
1363
+
1364
+ # Confirm if not --yes
1365
+ if not yes:
1366
+ scope_desc = {
1367
+ "project": "project cache (current directory)",
1368
+ "global": "global cache (~/.markitai)",
1369
+ "all": "ALL caches (project + global)",
1370
+ }
1371
+ if not click.confirm(f"Clear {scope_desc[scope]}?"):
1372
+ console.print("[yellow]Aborted[/yellow]")
1373
+ return
1374
+
1375
+ result = {"project": 0, "global": 0}
1376
+
1377
+ # Clear project cache
1378
+ if scope in ("project", "all"):
1379
+ project_cache_path = (
1380
+ Path.cwd() / DEFAULT_PROJECT_CACHE_DIR / DEFAULT_CACHE_DB_FILENAME
1381
+ )
1382
+ if project_cache_path.exists():
1383
+ try:
1384
+ project_cache = SQLiteCache(
1385
+ project_cache_path, cfg.cache.max_size_bytes
1386
+ )
1387
+ result["project"] = project_cache.clear()
1388
+ except Exception as e:
1389
+ console.print(f"[red]Failed to clear project cache:[/red] {e}")
1390
+
1391
+ # Clear global cache
1392
+ if scope in ("global", "all"):
1393
+ global_cache_path = (
1394
+ Path(cfg.cache.global_dir).expanduser() / DEFAULT_CACHE_DB_FILENAME
1395
+ )
1396
+ if global_cache_path.exists():
1397
+ try:
1398
+ global_cache = SQLiteCache(global_cache_path, cfg.cache.max_size_bytes)
1399
+ result["global"] = global_cache.clear()
1400
+ except Exception as e:
1401
+ console.print(f"[red]Failed to clear global cache:[/red] {e}")
1402
+
1403
+ # Report results
1404
+ total = result["project"] + result["global"]
1405
+ if total > 0:
1406
+ console.print(f"[green]Cleared {total} cache entries[/green]")
1407
+ if result["project"] > 0:
1408
+ console.print(f" Project: {result['project']}")
1409
+ if result["global"] > 0:
1410
+ console.print(f" Global: {result['global']}")
1411
+ else:
1412
+ console.print("[dim]No cache entries to clear[/dim]")
1413
+
1414
+
1415
+ # =============================================================================
1416
+ # Check dependencies command
1417
+ # =============================================================================
1418
+
1419
+
1420
+ @app.command("check-deps")
1421
+ @click.option(
1422
+ "--json",
1423
+ "as_json",
1424
+ is_flag=True,
1425
+ help="Output as JSON.",
1426
+ )
1427
+ def check_deps(as_json: bool) -> None:
1428
+ """Check all optional dependencies and their status.
1429
+
1430
+ This command helps diagnose setup issues by verifying:
1431
+ - agent-browser (for dynamic URL fetching)
1432
+ - LibreOffice (for Office document conversion)
1433
+ - Tesseract OCR (for scanned document processing)
1434
+ - LLM API configuration (for content enhancement)
1435
+ """
1436
+ import json
1437
+ import shutil
1438
+ import subprocess
1439
+
1440
+ from rich.panel import Panel
1441
+ from rich.table import Table
1442
+
1443
+ from markitai.fetch import verify_agent_browser_ready
1444
+
1445
+ manager = ConfigManager()
1446
+ cfg = manager.load()
1447
+
1448
+ results: dict[str, dict[str, Any]] = {}
1449
+
1450
+ # 1. Check agent-browser
1451
+ is_ready, message = verify_agent_browser_ready(use_cache=False)
1452
+ results["agent-browser"] = {
1453
+ "name": "agent-browser",
1454
+ "description": "Browser automation for dynamic URLs",
1455
+ "status": "ok" if is_ready else "missing",
1456
+ "message": message,
1457
+ "install_hint": "npm install -g agent-browser && npx playwright install chromium",
1458
+ }
1459
+
1460
+ # 2. Check LibreOffice
1461
+ soffice_path = shutil.which("soffice") or shutil.which("libreoffice")
1462
+ if soffice_path:
1463
+ try:
1464
+ proc = subprocess.run(
1465
+ [soffice_path, "--version"],
1466
+ capture_output=True,
1467
+ text=True,
1468
+ timeout=10,
1469
+ )
1470
+ version = (
1471
+ proc.stdout.strip().split("\n")[0]
1472
+ if proc.returncode == 0
1473
+ else "unknown"
1474
+ )
1475
+ results["libreoffice"] = {
1476
+ "name": "LibreOffice",
1477
+ "description": "Office document conversion (doc, docx, xls, xlsx, ppt, pptx)",
1478
+ "status": "ok",
1479
+ "message": f"Found at {soffice_path} ({version})",
1480
+ "install_hint": "",
1481
+ }
1482
+ except Exception as e:
1483
+ results["libreoffice"] = {
1484
+ "name": "LibreOffice",
1485
+ "description": "Office document conversion (doc, docx, xls, xlsx, ppt, pptx)",
1486
+ "status": "error",
1487
+ "message": f"Found but failed to run: {e}",
1488
+ "install_hint": "Reinstall LibreOffice",
1489
+ }
1490
+ else:
1491
+ results["libreoffice"] = {
1492
+ "name": "LibreOffice",
1493
+ "description": "Office document conversion (doc, docx, xls, xlsx, ppt, pptx)",
1494
+ "status": "missing",
1495
+ "message": "soffice/libreoffice command not found",
1496
+ "install_hint": "apt install libreoffice (Linux) / brew install libreoffice (macOS)",
1497
+ }
1498
+
1499
+ # 3. Check Tesseract OCR
1500
+ tesseract_path = shutil.which("tesseract")
1501
+ if tesseract_path:
1502
+ try:
1503
+ proc = subprocess.run(
1504
+ ["tesseract", "--version"],
1505
+ capture_output=True,
1506
+ text=True,
1507
+ timeout=10,
1508
+ )
1509
+ version = (
1510
+ proc.stdout.strip().split("\n")[0]
1511
+ if proc.returncode == 0
1512
+ else "unknown"
1513
+ )
1514
+ results["tesseract"] = {
1515
+ "name": "Tesseract OCR",
1516
+ "description": "OCR for scanned documents",
1517
+ "status": "ok",
1518
+ "message": f"Found at {tesseract_path} ({version})",
1519
+ "install_hint": "",
1520
+ }
1521
+ except Exception as e:
1522
+ results["tesseract"] = {
1523
+ "name": "Tesseract OCR",
1524
+ "description": "OCR for scanned documents",
1525
+ "status": "error",
1526
+ "message": f"Found but failed to run: {e}",
1527
+ "install_hint": "Reinstall tesseract",
1528
+ }
1529
+ else:
1530
+ results["tesseract"] = {
1531
+ "name": "Tesseract OCR",
1532
+ "description": "OCR for scanned documents",
1533
+ "status": "missing",
1534
+ "message": "tesseract command not found",
1535
+ "install_hint": "apt install tesseract-ocr (Linux) / brew install tesseract (macOS)",
1536
+ }
1537
+
1538
+ # 4. Check LLM API configuration (check model_list for configured models)
1539
+ configured_models = cfg.llm.model_list if cfg.llm.model_list else []
1540
+ if configured_models:
1541
+ # Find first model with api_key to determine provider
1542
+ first_model = configured_models[0].litellm_params.model
1543
+ provider = first_model.split("/")[0] if "/" in first_model else "openai"
1544
+ results["llm-api"] = {
1545
+ "name": f"LLM API ({provider})",
1546
+ "description": "Content enhancement and image analysis",
1547
+ "status": "ok",
1548
+ "message": f"{len(configured_models)} model(s) configured",
1549
+ "install_hint": "",
1550
+ }
1551
+ else:
1552
+ results["llm-api"] = {
1553
+ "name": "LLM API",
1554
+ "description": "Content enhancement and image analysis",
1555
+ "status": "missing",
1556
+ "message": "No models configured in llm.model_list",
1557
+ "install_hint": "Configure llm.model_list in markitai.json",
1558
+ }
1559
+
1560
+ # 5. Check vision model configuration (models with supports_vision=true)
1561
+ vision_models = [
1562
+ m for m in configured_models if m.model_info and m.model_info.supports_vision
1563
+ ]
1564
+ if vision_models:
1565
+ vision_model_names = [m.litellm_params.model for m in vision_models]
1566
+ results["vision-model"] = {
1567
+ "name": "Vision Model",
1568
+ "description": "Image analysis (alt text, descriptions)",
1569
+ "status": "ok",
1570
+ "message": f"Configured: {', '.join(vision_model_names[:2])}{'...' if len(vision_model_names) > 2 else ''}",
1571
+ "install_hint": "",
1572
+ }
1573
+ else:
1574
+ results["vision-model"] = {
1575
+ "name": "Vision Model",
1576
+ "description": "Image analysis (alt text, descriptions)",
1577
+ "status": "warning",
1578
+ "message": "No vision model configured (set model_info.supports_vision=true)",
1579
+ "install_hint": "Add supports_vision: true to model_info in model_list",
1580
+ }
1581
+
1582
+ # Output results
1583
+ if as_json:
1584
+ # Use click.echo for raw JSON (avoid Rich formatting which breaks JSON)
1585
+ click.echo(json.dumps(results, indent=2))
1586
+ return
1587
+
1588
+ # Rich table output
1589
+ table = Table(title="Dependency Status")
1590
+ table.add_column("Component", style="cyan")
1591
+ table.add_column("Status", justify="center")
1592
+ table.add_column("Description")
1593
+ table.add_column("Details")
1594
+
1595
+ status_icons = {
1596
+ "ok": "[green]✓[/green]",
1597
+ "warning": "[yellow]⚠[/yellow]",
1598
+ "missing": "[red]✗[/red]",
1599
+ "error": "[red]![/red]",
1600
+ }
1601
+
1602
+ for _key, info in results.items():
1603
+ status_icon = status_icons.get(info["status"], "?")
1604
+ table.add_row(
1605
+ info["name"],
1606
+ status_icon,
1607
+ info["description"],
1608
+ info["message"],
1609
+ )
1610
+
1611
+ console.print(table)
1612
+ console.print()
1613
+
1614
+ # Show install hints for missing/error items
1615
+ hints = [
1616
+ (info["name"], info["install_hint"])
1617
+ for info in results.values()
1618
+ if info["status"] in ("missing", "error") and info["install_hint"]
1619
+ ]
1620
+
1621
+ if hints:
1622
+ hint_text = "\n".join([f" • {name}: {hint}" for name, hint in hints])
1623
+ console.print(
1624
+ Panel(
1625
+ f"[yellow]To fix missing dependencies:[/yellow]\n{hint_text}",
1626
+ title="Installation Hints",
1627
+ border_style="yellow",
1628
+ )
1629
+ )
1630
+ else:
1631
+ console.print("[green]All dependencies are properly configured![/green]")
1632
+
1633
+
1634
+ # =============================================================================
1635
+ # Processing functions
1636
+ # =============================================================================
1637
+
1638
+
1639
+ async def process_single_file(
1640
+ input_path: Path,
1641
+ output_dir: Path,
1642
+ cfg: MarkitaiConfig,
1643
+ dry_run: bool,
1644
+ log_file_path: Path | None = None,
1645
+ verbose: bool = False,
1646
+ ) -> None:
1647
+ """Process a single file using workflow/core pipeline.
1648
+
1649
+ After conversion completes, outputs the final markdown to stdout.
1650
+ If LLM is enabled, outputs .llm.md content; otherwise outputs .md content.
1651
+ """
1652
+ from datetime import datetime
1653
+
1654
+ from markitai.workflow.core import (
1655
+ ConversionContext,
1656
+ convert_document_core,
1657
+ )
1658
+
1659
+ # Validate file size to prevent DoS
1660
+ try:
1661
+ validate_file_size(input_path, MAX_DOCUMENT_SIZE)
1662
+ except ValueError as e:
1663
+ console.print(Panel(f"[red]{e}[/red]", title="Error"))
1664
+ raise SystemExit(1)
1665
+
1666
+ # Detect file format for dry-run display
1667
+ fmt = detect_format(input_path)
1668
+ if fmt == FileFormat.UNKNOWN:
1669
+ console.print(
1670
+ Panel(
1671
+ f"[red]Unsupported file format: {input_path.suffix}[/red]",
1672
+ title="Error",
1673
+ )
1674
+ )
1675
+ raise SystemExit(1)
1676
+
1677
+ # Handle dry-run
1678
+ if dry_run:
1679
+ cache_status = "enabled" if cfg.cache.enabled else "disabled"
1680
+ dry_run_msg = (
1681
+ f"[yellow]Would convert:[/yellow] {input_path}\n"
1682
+ f"[yellow]Format:[/yellow] {fmt.value.upper()}\n"
1683
+ f"[yellow]Output:[/yellow] {output_dir / (input_path.name + '.md')}\n"
1684
+ f"[yellow]Cache:[/yellow] {cache_status}"
1685
+ )
1686
+ console.print(Panel(dry_run_msg, title="Dry Run"))
1687
+ if cfg.cache.enabled:
1688
+ console.print(
1689
+ "[dim]Tip: Use 'markitai cache stats -v' to view cached entries[/dim]"
1690
+ )
1691
+ raise SystemExit(0)
1692
+
1693
+ # Progress reporter for non-verbose mode feedback
1694
+ progress = ProgressReporter(enabled=not verbose)
1695
+ started_at = datetime.now()
1696
+ error_msg = None
1697
+
1698
+ try:
1699
+ progress.start_spinner(f"Converting {input_path.name}...")
1700
+
1701
+ # Create conversion context
1702
+ ctx = ConversionContext(
1703
+ input_path=input_path,
1704
+ output_dir=output_dir,
1705
+ config=cfg,
1706
+ project_dir=output_dir.parent,
1707
+ )
1708
+
1709
+ # Run core conversion pipeline
1710
+ result = await convert_document_core(ctx, MAX_DOCUMENT_SIZE)
1711
+
1712
+ if not result.success:
1713
+ if result.error:
1714
+ raise RuntimeError(result.error)
1715
+ raise RuntimeError("Unknown conversion error")
1716
+
1717
+ if result.skip_reason == "exists":
1718
+ progress.stop_spinner()
1719
+ base_output_file = output_dir / f"{input_path.name}.md"
1720
+ console.print(f"[yellow]Skipped (exists):[/yellow] {base_output_file}")
1721
+ return
1722
+
1723
+ # Show conversion complete message
1724
+ progress.log(f"Converted: {input_path.name}")
1725
+
1726
+ # Write image descriptions (single file)
1727
+ if ctx.image_analysis and cfg.image.desc_enabled:
1728
+ write_images_json(output_dir, [ctx.image_analysis])
1729
+
1730
+ # Generate report
1731
+ finished_at = datetime.now()
1732
+ duration = (finished_at - started_at).total_seconds()
1733
+
1734
+ input_tokens = sum(u.get("input_tokens", 0) for u in ctx.llm_usage.values())
1735
+ output_tokens = sum(u.get("output_tokens", 0) for u in ctx.llm_usage.values())
1736
+ requests = sum(u.get("requests", 0) for u in ctx.llm_usage.values())
1737
+
1738
+ report = {
1739
+ "version": "1.0",
1740
+ "generated_at": datetime.now().astimezone().isoformat(),
1741
+ "log_file": str(log_file_path) if log_file_path else None,
1742
+ "summary": {
1743
+ "total_documents": 1,
1744
+ "completed_documents": 1,
1745
+ "failed_documents": 0,
1746
+ "duration": duration,
1747
+ },
1748
+ "llm_usage": {
1749
+ "models": ctx.llm_usage,
1750
+ "requests": requests,
1751
+ "input_tokens": input_tokens,
1752
+ "output_tokens": output_tokens,
1753
+ "cost_usd": ctx.llm_cost,
1754
+ },
1755
+ "documents": {
1756
+ input_path.name: {
1757
+ "status": "completed",
1758
+ "error": None,
1759
+ "output": str(
1760
+ ctx.output_file.with_suffix(".llm.md")
1761
+ if cfg.llm.enabled and ctx.output_file
1762
+ else ctx.output_file
1763
+ ),
1764
+ "images": ctx.embedded_images_count,
1765
+ "screenshots": ctx.screenshots_count,
1766
+ "duration": duration,
1767
+ "llm_usage": {
1768
+ "input_tokens": input_tokens,
1769
+ "output_tokens": output_tokens,
1770
+ "cost_usd": ctx.llm_cost,
1771
+ },
1772
+ }
1773
+ },
1774
+ }
1775
+
1776
+ # Generate report file path
1777
+ task_options = {
1778
+ "llm": cfg.llm.enabled,
1779
+ "ocr": cfg.ocr.enabled,
1780
+ "screenshot": cfg.screenshot.enabled,
1781
+ "alt": cfg.image.alt_enabled,
1782
+ "desc": cfg.image.desc_enabled,
1783
+ }
1784
+ task_hash = compute_task_hash(input_path, output_dir, task_options)
1785
+ report_path = get_report_file_path(
1786
+ output_dir, task_hash, cfg.output.on_conflict
1787
+ )
1788
+ report_path.parent.mkdir(parents=True, exist_ok=True)
1789
+
1790
+ atomic_write_json(report_path, report, order_func=order_report)
1791
+ logger.info(f"Report saved: {report_path}")
1792
+
1793
+ # Clear progress output before printing final result
1794
+ progress.clear_and_finish()
1795
+
1796
+ # Output final markdown to stdout
1797
+ if ctx.output_file:
1798
+ final_output_file = (
1799
+ ctx.output_file.with_suffix(".llm.md")
1800
+ if cfg.llm.enabled
1801
+ else ctx.output_file
1802
+ )
1803
+ if final_output_file.exists():
1804
+ final_content = final_output_file.read_text(encoding="utf-8")
1805
+ print(final_content)
1806
+
1807
+ except Exception as e:
1808
+ error_msg = str(e)
1809
+ console.print(Panel(f"[red]{error_msg}[/red]", title="Error"))
1810
+ sys.exit(1)
1811
+
1812
+ finally:
1813
+ if error_msg:
1814
+ logger.warning(f"Failed to process {input_path.name}: {error_msg}")
1815
+
1816
+
1817
+ async def process_url(
1818
+ url: str,
1819
+ output_dir: Path,
1820
+ cfg: MarkitaiConfig,
1821
+ dry_run: bool,
1822
+ verbose: bool,
1823
+ log_file_path: Path | None = None,
1824
+ fetch_strategy: FetchStrategy | None = None,
1825
+ explicit_fetch_strategy: bool = False,
1826
+ ) -> None:
1827
+ """Process a URL and convert to Markdown.
1828
+
1829
+ Supports multiple fetch strategies:
1830
+ - auto: Detect JS-required pages and fallback automatically
1831
+ - static: Direct HTTP request via markitdown (fastest)
1832
+ - browser: Headless browser via agent-browser (for JS-rendered pages)
1833
+ - jina: Jina Reader API (cloud-based, no local dependencies)
1834
+
1835
+ Also supports:
1836
+ - LLM enhancement via --llm flag for document cleaning and frontmatter
1837
+ - Image downloading and analysis via --alt/--desc flags
1838
+
1839
+ Note: --screenshot and --ocr are not supported for URLs.
1840
+
1841
+ Args:
1842
+ url: URL to convert (http:// or https://)
1843
+ output_dir: Output directory for the markdown file
1844
+ cfg: Configuration
1845
+ dry_run: If True, only show what would be done
1846
+ verbose: If True, print logs before output
1847
+ log_file_path: Path to log file (for report)
1848
+ fetch_strategy: Strategy to use for fetching URL content
1849
+ explicit_fetch_strategy: If True, strategy was explicitly set via CLI flag
1850
+ """
1851
+ from markitai.fetch import (
1852
+ AgentBrowserNotFoundError,
1853
+ FetchError,
1854
+ FetchStrategy,
1855
+ JinaRateLimitError,
1856
+ fetch_url,
1857
+ )
1858
+ from markitai.image import download_url_images
1859
+
1860
+ # Default to auto strategy if not specified
1861
+ if fetch_strategy is None:
1862
+ fetch_strategy = FetchStrategy(cfg.fetch.strategy)
1863
+ # At this point fetch_strategy is guaranteed to be non-None
1864
+ assert fetch_strategy is not None # for type checker
1865
+
1866
+ # Warn about unsupported/ignored options for URL mode
1867
+ # Note: --alt and --desc are now supported (images will be downloaded)
1868
+ # --screenshot is now supported for URLs (captures full-page screenshot via browser)
1869
+ # --ocr is not applicable for URLs
1870
+ if cfg.ocr.enabled:
1871
+ logger.warning("[URL] --ocr is not supported for URL conversion, ignored")
1872
+
1873
+ # Generate output filename from URL
1874
+ filename = url_to_filename(url)
1875
+
1876
+ if dry_run:
1877
+ llm_status = "enabled" if cfg.llm.enabled else "disabled"
1878
+ cache_status = "enabled" if cfg.cache.enabled else "disabled"
1879
+ fetch_strategy_str = fetch_strategy.value if fetch_strategy else "auto"
1880
+ dry_run_msg = (
1881
+ f"[yellow]Would convert URL:[/yellow] {url}\n"
1882
+ f"[yellow]Output:[/yellow] {output_dir / filename}\n"
1883
+ f"[yellow]Fetch strategy:[/yellow] {fetch_strategy_str}\n"
1884
+ f"[yellow]LLM:[/yellow] {llm_status}\n"
1885
+ f"[yellow]Cache:[/yellow] {cache_status}"
1886
+ )
1887
+ console.print(Panel(dry_run_msg, title="Dry Run"))
1888
+ if cfg.cache.enabled:
1889
+ console.print(
1890
+ "[dim]Tip: Use 'markitai cache stats -v' to view cached entries[/dim]"
1891
+ )
1892
+ raise SystemExit(0)
1893
+
1894
+ # Create output directory
1895
+ from markitai.security import check_symlink_safety
1896
+
1897
+ check_symlink_safety(output_dir, allow_symlinks=cfg.output.allow_symlinks)
1898
+ ensure_dir(output_dir)
1899
+
1900
+ from datetime import datetime
1901
+
1902
+ started_at = datetime.now()
1903
+ llm_cost = 0.0
1904
+ llm_usage: dict[str, dict[str, Any]] = {}
1905
+
1906
+ # Progress reporter for non-verbose mode feedback
1907
+ progress = ProgressReporter(enabled=not verbose)
1908
+
1909
+ # Track cache hit for reporting
1910
+ fetch_cache_hit = False
1911
+
1912
+ # Initialize fetch cache if caching is enabled
1913
+ fetch_cache: FetchCache | None = None
1914
+ if cfg.cache.enabled:
1915
+ from markitai.fetch import get_fetch_cache
1916
+
1917
+ cache_dir = output_dir.parent / ".markitai"
1918
+ fetch_cache = get_fetch_cache(cache_dir, cfg.cache.max_size_bytes)
1919
+
1920
+ try:
1921
+ logger.info(f"Fetching URL: {url} (strategy: {fetch_strategy.value})")
1922
+ progress.start_spinner(f"Fetching {url}...")
1923
+
1924
+ # Fetch URL using the configured strategy
1925
+ # Prepare screenshot options if enabled
1926
+ screenshot_dir = (
1927
+ ensure_screenshots_dir(output_dir) if cfg.screenshot.enabled else None
1928
+ )
1929
+
1930
+ try:
1931
+ fetch_result = await fetch_url(
1932
+ url,
1933
+ fetch_strategy,
1934
+ cfg.fetch,
1935
+ explicit_strategy=explicit_fetch_strategy,
1936
+ cache=fetch_cache,
1937
+ skip_read_cache=cfg.cache.no_cache,
1938
+ screenshot=cfg.screenshot.enabled,
1939
+ screenshot_dir=screenshot_dir,
1940
+ screenshot_config=cfg.screenshot if cfg.screenshot.enabled else None,
1941
+ )
1942
+ fetch_cache_hit = fetch_result.cache_hit
1943
+ used_strategy = fetch_result.strategy_used
1944
+ original_markdown = fetch_result.content
1945
+ screenshot_path = fetch_result.screenshot_path
1946
+ logger.info(f"Fetched via {used_strategy}: {url}")
1947
+ except AgentBrowserNotFoundError:
1948
+ console.print(
1949
+ Panel(
1950
+ "[red]agent-browser is not installed.[/red]\n\n"
1951
+ "Install with:\n"
1952
+ " npm install -g agent-browser\n"
1953
+ " agent-browser install\n\n"
1954
+ "[dim]Or use --jina for cloud-based rendering.[/dim]",
1955
+ title="Error",
1956
+ )
1957
+ )
1958
+ raise SystemExit(1)
1959
+ except JinaRateLimitError:
1960
+ console.print(
1961
+ Panel(
1962
+ "[red]Jina Reader rate limit exceeded (free tier: 20 RPM).[/red]\n\n"
1963
+ "[dim]Try again later or use --agent-browser for local rendering.[/dim]",
1964
+ title="Error",
1965
+ )
1966
+ )
1967
+ raise SystemExit(1)
1968
+ except FetchError as e:
1969
+ console.print(Panel(f"[red]{e}[/red]", title="Error"))
1970
+ raise SystemExit(1)
1971
+
1972
+ if not original_markdown.strip():
1973
+ console.print(
1974
+ Panel(
1975
+ f"[red]No content extracted from URL: {url}[/red]\n"
1976
+ "[dim]The page may be empty, require JavaScript, or use an unsupported format.[/dim]",
1977
+ title="Error",
1978
+ )
1979
+ )
1980
+ raise SystemExit(1)
1981
+
1982
+ # Generate output path with conflict resolution
1983
+ base_output_file = output_dir / filename
1984
+ output_file = resolve_output_path(base_output_file, cfg.output.on_conflict)
1985
+
1986
+ if output_file is None:
1987
+ logger.info(f"[SKIP] Output exists: {base_output_file}")
1988
+ console.print(f"[yellow]Skipped (exists):[/yellow] {base_output_file}")
1989
+ return
1990
+
1991
+ # original_markdown was already set from fetch_result.content above
1992
+ markdown_for_llm = original_markdown
1993
+ progress.log(f"Fetched via {used_strategy}: {url}")
1994
+
1995
+ # Download images from URLs if --alt or --desc is enabled
1996
+ # Only update markdown_for_llm, keep original_markdown unchanged
1997
+ downloaded_images: list[Path] = []
1998
+ images_count = 0
1999
+ screenshots_count = 1 if screenshot_path and screenshot_path.exists() else 0
2000
+ img_analysis: ImageAnalysisResult | None = None
2001
+
2002
+ # Log screenshot capture if successful
2003
+ if screenshot_path and screenshot_path.exists():
2004
+ progress.log(f"Screenshot captured: {screenshot_path.name}")
2005
+ logger.info(f"Screenshot saved: {screenshot_path}")
2006
+
2007
+ if cfg.image.alt_enabled or cfg.image.desc_enabled:
2008
+ progress.start_spinner("Downloading images...")
2009
+ download_result = await download_url_images(
2010
+ markdown=original_markdown,
2011
+ output_dir=output_dir,
2012
+ base_url=url,
2013
+ config=cfg.image,
2014
+ source_name=url_to_filename(url).replace(".md", ""),
2015
+ concurrency=5,
2016
+ timeout=30,
2017
+ )
2018
+ markdown_for_llm = download_result.updated_markdown
2019
+ downloaded_images = download_result.downloaded_paths
2020
+ images_count = len(downloaded_images)
2021
+
2022
+ if download_result.failed_urls:
2023
+ for failed_url in download_result.failed_urls:
2024
+ logger.warning(f"Failed to download image: {failed_url}")
2025
+
2026
+ if downloaded_images:
2027
+ progress.log(f"Downloaded {len(downloaded_images)} images")
2028
+ else:
2029
+ progress.log("No images to download")
2030
+
2031
+ # Write base .md file with original content (no image link replacement)
2032
+ base_content = _add_basic_frontmatter(
2033
+ original_markdown,
2034
+ url,
2035
+ fetch_strategy=used_strategy,
2036
+ screenshot_path=screenshot_path,
2037
+ output_dir=output_dir,
2038
+ )
2039
+ atomic_write_text(output_file, base_content)
2040
+ logger.info(f"Written output: {output_file}")
2041
+
2042
+ # LLM processing (if enabled) uses markdown with local image paths
2043
+ final_content = base_content
2044
+ if cfg.llm.enabled:
2045
+ logger.info(f"[LLM] Processing URL content: {url}")
2046
+
2047
+ # Check if image analysis should run
2048
+ should_analyze_images = (
2049
+ cfg.image.alt_enabled or cfg.image.desc_enabled
2050
+ ) and downloaded_images
2051
+
2052
+ # Check for multi-source content (static + browser + screenshot)
2053
+ has_multi_source = (
2054
+ fetch_result.static_content is not None
2055
+ or fetch_result.browser_content is not None
2056
+ )
2057
+ has_screenshot = screenshot_path and screenshot_path.exists()
2058
+ use_vision_enhancement = has_multi_source and has_screenshot
2059
+
2060
+ if use_vision_enhancement and screenshot_path:
2061
+ # Multi-source URL with screenshot: use vision LLM
2062
+ progress.start_spinner("Processing with Vision LLM (multi-source)...")
2063
+ multi_source_content = _build_multi_source_content(
2064
+ fetch_result.static_content,
2065
+ fetch_result.browser_content,
2066
+ markdown_for_llm,
2067
+ )
2068
+ logger.info(
2069
+ f"[URL] Using vision enhancement for multi-source URL: {url}"
2070
+ )
2071
+
2072
+ _, doc_cost, doc_usage = await _process_url_with_vision(
2073
+ multi_source_content,
2074
+ screenshot_path,
2075
+ url,
2076
+ cfg,
2077
+ output_file,
2078
+ project_dir=output_dir.parent,
2079
+ )
2080
+ llm_cost += doc_cost
2081
+ _merge_llm_usage(llm_usage, doc_usage)
2082
+
2083
+ # Run image analysis if needed
2084
+ if should_analyze_images:
2085
+ (
2086
+ _,
2087
+ image_cost,
2088
+ image_usage,
2089
+ img_analysis,
2090
+ ) = await analyze_images_with_llm(
2091
+ downloaded_images,
2092
+ multi_source_content,
2093
+ output_file,
2094
+ cfg,
2095
+ Path(url),
2096
+ concurrency_limit=cfg.llm.concurrency,
2097
+ project_dir=output_dir.parent,
2098
+ )
2099
+ llm_cost += image_cost
2100
+ _merge_llm_usage(llm_usage, image_usage)
2101
+ progress.log("LLM processing complete (vision enhanced)")
2102
+ elif should_analyze_images:
2103
+ # Standard processing with image analysis
2104
+ progress.start_spinner("Processing document and images with LLM...")
2105
+
2106
+ # Create parallel tasks
2107
+ doc_task = process_with_llm(
2108
+ markdown_for_llm,
2109
+ url, # Use URL as source identifier
2110
+ cfg,
2111
+ output_file,
2112
+ project_dir=output_dir.parent,
2113
+ )
2114
+ img_task = analyze_images_with_llm(
2115
+ downloaded_images,
2116
+ markdown_for_llm,
2117
+ output_file,
2118
+ cfg,
2119
+ Path(url), # Use URL as source path
2120
+ concurrency_limit=cfg.llm.concurrency,
2121
+ project_dir=output_dir.parent,
2122
+ )
2123
+
2124
+ # Execute in parallel
2125
+ doc_result, img_result = await asyncio.gather(doc_task, img_task)
2126
+
2127
+ # Unpack results
2128
+ _, doc_cost, doc_usage = doc_result
2129
+ _, image_cost, image_usage, img_analysis = img_result
2130
+
2131
+ llm_cost += doc_cost + image_cost
2132
+ _merge_llm_usage(llm_usage, doc_usage)
2133
+ _merge_llm_usage(llm_usage, image_usage)
2134
+ progress.log("LLM processing complete (document + images)")
2135
+ else:
2136
+ # Only document processing, no images to analyze
2137
+ progress.start_spinner("Processing with LLM...")
2138
+ _, doc_cost, doc_usage = await process_with_llm(
2139
+ markdown_for_llm,
2140
+ url, # Use URL as source identifier
2141
+ cfg,
2142
+ output_file,
2143
+ project_dir=output_dir.parent,
2144
+ )
2145
+ llm_cost += doc_cost
2146
+ _merge_llm_usage(llm_usage, doc_usage)
2147
+ progress.log("LLM processing complete")
2148
+
2149
+ # Read the LLM-processed content for stdout output
2150
+ llm_output_file = output_file.with_suffix(".llm.md")
2151
+ if llm_output_file.exists():
2152
+ final_content = llm_output_file.read_text(encoding="utf-8")
2153
+
2154
+ # Write image descriptions (if enabled and images were analyzed)
2155
+ if img_analysis and cfg.image.desc_enabled:
2156
+ write_images_json(output_dir, [img_analysis])
2157
+
2158
+ # Generate report before final output
2159
+ finished_at = datetime.now()
2160
+ duration = (finished_at - started_at).total_seconds()
2161
+
2162
+ input_tokens = sum(u.get("input_tokens", 0) for u in llm_usage.values())
2163
+ output_tokens = sum(u.get("output_tokens", 0) for u in llm_usage.values())
2164
+ requests = sum(u.get("requests", 0) for u in llm_usage.values())
2165
+
2166
+ task_options = {
2167
+ "llm": cfg.llm.enabled,
2168
+ "url": url,
2169
+ }
2170
+ task_hash = compute_task_hash(output_dir, output_dir, task_options)
2171
+ report_path = get_report_file_path(
2172
+ output_dir, task_hash, cfg.output.on_conflict
2173
+ )
2174
+ report_path.parent.mkdir(parents=True, exist_ok=True)
2175
+
2176
+ # Determine cache hit status (LLM was enabled but no tokens used)
2177
+ llm_cache_hit = cfg.llm.enabled and requests == 0
2178
+
2179
+ report = {
2180
+ "version": "1.0",
2181
+ "generated_at": datetime.now().astimezone().isoformat(),
2182
+ "log_file": str(log_file_path) if log_file_path else None,
2183
+ "options": {
2184
+ "llm": cfg.llm.enabled,
2185
+ "cache": cfg.cache.enabled,
2186
+ "fetch_strategy": used_strategy,
2187
+ "alt": cfg.image.alt_enabled,
2188
+ "desc": cfg.image.desc_enabled,
2189
+ },
2190
+ "summary": {
2191
+ "total_documents": 0,
2192
+ "completed_documents": 0,
2193
+ "failed_documents": 0,
2194
+ "total_urls": 1,
2195
+ "completed_urls": 1,
2196
+ "failed_urls": 0,
2197
+ "duration": duration,
2198
+ },
2199
+ "llm_usage": {
2200
+ "models": llm_usage,
2201
+ "requests": requests,
2202
+ "input_tokens": input_tokens,
2203
+ "output_tokens": output_tokens,
2204
+ "cost_usd": llm_cost,
2205
+ },
2206
+ "urls": {
2207
+ url: {
2208
+ "status": "completed",
2209
+ "source_file": "cli",
2210
+ "error": None,
2211
+ "output": str(
2212
+ output_file.with_suffix(".llm.md")
2213
+ if cfg.llm.enabled
2214
+ else output_file
2215
+ ),
2216
+ "fetch_strategy": used_strategy,
2217
+ "fetch_cache_hit": fetch_cache_hit,
2218
+ "llm_cache_hit": llm_cache_hit,
2219
+ "images": images_count,
2220
+ "screenshots": screenshots_count,
2221
+ "duration": duration,
2222
+ "llm_usage": {
2223
+ "input_tokens": input_tokens,
2224
+ "output_tokens": output_tokens,
2225
+ "cost_usd": llm_cost,
2226
+ },
2227
+ }
2228
+ },
2229
+ }
2230
+
2231
+ atomic_write_json(report_path, report, order_func=order_report)
2232
+ logger.info(f"Report saved: {report_path}")
2233
+
2234
+ # Clear progress output before printing final result
2235
+ progress.clear_and_finish()
2236
+
2237
+ # Output to stdout (single URL mode behavior, same as single file)
2238
+ print(final_content)
2239
+
2240
+ except SystemExit:
2241
+ raise
2242
+ except Exception as e:
2243
+ console.print(Panel(f"[red]{e}[/red]", title="Error"))
2244
+ raise SystemExit(1)
2245
+
2246
+
2247
+ async def process_url_batch(
2248
+ url_entries: list, # list[UrlEntry] but imported dynamically
2249
+ output_dir: Path,
2250
+ cfg: MarkitaiConfig,
2251
+ dry_run: bool,
2252
+ verbose: bool,
2253
+ log_file_path: Path | None = None,
2254
+ concurrency: int = 3,
2255
+ fetch_strategy: FetchStrategy | None = None,
2256
+ explicit_fetch_strategy: bool = False,
2257
+ ) -> None:
2258
+ """Batch process multiple URLs from a URL list file.
2259
+
2260
+ Shows progress bar similar to file batch processing.
2261
+ Each URL is processed concurrently up to the concurrency limit.
2262
+
2263
+ Args:
2264
+ url_entries: List of UrlEntry objects from parse_url_list()
2265
+ output_dir: Output directory for all markdown files
2266
+ cfg: Configuration
2267
+ dry_run: If True, only show what would be done
2268
+ verbose: If True, enable verbose logging
2269
+ log_file_path: Path to log file (for report)
2270
+ concurrency: Max concurrent URL processing (default 3)
2271
+ fetch_strategy: Strategy to use for fetching URL content
2272
+ explicit_fetch_strategy: If True, strategy was explicitly set via CLI flag
2273
+ """
2274
+ from datetime import datetime
2275
+
2276
+ from rich.progress import (
2277
+ BarColumn,
2278
+ MofNCompleteColumn,
2279
+ Progress,
2280
+ SpinnerColumn,
2281
+ TextColumn,
2282
+ TimeElapsedColumn,
2283
+ )
2284
+
2285
+ from markitai.fetch import (
2286
+ AgentBrowserNotFoundError,
2287
+ FetchError,
2288
+ FetchStrategy,
2289
+ JinaRateLimitError,
2290
+ fetch_url,
2291
+ get_fetch_cache,
2292
+ )
2293
+ from markitai.image import download_url_images
2294
+ from markitai.security import check_symlink_safety
2295
+
2296
+ # Default to auto strategy if not specified
2297
+ if fetch_strategy is None:
2298
+ fetch_strategy = FetchStrategy(cfg.fetch.strategy)
2299
+ assert fetch_strategy is not None # for type checker
2300
+
2301
+ # Dry run: just show what would be done
2302
+ if dry_run:
2303
+ console.print(
2304
+ Panel(
2305
+ f"[yellow]Would process {len(url_entries)} URLs[/yellow]\n"
2306
+ f"[yellow]Output directory:[/yellow] {output_dir}",
2307
+ title="Dry Run - URL Batch",
2308
+ )
2309
+ )
2310
+ for entry in url_entries[:10]:
2311
+ filename = entry.output_name or url_to_filename(entry.url).replace(
2312
+ ".md", ""
2313
+ )
2314
+ console.print(f" - {entry.url} -> {filename}.md")
2315
+ if len(url_entries) > 10:
2316
+ console.print(f" ... and {len(url_entries) - 10} more")
2317
+ raise SystemExit(0)
2318
+
2319
+ # Create output directory
2320
+ check_symlink_safety(output_dir, allow_symlinks=cfg.output.allow_symlinks)
2321
+ ensure_dir(output_dir)
2322
+
2323
+ # Initialize fetch cache if caching is enabled
2324
+ fetch_cache = None
2325
+ if cfg.cache.enabled:
2326
+ cache_dir = output_dir.parent / ".markitai"
2327
+ fetch_cache = get_fetch_cache(cache_dir, cfg.cache.max_size_bytes)
2328
+
2329
+ started_at = datetime.now()
2330
+ total_llm_cost = 0.0
2331
+ total_llm_usage: dict[str, dict[str, Any]] = {}
2332
+ completed = 0
2333
+ failed = 0
2334
+ results: dict[str, dict] = {}
2335
+
2336
+ semaphore = asyncio.Semaphore(concurrency)
2337
+
2338
+ async def process_single_url(entry, progress_task, progress_obj) -> None:
2339
+ """Process a single URL."""
2340
+ nonlocal completed, failed, total_llm_cost
2341
+
2342
+ url = entry.url
2343
+ custom_name = entry.output_name
2344
+ url_fetch_strategy = "unknown"
2345
+
2346
+ async with semaphore:
2347
+ try:
2348
+ # Generate filename
2349
+ if custom_name:
2350
+ filename = f"{custom_name}.md"
2351
+ else:
2352
+ filename = url_to_filename(url)
2353
+
2354
+ logger.info(f"Processing URL: {url} (strategy: {fetch_strategy.value})")
2355
+ progress_obj.update(progress_task, description=f"[cyan]{url[:50]}...")
2356
+
2357
+ # Fetch URL using the configured strategy
2358
+ try:
2359
+ fetch_result = await fetch_url(
2360
+ url,
2361
+ fetch_strategy,
2362
+ cfg.fetch,
2363
+ explicit_strategy=explicit_fetch_strategy,
2364
+ cache=fetch_cache,
2365
+ skip_read_cache=cfg.cache.no_cache,
2366
+ )
2367
+ url_fetch_strategy = fetch_result.strategy_used
2368
+ markdown_content = fetch_result.content
2369
+ cache_status = " [cache]" if fetch_result.cache_hit else ""
2370
+ logger.info(
2371
+ f"Fetched via {url_fetch_strategy}{cache_status}: {url}"
2372
+ )
2373
+ except AgentBrowserNotFoundError:
2374
+ logger.error(f"agent-browser not installed for: {url}")
2375
+ results[url] = {
2376
+ "status": "failed",
2377
+ "error": "agent-browser not installed",
2378
+ }
2379
+ failed += 1
2380
+ return
2381
+ except JinaRateLimitError:
2382
+ logger.error(f"Jina Reader rate limit exceeded for: {url}")
2383
+ results[url] = {
2384
+ "status": "failed",
2385
+ "error": "Jina Reader rate limit exceeded (20 RPM)",
2386
+ }
2387
+ failed += 1
2388
+ return
2389
+ except FetchError as e:
2390
+ logger.error(f"Failed to fetch {url}: {e}")
2391
+ results[url] = {"status": "failed", "error": str(e)}
2392
+ failed += 1
2393
+ return
2394
+
2395
+ if not markdown_content.strip():
2396
+ logger.warning(f"No content extracted from URL: {url}")
2397
+ results[url] = {
2398
+ "status": "failed",
2399
+ "error": "No content extracted",
2400
+ }
2401
+ failed += 1
2402
+ return
2403
+
2404
+ # Download images if --alt or --desc is enabled
2405
+ images_count = 0
2406
+ if cfg.image.alt_enabled or cfg.image.desc_enabled:
2407
+ download_result = await download_url_images(
2408
+ markdown=markdown_content,
2409
+ output_dir=output_dir,
2410
+ base_url=url,
2411
+ config=cfg.image,
2412
+ source_name=filename.replace(".md", ""),
2413
+ concurrency=5,
2414
+ timeout=30,
2415
+ )
2416
+ markdown_content = download_result.updated_markdown
2417
+ images_count = len(download_result.downloaded_paths)
2418
+
2419
+ # Generate output path with conflict resolution
2420
+ base_output_file = output_dir / filename
2421
+ output_file = resolve_output_path(
2422
+ base_output_file, cfg.output.on_conflict
2423
+ )
2424
+
2425
+ if output_file is None:
2426
+ logger.info(f"[SKIP] Output exists: {base_output_file}")
2427
+ results[url] = {"status": "skipped", "error": "Output exists"}
2428
+ return
2429
+
2430
+ # Write base .md file with frontmatter
2431
+ base_content = _add_basic_frontmatter(
2432
+ markdown_content,
2433
+ url,
2434
+ fetch_strategy=url_fetch_strategy,
2435
+ output_dir=output_dir,
2436
+ )
2437
+ atomic_write_text(output_file, base_content)
2438
+
2439
+ llm_cost = 0.0
2440
+ llm_usage: dict[str, dict[str, Any]] = {}
2441
+
2442
+ # LLM processing (if enabled)
2443
+ if cfg.llm.enabled:
2444
+ _, doc_cost, doc_usage = await process_with_llm(
2445
+ markdown_content,
2446
+ url,
2447
+ cfg,
2448
+ output_file,
2449
+ project_dir=output_dir.parent,
2450
+ )
2451
+ llm_cost += doc_cost
2452
+ _merge_llm_usage(llm_usage, doc_usage)
2453
+
2454
+ total_llm_cost += llm_cost
2455
+ _merge_llm_usage(total_llm_usage, llm_usage)
2456
+
2457
+ results[url] = {
2458
+ "status": "completed",
2459
+ "error": None,
2460
+ "output": str(
2461
+ output_file.with_suffix(".llm.md")
2462
+ if cfg.llm.enabled
2463
+ else output_file
2464
+ ),
2465
+ "fetch_strategy": url_fetch_strategy,
2466
+ "images": images_count,
2467
+ }
2468
+ completed += 1
2469
+ logger.info(f"Completed via {url_fetch_strategy}: {url}")
2470
+
2471
+ except Exception as e:
2472
+ logger.error(f"Failed to process {url}: {e}")
2473
+ results[url] = {"status": "failed", "error": str(e)}
2474
+ failed += 1
2475
+
2476
+ finally:
2477
+ progress_obj.advance(progress_task)
2478
+
2479
+ # Process all URLs with progress bar
2480
+ with Progress(
2481
+ SpinnerColumn(),
2482
+ TextColumn("[progress.description]{task.description}"),
2483
+ BarColumn(),
2484
+ MofNCompleteColumn(),
2485
+ TimeElapsedColumn(),
2486
+ console=console,
2487
+ ) as progress:
2488
+ task = progress.add_task("[cyan]Processing URLs...", total=len(url_entries))
2489
+
2490
+ tasks = [process_single_url(entry, task, progress) for entry in url_entries]
2491
+ await asyncio.gather(*tasks)
2492
+
2493
+ # Generate report
2494
+ finished_at = datetime.now()
2495
+ duration = (finished_at - started_at).total_seconds()
2496
+
2497
+ input_tokens = sum(u.get("input_tokens", 0) for u in total_llm_usage.values())
2498
+ output_tokens = sum(u.get("output_tokens", 0) for u in total_llm_usage.values())
2499
+ requests = sum(u.get("requests", 0) for u in total_llm_usage.values())
2500
+
2501
+ task_options = {
2502
+ "llm": cfg.llm.enabled,
2503
+ "alt": cfg.image.alt_enabled,
2504
+ "desc": cfg.image.desc_enabled,
2505
+ }
2506
+ task_hash = compute_task_hash(output_dir, output_dir, task_options)
2507
+ report_path = get_report_file_path(output_dir, task_hash, cfg.output.on_conflict)
2508
+ report_path.parent.mkdir(parents=True, exist_ok=True)
2509
+
2510
+ report = {
2511
+ "version": "1.0",
2512
+ "generated_at": datetime.now().astimezone().isoformat(),
2513
+ "log_file": str(log_file_path) if log_file_path else None,
2514
+ "summary": {
2515
+ "total_documents": 0,
2516
+ "completed_documents": 0,
2517
+ "failed_documents": 0,
2518
+ "total_urls": len(url_entries),
2519
+ "completed_urls": completed,
2520
+ "failed_urls": failed,
2521
+ "duration": duration,
2522
+ },
2523
+ "llm_usage": {
2524
+ "models": total_llm_usage,
2525
+ "requests": requests,
2526
+ "input_tokens": input_tokens,
2527
+ "output_tokens": output_tokens,
2528
+ "cost_usd": total_llm_cost,
2529
+ },
2530
+ "urls": results,
2531
+ }
2532
+
2533
+ atomic_write_json(report_path, report, order_func=order_report)
2534
+ logger.info(f"Report saved: {report_path}")
2535
+
2536
+ # Print summary
2537
+ console.print()
2538
+ console.print(
2539
+ Panel(
2540
+ f"[green]Completed:[/green] {completed}\n"
2541
+ f"[red]Failed:[/red] {failed}\n"
2542
+ f"[dim]Duration:[/dim] {duration:.1f}s\n"
2543
+ f"[dim]Report:[/dim] {report_path}",
2544
+ title="URL Batch Complete",
2545
+ )
2546
+ )
2547
+
2548
+
2549
+ def _build_multi_source_content(
2550
+ static_content: str | None,
2551
+ browser_content: str | None,
2552
+ fallback_content: str,
2553
+ ) -> str:
2554
+ """Build content from URL fetch result (single-source strategy).
2555
+
2556
+ With the static-first + browser-fallback strategy, we only have one
2557
+ valid source at a time. This function simply returns the primary content
2558
+ without adding any source labels (which would leak into the final output).
2559
+
2560
+ Args:
2561
+ static_content: Content from static/jina fetch (may be None)
2562
+ browser_content: Content from browser fetch (may be None)
2563
+ fallback_content: Primary content from FetchResult.content
2564
+
2565
+ Returns:
2566
+ Single-source content without labels
2567
+ """
2568
+ # With single-source strategy, fallback_content is already the best source
2569
+ # No need to merge or add labels - just return the primary content
2570
+ return fallback_content.strip() if fallback_content else ""
2571
+
2572
+
2573
+ async def _process_url_with_vision(
2574
+ content: str,
2575
+ screenshot_path: Path,
2576
+ url: str,
2577
+ cfg: MarkitaiConfig,
2578
+ output_file: Path,
2579
+ processor: LLMProcessor | None = None,
2580
+ project_dir: Path | None = None,
2581
+ ) -> tuple[str, float, dict[str, dict[str, Any]]]:
2582
+ """Process URL content with vision enhancement using screenshot.
2583
+
2584
+ This provides similar functionality to PDF/PPTX vision enhancement,
2585
+ using the page screenshot as visual reference for content extraction.
2586
+
2587
+ Args:
2588
+ content: Markdown content (may be multi-source combined)
2589
+ screenshot_path: Path to the URL screenshot
2590
+ url: Original URL (used as source identifier)
2591
+ cfg: Configuration
2592
+ output_file: Output file path
2593
+ processor: Optional shared LLMProcessor
2594
+ project_dir: Project directory for cache
2595
+
2596
+ Returns:
2597
+ Tuple of (original_content, cost_usd, llm_usage)
2598
+ """
2599
+ from markitai.workflow.helpers import create_llm_processor
2600
+
2601
+ try:
2602
+ if processor is None:
2603
+ processor = create_llm_processor(cfg, project_dir=project_dir)
2604
+
2605
+ # Use URL-specific vision enhancement (no slide/page marker protection)
2606
+ cleaned_content, frontmatter = await processor.enhance_url_with_vision(
2607
+ content, screenshot_path, context=url
2608
+ )
2609
+
2610
+ # Format and write LLM output
2611
+ llm_output = output_file.with_suffix(".llm.md")
2612
+ llm_content = processor.format_llm_output(cleaned_content, frontmatter)
2613
+
2614
+ # Add screenshot reference as comment
2615
+ screenshot_comment = (
2616
+ f"\n\n<!-- Screenshot for reference -->\n"
2617
+ f"<!-- ![Screenshot](screenshots/{screenshot_path.name}) -->"
2618
+ )
2619
+ llm_content += screenshot_comment
2620
+
2621
+ atomic_write_text(llm_output, llm_content)
2622
+ logger.info(f"Written LLM version with vision: {llm_output}")
2623
+
2624
+ # Get usage for this URL
2625
+ cost = processor.get_context_cost(url)
2626
+ usage = processor.get_context_usage(url)
2627
+ return content, cost, usage
2628
+
2629
+ except Exception as e:
2630
+ logger.warning(
2631
+ f"Vision enhancement failed for {url}: {e}, falling back to standard processing"
2632
+ )
2633
+ # Fallback to standard processing
2634
+ return await process_with_llm(
2635
+ content,
2636
+ url,
2637
+ cfg,
2638
+ output_file,
2639
+ processor=processor,
2640
+ project_dir=project_dir,
2641
+ )
2642
+
2643
+
2644
+ async def process_with_llm(
2645
+ markdown: str,
2646
+ source: str,
2647
+ cfg: MarkitaiConfig,
2648
+ output_file: Path,
2649
+ page_images: list[dict] | None = None,
2650
+ processor: LLMProcessor | None = None,
2651
+ original_markdown: str | None = None,
2652
+ project_dir: Path | None = None,
2653
+ ) -> tuple[str, float, dict[str, dict[str, Any]]]:
2654
+ """Process markdown with LLM and write enhanced version to .llm.md file.
2655
+
2656
+ The LLM-enhanced content is written to output_file with .llm.md suffix.
2657
+ Returns the original markdown unchanged for use in base .md file.
2658
+
2659
+ Args:
2660
+ markdown: Markdown content to process
2661
+ source: Source file name (used as LLM context identifier)
2662
+ cfg: Configuration with LLM and prompt settings
2663
+ output_file: Base output file path (.llm.md suffix added automatically)
2664
+ page_images: Optional page image info for adding commented references
2665
+ processor: Optional shared LLMProcessor (created if not provided)
2666
+ original_markdown: Original markdown for detecting hallucinated images
2667
+ project_dir: Project directory for cache isolation
2668
+
2669
+ Returns:
2670
+ Tuple of (original_markdown, cost_usd, llm_usage):
2671
+ - original_markdown: Input markdown unchanged (for base .md file)
2672
+ - cost_usd: LLM API cost for this file
2673
+ - llm_usage: Per-model usage {model: {requests, input_tokens, output_tokens, cost_usd}}
2674
+
2675
+ Side Effects:
2676
+ Writes LLM-enhanced content to {output_file}.llm.md
2677
+ """
2678
+ try:
2679
+ if processor is None:
2680
+ processor = create_llm_processor(cfg, project_dir=project_dir)
2681
+
2682
+ cleaned, frontmatter = await processor.process_document(markdown, source)
2683
+
2684
+ # Remove hallucinated image URLs (URLs that don't exist in original)
2685
+ original_for_comparison = original_markdown if original_markdown else markdown
2686
+ cleaned = ImageProcessor.remove_hallucinated_images(
2687
+ cleaned, original_for_comparison
2688
+ )
2689
+
2690
+ # Validate local image references - remove non-existent assets
2691
+ assets_dir = output_file.parent / "assets"
2692
+ if assets_dir.exists():
2693
+ cleaned = ImageProcessor.remove_nonexistent_images(cleaned, assets_dir)
2694
+
2695
+ # Write LLM version
2696
+ llm_output = output_file.with_suffix(".llm.md")
2697
+ llm_content = processor.format_llm_output(cleaned, frontmatter)
2698
+
2699
+ # Check if page_images comments already exist in content
2700
+ # process_document's placeholder protection should preserve them
2701
+ # Append missing page image comments
2702
+ if page_images:
2703
+ page_header = "<!-- Page images for reference -->"
2704
+ has_page_images_header = page_header in llm_content
2705
+
2706
+ # Build the complete page images section
2707
+ commented_images = [
2708
+ f"<!-- ![Page {img['page']}](screenshots/{img['name']}) -->"
2709
+ for img in sorted(page_images, key=lambda x: x.get("page", 0))
2710
+ ]
2711
+
2712
+ if not has_page_images_header:
2713
+ # No header exists, add complete section
2714
+ llm_content += "\n\n" + page_header + "\n" + "\n".join(commented_images)
2715
+ else:
2716
+ # Header exists, check for missing page comments
2717
+ import re
2718
+
2719
+ for comment in commented_images:
2720
+ # Check if this specific page is already referenced
2721
+ page_match = re.search(r"!\[Page\s+(\d+)\]", comment)
2722
+ if page_match:
2723
+ page_num = page_match.group(1)
2724
+ # Look for this page number in any form (commented or not)
2725
+ if not re.search(rf"!\[Page\s+{page_num}\]", llm_content):
2726
+ # Append missing page comment
2727
+ llm_content = llm_content.rstrip() + "\n" + comment
2728
+
2729
+ atomic_write_text(llm_output, llm_content)
2730
+ logger.info(f"Written LLM version: {llm_output}")
2731
+
2732
+ # Get usage for THIS file only, not global cumulative usage
2733
+ cost = processor.get_context_cost(source)
2734
+ usage = processor.get_context_usage(source)
2735
+ return markdown, cost, usage # Return original for base .md file
2736
+
2737
+ except Exception as e:
2738
+ logger.warning(f"LLM processing failed: {e}")
2739
+ return markdown, 0.0, {}
2740
+
2741
+
2742
+ def _format_standalone_image_markdown(
2743
+ input_path: Path,
2744
+ analysis: ImageAnalysis,
2745
+ image_ref_path: str,
2746
+ include_frontmatter: bool = False,
2747
+ ) -> str:
2748
+ """Format analysis results for a standalone image file.
2749
+
2750
+ This is a wrapper that delegates to workflow/helpers.format_standalone_image_markdown.
2751
+
2752
+ Args:
2753
+ input_path: Original image file path
2754
+ analysis: ImageAnalysis result with caption, description, extracted_text
2755
+ image_ref_path: Relative path for image reference
2756
+ include_frontmatter: Whether to include YAML frontmatter
2757
+
2758
+ Returns:
2759
+ Formatted markdown string
2760
+ """
2761
+ from markitai.workflow.helpers import format_standalone_image_markdown
2762
+
2763
+ return format_standalone_image_markdown(
2764
+ input_path, analysis, image_ref_path, include_frontmatter
2765
+ )
2766
+
2767
+
2768
+ async def analyze_images_with_llm(
2769
+ image_paths: list[Path],
2770
+ markdown: str,
2771
+ output_file: Path,
2772
+ cfg: MarkitaiConfig,
2773
+ input_path: Path | None = None,
2774
+ concurrency_limit: int | None = None,
2775
+ processor: LLMProcessor | None = None,
2776
+ project_dir: Path | None = None,
2777
+ ) -> tuple[str, float, dict[str, dict[str, Any]], ImageAnalysisResult | None]:
2778
+ """Analyze images with LLM Vision using batch processing.
2779
+
2780
+ Uses batch analysis to reduce LLM calls (10 images per call instead of 1).
2781
+
2782
+ Behavior controlled by config:
2783
+ - alt_enabled: Update alt text in markdown
2784
+ - desc_enabled: Collect asset descriptions (caller writes JSON)
2785
+
2786
+ Args:
2787
+ image_paths: List of image file paths
2788
+ markdown: Original markdown content
2789
+ output_file: Output markdown file path
2790
+ cfg: Configuration
2791
+ input_path: Source input file path (for absolute path in JSON)
2792
+ concurrency_limit: Max concurrent LLM requests (unused, kept for API compat)
2793
+ processor: Optional shared LLMProcessor (created if not provided)
2794
+ project_dir: Project directory for persistent cache scope
2795
+
2796
+ Returns:
2797
+ Tuple of (updated_markdown, cost_usd, llm_usage, image_analysis_result):
2798
+ - updated_markdown: Markdown with updated alt text (if alt_enabled)
2799
+ - cost_usd: LLM API cost for image analysis
2800
+ - llm_usage: Per-model usage {model: {requests, input_tokens, output_tokens, cost_usd}}
2801
+ - image_analysis_result: Analysis data for JSON output (None if desc_enabled=False)
2802
+ """
2803
+ import re
2804
+ from datetime import datetime
2805
+
2806
+ alt_enabled = cfg.image.alt_enabled
2807
+ desc_enabled = cfg.image.desc_enabled
2808
+
2809
+ try:
2810
+ if processor is None:
2811
+ processor = create_llm_processor(cfg, project_dir=project_dir)
2812
+
2813
+ # Use unique context for image analysis to track usage separately from doc processing
2814
+ # Format: "full_path:images" ensures isolation even for files with same name in different dirs
2815
+ # This prevents usage from concurrent files being mixed together
2816
+ source_path = (
2817
+ str(input_path.resolve()) if input_path else str(output_file.resolve())
2818
+ )
2819
+ context = f"{source_path}:images"
2820
+
2821
+ # Detect document language from markdown content
2822
+ language = _detect_language(markdown)
2823
+
2824
+ # Use batch analysis
2825
+ logger.info(f"Analyzing {len(image_paths)} images in batches...")
2826
+ analyses = await processor.analyze_images_batch(
2827
+ image_paths,
2828
+ language=language,
2829
+ max_images_per_batch=DEFAULT_MAX_IMAGES_PER_BATCH,
2830
+ context=context,
2831
+ )
2832
+
2833
+ timestamp = datetime.now().astimezone().isoformat()
2834
+
2835
+ # Collect asset descriptions for JSON output
2836
+ asset_descriptions: list[dict[str, Any]] = []
2837
+
2838
+ # Check if this is a standalone image file
2839
+ is_standalone_image = (
2840
+ input_path is not None
2841
+ and input_path.suffix.lower() in IMAGE_EXTENSIONS
2842
+ and len(image_paths) == 1
2843
+ )
2844
+
2845
+ # Process results (analyses is in same order as image_paths)
2846
+ results: list[tuple[Path, ImageAnalysis | None, str]] = []
2847
+ for image_path, analysis in zip(image_paths, analyses):
2848
+ results.append((image_path, analysis, timestamp))
2849
+
2850
+ # Collect for JSON output (if desc_enabled)
2851
+ if desc_enabled:
2852
+ asset_descriptions.append(
2853
+ {
2854
+ "asset": str(image_path.resolve()),
2855
+ "alt": analysis.caption,
2856
+ "desc": analysis.description,
2857
+ "text": analysis.extracted_text or "",
2858
+ "llm_usage": analysis.llm_usage or {},
2859
+ "created": timestamp,
2860
+ }
2861
+ )
2862
+
2863
+ # Update alt text in markdown (if alt_enabled)
2864
+ if alt_enabled and not is_standalone_image:
2865
+ old_pattern = rf"!\[[^\]]*\]\([^)]*{re.escape(image_path.name)}\)"
2866
+ new_ref = f"![{analysis.caption}](assets/{image_path.name})"
2867
+ markdown = re.sub(old_pattern, new_ref, markdown)
2868
+
2869
+ # Update .llm.md file
2870
+ llm_output = output_file.with_suffix(".llm.md")
2871
+ if is_standalone_image and results and results[0][1] is not None:
2872
+ # For standalone images, write the rich formatted content with frontmatter
2873
+ assert input_path is not None
2874
+ _, analysis, _ = results[0]
2875
+ if analysis:
2876
+ rich_content = _format_standalone_image_markdown(
2877
+ input_path,
2878
+ analysis,
2879
+ f"assets/{input_path.name}",
2880
+ include_frontmatter=True,
2881
+ )
2882
+ # Normalize whitespace (ensure headers have blank lines before/after)
2883
+ from markitai.utils.text import normalize_markdown_whitespace
2884
+
2885
+ rich_content = normalize_markdown_whitespace(rich_content)
2886
+ atomic_write_text(llm_output, rich_content)
2887
+ elif alt_enabled:
2888
+ # For other files, update alt text in .llm.md
2889
+ # Wait for .llm.md file to exist (it's written by parallel doc processing)
2890
+ max_wait_seconds = 120 # Max wait time
2891
+ poll_interval = 0.5 # Check every 0.5 seconds
2892
+ waited = 0.0
2893
+ while not llm_output.exists() and waited < max_wait_seconds:
2894
+ await asyncio.sleep(poll_interval)
2895
+ waited += poll_interval
2896
+
2897
+ if llm_output.exists():
2898
+ llm_content = llm_output.read_text(encoding="utf-8")
2899
+ for image_path, analysis, _ in results:
2900
+ if analysis is None:
2901
+ continue
2902
+ old_pattern = rf"!\[[^\]]*\]\([^)]*{re.escape(image_path.name)}\)"
2903
+ new_ref = f"![{analysis.caption}](assets/{image_path.name})"
2904
+ llm_content = re.sub(old_pattern, new_ref, llm_content)
2905
+ atomic_write_text(llm_output, llm_content)
2906
+ else:
2907
+ logger.warning(
2908
+ f"Skipped alt text update: {llm_output} not created within {max_wait_seconds}s"
2909
+ )
2910
+
2911
+ # Build analysis result for caller to aggregate
2912
+ analysis_result: ImageAnalysisResult | None = None
2913
+ if desc_enabled and asset_descriptions:
2914
+ source_path = str(input_path.resolve()) if input_path else output_file.stem
2915
+ analysis_result = ImageAnalysisResult(
2916
+ source_file=source_path,
2917
+ assets=asset_descriptions,
2918
+ )
2919
+
2920
+ # Get usage for THIS file only using context-based tracking
2921
+ # This is concurrency-safe: only includes LLM calls tagged with this context
2922
+ incremental_usage = processor.get_context_usage(context)
2923
+ incremental_cost = processor.get_context_cost(context)
2924
+
2925
+ return (
2926
+ markdown,
2927
+ incremental_cost,
2928
+ incremental_usage,
2929
+ analysis_result,
2930
+ )
2931
+
2932
+ except Exception as e:
2933
+ logger.warning(f"Image analysis failed: {e}")
2934
+ return markdown, 0.0, {}, None
2935
+
2936
+
2937
+ async def enhance_document_with_vision(
2938
+ extracted_text: str,
2939
+ page_images: list[dict],
2940
+ cfg: MarkitaiConfig,
2941
+ source: str = "document",
2942
+ processor: LLMProcessor | None = None,
2943
+ project_dir: Path | None = None,
2944
+ ) -> tuple[str, str, float, dict[str, dict[str, Any]]]:
2945
+ """Enhance document by combining extracted text with page images.
2946
+
2947
+ This is used for OCR+LLM mode where we have:
2948
+ 1. Text extracted programmatically (pymupdf4llm/markitdown) - accurate content
2949
+ 2. Page images - visual reference for layout/structure
2950
+
2951
+ The LLM uses both to produce optimized markdown + frontmatter.
2952
+
2953
+ Args:
2954
+ extracted_text: Text extracted by pymupdf4llm/markitdown
2955
+ page_images: List of page image info dicts with 'path' key
2956
+ cfg: Configuration
2957
+ source: Source file name for logging context
2958
+ processor: Optional shared LLMProcessor (created if not provided)
2959
+ project_dir: Project directory for persistent cache scope
2960
+
2961
+ Returns:
2962
+ Tuple of (cleaned_markdown, frontmatter_yaml, cost_usd, llm_usage)
2963
+ """
2964
+ try:
2965
+ if processor is None:
2966
+ processor = create_llm_processor(cfg, project_dir=project_dir)
2967
+
2968
+ # Sort images by page number
2969
+ def get_page_num(img_info: dict) -> int:
2970
+ return img_info.get("page", 0)
2971
+
2972
+ sorted_images = sorted(page_images, key=get_page_num)
2973
+
2974
+ # Convert to Path list
2975
+ image_paths = [Path(img["path"]) for img in sorted_images]
2976
+
2977
+ logger.info(
2978
+ f"[START] {source}: Enhancing with {len(image_paths)} page images..."
2979
+ )
2980
+
2981
+ # Call the combined enhancement method (clean + frontmatter)
2982
+ cleaned_content, frontmatter = await processor.enhance_document_complete(
2983
+ extracted_text, image_paths, source=source
2984
+ )
2985
+
2986
+ # Get usage for THIS file only, not global cumulative usage
2987
+ return (
2988
+ cleaned_content,
2989
+ frontmatter,
2990
+ processor.get_context_cost(source),
2991
+ processor.get_context_usage(source),
2992
+ )
2993
+
2994
+ except Exception as e:
2995
+ logger.warning(f"Document enhancement failed: {e}")
2996
+ # Return original text with basic frontmatter as fallback
2997
+ basic_frontmatter = f"title: {source}\nsource: {source}"
2998
+ return extracted_text, basic_frontmatter, 0.0, {}
2999
+
3000
+
3001
+ def _check_vision_model_config(cfg: Any, console: Any, verbose: bool = False) -> None:
3002
+ """Check vision model configuration when image analysis is enabled.
3003
+
3004
+ Args:
3005
+ cfg: Configuration object
3006
+ console: Rich console for output
3007
+ verbose: Whether to show extra details
3008
+ """
3009
+ # Only check if image analysis is enabled
3010
+ if not (cfg.image.alt_enabled or cfg.image.desc_enabled):
3011
+ return
3012
+
3013
+ # Check if LLM is enabled
3014
+ if not cfg.llm.enabled:
3015
+ from rich.panel import Panel
3016
+
3017
+ warning_text = (
3018
+ "[yellow]⚠ Image analysis (--alt/--desc) requires LLM to be enabled.[/yellow]\n\n"
3019
+ "[dim]Image alt text and descriptions will be skipped without LLM.[/dim]\n\n"
3020
+ "To enable LLM processing:\n"
3021
+ " [cyan]markitai --llm ...[/cyan] or use [cyan]--preset rich/standard[/cyan]"
3022
+ )
3023
+ console.print(Panel(warning_text, title="LLM Required", border_style="yellow"))
3024
+ return
3025
+
3026
+ # Check if vision-capable models are configured (auto-detect from litellm)
3027
+ from markitai.llm import get_model_info_cached
3028
+
3029
+ def is_vision_model(model_config: Any) -> bool:
3030
+ """Check if model supports vision (config override or auto-detect)."""
3031
+ if (
3032
+ model_config.model_info
3033
+ and model_config.model_info.supports_vision is not None
3034
+ ):
3035
+ return model_config.model_info.supports_vision
3036
+ info = get_model_info_cached(model_config.litellm_params.model)
3037
+ return info.get("supports_vision", False)
3038
+
3039
+ vision_models = [m for m in cfg.llm.model_list if is_vision_model(m)]
3040
+
3041
+ if not vision_models and cfg.llm.model_list:
3042
+ from rich.panel import Panel
3043
+
3044
+ # List configured models
3045
+ configured_models = ", ".join(
3046
+ [m.litellm_params.model for m in cfg.llm.model_list[:3]]
3047
+ )
3048
+ if len(cfg.llm.model_list) > 3:
3049
+ configured_models += f" (+{len(cfg.llm.model_list) - 3} more)"
3050
+
3051
+ warning_text = (
3052
+ "[yellow]⚠ No vision-capable models detected.[/yellow]\n\n"
3053
+ f"[dim]Current models: {configured_models}[/dim]\n"
3054
+ "[dim]Vision models are auto-detected from litellm. "
3055
+ "Add `supports_vision: true` in config to override.[/dim]"
3056
+ )
3057
+ console.print(
3058
+ Panel(warning_text, title="Vision Model Recommended", border_style="yellow")
3059
+ )
3060
+ elif verbose and vision_models:
3061
+ # In verbose mode, show which vision models are configured
3062
+ model_names = [m.litellm_params.model for m in vision_models]
3063
+ count = len(model_names)
3064
+ if count <= 3:
3065
+ logger.debug(
3066
+ f"Vision models configured: {count} ({', '.join(model_names)})"
3067
+ )
3068
+ else:
3069
+ preview = ", ".join(model_names[:3])
3070
+ logger.debug(f"Vision models configured: {count} ({preview}, ...)")
3071
+
3072
+
3073
+ def _check_agent_browser_for_urls(cfg: Any, console: Any) -> None:
3074
+ """Check agent-browser availability and warn if not ready for URL processing.
3075
+
3076
+ Args:
3077
+ cfg: Configuration object
3078
+ console: Rich console for output
3079
+ """
3080
+ from markitai.fetch import FetchStrategy, verify_agent_browser_ready
3081
+
3082
+ # Only check if strategy might use browser
3083
+ strategy = (
3084
+ cfg.fetch.strategy if hasattr(cfg.fetch, "strategy") else FetchStrategy.AUTO
3085
+ )
3086
+ if strategy == FetchStrategy.STATIC or strategy == FetchStrategy.JINA:
3087
+ return # No browser needed
3088
+
3089
+ # Get command from config
3090
+ command = "agent-browser"
3091
+ if hasattr(cfg, "agent_browser") and hasattr(cfg.agent_browser, "command"):
3092
+ command = cfg.agent_browser.command
3093
+
3094
+ is_ready, message = verify_agent_browser_ready(command, use_cache=True)
3095
+
3096
+ if not is_ready:
3097
+ from rich.panel import Panel
3098
+
3099
+ warning_text = (
3100
+ f"[yellow]⚠ {message}[/yellow]\n\n"
3101
+ "[dim]URL processing will fall back to static fetch strategy.\n"
3102
+ "For JavaScript-rendered pages (Twitter/X, etc.), browser support is recommended.\n\n"
3103
+ "To install browser support:[/dim]\n"
3104
+ " [cyan]agent-browser install[/cyan] [dim]or[/dim] [cyan]npx playwright install chromium[/cyan]"
3105
+ )
3106
+ console.print(
3107
+ Panel(warning_text, title="Browser Not Available", border_style="yellow")
3108
+ )
3109
+
3110
+
3111
+ def _warn_case_sensitivity_mismatches(
3112
+ files: list[Path],
3113
+ input_dir: Path,
3114
+ patterns: list[str],
3115
+ ) -> None:
3116
+ """Warn about files that would match patterns if case-insensitive.
3117
+
3118
+ This helps users catch cases where e.g., '*.jpg' doesn't match 'IMAGE.JPG'
3119
+ because pattern matching is case-sensitive on most platforms.
3120
+
3121
+ Args:
3122
+ files: List of files discovered for processing
3123
+ input_dir: Base input directory for relative path calculation
3124
+ patterns: List of --no-cache-for patterns
3125
+ """
3126
+ import fnmatch
3127
+
3128
+ # Collect potential case mismatches
3129
+ mismatches: list[tuple[str, str]] = [] # (file_path, pattern)
3130
+
3131
+ for f in files:
3132
+ try:
3133
+ rel_path = f.relative_to(input_dir).as_posix()
3134
+ except ValueError:
3135
+ rel_path = f.name
3136
+
3137
+ for pattern in patterns:
3138
+ # Normalize pattern
3139
+ norm_pattern = pattern.replace("\\", "/")
3140
+
3141
+ # Check if it would match case-insensitively but not case-sensitively
3142
+ if not fnmatch.fnmatch(rel_path, norm_pattern):
3143
+ if fnmatch.fnmatch(rel_path.lower(), norm_pattern.lower()):
3144
+ mismatches.append((rel_path, pattern))
3145
+
3146
+ if mismatches:
3147
+ # Group by pattern for cleaner output
3148
+ by_pattern: dict[str, list[str]] = {}
3149
+ for file_path, pattern in mismatches:
3150
+ by_pattern.setdefault(pattern, []).append(file_path)
3151
+
3152
+ # Log warning
3153
+ logger.warning(
3154
+ f"[Cache] Case-sensitivity: {len(mismatches)} file(s) would match "
3155
+ "--no-cache-for patterns if case-insensitive"
3156
+ )
3157
+
3158
+ # Show details in console
3159
+ console.print(
3160
+ f"[yellow]Warning: {len(mismatches)} file(s) have case mismatches "
3161
+ "with --no-cache-for patterns[/yellow]"
3162
+ )
3163
+ for pattern, file_paths in by_pattern.items():
3164
+ console.print(f" Pattern: [cyan]{pattern}[/cyan]")
3165
+ for fp in file_paths[:3]: # Show max 3 examples
3166
+ console.print(f" - {fp}")
3167
+ if len(file_paths) > 3:
3168
+ console.print(f" ... and {len(file_paths) - 3} more")
3169
+ console.print(
3170
+ "[dim]Hint: Pattern matching is case-sensitive. "
3171
+ "Use exact case or patterns like '*.[jJ][pP][gG]'[/dim]"
3172
+ )
3173
+
3174
+
3175
+ def _create_process_file(
3176
+ cfg: MarkitaiConfig,
3177
+ input_dir: Path,
3178
+ output_dir: Path,
3179
+ preconverted_map: dict[Path, Path],
3180
+ shared_processor: LLMProcessor | None,
3181
+ ):
3182
+ """Create a process_file function using workflow/core pipeline.
3183
+
3184
+ This factory function creates a closure that captures the batch processing
3185
+ context for conversion.
3186
+
3187
+ Args:
3188
+ cfg: Markitai configuration
3189
+ input_dir: Input directory for relative path calculation
3190
+ output_dir: Output directory
3191
+ preconverted_map: Map of pre-converted legacy Office files
3192
+ shared_processor: Shared LLM processor for batch mode
3193
+
3194
+ Returns:
3195
+ An async function that processes a single file and returns ProcessResult
3196
+ """
3197
+ from markitai.batch import ProcessResult
3198
+ from markitai.workflow.core import ConversionContext, convert_document_core
3199
+
3200
+ async def process_file(file_path: Path) -> ProcessResult:
3201
+ """Process a single file using workflow/core pipeline."""
3202
+ import time
3203
+
3204
+ start_time = time.perf_counter()
3205
+ logger.info(f"[START] {file_path.name}")
3206
+
3207
+ try:
3208
+ # Calculate relative path to preserve directory structure
3209
+ try:
3210
+ rel_path = file_path.parent.relative_to(input_dir)
3211
+ file_output_dir = output_dir / rel_path
3212
+ except ValueError:
3213
+ file_output_dir = output_dir
3214
+
3215
+ # Create conversion context
3216
+ ctx = ConversionContext(
3217
+ input_path=file_path,
3218
+ output_dir=file_output_dir,
3219
+ config=cfg,
3220
+ actual_file=preconverted_map.get(file_path),
3221
+ shared_processor=shared_processor,
3222
+ project_dir=output_dir.parent,
3223
+ use_multiprocess_images=True,
3224
+ input_base_dir=input_dir,
3225
+ )
3226
+
3227
+ # Run core conversion pipeline
3228
+ result = await convert_document_core(ctx, MAX_DOCUMENT_SIZE)
3229
+
3230
+ total_time = time.perf_counter() - start_time
3231
+
3232
+ if not result.success:
3233
+ logger.error(
3234
+ f"[FAIL] {file_path.name}: {result.error} ({total_time:.2f}s)"
3235
+ )
3236
+ return ProcessResult(success=False, error=result.error)
3237
+
3238
+ if result.skip_reason == "exists":
3239
+ logger.info(
3240
+ f"[SKIP] Output exists: {file_output_dir / f'{file_path.name}.md'}"
3241
+ )
3242
+ return ProcessResult(
3243
+ success=True,
3244
+ output_path=str(file_output_dir / f"{file_path.name}.md"),
3245
+ error="skipped (exists)",
3246
+ )
3247
+
3248
+ # Determine cache hit
3249
+ cache_hit = cfg.llm.enabled and not ctx.llm_usage
3250
+
3251
+ logger.info(
3252
+ f"[DONE] {file_path.name}: {total_time:.2f}s "
3253
+ f"(images={ctx.embedded_images_count}, screenshots={ctx.screenshots_count}, cost=${ctx.llm_cost:.4f})"
3254
+ + (" [cache]" if cache_hit else "")
3255
+ )
3256
+
3257
+ return ProcessResult(
3258
+ success=True,
3259
+ output_path=str(
3260
+ ctx.output_file.with_suffix(".llm.md")
3261
+ if cfg.llm.enabled and ctx.output_file
3262
+ else ctx.output_file
3263
+ ),
3264
+ images=ctx.embedded_images_count,
3265
+ screenshots=ctx.screenshots_count,
3266
+ cost_usd=ctx.llm_cost,
3267
+ llm_usage=ctx.llm_usage,
3268
+ image_analysis_result=ctx.image_analysis,
3269
+ cache_hit=cache_hit,
3270
+ )
3271
+
3272
+ except Exception as e:
3273
+ total_time = time.perf_counter() - start_time
3274
+ logger.error(f"[FAIL] {file_path.name}: {e} ({total_time:.2f}s)")
3275
+ return ProcessResult(success=False, error=str(e))
3276
+
3277
+ return process_file
3278
+
3279
+
3280
+ def _create_url_processor(
3281
+ cfg: MarkitaiConfig,
3282
+ output_dir: Path,
3283
+ fetch_strategy: FetchStrategy | None,
3284
+ explicit_fetch_strategy: bool,
3285
+ shared_processor: LLMProcessor | None = None,
3286
+ ) -> Callable:
3287
+ """Create a URL processing function for batch processing.
3288
+
3289
+ Args:
3290
+ cfg: Configuration
3291
+ output_dir: Output directory
3292
+ fetch_strategy: Fetch strategy to use
3293
+ explicit_fetch_strategy: Whether strategy was explicitly specified
3294
+ shared_processor: Optional shared LLMProcessor
3295
+
3296
+ Returns:
3297
+ Async function that processes a single URL and returns ProcessResult
3298
+ """
3299
+ from markitai.batch import ProcessResult
3300
+ from markitai.fetch import (
3301
+ AgentBrowserNotFoundError,
3302
+ FetchError,
3303
+ FetchStrategy,
3304
+ JinaRateLimitError,
3305
+ fetch_url,
3306
+ get_fetch_cache,
3307
+ )
3308
+ from markitai.image import download_url_images
3309
+
3310
+ # Determine fetch strategy (use config default if not specified)
3311
+ _fetch_strategy = fetch_strategy
3312
+ if _fetch_strategy is None:
3313
+ _fetch_strategy = FetchStrategy(cfg.fetch.strategy)
3314
+
3315
+ # Initialize fetch cache for URL processing
3316
+ url_fetch_cache = None
3317
+ if cfg.cache.enabled:
3318
+ url_cache_dir = output_dir.parent / ".markitai"
3319
+ url_fetch_cache = get_fetch_cache(url_cache_dir, cfg.cache.max_size_bytes)
3320
+
3321
+ # Prepare screenshot directory if enabled
3322
+ url_screenshot_dir = (
3323
+ ensure_screenshots_dir(output_dir) if cfg.screenshot.enabled else None
3324
+ )
3325
+
3326
+ async def process_url(
3327
+ url: str,
3328
+ source_file: Path,
3329
+ custom_name: str | None = None,
3330
+ ) -> tuple[ProcessResult, dict[str, Any]]:
3331
+ """Process a single URL.
3332
+
3333
+ Args:
3334
+ url: URL to process
3335
+ source_file: Path to the .urls file containing this URL
3336
+ custom_name: Optional custom output name
3337
+
3338
+ Returns:
3339
+ Tuple of (ProcessResult, extra_info dict with fetch_strategy)
3340
+ """
3341
+ import time
3342
+
3343
+ start_time = time.perf_counter()
3344
+ extra_info: dict[str, Any] = {
3345
+ "fetch_strategy": "unknown",
3346
+ }
3347
+
3348
+ try:
3349
+ # Generate filename
3350
+ if custom_name:
3351
+ filename = f"{custom_name}.md"
3352
+ else:
3353
+ filename = url_to_filename(url)
3354
+
3355
+ logger.info(f"[URL] Processing: {url} (strategy: {_fetch_strategy.value})")
3356
+
3357
+ # Fetch URL using the configured strategy
3358
+ try:
3359
+ fetch_result = await fetch_url(
3360
+ url,
3361
+ _fetch_strategy,
3362
+ cfg.fetch,
3363
+ explicit_strategy=explicit_fetch_strategy,
3364
+ cache=url_fetch_cache,
3365
+ skip_read_cache=cfg.cache.no_cache,
3366
+ screenshot=cfg.screenshot.enabled,
3367
+ screenshot_dir=url_screenshot_dir,
3368
+ screenshot_config=cfg.screenshot
3369
+ if cfg.screenshot.enabled
3370
+ else None,
3371
+ )
3372
+ extra_info["fetch_strategy"] = fetch_result.strategy_used
3373
+ original_markdown = fetch_result.content
3374
+ screenshot_path = fetch_result.screenshot_path
3375
+ cache_status = " [cache]" if fetch_result.cache_hit else ""
3376
+ logger.debug(
3377
+ f"[URL] Fetched via {fetch_result.strategy_used}{cache_status}: {url}"
3378
+ )
3379
+ except AgentBrowserNotFoundError:
3380
+ logger.error(f"[URL] agent-browser not installed for: {url}")
3381
+ return ProcessResult(
3382
+ success=False,
3383
+ error="agent-browser not installed",
3384
+ ), extra_info
3385
+ except JinaRateLimitError:
3386
+ logger.error(f"[URL] Jina rate limit exceeded for: {url}")
3387
+ return ProcessResult(
3388
+ success=False,
3389
+ error="Jina Reader rate limit exceeded (20 RPM)",
3390
+ ), extra_info
3391
+ except FetchError as e:
3392
+ logger.error(f"[URL] Fetch failed {url}: {e}")
3393
+ return ProcessResult(success=False, error=str(e)), extra_info
3394
+
3395
+ if not original_markdown.strip():
3396
+ logger.warning(f"[URL] No content: {url}")
3397
+ return ProcessResult(
3398
+ success=False,
3399
+ error="No content extracted",
3400
+ ), extra_info
3401
+
3402
+ markdown_for_llm = original_markdown
3403
+
3404
+ # Check for multi-source content (static + browser + screenshot)
3405
+ has_multi_source = (
3406
+ fetch_result.static_content is not None
3407
+ or fetch_result.browser_content is not None
3408
+ )
3409
+ has_screenshot = screenshot_path and screenshot_path.exists()
3410
+
3411
+ logger.debug(
3412
+ f"[URL] Multi-source check: static={fetch_result.static_content is not None}, "
3413
+ f"browser={fetch_result.browser_content is not None}, "
3414
+ f"has_multi_source={has_multi_source}, has_screenshot={has_screenshot}"
3415
+ )
3416
+
3417
+ # Download images if --alt or --desc is enabled
3418
+ images_count = 0
3419
+ screenshots_count = 1 if has_screenshot else 0
3420
+ downloaded_images: list[Path] = []
3421
+
3422
+ if has_screenshot and screenshot_path:
3423
+ logger.debug(f"[URL] Screenshot captured: {screenshot_path.name}")
3424
+ if cfg.image.alt_enabled or cfg.image.desc_enabled:
3425
+ download_result = await download_url_images(
3426
+ markdown=original_markdown,
3427
+ output_dir=output_dir,
3428
+ base_url=url,
3429
+ config=cfg.image,
3430
+ source_name=filename.replace(".md", ""),
3431
+ concurrency=5,
3432
+ timeout=30,
3433
+ )
3434
+ markdown_for_llm = download_result.updated_markdown
3435
+ downloaded_images = download_result.downloaded_paths
3436
+ images_count = len(downloaded_images)
3437
+
3438
+ # Generate output path
3439
+ base_output_file = output_dir / filename
3440
+ output_file = resolve_output_path(base_output_file, cfg.output.on_conflict)
3441
+
3442
+ if output_file is None:
3443
+ logger.info(f"[URL] Skipped (exists): {base_output_file}")
3444
+ return ProcessResult(
3445
+ success=True,
3446
+ output_path=str(base_output_file),
3447
+ error="skipped (exists)",
3448
+ ), extra_info
3449
+
3450
+ # Write base .md file with original content
3451
+ base_content = _add_basic_frontmatter(
3452
+ original_markdown,
3453
+ url,
3454
+ fetch_strategy=fetch_result.strategy_used if fetch_result else None,
3455
+ screenshot_path=screenshot_path,
3456
+ output_dir=output_dir,
3457
+ )
3458
+ atomic_write_text(output_file, base_content)
3459
+
3460
+ # LLM processing uses markdown with local image paths
3461
+ url_llm_usage: dict[str, dict[str, Any]] = {}
3462
+ llm_cost = 0.0
3463
+ img_analysis = None
3464
+
3465
+ if cfg.llm.enabled:
3466
+ # Check if image analysis should run
3467
+ should_analyze_images = (
3468
+ cfg.image.alt_enabled or cfg.image.desc_enabled
3469
+ ) and downloaded_images
3470
+
3471
+ # Check if we should use vision enhancement (multi-source + screenshot)
3472
+ use_vision_enhancement = (
3473
+ has_multi_source and has_screenshot and screenshot_path
3474
+ )
3475
+
3476
+ if use_vision_enhancement:
3477
+ # Multi-source URL with screenshot: use vision LLM for better content extraction
3478
+ # Build multi-source markdown content for LLM
3479
+ multi_source_content = _build_multi_source_content(
3480
+ fetch_result.static_content,
3481
+ fetch_result.browser_content,
3482
+ markdown_for_llm, # Fallback primary content
3483
+ )
3484
+
3485
+ logger.info(
3486
+ f"[URL] Using vision enhancement for multi-source URL: {url}"
3487
+ )
3488
+
3489
+ # Use vision enhancement with screenshot
3490
+ assert (
3491
+ screenshot_path is not None
3492
+ ) # Guaranteed by use_vision_enhancement check
3493
+ _, cost, url_llm_usage = await _process_url_with_vision(
3494
+ multi_source_content,
3495
+ screenshot_path,
3496
+ url,
3497
+ cfg,
3498
+ output_file,
3499
+ processor=shared_processor,
3500
+ project_dir=output_dir.parent,
3501
+ )
3502
+ llm_cost = cost
3503
+
3504
+ # Run image analysis in parallel if needed
3505
+ if should_analyze_images:
3506
+ (
3507
+ _,
3508
+ image_cost,
3509
+ image_usage,
3510
+ img_analysis,
3511
+ ) = await analyze_images_with_llm(
3512
+ downloaded_images,
3513
+ multi_source_content,
3514
+ output_file,
3515
+ cfg,
3516
+ Path(url),
3517
+ concurrency_limit=cfg.llm.concurrency,
3518
+ processor=shared_processor,
3519
+ project_dir=output_dir.parent,
3520
+ )
3521
+ _merge_llm_usage(url_llm_usage, image_usage)
3522
+ llm_cost += image_cost
3523
+ elif should_analyze_images:
3524
+ # Standard processing with image analysis
3525
+ doc_task = process_with_llm(
3526
+ markdown_for_llm,
3527
+ url,
3528
+ cfg,
3529
+ output_file,
3530
+ processor=shared_processor,
3531
+ project_dir=output_dir.parent,
3532
+ )
3533
+ img_task = analyze_images_with_llm(
3534
+ downloaded_images,
3535
+ markdown_for_llm,
3536
+ output_file,
3537
+ cfg,
3538
+ Path(url), # Use URL as source path
3539
+ concurrency_limit=cfg.llm.concurrency,
3540
+ processor=shared_processor,
3541
+ project_dir=output_dir.parent,
3542
+ )
3543
+
3544
+ # Execute in parallel
3545
+ doc_result, img_result = await asyncio.gather(doc_task, img_task)
3546
+
3547
+ # Unpack results
3548
+ _, cost, url_llm_usage = doc_result
3549
+ _, image_cost, image_usage, img_analysis = img_result
3550
+
3551
+ _merge_llm_usage(url_llm_usage, image_usage)
3552
+ llm_cost = cost + image_cost
3553
+ else:
3554
+ # Only document processing
3555
+ _, cost, url_llm_usage = await process_with_llm(
3556
+ markdown_for_llm,
3557
+ url,
3558
+ cfg,
3559
+ output_file,
3560
+ processor=shared_processor,
3561
+ project_dir=output_dir.parent,
3562
+ )
3563
+ llm_cost = cost
3564
+
3565
+ # Track cache hit: LLM enabled but no usage means cache hit
3566
+ is_cache_hit = cfg.llm.enabled and not url_llm_usage
3567
+
3568
+ total_time = time.perf_counter() - start_time
3569
+ logger.info(
3570
+ f"[URL] Completed via {extra_info['fetch_strategy']}: {url} "
3571
+ f"({total_time:.2f}s)" + (" [cache]" if is_cache_hit else "")
3572
+ )
3573
+
3574
+ return ProcessResult(
3575
+ success=True,
3576
+ output_path=str(
3577
+ output_file.with_suffix(".llm.md")
3578
+ if cfg.llm.enabled
3579
+ else output_file
3580
+ ),
3581
+ images=images_count,
3582
+ screenshots=screenshots_count,
3583
+ cost_usd=llm_cost,
3584
+ llm_usage=url_llm_usage,
3585
+ image_analysis_result=img_analysis,
3586
+ cache_hit=is_cache_hit,
3587
+ ), extra_info
3588
+
3589
+ except Exception as e:
3590
+ total_time = time.perf_counter() - start_time
3591
+ logger.error(f"[URL] Failed {url}: {e} ({total_time:.2f}s)")
3592
+ return ProcessResult(success=False, error=str(e)), extra_info
3593
+
3594
+ return process_url
3595
+
3596
+
3597
+ async def process_batch(
3598
+ input_dir: Path,
3599
+ output_dir: Path,
3600
+ cfg: MarkitaiConfig,
3601
+ resume: bool,
3602
+ dry_run: bool,
3603
+ verbose: bool = False,
3604
+ console_handler_id: int | None = None,
3605
+ log_file_path: Path | None = None,
3606
+ fetch_strategy: FetchStrategy | None = None,
3607
+ explicit_fetch_strategy: bool = False,
3608
+ ) -> None:
3609
+ """Process directory in batch mode."""
3610
+ from markitai.batch import BatchProcessor
3611
+
3612
+ # Supported extensions
3613
+ extensions = set(EXTENSION_MAP.keys())
3614
+
3615
+ # Build task options for report (before BatchProcessor init for hash calculation)
3616
+ # Note: input_dir and output_dir will be converted to absolute paths by init_state()
3617
+ task_options: dict[str, Any] = {
3618
+ "concurrency": cfg.batch.concurrency,
3619
+ "llm": cfg.llm.enabled,
3620
+ "ocr": cfg.ocr.enabled,
3621
+ "screenshot": cfg.screenshot.enabled,
3622
+ "alt": cfg.image.alt_enabled,
3623
+ "desc": cfg.image.desc_enabled,
3624
+ }
3625
+ if cfg.llm.enabled and cfg.llm.model_list:
3626
+ task_options["models"] = [m.litellm_params.model for m in cfg.llm.model_list]
3627
+
3628
+ batch = BatchProcessor(
3629
+ cfg.batch,
3630
+ output_dir,
3631
+ input_path=input_dir,
3632
+ log_file=log_file_path,
3633
+ on_conflict=cfg.output.on_conflict,
3634
+ task_options=task_options,
3635
+ )
3636
+ files = batch.discover_files(input_dir, extensions)
3637
+
3638
+ # Discover .urls files for URL batch processing
3639
+ from markitai.urls import find_url_list_files, parse_url_list
3640
+
3641
+ url_list_files = find_url_list_files(input_dir)
3642
+ url_entries_from_files: list = [] # List of (source_file, UrlEntry)
3643
+
3644
+ for url_file in url_list_files:
3645
+ try:
3646
+ entries = parse_url_list(url_file)
3647
+ for entry in entries:
3648
+ url_entries_from_files.append((url_file, entry))
3649
+ if entries:
3650
+ logger.info(f"Found {len(entries)} URLs in {url_file.name}")
3651
+ except Exception as e:
3652
+ logger.warning(f"Failed to parse URL list {url_file}: {e}")
3653
+
3654
+ # Check agent-browser availability if URLs will be processed
3655
+ if url_entries_from_files:
3656
+ _check_agent_browser_for_urls(cfg, console)
3657
+
3658
+ if not files and not url_entries_from_files:
3659
+ console.print("[yellow]No supported files or URL lists found.[/yellow]")
3660
+ raise SystemExit(0)
3661
+
3662
+ # Warn about potential case-sensitivity mismatches in --no-cache-for patterns
3663
+ if cfg.cache.no_cache_patterns:
3664
+ _warn_case_sensitivity_mismatches(files, input_dir, cfg.cache.no_cache_patterns)
3665
+
3666
+ from markitai.security import check_symlink_safety
3667
+
3668
+ check_symlink_safety(output_dir, allow_symlinks=cfg.output.allow_symlinks)
3669
+ ensure_dir(output_dir)
3670
+
3671
+ if dry_run:
3672
+ # Build dry run message
3673
+ cache_status = "enabled" if cfg.cache.enabled else "disabled"
3674
+ dry_run_msg = f"[yellow]Would process {len(files)} files[/yellow]"
3675
+ if url_entries_from_files:
3676
+ dry_run_msg += f"\n[yellow]Would process {len(url_entries_from_files)} URLs from {len(url_list_files)} .urls files[/yellow]"
3677
+ dry_run_msg += f"\n[yellow]Input:[/yellow] {input_dir}\n[yellow]Output:[/yellow] {output_dir}"
3678
+ dry_run_msg += f"\n[yellow]Cache:[/yellow] {cache_status}"
3679
+
3680
+ console.print(Panel(dry_run_msg, title="Dry Run"))
3681
+ for f in files[:10]:
3682
+ console.print(f" - {f.name}")
3683
+ if len(files) > 10:
3684
+ console.print(f" ... and {len(files) - 10} more files")
3685
+ if url_entries_from_files:
3686
+ console.print("[dim]URL list files:[/dim]")
3687
+ for url_file in url_list_files[:5]:
3688
+ console.print(f" - {url_file.name}")
3689
+ if len(url_list_files) > 5:
3690
+ console.print(f" ... and {len(url_list_files) - 5} more .urls files")
3691
+ if cfg.cache.enabled:
3692
+ console.print(
3693
+ "[dim]Tip: Use 'markitai cache stats -v' to view cached entries[/dim]"
3694
+ )
3695
+ raise SystemExit(0)
3696
+
3697
+ # Record batch start time before any processing (including pre-conversion)
3698
+ from datetime import datetime
3699
+
3700
+ batch_started_at = datetime.now().astimezone().isoformat()
3701
+
3702
+ # Start Live display early to capture all logs (including URL processing)
3703
+ # This ensures all INFO+ logs go to the panel instead of console
3704
+ batch.start_live_display(
3705
+ verbose=verbose,
3706
+ console_handler_id=console_handler_id,
3707
+ total_files=len(files),
3708
+ total_urls=len(url_entries_from_files),
3709
+ )
3710
+
3711
+ # Pre-convert legacy Office files using batch COM (Windows only)
3712
+ # This reduces overhead by starting each Office app only once
3713
+ legacy_suffixes = {".doc", ".ppt", ".xls"}
3714
+ legacy_files = [f for f in files if f.suffix.lower() in legacy_suffixes]
3715
+ preconverted_map: dict[Path, Path] = {}
3716
+ preconvert_temp_dir: tempfile.TemporaryDirectory | None = None
3717
+
3718
+ if legacy_files:
3719
+ import platform
3720
+
3721
+ if platform.system() == "Windows":
3722
+ from markitai.converter.legacy import batch_convert_legacy_files
3723
+
3724
+ # Create temp directory for pre-converted files
3725
+ preconvert_temp_dir = tempfile.TemporaryDirectory(
3726
+ prefix="markitai_preconv_"
3727
+ )
3728
+ preconvert_path = Path(preconvert_temp_dir.name)
3729
+
3730
+ logger.info(f"Pre-converting {len(legacy_files)} legacy files...")
3731
+ preconverted_map = batch_convert_legacy_files(legacy_files, preconvert_path)
3732
+ if preconverted_map:
3733
+ logger.info(
3734
+ f"Pre-converted {len(preconverted_map)}/{len(legacy_files)} files with MS Office COM"
3735
+ )
3736
+
3737
+ # Create shared LLM runtime and processor for batch mode
3738
+ shared_processor = None
3739
+ if cfg.llm.enabled:
3740
+ from markitai.llm import LLMRuntime
3741
+
3742
+ runtime = LLMRuntime(concurrency=cfg.llm.concurrency)
3743
+ # Use output directory's parent as project dir for project-level cache
3744
+ project_dir = output_dir.parent if output_dir else Path.cwd()
3745
+ shared_processor = create_llm_processor(
3746
+ cfg, project_dir=project_dir, runtime=runtime
3747
+ )
3748
+ logger.info(
3749
+ f"Created shared LLMProcessor with concurrency={cfg.llm.concurrency}"
3750
+ )
3751
+
3752
+ # Create process_file using workflow/core implementation
3753
+ process_file = _create_process_file(
3754
+ cfg=cfg,
3755
+ input_dir=input_dir,
3756
+ output_dir=output_dir,
3757
+ preconverted_map=preconverted_map,
3758
+ shared_processor=shared_processor,
3759
+ )
3760
+ logger.debug("Using workflow/core implementation for batch processing")
3761
+
3762
+ # Initialize state for URL tracking
3763
+ from markitai.batch import FileStatus, UrlState
3764
+
3765
+ # Group URL entries by source file and collect source file list
3766
+ url_sources_set: set[str] = set()
3767
+ if url_entries_from_files:
3768
+ for source_file, _entry in url_entries_from_files:
3769
+ url_sources_set.add(str(source_file))
3770
+
3771
+ # Initialize batch state with files
3772
+ if files or url_entries_from_files:
3773
+ batch.state = batch.init_state(
3774
+ input_dir=input_dir,
3775
+ files=files,
3776
+ options=task_options,
3777
+ started_at=batch_started_at,
3778
+ )
3779
+ # Add URL source files to state
3780
+ batch.state.url_sources = list(url_sources_set)
3781
+
3782
+ # Initialize URL states in batch state
3783
+ for source_file, entry in url_entries_from_files:
3784
+ batch.state.urls[entry.url] = UrlState(
3785
+ url=entry.url,
3786
+ source_file=str(source_file),
3787
+ status=FileStatus.PENDING,
3788
+ )
3789
+
3790
+ # Create URL processor function
3791
+ url_processor = None
3792
+ if url_entries_from_files:
3793
+ url_processor = _create_url_processor(
3794
+ cfg=cfg,
3795
+ output_dir=output_dir,
3796
+ fetch_strategy=fetch_strategy,
3797
+ explicit_fetch_strategy=explicit_fetch_strategy,
3798
+ shared_processor=shared_processor,
3799
+ )
3800
+
3801
+ # Create separate semaphores for file and URL processing
3802
+ # This allows file processing and URL fetching to run at their own concurrency levels
3803
+ file_semaphore = asyncio.Semaphore(cfg.batch.concurrency)
3804
+ url_semaphore = asyncio.Semaphore(cfg.batch.url_concurrency)
3805
+
3806
+ async def process_url_with_state(
3807
+ url: str,
3808
+ source_file: Path,
3809
+ custom_name: str | None,
3810
+ ) -> None:
3811
+ """Process a URL and update batch state."""
3812
+ assert batch.state is not None
3813
+ assert url_processor is not None
3814
+
3815
+ url_state = batch.state.urls.get(url)
3816
+ if url_state is None:
3817
+ return
3818
+
3819
+ # Update state to in_progress
3820
+ url_state.status = FileStatus.IN_PROGRESS
3821
+ url_state.started_at = datetime.now().astimezone().isoformat()
3822
+
3823
+ start_time = asyncio.get_event_loop().time()
3824
+
3825
+ try:
3826
+ async with url_semaphore:
3827
+ result, extra_info = await url_processor(url, source_file, custom_name)
3828
+
3829
+ if result.success:
3830
+ url_state.status = FileStatus.COMPLETED
3831
+ url_state.output = result.output_path
3832
+ url_state.fetch_strategy = extra_info.get("fetch_strategy")
3833
+ url_state.images = result.images
3834
+ url_state.cost_usd = result.cost_usd
3835
+ url_state.llm_usage = result.llm_usage
3836
+ url_state.cache_hit = result.cache_hit
3837
+ # Collect image analysis for JSON output
3838
+ if result.image_analysis_result is not None:
3839
+ batch.image_analysis_results.append(result.image_analysis_result)
3840
+ else:
3841
+ url_state.status = FileStatus.FAILED
3842
+ url_state.error = result.error
3843
+
3844
+ except Exception as e:
3845
+ url_state.status = FileStatus.FAILED
3846
+ url_state.error = str(e)
3847
+ logger.error(f"[URL] Failed {url}: {e}")
3848
+
3849
+ finally:
3850
+ end_time = asyncio.get_event_loop().time()
3851
+ url_state.completed_at = datetime.now().astimezone().isoformat()
3852
+ url_state.duration = end_time - start_time
3853
+
3854
+ # Update progress
3855
+ batch.update_url_status(url, completed=True)
3856
+
3857
+ # Save state (non-blocking, throttled)
3858
+ await asyncio.to_thread(batch.save_state)
3859
+
3860
+ async def process_file_with_state(file_path: Path) -> None:
3861
+ """Process a file and update batch state."""
3862
+ assert batch.state is not None
3863
+
3864
+ file_key = str(file_path)
3865
+ file_state = batch.state.files.get(file_key)
3866
+
3867
+ if file_state is None:
3868
+ return
3869
+
3870
+ # Update state to in_progress
3871
+ file_state.status = FileStatus.IN_PROGRESS
3872
+ file_state.started_at = datetime.now().astimezone().isoformat()
3873
+
3874
+ start_time = asyncio.get_event_loop().time()
3875
+
3876
+ try:
3877
+ async with file_semaphore:
3878
+ result = await process_file(file_path)
3879
+
3880
+ if result.success:
3881
+ file_state.status = FileStatus.COMPLETED
3882
+ file_state.output = result.output_path
3883
+ file_state.images = result.images
3884
+ file_state.screenshots = result.screenshots
3885
+ file_state.cost_usd = result.cost_usd
3886
+ file_state.llm_usage = result.llm_usage
3887
+ file_state.cache_hit = result.cache_hit
3888
+ # Collect image analysis for JSON output
3889
+ if result.image_analysis_result is not None:
3890
+ batch.image_analysis_results.append(result.image_analysis_result)
3891
+ else:
3892
+ file_state.status = FileStatus.FAILED
3893
+ file_state.error = result.error
3894
+
3895
+ except Exception as e:
3896
+ file_state.status = FileStatus.FAILED
3897
+ file_state.error = str(e)
3898
+ logger.error(f"[FAIL] {file_path.name}: {e}")
3899
+
3900
+ finally:
3901
+ end_time = asyncio.get_event_loop().time()
3902
+ file_state.completed_at = datetime.now().astimezone().isoformat()
3903
+ file_state.duration = end_time - start_time
3904
+
3905
+ # Update progress
3906
+ batch.advance_progress()
3907
+
3908
+ # Save state (non-blocking, throttled)
3909
+ await asyncio.to_thread(batch.save_state)
3910
+
3911
+ # Run all tasks in parallel (URLs + files)
3912
+ state = batch.state
3913
+ try:
3914
+ if files or url_entries_from_files:
3915
+ # Build task list
3916
+ all_tasks = []
3917
+
3918
+ # Add URL tasks
3919
+ for source_file, entry in url_entries_from_files:
3920
+ all_tasks.append(
3921
+ process_url_with_state(entry.url, source_file, entry.output_name)
3922
+ )
3923
+
3924
+ # Add file tasks
3925
+ for file_path in files:
3926
+ all_tasks.append(process_file_with_state(file_path))
3927
+
3928
+ if all_tasks:
3929
+ logger.info(
3930
+ f"Processing {len(files)} files and {len(url_entries_from_files)} URLs "
3931
+ f"with concurrency {cfg.batch.concurrency}"
3932
+ )
3933
+
3934
+ # Run all tasks in parallel
3935
+ await asyncio.gather(*all_tasks, return_exceptions=True)
3936
+
3937
+ finally:
3938
+ # Stop Live display and restore console handler
3939
+ # This must be done before printing summary
3940
+ batch.stop_live_display()
3941
+
3942
+ # Clean up pre-conversion temp directory
3943
+ if preconvert_temp_dir is not None:
3944
+ preconvert_temp_dir.cleanup()
3945
+
3946
+ if state:
3947
+ # Update state timestamp
3948
+ state.updated_at = datetime.now().astimezone().isoformat()
3949
+ batch.save_state(force=True)
3950
+
3951
+ # Print summary (uses state for URL stats)
3952
+ batch.print_summary(
3953
+ url_completed=state.completed_urls_count,
3954
+ url_failed=state.failed_urls_count,
3955
+ url_cache_hits=sum(
3956
+ 1
3957
+ for u in state.urls.values()
3958
+ if u.status == FileStatus.COMPLETED and u.cache_hit
3959
+ ),
3960
+ url_sources=len(state.url_sources),
3961
+ )
3962
+
3963
+ # Write aggregated image analysis JSON (if any)
3964
+ if batch.image_analysis_results and cfg.image.desc_enabled:
3965
+ write_images_json(output_dir, batch.image_analysis_results)
3966
+
3967
+ # Save report (logging is done inside save_report)
3968
+ batch.save_report()
3969
+
3970
+ # Exit with appropriate code
3971
+ total_failed = (state.failed_count if state else 0) + (
3972
+ state.failed_urls_count if state else 0
3973
+ )
3974
+ if total_failed > 0:
3975
+ raise SystemExit(10) # PARTIAL_FAILURE
3976
+
3977
+
3978
+ if __name__ == "__main__":
3979
+ app()