markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/batch.py ADDED
@@ -0,0 +1,1316 @@
1
+ """Batch processing module with resume capability."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ from collections import deque
8
+ from collections.abc import Callable, Coroutine
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime
11
+ from enum import Enum
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING, Any
14
+
15
+ from loguru import logger
16
+ from rich.console import Console, Group
17
+ from rich.live import Live
18
+ from rich.panel import Panel
19
+ from rich.progress import (
20
+ BarColumn,
21
+ Progress,
22
+ SpinnerColumn,
23
+ TaskID,
24
+ TaskProgressColumn,
25
+ TextColumn,
26
+ TimeElapsedColumn,
27
+ )
28
+ from rich.table import Table
29
+ from rich.text import Text
30
+
31
+ from markitai.constants import DEFAULT_LOG_PANEL_MAX_LINES
32
+ from markitai.json_order import order_report, order_state
33
+ from markitai.security import atomic_write_json
34
+
35
+ if TYPE_CHECKING:
36
+ from markitai.config import BatchConfig
37
+ from markitai.workflow.single import ImageAnalysisResult
38
+
39
+
40
+ class FileStatus(str, Enum):
41
+ """Status of a file in batch processing.
42
+
43
+ State transitions:
44
+ PENDING -> IN_PROGRESS -> COMPLETED
45
+ -> FAILED
46
+
47
+ On resume: IN_PROGRESS files are treated as FAILED (re-processed).
48
+ """
49
+
50
+ PENDING = "pending"
51
+ IN_PROGRESS = "in_progress"
52
+ COMPLETED = "completed"
53
+ FAILED = "failed"
54
+
55
+
56
+ @dataclass
57
+ class FileState:
58
+ """State of a single file in batch processing.
59
+
60
+ Attributes:
61
+ path: Relative path to source file from input_dir
62
+ status: Current processing status
63
+ output: Relative path to output .md file from output_dir
64
+ error: Error message if status is FAILED
65
+ started_at: ISO timestamp when processing started
66
+ completed_at: ISO timestamp when processing completed
67
+ duration: Total processing time in seconds
68
+ images: Count of embedded images extracted from document content
69
+ screenshots: Count of page/slide screenshots rendered for OCR/LLM
70
+ cost_usd: Total LLM API cost for this file
71
+ llm_usage: Per-model usage stats {model: {requests, input_tokens, output_tokens, cost_usd}}
72
+ cache_hit: Whether LLM results were served from cache (no API calls made)
73
+ """
74
+
75
+ path: str
76
+ status: FileStatus = FileStatus.PENDING
77
+ output: str | None = None
78
+ error: str | None = None
79
+ started_at: str | None = None
80
+ completed_at: str | None = None
81
+ duration: float | None = None
82
+ images: int = 0
83
+ screenshots: int = 0
84
+ cost_usd: float = 0.0
85
+ llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
86
+ cache_hit: bool = False
87
+
88
+
89
+ @dataclass
90
+ class UrlState:
91
+ """State of a single URL in batch processing.
92
+
93
+ Attributes:
94
+ url: The URL being processed
95
+ source_file: Path to the .urls file containing this URL
96
+ status: Current processing status
97
+ output: Relative path to output .md file from output_dir
98
+ error: Error message if status is FAILED
99
+ fetch_strategy: The fetch strategy that was used (static/browser/jina)
100
+ images: Count of images downloaded from the URL
101
+ started_at: ISO timestamp when processing started
102
+ completed_at: ISO timestamp when processing completed
103
+ duration: Total processing time in seconds
104
+ cost_usd: Total LLM API cost for this URL
105
+ llm_usage: Per-model usage stats {model: {requests, input_tokens, output_tokens, cost_usd}}
106
+ cache_hit: Whether LLM results were served from cache (no API calls made)
107
+ """
108
+
109
+ url: str
110
+ source_file: str
111
+ status: FileStatus = FileStatus.PENDING
112
+ output: str | None = None
113
+ error: str | None = None
114
+ fetch_strategy: str | None = None
115
+ images: int = 0
116
+ started_at: str | None = None
117
+ completed_at: str | None = None
118
+ duration: float | None = None
119
+ cost_usd: float = 0.0
120
+ llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
121
+ cache_hit: bool = False
122
+
123
+
124
+ @dataclass
125
+ class BatchState:
126
+ """State of batch processing for resume capability."""
127
+
128
+ version: str = "1.0"
129
+ started_at: str = ""
130
+ updated_at: str = ""
131
+ input_dir: str = ""
132
+ output_dir: str = ""
133
+ log_file: str | None = None # Path to log file for this run
134
+ options: dict = field(default_factory=dict)
135
+ files: dict[str, FileState] = field(default_factory=dict)
136
+ urls: dict[str, UrlState] = field(default_factory=dict) # key: URL string
137
+ url_sources: list[str] = field(default_factory=list) # .urls file paths
138
+
139
+ @property
140
+ def total(self) -> int:
141
+ """Total number of files."""
142
+ return len(self.files)
143
+
144
+ @property
145
+ def total_urls(self) -> int:
146
+ """Total number of URLs."""
147
+ return len(self.urls)
148
+
149
+ @property
150
+ def completed_count(self) -> int:
151
+ """Number of completed files."""
152
+ return sum(1 for f in self.files.values() if f.status == FileStatus.COMPLETED)
153
+
154
+ @property
155
+ def completed_urls_count(self) -> int:
156
+ """Number of completed URLs."""
157
+ return sum(1 for u in self.urls.values() if u.status == FileStatus.COMPLETED)
158
+
159
+ @property
160
+ def failed_count(self) -> int:
161
+ """Number of failed files."""
162
+ return sum(1 for f in self.files.values() if f.status == FileStatus.FAILED)
163
+
164
+ @property
165
+ def failed_urls_count(self) -> int:
166
+ """Number of failed URLs."""
167
+ return sum(1 for u in self.urls.values() if u.status == FileStatus.FAILED)
168
+
169
+ @property
170
+ def pending_count(self) -> int:
171
+ """Number of pending files."""
172
+ return sum(
173
+ 1
174
+ for f in self.files.values()
175
+ if f.status in (FileStatus.PENDING, FileStatus.FAILED)
176
+ )
177
+
178
+ @property
179
+ def pending_urls_count(self) -> int:
180
+ """Number of pending URLs."""
181
+ return sum(
182
+ 1
183
+ for u in self.urls.values()
184
+ if u.status in (FileStatus.PENDING, FileStatus.FAILED)
185
+ )
186
+
187
+ def get_pending_files(self) -> list[Path]:
188
+ """Get list of files that need processing."""
189
+ return [
190
+ Path(f.path)
191
+ for f in self.files.values()
192
+ if f.status in (FileStatus.PENDING, FileStatus.FAILED)
193
+ ]
194
+
195
+ def get_pending_urls(self) -> list[str]:
196
+ """Get list of URLs that need processing."""
197
+ return [
198
+ u.url
199
+ for u in self.urls.values()
200
+ if u.status in (FileStatus.PENDING, FileStatus.FAILED)
201
+ ]
202
+
203
+ def to_dict(self) -> dict[str, Any]:
204
+ """Convert to dictionary for JSON serialization.
205
+
206
+ Note: input_dir/output_dir are stored in options with absolute paths.
207
+ Files keys are stored as relative paths (relative to input_dir).
208
+ """
209
+ # Convert log_file to absolute path if it exists
210
+ log_file_abs = None
211
+ if self.log_file:
212
+ log_path = Path(self.log_file)
213
+ log_file_abs = (
214
+ str(log_path.resolve()) if log_path.exists() else self.log_file
215
+ )
216
+
217
+ # Convert files keys to relative paths (relative to input_dir)
218
+ input_dir_path = Path(self.input_dir).resolve()
219
+ files_dict = {}
220
+ for path, state in self.files.items():
221
+ file_path = Path(path).resolve()
222
+ try:
223
+ rel_path = str(file_path.relative_to(input_dir_path))
224
+ except ValueError:
225
+ # File is not under input_dir, use filename only
226
+ rel_path = file_path.name
227
+ files_dict[rel_path] = {
228
+ "status": state.status.value,
229
+ "output": state.output,
230
+ "error": state.error,
231
+ "started_at": state.started_at,
232
+ "completed_at": state.completed_at,
233
+ "duration": state.duration,
234
+ "images": state.images,
235
+ "screenshots": state.screenshots,
236
+ "cost_usd": state.cost_usd,
237
+ "llm_usage": state.llm_usage,
238
+ "cache_hit": state.cache_hit,
239
+ }
240
+
241
+ # Convert URLs to dict
242
+ urls_dict = {}
243
+ for url, state in self.urls.items():
244
+ urls_dict[url] = {
245
+ "source_file": state.source_file,
246
+ "status": state.status.value,
247
+ "output": state.output,
248
+ "error": state.error,
249
+ "fetch_strategy": state.fetch_strategy,
250
+ "images": state.images,
251
+ "started_at": state.started_at,
252
+ "completed_at": state.completed_at,
253
+ "duration": state.duration,
254
+ "cost_usd": state.cost_usd,
255
+ "llm_usage": state.llm_usage,
256
+ "cache_hit": state.cache_hit,
257
+ }
258
+
259
+ return {
260
+ "version": self.version,
261
+ "started_at": self.started_at,
262
+ "updated_at": self.updated_at,
263
+ "log_file": log_file_abs,
264
+ "options": self.options,
265
+ "documents": files_dict,
266
+ "urls": urls_dict,
267
+ "url_sources": self.url_sources,
268
+ }
269
+
270
+ def to_minimal_dict(self) -> dict[str, Any]:
271
+ """Convert to minimal dictionary for state file (resume capability).
272
+
273
+ Only includes fields necessary for determining what needs to be reprocessed:
274
+ - version: For compatibility checking
275
+ - options: input_dir/output_dir needed to resolve paths
276
+ - documents: status + output (completed) or error (failed)
277
+ - urls: status + output + source_file
278
+ """
279
+ # Convert files keys to relative paths (relative to input_dir)
280
+ input_dir_path = Path(self.input_dir).resolve()
281
+ files_dict = {}
282
+ for path, state in self.files.items():
283
+ file_path = Path(path).resolve()
284
+ try:
285
+ rel_path = str(file_path.relative_to(input_dir_path))
286
+ except ValueError:
287
+ rel_path = file_path.name
288
+
289
+ # Minimal state: only what's needed for resume
290
+ entry: dict[str, Any] = {"status": state.status.value}
291
+ if state.status == FileStatus.COMPLETED and state.output:
292
+ entry["output"] = state.output
293
+ elif state.status == FileStatus.FAILED and state.error:
294
+ entry["error"] = state.error
295
+ files_dict[rel_path] = entry
296
+
297
+ # Convert URLs to minimal dict
298
+ urls_dict = {}
299
+ for url, state in self.urls.items():
300
+ entry: dict[str, Any] = {
301
+ "status": state.status.value,
302
+ "source_file": state.source_file,
303
+ }
304
+ if state.status == FileStatus.COMPLETED and state.output:
305
+ entry["output"] = state.output
306
+ elif state.status == FileStatus.FAILED and state.error:
307
+ entry["error"] = state.error
308
+ urls_dict[url] = entry
309
+
310
+ return {
311
+ "version": self.version,
312
+ "options": self.options,
313
+ "documents": files_dict,
314
+ "urls": urls_dict,
315
+ }
316
+
317
+ @classmethod
318
+ def from_dict(cls, data: dict[str, Any]) -> BatchState:
319
+ """Create from dictionary."""
320
+ options = data.get("options", {})
321
+
322
+ # Get input_dir/output_dir from options
323
+ input_dir = options.get("input_dir", "")
324
+ output_dir = options.get("output_dir", "")
325
+
326
+ state = cls(
327
+ version=data.get("version", "1.0"),
328
+ started_at=data.get("started_at", ""),
329
+ updated_at=data.get("updated_at", ""),
330
+ input_dir=input_dir,
331
+ output_dir=output_dir,
332
+ log_file=data.get("log_file"),
333
+ options=options,
334
+ url_sources=data.get("url_sources", []),
335
+ )
336
+
337
+ documents_data = data.get("documents", {})
338
+
339
+ # Reconstruct absolute file paths from relative paths
340
+ input_dir_path = Path(input_dir) if input_dir else Path(".")
341
+ for path, file_data in documents_data.items():
342
+ # If path is relative, make it absolute relative to input_dir
343
+ file_path = Path(path)
344
+ if not file_path.is_absolute():
345
+ abs_path = str(input_dir_path / path)
346
+ else:
347
+ abs_path = path
348
+
349
+ state.files[abs_path] = FileState(
350
+ path=abs_path,
351
+ status=FileStatus(file_data.get("status", "pending")),
352
+ output=file_data.get("output"),
353
+ error=file_data.get("error"),
354
+ started_at=file_data.get("started_at"),
355
+ completed_at=file_data.get("completed_at"),
356
+ duration=file_data.get("duration"),
357
+ images=file_data.get("images", 0),
358
+ screenshots=file_data.get("screenshots", 0),
359
+ cost_usd=file_data.get("cost_usd", 0.0),
360
+ llm_usage=file_data.get("llm_usage", {}),
361
+ cache_hit=file_data.get("cache_hit", False),
362
+ )
363
+
364
+ # Reconstruct URL states
365
+ for url, url_data in data.get("urls", {}).items():
366
+ state.urls[url] = UrlState(
367
+ url=url,
368
+ source_file=url_data.get("source_file", ""),
369
+ status=FileStatus(url_data.get("status", "pending")),
370
+ output=url_data.get("output"),
371
+ error=url_data.get("error"),
372
+ fetch_strategy=url_data.get("fetch_strategy"),
373
+ images=url_data.get("images", 0),
374
+ started_at=url_data.get("started_at"),
375
+ completed_at=url_data.get("completed_at"),
376
+ duration=url_data.get("duration"),
377
+ cost_usd=url_data.get("cost_usd", 0.0),
378
+ llm_usage=url_data.get("llm_usage", {}),
379
+ cache_hit=url_data.get("cache_hit", False),
380
+ )
381
+
382
+ return state
383
+
384
+
385
+ @dataclass
386
+ class ProcessResult:
387
+ """Result of processing a single file.
388
+
389
+ Attributes:
390
+ success: Whether processing completed without errors
391
+ output_path: Path to generated .md file (None if failed)
392
+ error: Error message if success is False
393
+ images: Count of embedded images extracted from document
394
+ screenshots: Count of page/slide screenshots for OCR/LLM
395
+ cost_usd: Total LLM API cost for this file
396
+ llm_usage: Per-model usage {model: {requests, input_tokens, output_tokens, cost_usd}}
397
+ image_analysis_result: Aggregated image analysis for JSON output (None if disabled)
398
+ cache_hit: Whether LLM results were served entirely from cache
399
+ """
400
+
401
+ success: bool
402
+ output_path: str | None = None
403
+ error: str | None = None
404
+ images: int = 0
405
+ screenshots: int = 0
406
+ cost_usd: float = 0.0
407
+ llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
408
+ image_analysis_result: ImageAnalysisResult | None = None
409
+ cache_hit: bool = False
410
+
411
+
412
+ # Type alias for process function
413
+ ProcessFunc = Callable[[Path], Coroutine[Any, Any, ProcessResult]]
414
+
415
+
416
+ class LogPanel:
417
+ """Log panel for verbose mode, displays scrolling log messages."""
418
+
419
+ def __init__(self, max_lines: int = DEFAULT_LOG_PANEL_MAX_LINES):
420
+ self.logs: deque[str] = deque(maxlen=max_lines)
421
+
422
+ def add(self, message: str) -> None:
423
+ """Add a log message to the panel."""
424
+ timestamp = datetime.now().strftime("%H:%M:%S")
425
+ self.logs.append(f"{timestamp} | {message}")
426
+
427
+ def __rich__(self) -> Panel:
428
+ """Render the log panel."""
429
+ content = "\n".join(self.logs) if self.logs else "(waiting for logs...)"
430
+ # Use Text object to prevent markup parsing (paths like [/foo/bar] would be misinterpreted)
431
+ return Panel(Text(content), title="Logs", border_style="dim")
432
+
433
+
434
+ class BatchProcessor:
435
+ """Batch processor with concurrent execution and progress display."""
436
+
437
+ def __init__(
438
+ self,
439
+ config: BatchConfig,
440
+ output_dir: Path,
441
+ input_path: Path | None = None,
442
+ log_file: Path | str | None = None,
443
+ on_conflict: str = "rename",
444
+ task_options: dict[str, Any] | None = None,
445
+ ) -> None:
446
+ """
447
+ Initialize batch processor.
448
+
449
+ Args:
450
+ config: Batch processing configuration
451
+ output_dir: Output directory
452
+ input_path: Input file or directory (used for report file naming)
453
+ log_file: Path to the log file for this run
454
+ on_conflict: Conflict resolution strategy ("skip", "overwrite", "rename")
455
+ task_options: Task options dict (used for computing task hash)
456
+ """
457
+ self.config = config
458
+ self.output_dir = Path(output_dir)
459
+ self.input_path = Path(input_path) if input_path else None
460
+ self.log_file = str(log_file) if log_file else None
461
+ self.on_conflict = on_conflict
462
+ self.task_options = task_options or {}
463
+ self.task_hash = self._compute_task_hash()
464
+ self.state_file = self._get_state_file_path()
465
+ self.report_file = self._get_report_file_path()
466
+ self.state: BatchState | None = None
467
+ self.console = Console()
468
+ # Collect image analysis results for JSON aggregation
469
+ self.image_analysis_results: list[ImageAnalysisResult] = []
470
+
471
+ # Live display state (managed by start_live_display/stop_live_display)
472
+ self._live: Live | None = None
473
+ self._log_panel: LogPanel | None = None
474
+ self._panel_handler_id: int | None = None
475
+ self._console_handler_id: int | None = None
476
+ self._verbose: bool = False
477
+ self._progress: Progress | None = None
478
+ self._overall_task_id: TaskID | None = None
479
+ self._url_task_id: TaskID | None = None
480
+ self._total_urls: int = 0
481
+ self._total_files: int = 0
482
+ self._completed_urls: int = 0
483
+ self._completed_files: int = 0
484
+
485
+ def _compute_task_hash(self) -> str:
486
+ """Compute hash from task input parameters.
487
+
488
+ Hash is based on:
489
+ - input_path (resolved)
490
+ - output_dir (resolved)
491
+ - key task options (llm_enabled, ocr_enabled, etc.)
492
+
493
+ This ensures different parameter combinations produce different hashes,
494
+ so resuming with different options creates a new state file.
495
+ """
496
+ import hashlib
497
+
498
+ # Extract key options that affect output (exclude paths, they're added separately)
499
+ key_options = {
500
+ k: v
501
+ for k, v in self.task_options.items()
502
+ if k
503
+ in (
504
+ "llm_enabled",
505
+ "ocr_enabled",
506
+ "screenshot_enabled",
507
+ "image_alt_enabled",
508
+ "image_desc_enabled",
509
+ )
510
+ }
511
+
512
+ hash_params = {
513
+ "input": str(self.input_path.resolve()) if self.input_path else "",
514
+ "output": str(self.output_dir.resolve()),
515
+ "options": key_options,
516
+ }
517
+ hash_str = json.dumps(hash_params, sort_keys=True)
518
+ return hashlib.md5(hash_str.encode()).hexdigest()[:6]
519
+
520
+ def _get_state_file_path(self) -> Path:
521
+ """Generate state file path for resume capability.
522
+
523
+ Format: states/markitai.<hash>.state.json
524
+ """
525
+ states_dir = self.output_dir / "states"
526
+ return states_dir / f"markitai.{self.task_hash}.state.json"
527
+
528
+ def _get_report_file_path(self) -> Path:
529
+ """Generate report file path based on task hash.
530
+
531
+ Format: reports/markitai.<hash>.report.json
532
+ Respects on_conflict strategy for rename.
533
+ """
534
+ reports_dir = self.output_dir / "reports"
535
+ base_path = reports_dir / f"markitai.{self.task_hash}.report.json"
536
+
537
+ if not base_path.exists():
538
+ return base_path
539
+
540
+ if self.on_conflict == "skip":
541
+ return base_path # Will be handled by caller
542
+ elif self.on_conflict == "overwrite":
543
+ return base_path
544
+ else: # rename
545
+ seq = 2
546
+ while True:
547
+ new_path = reports_dir / f"markitai.{self.task_hash}.v{seq}.report.json"
548
+ if not new_path.exists():
549
+ return new_path
550
+ seq += 1
551
+
552
+ def start_live_display(
553
+ self,
554
+ verbose: bool = False,
555
+ console_handler_id: int | None = None,
556
+ total_files: int = 0,
557
+ total_urls: int = 0,
558
+ ) -> None:
559
+ """Start Live display with progress bar and optional log panel.
560
+
561
+ Call this before any processing (including pre-conversion) to capture
562
+ all logs in the panel instead of printing to console.
563
+
564
+ Args:
565
+ verbose: Whether to show log panel
566
+ console_handler_id: Loguru console handler ID to disable
567
+ total_files: Total number of files (for progress bar)
568
+ total_urls: Total number of URLs to process
569
+ """
570
+
571
+ self._verbose = verbose
572
+ self._console_handler_id = console_handler_id
573
+
574
+ # Create progress display
575
+ self._progress = Progress(
576
+ SpinnerColumn(),
577
+ TextColumn("[bold blue]{task.fields[filename]:<30}"),
578
+ BarColumn(),
579
+ TaskProgressColumn(),
580
+ TimeElapsedColumn(),
581
+ )
582
+
583
+ # Store totals for progress display
584
+ self._total_urls = total_urls
585
+ self._total_files = total_files
586
+ self._completed_urls = 0
587
+ self._completed_files = 0
588
+
589
+ # Add URL progress task if there are URLs to process
590
+ if total_urls > 0:
591
+ self._url_task_id = self._progress.add_task(
592
+ "URLs",
593
+ total=total_urls,
594
+ filename=f"[URLs:0/{total_urls}]",
595
+ )
596
+
597
+ # Add file progress task (or overall if no URLs)
598
+ self._overall_task_id = self._progress.add_task(
599
+ "Overall",
600
+ total=total_files,
601
+ filename=f"[Files:0/{total_files}]"
602
+ if total_urls > 0
603
+ else "[Overall Progress]",
604
+ )
605
+
606
+ # Create log panel for verbose mode
607
+ if verbose:
608
+ self._log_panel = LogPanel()
609
+
610
+ def panel_sink(message: Any) -> None:
611
+ """Sink function to write logs to the panel."""
612
+ if self._log_panel is not None:
613
+ self._log_panel.add(message.record["message"])
614
+
615
+ # Add a handler that writes to the log panel
616
+ self._panel_handler_id = logger.add(
617
+ panel_sink,
618
+ level="INFO",
619
+ format="{message}",
620
+ filter=lambda record: record["level"].no >= 20, # INFO and above
621
+ )
622
+
623
+ # Disable console handler to avoid conflict with progress bar
624
+ if console_handler_id is not None:
625
+ try:
626
+ logger.remove(console_handler_id)
627
+ except ValueError:
628
+ pass # Handler already removed
629
+
630
+ # Start Live display
631
+ if verbose and self._log_panel is not None:
632
+ display = Group(self._progress, self._log_panel)
633
+ self._live = Live(display, console=self.console, refresh_per_second=4)
634
+ else:
635
+ self._live = Live(
636
+ self._progress, console=self.console, refresh_per_second=4
637
+ )
638
+
639
+ self._live.start()
640
+
641
+ def stop_live_display(self) -> None:
642
+ """Stop Live display and restore console handler."""
643
+ import sys
644
+
645
+ # Stop Live display
646
+ if self._live is not None:
647
+ self._live.stop()
648
+ self._live = None
649
+
650
+ # Remove panel handler if added
651
+ if self._panel_handler_id is not None:
652
+ try:
653
+ logger.remove(self._panel_handler_id)
654
+ except ValueError:
655
+ pass
656
+ self._panel_handler_id = None
657
+
658
+ # Re-add console handler (restore original state)
659
+ if self._console_handler_id is not None:
660
+ new_handler_id = logger.add(
661
+ sys.stderr,
662
+ level="DEBUG" if self._verbose else "INFO",
663
+ format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
664
+ )
665
+ self._restored_console_handler_id = new_handler_id
666
+ self._console_handler_id = None
667
+
668
+ def update_progress_total(self, total: int) -> None:
669
+ """Update progress bar total after file discovery."""
670
+ self._total_files = total
671
+ if self._progress is not None and self._overall_task_id is not None:
672
+ self._progress.update(self._overall_task_id, total=total)
673
+ # Update filename with new total
674
+ self._progress.update(
675
+ self._overall_task_id,
676
+ filename=f"[Files:{self._completed_files}/{total}]",
677
+ )
678
+
679
+ def advance_progress(self) -> None:
680
+ """Advance progress bar by one."""
681
+ if self._progress is not None and self._overall_task_id is not None:
682
+ self._completed_files += 1
683
+ self._progress.advance(self._overall_task_id)
684
+ # Update filename with current count
685
+ self._progress.update(
686
+ self._overall_task_id,
687
+ filename=f"[Files:{self._completed_files}/{self._total_files}]",
688
+ )
689
+
690
+ def update_url_status(self, url: str, completed: bool = False) -> None:
691
+ """Update URL processing status in progress display.
692
+
693
+ Args:
694
+ url: The URL being processed (displayed in progress bar)
695
+ completed: If True, advance the URL progress counter
696
+ """
697
+ if self._progress is not None and self._url_task_id is not None:
698
+ if completed:
699
+ self._completed_urls += 1
700
+ self._progress.advance(self._url_task_id)
701
+ # Update filename with current count
702
+ self._progress.update(
703
+ self._url_task_id,
704
+ filename=f"[URLs:{self._completed_urls}/{self._total_urls}]",
705
+ )
706
+
707
+ def finish_url_processing(self, completed: int, failed: int) -> None:
708
+ """Mark URL processing as complete.
709
+
710
+ Args:
711
+ completed: Number of URLs successfully processed
712
+ failed: Number of URLs that failed
713
+ """
714
+ if self._progress is not None and self._url_task_id is not None:
715
+ # Final status already shows count from update_url_status
716
+ pass
717
+
718
+ def discover_files(
719
+ self,
720
+ input_path: Path,
721
+ extensions: set[str],
722
+ ) -> list[Path]:
723
+ """
724
+ Discover files to process.
725
+
726
+ Args:
727
+ input_path: Input file or directory
728
+ extensions: Set of valid file extensions (e.g., {".docx", ".pdf"})
729
+
730
+ Returns:
731
+ List of file paths
732
+
733
+ Raises:
734
+ ValueError: If any discovered file is outside the input directory
735
+ """
736
+ from markitai.security import validate_path_within_base
737
+
738
+ if input_path.is_file():
739
+ return [input_path]
740
+
741
+ input_resolved = input_path.resolve()
742
+ files: list[Path] = []
743
+ max_depth = max(0, self.config.scan_max_depth)
744
+ max_files = max(1, self.config.scan_max_files)
745
+
746
+ def should_include(path: Path) -> bool:
747
+ try:
748
+ validate_path_within_base(path, input_resolved)
749
+ except ValueError:
750
+ logger.warning(f"Skipping file outside input directory: {path}")
751
+ return False
752
+ return path.is_file()
753
+
754
+ for ext in extensions:
755
+ # Search both lowercase and uppercase variants (Linux glob is case-sensitive)
756
+ ext_variants = [ext, ext.upper()]
757
+ candidates = []
758
+
759
+ for ext_variant in ext_variants:
760
+ if max_depth == 0:
761
+ candidates.extend(input_path.glob(f"*{ext_variant}"))
762
+ else:
763
+ # Use rglob for recursive search, then filter by depth
764
+ for f in input_path.rglob(f"*{ext_variant}"):
765
+ # Calculate relative depth
766
+ try:
767
+ rel_path = f.relative_to(input_path)
768
+ depth = len(rel_path.parts) - 1 # -1 for filename itself
769
+ if depth <= max_depth:
770
+ candidates.append(f)
771
+ except ValueError:
772
+ continue
773
+
774
+ for f in candidates:
775
+ if len(files) >= max_files:
776
+ logger.warning(
777
+ f"Reached scan_max_files={max_files}, stopping file discovery"
778
+ )
779
+ return sorted(set(files))
780
+ if should_include(f):
781
+ files.append(f)
782
+
783
+ return sorted(set(files))
784
+
785
+ def load_state(self) -> BatchState | None:
786
+ """Load state from state file if exists (for resume capability)."""
787
+ from markitai.constants import MAX_STATE_FILE_SIZE
788
+ from markitai.security import validate_file_size
789
+
790
+ if not self.state_file.exists():
791
+ return None
792
+
793
+ try:
794
+ # Validate file size to prevent DoS
795
+ validate_file_size(self.state_file, MAX_STATE_FILE_SIZE)
796
+ data = json.loads(self.state_file.read_text(encoding="utf-8"))
797
+ return BatchState.from_dict(data)
798
+ except Exception as e:
799
+ logger.warning(f"Failed to load state file for resume: {e}")
800
+ return None
801
+
802
+ def save_state(self, force: bool = False, log: bool = False) -> None:
803
+ """Save current state to state file for resume capability.
804
+
805
+ State file is saved to: states/markitai.<hash>.state.json
806
+
807
+ Optimized with interval-based throttling:
808
+ - Checks interval BEFORE serialization to avoid unnecessary work
809
+ - Uses minimal serialization when possible
810
+
811
+ Args:
812
+ force: Force save even if interval hasn't passed
813
+ log: Whether to log the save operation
814
+ """
815
+ if self.state is None:
816
+ return
817
+
818
+ now = datetime.now().astimezone()
819
+ interval = getattr(self.config, "state_flush_interval_seconds", 0) or 0
820
+
821
+ # Check interval BEFORE any serialization work (optimization)
822
+ if not force and interval > 0:
823
+ last_saved = getattr(self, "_last_state_save", None)
824
+ if last_saved and (now - last_saved).total_seconds() < interval:
825
+ return # Skip: interval not passed, no work done
826
+
827
+ self.state.updated_at = now.isoformat()
828
+
829
+ # Build minimal state document (only what's needed for resume)
830
+ state_data = self.state.to_minimal_dict()
831
+
832
+ # Ensure states directory exists
833
+ self.state_file.parent.mkdir(parents=True, exist_ok=True)
834
+
835
+ atomic_write_json(self.state_file, state_data, order_func=order_state)
836
+ self._last_state_save = now
837
+
838
+ if log:
839
+ logger.info(f"State file saved: {self.state_file.resolve()}")
840
+
841
+ def _compute_summary(self) -> dict[str, Any]:
842
+ """Compute summary statistics for report."""
843
+ if self.state is None:
844
+ return {}
845
+
846
+ # Calculate wall-clock duration
847
+ wall_duration = 0.0
848
+ if self.state.started_at and self.state.updated_at:
849
+ try:
850
+ start = datetime.fromisoformat(self.state.started_at)
851
+ end = datetime.fromisoformat(self.state.updated_at)
852
+ wall_duration = (end - start).total_seconds()
853
+ except ValueError:
854
+ wall_duration = 0.0
855
+
856
+ # Calculate cumulative processing time (files + URLs)
857
+ file_duration = sum(f.duration or 0 for f in self.state.files.values())
858
+ url_duration = sum(u.duration or 0 for u in self.state.urls.values())
859
+ processing_time = file_duration + url_duration
860
+
861
+ # URL cache hits count
862
+ url_cache_hits = sum(
863
+ 1
864
+ for u in self.state.urls.values()
865
+ if u.status == FileStatus.COMPLETED and u.cache_hit
866
+ )
867
+
868
+ return {
869
+ "total_documents": self.state.total,
870
+ "completed_documents": self.state.completed_count,
871
+ "failed_documents": self.state.failed_count,
872
+ "pending_documents": self.state.pending_count,
873
+ "total_urls": self.state.total_urls,
874
+ "completed_urls": self.state.completed_urls_count,
875
+ "failed_urls": self.state.failed_urls_count,
876
+ "pending_urls": self.state.pending_urls_count,
877
+ "url_cache_hits": url_cache_hits,
878
+ "url_sources": len(self.state.url_sources),
879
+ "duration": wall_duration,
880
+ "processing_time": processing_time,
881
+ }
882
+
883
+ def _compute_llm_usage(self) -> dict[str, Any]:
884
+ """Compute aggregated LLM usage statistics for report."""
885
+ if self.state is None:
886
+ return {}
887
+
888
+ # Aggregate LLM usage by model (from both files and URLs)
889
+ models_usage: dict[str, dict[str, Any]] = {}
890
+
891
+ # Aggregate from files
892
+ for f in self.state.files.values():
893
+ for model, usage in f.llm_usage.items():
894
+ if model not in models_usage:
895
+ models_usage[model] = {
896
+ "requests": 0,
897
+ "input_tokens": 0,
898
+ "output_tokens": 0,
899
+ "cost_usd": 0.0,
900
+ }
901
+ models_usage[model]["requests"] += usage.get("requests", 0)
902
+ models_usage[model]["input_tokens"] += usage.get("input_tokens", 0)
903
+ models_usage[model]["output_tokens"] += usage.get("output_tokens", 0)
904
+ models_usage[model]["cost_usd"] += usage.get("cost_usd", 0.0)
905
+
906
+ # Aggregate from URLs
907
+ for u in self.state.urls.values():
908
+ for model, usage in u.llm_usage.items():
909
+ if model not in models_usage:
910
+ models_usage[model] = {
911
+ "requests": 0,
912
+ "input_tokens": 0,
913
+ "output_tokens": 0,
914
+ "cost_usd": 0.0,
915
+ }
916
+ models_usage[model]["requests"] += usage.get("requests", 0)
917
+ models_usage[model]["input_tokens"] += usage.get("input_tokens", 0)
918
+ models_usage[model]["output_tokens"] += usage.get("output_tokens", 0)
919
+ models_usage[model]["cost_usd"] += usage.get("cost_usd", 0.0)
920
+
921
+ # Calculate totals (files + URLs)
922
+ total_cost = sum(f.cost_usd for f in self.state.files.values()) + sum(
923
+ u.cost_usd for u in self.state.urls.values()
924
+ )
925
+ input_tokens = sum(m["input_tokens"] for m in models_usage.values())
926
+ output_tokens = sum(m["output_tokens"] for m in models_usage.values())
927
+ requests = sum(m["requests"] for m in models_usage.values())
928
+
929
+ return {
930
+ "models": models_usage,
931
+ "requests": requests,
932
+ "input_tokens": input_tokens,
933
+ "output_tokens": output_tokens,
934
+ "cost_usd": total_cost,
935
+ }
936
+
937
+ def init_state(
938
+ self,
939
+ input_dir: Path,
940
+ files: list[Path],
941
+ options: dict[str, Any],
942
+ started_at: str | None = None,
943
+ ) -> BatchState:
944
+ """
945
+ Initialize a new batch state.
946
+
947
+ Args:
948
+ input_dir: Input directory
949
+ files: List of files to process
950
+ options: Processing options (will be updated with absolute paths)
951
+ started_at: ISO timestamp when processing started (defaults to now)
952
+
953
+ Returns:
954
+ New BatchState
955
+ """
956
+ # Resolve absolute paths
957
+ abs_input_dir = str(input_dir.resolve())
958
+ abs_output_dir = str(self.output_dir.resolve())
959
+ abs_log_file = None
960
+ if self.log_file:
961
+ log_path = Path(self.log_file)
962
+ abs_log_file = (
963
+ str(log_path.resolve()) if log_path.exists() else self.log_file
964
+ )
965
+
966
+ # Update options with absolute paths
967
+ options["input_dir"] = abs_input_dir
968
+ options["output_dir"] = abs_output_dir
969
+
970
+ now = datetime.now().astimezone().isoformat()
971
+ state = BatchState(
972
+ started_at=started_at or now,
973
+ updated_at=now,
974
+ input_dir=abs_input_dir,
975
+ output_dir=abs_output_dir,
976
+ log_file=abs_log_file,
977
+ options=options,
978
+ )
979
+
980
+ for file_path in files:
981
+ state.files[str(file_path)] = FileState(path=str(file_path))
982
+
983
+ return state
984
+
985
+ async def process_batch(
986
+ self,
987
+ files: list[Path],
988
+ process_func: ProcessFunc,
989
+ resume: bool = False,
990
+ options: dict[str, Any] | None = None,
991
+ verbose: bool = False,
992
+ console_handler_id: int | None = None,
993
+ started_at: str | None = None,
994
+ ) -> BatchState:
995
+ """
996
+ Process files in batch with concurrency control.
997
+
998
+ Args:
999
+ files: List of files to process
1000
+ process_func: Async function to process each file
1001
+ resume: Whether to resume from previous state
1002
+ options: Task options to record in report
1003
+ verbose: Whether to show log panel during processing
1004
+ console_handler_id: Loguru console handler ID for temporary disable
1005
+ (ignored if start_live_display was already called)
1006
+ started_at: ISO timestamp when processing started (for accurate duration)
1007
+
1008
+ Returns:
1009
+ Final batch state
1010
+ """
1011
+ # Use provided started_at or default to now
1012
+ actual_started_at = started_at or datetime.now().astimezone().isoformat()
1013
+
1014
+ # Initialize or load state
1015
+ if resume:
1016
+ self.state = self.load_state()
1017
+ if self.state:
1018
+ files = self.state.get_pending_files()
1019
+ logger.info(
1020
+ f"Resuming batch: {self.state.completed_count} completed, "
1021
+ f"{len(files)} remaining"
1022
+ )
1023
+ # Reset started_at for accurate duration calculation in this session
1024
+ self.state.started_at = actual_started_at
1025
+
1026
+ if self.state is None:
1027
+ self.state = self.init_state(
1028
+ input_dir=files[0].parent if files else Path("."),
1029
+ files=files,
1030
+ options=options or {},
1031
+ started_at=actual_started_at,
1032
+ )
1033
+ self.save_state(force=True)
1034
+
1035
+ if not files:
1036
+ logger.info("No files to process")
1037
+ self.save_state(force=True)
1038
+ return self.state
1039
+
1040
+ # Create semaphore for concurrency control
1041
+ semaphore = asyncio.Semaphore(self.config.concurrency)
1042
+
1043
+ # Check if Live display was already started by caller
1044
+ live_already_started = self._live is not None
1045
+
1046
+ # Use existing progress or create new one
1047
+ if live_already_started and self._progress is not None:
1048
+ progress = self._progress
1049
+ overall_task = self._overall_task_id
1050
+ assert overall_task is not None # Guaranteed when _progress is set
1051
+ # Update total in case it changed
1052
+ progress.update(overall_task, total=len(files))
1053
+ log_panel = self._log_panel
1054
+ else:
1055
+ # Create progress display (legacy path for backwards compatibility)
1056
+ progress = Progress(
1057
+ SpinnerColumn(),
1058
+ TextColumn("[bold blue]{task.fields[filename]:<30}"),
1059
+ BarColumn(),
1060
+ TaskProgressColumn(),
1061
+ TimeElapsedColumn(),
1062
+ )
1063
+ overall_task = progress.add_task(
1064
+ "Overall",
1065
+ total=len(files),
1066
+ filename="[Overall Progress]",
1067
+ )
1068
+ log_panel = None
1069
+
1070
+ # Create log panel for verbose mode (if not already created)
1071
+ if verbose:
1072
+ log_panel = LogPanel()
1073
+
1074
+ def panel_sink(message: Any) -> None:
1075
+ """Sink function to write logs to the panel."""
1076
+ if log_panel is not None:
1077
+ log_panel.add(message.record["message"])
1078
+
1079
+ # Add a handler that writes to the log panel
1080
+ self._panel_handler_id = logger.add(
1081
+ panel_sink,
1082
+ level="INFO",
1083
+ format="{message}",
1084
+ filter=lambda record: record["level"].no >= 20, # INFO and above
1085
+ )
1086
+
1087
+ async def process_with_limit(file_path: Path) -> None:
1088
+ """Process a file with semaphore limit.
1089
+
1090
+ State saving is performed outside the semaphore to avoid blocking
1091
+ concurrent file processing.
1092
+ """
1093
+ assert self.state is not None # Guaranteed by _init_state() above
1094
+ file_key = str(file_path)
1095
+ file_state = self.state.files.get(file_key)
1096
+
1097
+ if file_state is None:
1098
+ file_state = FileState(path=file_key)
1099
+ self.state.files[file_key] = file_state
1100
+
1101
+ # Update state to in_progress
1102
+ file_state.status = FileStatus.IN_PROGRESS
1103
+ file_state.started_at = datetime.now().astimezone().isoformat()
1104
+
1105
+ start_time = asyncio.get_event_loop().time()
1106
+
1107
+ try:
1108
+ # Process file within semaphore
1109
+ async with semaphore:
1110
+ result = await process_func(file_path)
1111
+
1112
+ if result.success:
1113
+ file_state.status = FileStatus.COMPLETED
1114
+ file_state.output = result.output_path
1115
+ file_state.images = result.images
1116
+ file_state.screenshots = result.screenshots
1117
+ file_state.cost_usd = result.cost_usd
1118
+ file_state.llm_usage = result.llm_usage
1119
+ file_state.cache_hit = result.cache_hit
1120
+ # Collect image analysis result for JSON aggregation
1121
+ if result.image_analysis_result is not None:
1122
+ self.image_analysis_results.append(result.image_analysis_result)
1123
+ else:
1124
+ file_state.status = FileStatus.FAILED
1125
+ file_state.error = result.error
1126
+
1127
+ except Exception as e:
1128
+ file_state.status = FileStatus.FAILED
1129
+ file_state.error = str(e)
1130
+ logger.error(f"Failed to process {file_path.name}: {e}")
1131
+
1132
+ finally:
1133
+ end_time = asyncio.get_event_loop().time()
1134
+ file_state.completed_at = datetime.now().astimezone().isoformat()
1135
+ file_state.duration = end_time - start_time
1136
+
1137
+ # Update progress
1138
+ progress.advance(overall_task)
1139
+
1140
+ # Save state outside semaphore (non-blocking, throttled)
1141
+ # Use asyncio.to_thread to avoid blocking the event loop
1142
+ await asyncio.to_thread(self.save_state)
1143
+
1144
+ # If Live display was already started, just run the tasks without creating new Live
1145
+ if live_already_started:
1146
+ tasks = [process_with_limit(f) for f in files]
1147
+ await asyncio.gather(*tasks, return_exceptions=True)
1148
+ else:
1149
+ # No external Live display provided - create one here
1150
+ # Disable console handler to avoid conflict with progress bar
1151
+ if console_handler_id is not None:
1152
+ try:
1153
+ logger.remove(console_handler_id)
1154
+ except ValueError:
1155
+ pass # Handler already removed
1156
+
1157
+ try:
1158
+ if verbose and log_panel is not None:
1159
+ # Verbose mode: show progress + log panel
1160
+ display = Group(progress, log_panel)
1161
+ with Live(display, console=self.console, refresh_per_second=4):
1162
+ tasks = [process_with_limit(f) for f in files]
1163
+ await asyncio.gather(*tasks, return_exceptions=True)
1164
+ else:
1165
+ # Normal mode: progress bar only
1166
+ with Live(progress, console=self.console, refresh_per_second=4):
1167
+ tasks = [process_with_limit(f) for f in files]
1168
+ await asyncio.gather(*tasks, return_exceptions=True)
1169
+ finally:
1170
+ # Remove panel handler if added
1171
+ if self._panel_handler_id is not None:
1172
+ try:
1173
+ logger.remove(self._panel_handler_id)
1174
+ except ValueError:
1175
+ pass
1176
+ self._panel_handler_id = None
1177
+
1178
+ # Re-add console handler (restore original state)
1179
+ if console_handler_id is not None:
1180
+ import sys
1181
+
1182
+ new_handler_id = logger.add(
1183
+ sys.stderr,
1184
+ level="DEBUG" if verbose else "INFO",
1185
+ format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
1186
+ )
1187
+ self._restored_console_handler_id = new_handler_id
1188
+
1189
+ # Final save
1190
+ self.save_state(force=True)
1191
+
1192
+ return self.state
1193
+
1194
+ def generate_report(self) -> dict[str, Any]:
1195
+ """
1196
+ Generate final processing report.
1197
+
1198
+ Returns:
1199
+ Report dictionary (same structure as saved report file)
1200
+ """
1201
+ if self.state is None:
1202
+ return {}
1203
+
1204
+ report = self.state.to_dict()
1205
+ report["summary"] = self._compute_summary()
1206
+ report["llm_usage"] = self._compute_llm_usage()
1207
+ report["generated_at"] = datetime.now().astimezone().isoformat()
1208
+
1209
+ return report
1210
+
1211
+ def save_report(self) -> Path:
1212
+ """Finalize and save report to file.
1213
+
1214
+ Report file is saved to: reports/markitai.<hash>.report.json
1215
+ Respects on_conflict strategy (skip/overwrite/rename).
1216
+
1217
+ Returns:
1218
+ Path to the report file
1219
+ """
1220
+ # First, ensure state is saved for resume capability
1221
+ self.save_state(force=True, log=True)
1222
+
1223
+ # Generate and save the report
1224
+ report = self.generate_report()
1225
+
1226
+ # Ensure reports directory exists
1227
+ self.report_file.parent.mkdir(parents=True, exist_ok=True)
1228
+
1229
+ atomic_write_json(self.report_file, report, order_func=order_report)
1230
+ logger.info(f"Report saved: {self.report_file.resolve()}")
1231
+
1232
+ return self.report_file
1233
+
1234
+ def print_summary(
1235
+ self,
1236
+ url_completed: int = 0,
1237
+ url_failed: int = 0,
1238
+ url_cache_hits: int = 0,
1239
+ url_sources: int = 0,
1240
+ ) -> None:
1241
+ """Print summary to console.
1242
+
1243
+ Args:
1244
+ url_completed: Number of URLs successfully processed
1245
+ url_failed: Number of URLs that failed
1246
+ url_cache_hits: Number of URLs that hit LLM cache
1247
+ url_sources: Number of .urls source files processed
1248
+ """
1249
+ if self.state is None:
1250
+ return
1251
+
1252
+ table = Table(title="Batch Processing Summary")
1253
+ table.add_column("Metric", style="cyan")
1254
+ table.add_column("Value", style="green")
1255
+
1256
+ # Local Files section
1257
+ if self.state.total > 0:
1258
+ table.add_row("Local Files", str(self.state.total))
1259
+ table.add_row("Completed", str(self.state.completed_count))
1260
+ if self.state.failed_count > 0:
1261
+ table.add_row("Failed", str(self.state.failed_count))
1262
+
1263
+ # File cache hits
1264
+ completed_files = [
1265
+ f for f in self.state.files.values() if f.status == FileStatus.COMPLETED
1266
+ ]
1267
+ file_cache_hits = sum(1 for f in completed_files if f.cache_hit)
1268
+ if completed_files:
1269
+ table.add_row("Cache Hits", f"{file_cache_hits}/{len(completed_files)}")
1270
+
1271
+ # Add separator if URLs follow
1272
+ total_urls = url_completed + url_failed
1273
+ if total_urls > 0:
1274
+ table.add_row("", "") # Empty row as separator
1275
+
1276
+ # URL Files section
1277
+ total_urls = url_completed + url_failed
1278
+ if total_urls > 0:
1279
+ if url_sources > 0:
1280
+ table.add_row("URL Files", str(url_sources))
1281
+ table.add_row("URLs", str(total_urls))
1282
+ table.add_row("Completed", str(url_completed))
1283
+ if url_failed > 0:
1284
+ table.add_row("Failed", str(url_failed))
1285
+ if url_completed > 0:
1286
+ table.add_row("Cache Hits", f"{url_cache_hits}/{url_completed}")
1287
+
1288
+ # Add separator before duration
1289
+ if self.state.total > 0 or total_urls > 0:
1290
+ table.add_row("", "") # Empty row as separator
1291
+
1292
+ # Calculate wall-clock duration from started_at to updated_at
1293
+ wall_duration = 0.0
1294
+ if self.state.started_at and self.state.updated_at:
1295
+ try:
1296
+ start = datetime.fromisoformat(self.state.started_at)
1297
+ end = datetime.fromisoformat(self.state.updated_at)
1298
+ wall_duration = (end - start).total_seconds()
1299
+ except ValueError:
1300
+ # Fallback to sum of individual durations
1301
+ wall_duration = sum(f.duration or 0 for f in self.state.files.values())
1302
+ table.add_row("Duration", f"{wall_duration:.1f}s")
1303
+
1304
+ # LLM cost
1305
+ total_cost = sum(f.cost_usd for f in self.state.files.values())
1306
+ if total_cost > 0:
1307
+ table.add_row("LLM Cost", f"${total_cost:.4f}")
1308
+
1309
+ self.console.print(table)
1310
+
1311
+ # Print failed files if any
1312
+ failed = [f for f in self.state.files.values() if f.status == FileStatus.FAILED]
1313
+ if failed:
1314
+ self.console.print("\n[red]Failed files:[/red]")
1315
+ for f in failed:
1316
+ self.console.print(f" - {Path(f.path).name}: {f.error}")