markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/batch.py
ADDED
|
@@ -0,0 +1,1316 @@
|
|
|
1
|
+
"""Batch processing module with resume capability."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
from collections import deque
|
|
8
|
+
from collections.abc import Callable, Coroutine
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING, Any
|
|
14
|
+
|
|
15
|
+
from loguru import logger
|
|
16
|
+
from rich.console import Console, Group
|
|
17
|
+
from rich.live import Live
|
|
18
|
+
from rich.panel import Panel
|
|
19
|
+
from rich.progress import (
|
|
20
|
+
BarColumn,
|
|
21
|
+
Progress,
|
|
22
|
+
SpinnerColumn,
|
|
23
|
+
TaskID,
|
|
24
|
+
TaskProgressColumn,
|
|
25
|
+
TextColumn,
|
|
26
|
+
TimeElapsedColumn,
|
|
27
|
+
)
|
|
28
|
+
from rich.table import Table
|
|
29
|
+
from rich.text import Text
|
|
30
|
+
|
|
31
|
+
from markitai.constants import DEFAULT_LOG_PANEL_MAX_LINES
|
|
32
|
+
from markitai.json_order import order_report, order_state
|
|
33
|
+
from markitai.security import atomic_write_json
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from markitai.config import BatchConfig
|
|
37
|
+
from markitai.workflow.single import ImageAnalysisResult
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class FileStatus(str, Enum):
|
|
41
|
+
"""Status of a file in batch processing.
|
|
42
|
+
|
|
43
|
+
State transitions:
|
|
44
|
+
PENDING -> IN_PROGRESS -> COMPLETED
|
|
45
|
+
-> FAILED
|
|
46
|
+
|
|
47
|
+
On resume: IN_PROGRESS files are treated as FAILED (re-processed).
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
PENDING = "pending"
|
|
51
|
+
IN_PROGRESS = "in_progress"
|
|
52
|
+
COMPLETED = "completed"
|
|
53
|
+
FAILED = "failed"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class FileState:
|
|
58
|
+
"""State of a single file in batch processing.
|
|
59
|
+
|
|
60
|
+
Attributes:
|
|
61
|
+
path: Relative path to source file from input_dir
|
|
62
|
+
status: Current processing status
|
|
63
|
+
output: Relative path to output .md file from output_dir
|
|
64
|
+
error: Error message if status is FAILED
|
|
65
|
+
started_at: ISO timestamp when processing started
|
|
66
|
+
completed_at: ISO timestamp when processing completed
|
|
67
|
+
duration: Total processing time in seconds
|
|
68
|
+
images: Count of embedded images extracted from document content
|
|
69
|
+
screenshots: Count of page/slide screenshots rendered for OCR/LLM
|
|
70
|
+
cost_usd: Total LLM API cost for this file
|
|
71
|
+
llm_usage: Per-model usage stats {model: {requests, input_tokens, output_tokens, cost_usd}}
|
|
72
|
+
cache_hit: Whether LLM results were served from cache (no API calls made)
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
path: str
|
|
76
|
+
status: FileStatus = FileStatus.PENDING
|
|
77
|
+
output: str | None = None
|
|
78
|
+
error: str | None = None
|
|
79
|
+
started_at: str | None = None
|
|
80
|
+
completed_at: str | None = None
|
|
81
|
+
duration: float | None = None
|
|
82
|
+
images: int = 0
|
|
83
|
+
screenshots: int = 0
|
|
84
|
+
cost_usd: float = 0.0
|
|
85
|
+
llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
86
|
+
cache_hit: bool = False
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class UrlState:
|
|
91
|
+
"""State of a single URL in batch processing.
|
|
92
|
+
|
|
93
|
+
Attributes:
|
|
94
|
+
url: The URL being processed
|
|
95
|
+
source_file: Path to the .urls file containing this URL
|
|
96
|
+
status: Current processing status
|
|
97
|
+
output: Relative path to output .md file from output_dir
|
|
98
|
+
error: Error message if status is FAILED
|
|
99
|
+
fetch_strategy: The fetch strategy that was used (static/browser/jina)
|
|
100
|
+
images: Count of images downloaded from the URL
|
|
101
|
+
started_at: ISO timestamp when processing started
|
|
102
|
+
completed_at: ISO timestamp when processing completed
|
|
103
|
+
duration: Total processing time in seconds
|
|
104
|
+
cost_usd: Total LLM API cost for this URL
|
|
105
|
+
llm_usage: Per-model usage stats {model: {requests, input_tokens, output_tokens, cost_usd}}
|
|
106
|
+
cache_hit: Whether LLM results were served from cache (no API calls made)
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
url: str
|
|
110
|
+
source_file: str
|
|
111
|
+
status: FileStatus = FileStatus.PENDING
|
|
112
|
+
output: str | None = None
|
|
113
|
+
error: str | None = None
|
|
114
|
+
fetch_strategy: str | None = None
|
|
115
|
+
images: int = 0
|
|
116
|
+
started_at: str | None = None
|
|
117
|
+
completed_at: str | None = None
|
|
118
|
+
duration: float | None = None
|
|
119
|
+
cost_usd: float = 0.0
|
|
120
|
+
llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
121
|
+
cache_hit: bool = False
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class BatchState:
|
|
126
|
+
"""State of batch processing for resume capability."""
|
|
127
|
+
|
|
128
|
+
version: str = "1.0"
|
|
129
|
+
started_at: str = ""
|
|
130
|
+
updated_at: str = ""
|
|
131
|
+
input_dir: str = ""
|
|
132
|
+
output_dir: str = ""
|
|
133
|
+
log_file: str | None = None # Path to log file for this run
|
|
134
|
+
options: dict = field(default_factory=dict)
|
|
135
|
+
files: dict[str, FileState] = field(default_factory=dict)
|
|
136
|
+
urls: dict[str, UrlState] = field(default_factory=dict) # key: URL string
|
|
137
|
+
url_sources: list[str] = field(default_factory=list) # .urls file paths
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def total(self) -> int:
|
|
141
|
+
"""Total number of files."""
|
|
142
|
+
return len(self.files)
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def total_urls(self) -> int:
|
|
146
|
+
"""Total number of URLs."""
|
|
147
|
+
return len(self.urls)
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def completed_count(self) -> int:
|
|
151
|
+
"""Number of completed files."""
|
|
152
|
+
return sum(1 for f in self.files.values() if f.status == FileStatus.COMPLETED)
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def completed_urls_count(self) -> int:
|
|
156
|
+
"""Number of completed URLs."""
|
|
157
|
+
return sum(1 for u in self.urls.values() if u.status == FileStatus.COMPLETED)
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def failed_count(self) -> int:
|
|
161
|
+
"""Number of failed files."""
|
|
162
|
+
return sum(1 for f in self.files.values() if f.status == FileStatus.FAILED)
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
def failed_urls_count(self) -> int:
|
|
166
|
+
"""Number of failed URLs."""
|
|
167
|
+
return sum(1 for u in self.urls.values() if u.status == FileStatus.FAILED)
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def pending_count(self) -> int:
|
|
171
|
+
"""Number of pending files."""
|
|
172
|
+
return sum(
|
|
173
|
+
1
|
|
174
|
+
for f in self.files.values()
|
|
175
|
+
if f.status in (FileStatus.PENDING, FileStatus.FAILED)
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def pending_urls_count(self) -> int:
|
|
180
|
+
"""Number of pending URLs."""
|
|
181
|
+
return sum(
|
|
182
|
+
1
|
|
183
|
+
for u in self.urls.values()
|
|
184
|
+
if u.status in (FileStatus.PENDING, FileStatus.FAILED)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def get_pending_files(self) -> list[Path]:
|
|
188
|
+
"""Get list of files that need processing."""
|
|
189
|
+
return [
|
|
190
|
+
Path(f.path)
|
|
191
|
+
for f in self.files.values()
|
|
192
|
+
if f.status in (FileStatus.PENDING, FileStatus.FAILED)
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
def get_pending_urls(self) -> list[str]:
|
|
196
|
+
"""Get list of URLs that need processing."""
|
|
197
|
+
return [
|
|
198
|
+
u.url
|
|
199
|
+
for u in self.urls.values()
|
|
200
|
+
if u.status in (FileStatus.PENDING, FileStatus.FAILED)
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
def to_dict(self) -> dict[str, Any]:
|
|
204
|
+
"""Convert to dictionary for JSON serialization.
|
|
205
|
+
|
|
206
|
+
Note: input_dir/output_dir are stored in options with absolute paths.
|
|
207
|
+
Files keys are stored as relative paths (relative to input_dir).
|
|
208
|
+
"""
|
|
209
|
+
# Convert log_file to absolute path if it exists
|
|
210
|
+
log_file_abs = None
|
|
211
|
+
if self.log_file:
|
|
212
|
+
log_path = Path(self.log_file)
|
|
213
|
+
log_file_abs = (
|
|
214
|
+
str(log_path.resolve()) if log_path.exists() else self.log_file
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Convert files keys to relative paths (relative to input_dir)
|
|
218
|
+
input_dir_path = Path(self.input_dir).resolve()
|
|
219
|
+
files_dict = {}
|
|
220
|
+
for path, state in self.files.items():
|
|
221
|
+
file_path = Path(path).resolve()
|
|
222
|
+
try:
|
|
223
|
+
rel_path = str(file_path.relative_to(input_dir_path))
|
|
224
|
+
except ValueError:
|
|
225
|
+
# File is not under input_dir, use filename only
|
|
226
|
+
rel_path = file_path.name
|
|
227
|
+
files_dict[rel_path] = {
|
|
228
|
+
"status": state.status.value,
|
|
229
|
+
"output": state.output,
|
|
230
|
+
"error": state.error,
|
|
231
|
+
"started_at": state.started_at,
|
|
232
|
+
"completed_at": state.completed_at,
|
|
233
|
+
"duration": state.duration,
|
|
234
|
+
"images": state.images,
|
|
235
|
+
"screenshots": state.screenshots,
|
|
236
|
+
"cost_usd": state.cost_usd,
|
|
237
|
+
"llm_usage": state.llm_usage,
|
|
238
|
+
"cache_hit": state.cache_hit,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
# Convert URLs to dict
|
|
242
|
+
urls_dict = {}
|
|
243
|
+
for url, state in self.urls.items():
|
|
244
|
+
urls_dict[url] = {
|
|
245
|
+
"source_file": state.source_file,
|
|
246
|
+
"status": state.status.value,
|
|
247
|
+
"output": state.output,
|
|
248
|
+
"error": state.error,
|
|
249
|
+
"fetch_strategy": state.fetch_strategy,
|
|
250
|
+
"images": state.images,
|
|
251
|
+
"started_at": state.started_at,
|
|
252
|
+
"completed_at": state.completed_at,
|
|
253
|
+
"duration": state.duration,
|
|
254
|
+
"cost_usd": state.cost_usd,
|
|
255
|
+
"llm_usage": state.llm_usage,
|
|
256
|
+
"cache_hit": state.cache_hit,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
return {
|
|
260
|
+
"version": self.version,
|
|
261
|
+
"started_at": self.started_at,
|
|
262
|
+
"updated_at": self.updated_at,
|
|
263
|
+
"log_file": log_file_abs,
|
|
264
|
+
"options": self.options,
|
|
265
|
+
"documents": files_dict,
|
|
266
|
+
"urls": urls_dict,
|
|
267
|
+
"url_sources": self.url_sources,
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
def to_minimal_dict(self) -> dict[str, Any]:
|
|
271
|
+
"""Convert to minimal dictionary for state file (resume capability).
|
|
272
|
+
|
|
273
|
+
Only includes fields necessary for determining what needs to be reprocessed:
|
|
274
|
+
- version: For compatibility checking
|
|
275
|
+
- options: input_dir/output_dir needed to resolve paths
|
|
276
|
+
- documents: status + output (completed) or error (failed)
|
|
277
|
+
- urls: status + output + source_file
|
|
278
|
+
"""
|
|
279
|
+
# Convert files keys to relative paths (relative to input_dir)
|
|
280
|
+
input_dir_path = Path(self.input_dir).resolve()
|
|
281
|
+
files_dict = {}
|
|
282
|
+
for path, state in self.files.items():
|
|
283
|
+
file_path = Path(path).resolve()
|
|
284
|
+
try:
|
|
285
|
+
rel_path = str(file_path.relative_to(input_dir_path))
|
|
286
|
+
except ValueError:
|
|
287
|
+
rel_path = file_path.name
|
|
288
|
+
|
|
289
|
+
# Minimal state: only what's needed for resume
|
|
290
|
+
entry: dict[str, Any] = {"status": state.status.value}
|
|
291
|
+
if state.status == FileStatus.COMPLETED and state.output:
|
|
292
|
+
entry["output"] = state.output
|
|
293
|
+
elif state.status == FileStatus.FAILED and state.error:
|
|
294
|
+
entry["error"] = state.error
|
|
295
|
+
files_dict[rel_path] = entry
|
|
296
|
+
|
|
297
|
+
# Convert URLs to minimal dict
|
|
298
|
+
urls_dict = {}
|
|
299
|
+
for url, state in self.urls.items():
|
|
300
|
+
entry: dict[str, Any] = {
|
|
301
|
+
"status": state.status.value,
|
|
302
|
+
"source_file": state.source_file,
|
|
303
|
+
}
|
|
304
|
+
if state.status == FileStatus.COMPLETED and state.output:
|
|
305
|
+
entry["output"] = state.output
|
|
306
|
+
elif state.status == FileStatus.FAILED and state.error:
|
|
307
|
+
entry["error"] = state.error
|
|
308
|
+
urls_dict[url] = entry
|
|
309
|
+
|
|
310
|
+
return {
|
|
311
|
+
"version": self.version,
|
|
312
|
+
"options": self.options,
|
|
313
|
+
"documents": files_dict,
|
|
314
|
+
"urls": urls_dict,
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
@classmethod
|
|
318
|
+
def from_dict(cls, data: dict[str, Any]) -> BatchState:
|
|
319
|
+
"""Create from dictionary."""
|
|
320
|
+
options = data.get("options", {})
|
|
321
|
+
|
|
322
|
+
# Get input_dir/output_dir from options
|
|
323
|
+
input_dir = options.get("input_dir", "")
|
|
324
|
+
output_dir = options.get("output_dir", "")
|
|
325
|
+
|
|
326
|
+
state = cls(
|
|
327
|
+
version=data.get("version", "1.0"),
|
|
328
|
+
started_at=data.get("started_at", ""),
|
|
329
|
+
updated_at=data.get("updated_at", ""),
|
|
330
|
+
input_dir=input_dir,
|
|
331
|
+
output_dir=output_dir,
|
|
332
|
+
log_file=data.get("log_file"),
|
|
333
|
+
options=options,
|
|
334
|
+
url_sources=data.get("url_sources", []),
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
documents_data = data.get("documents", {})
|
|
338
|
+
|
|
339
|
+
# Reconstruct absolute file paths from relative paths
|
|
340
|
+
input_dir_path = Path(input_dir) if input_dir else Path(".")
|
|
341
|
+
for path, file_data in documents_data.items():
|
|
342
|
+
# If path is relative, make it absolute relative to input_dir
|
|
343
|
+
file_path = Path(path)
|
|
344
|
+
if not file_path.is_absolute():
|
|
345
|
+
abs_path = str(input_dir_path / path)
|
|
346
|
+
else:
|
|
347
|
+
abs_path = path
|
|
348
|
+
|
|
349
|
+
state.files[abs_path] = FileState(
|
|
350
|
+
path=abs_path,
|
|
351
|
+
status=FileStatus(file_data.get("status", "pending")),
|
|
352
|
+
output=file_data.get("output"),
|
|
353
|
+
error=file_data.get("error"),
|
|
354
|
+
started_at=file_data.get("started_at"),
|
|
355
|
+
completed_at=file_data.get("completed_at"),
|
|
356
|
+
duration=file_data.get("duration"),
|
|
357
|
+
images=file_data.get("images", 0),
|
|
358
|
+
screenshots=file_data.get("screenshots", 0),
|
|
359
|
+
cost_usd=file_data.get("cost_usd", 0.0),
|
|
360
|
+
llm_usage=file_data.get("llm_usage", {}),
|
|
361
|
+
cache_hit=file_data.get("cache_hit", False),
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
# Reconstruct URL states
|
|
365
|
+
for url, url_data in data.get("urls", {}).items():
|
|
366
|
+
state.urls[url] = UrlState(
|
|
367
|
+
url=url,
|
|
368
|
+
source_file=url_data.get("source_file", ""),
|
|
369
|
+
status=FileStatus(url_data.get("status", "pending")),
|
|
370
|
+
output=url_data.get("output"),
|
|
371
|
+
error=url_data.get("error"),
|
|
372
|
+
fetch_strategy=url_data.get("fetch_strategy"),
|
|
373
|
+
images=url_data.get("images", 0),
|
|
374
|
+
started_at=url_data.get("started_at"),
|
|
375
|
+
completed_at=url_data.get("completed_at"),
|
|
376
|
+
duration=url_data.get("duration"),
|
|
377
|
+
cost_usd=url_data.get("cost_usd", 0.0),
|
|
378
|
+
llm_usage=url_data.get("llm_usage", {}),
|
|
379
|
+
cache_hit=url_data.get("cache_hit", False),
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
return state
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
@dataclass
|
|
386
|
+
class ProcessResult:
|
|
387
|
+
"""Result of processing a single file.
|
|
388
|
+
|
|
389
|
+
Attributes:
|
|
390
|
+
success: Whether processing completed without errors
|
|
391
|
+
output_path: Path to generated .md file (None if failed)
|
|
392
|
+
error: Error message if success is False
|
|
393
|
+
images: Count of embedded images extracted from document
|
|
394
|
+
screenshots: Count of page/slide screenshots for OCR/LLM
|
|
395
|
+
cost_usd: Total LLM API cost for this file
|
|
396
|
+
llm_usage: Per-model usage {model: {requests, input_tokens, output_tokens, cost_usd}}
|
|
397
|
+
image_analysis_result: Aggregated image analysis for JSON output (None if disabled)
|
|
398
|
+
cache_hit: Whether LLM results were served entirely from cache
|
|
399
|
+
"""
|
|
400
|
+
|
|
401
|
+
success: bool
|
|
402
|
+
output_path: str | None = None
|
|
403
|
+
error: str | None = None
|
|
404
|
+
images: int = 0
|
|
405
|
+
screenshots: int = 0
|
|
406
|
+
cost_usd: float = 0.0
|
|
407
|
+
llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
408
|
+
image_analysis_result: ImageAnalysisResult | None = None
|
|
409
|
+
cache_hit: bool = False
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
# Type alias for process function
|
|
413
|
+
ProcessFunc = Callable[[Path], Coroutine[Any, Any, ProcessResult]]
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
class LogPanel:
|
|
417
|
+
"""Log panel for verbose mode, displays scrolling log messages."""
|
|
418
|
+
|
|
419
|
+
def __init__(self, max_lines: int = DEFAULT_LOG_PANEL_MAX_LINES):
|
|
420
|
+
self.logs: deque[str] = deque(maxlen=max_lines)
|
|
421
|
+
|
|
422
|
+
def add(self, message: str) -> None:
|
|
423
|
+
"""Add a log message to the panel."""
|
|
424
|
+
timestamp = datetime.now().strftime("%H:%M:%S")
|
|
425
|
+
self.logs.append(f"{timestamp} | {message}")
|
|
426
|
+
|
|
427
|
+
def __rich__(self) -> Panel:
|
|
428
|
+
"""Render the log panel."""
|
|
429
|
+
content = "\n".join(self.logs) if self.logs else "(waiting for logs...)"
|
|
430
|
+
# Use Text object to prevent markup parsing (paths like [/foo/bar] would be misinterpreted)
|
|
431
|
+
return Panel(Text(content), title="Logs", border_style="dim")
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
class BatchProcessor:
|
|
435
|
+
"""Batch processor with concurrent execution and progress display."""
|
|
436
|
+
|
|
437
|
+
def __init__(
|
|
438
|
+
self,
|
|
439
|
+
config: BatchConfig,
|
|
440
|
+
output_dir: Path,
|
|
441
|
+
input_path: Path | None = None,
|
|
442
|
+
log_file: Path | str | None = None,
|
|
443
|
+
on_conflict: str = "rename",
|
|
444
|
+
task_options: dict[str, Any] | None = None,
|
|
445
|
+
) -> None:
|
|
446
|
+
"""
|
|
447
|
+
Initialize batch processor.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
config: Batch processing configuration
|
|
451
|
+
output_dir: Output directory
|
|
452
|
+
input_path: Input file or directory (used for report file naming)
|
|
453
|
+
log_file: Path to the log file for this run
|
|
454
|
+
on_conflict: Conflict resolution strategy ("skip", "overwrite", "rename")
|
|
455
|
+
task_options: Task options dict (used for computing task hash)
|
|
456
|
+
"""
|
|
457
|
+
self.config = config
|
|
458
|
+
self.output_dir = Path(output_dir)
|
|
459
|
+
self.input_path = Path(input_path) if input_path else None
|
|
460
|
+
self.log_file = str(log_file) if log_file else None
|
|
461
|
+
self.on_conflict = on_conflict
|
|
462
|
+
self.task_options = task_options or {}
|
|
463
|
+
self.task_hash = self._compute_task_hash()
|
|
464
|
+
self.state_file = self._get_state_file_path()
|
|
465
|
+
self.report_file = self._get_report_file_path()
|
|
466
|
+
self.state: BatchState | None = None
|
|
467
|
+
self.console = Console()
|
|
468
|
+
# Collect image analysis results for JSON aggregation
|
|
469
|
+
self.image_analysis_results: list[ImageAnalysisResult] = []
|
|
470
|
+
|
|
471
|
+
# Live display state (managed by start_live_display/stop_live_display)
|
|
472
|
+
self._live: Live | None = None
|
|
473
|
+
self._log_panel: LogPanel | None = None
|
|
474
|
+
self._panel_handler_id: int | None = None
|
|
475
|
+
self._console_handler_id: int | None = None
|
|
476
|
+
self._verbose: bool = False
|
|
477
|
+
self._progress: Progress | None = None
|
|
478
|
+
self._overall_task_id: TaskID | None = None
|
|
479
|
+
self._url_task_id: TaskID | None = None
|
|
480
|
+
self._total_urls: int = 0
|
|
481
|
+
self._total_files: int = 0
|
|
482
|
+
self._completed_urls: int = 0
|
|
483
|
+
self._completed_files: int = 0
|
|
484
|
+
|
|
485
|
+
def _compute_task_hash(self) -> str:
|
|
486
|
+
"""Compute hash from task input parameters.
|
|
487
|
+
|
|
488
|
+
Hash is based on:
|
|
489
|
+
- input_path (resolved)
|
|
490
|
+
- output_dir (resolved)
|
|
491
|
+
- key task options (llm_enabled, ocr_enabled, etc.)
|
|
492
|
+
|
|
493
|
+
This ensures different parameter combinations produce different hashes,
|
|
494
|
+
so resuming with different options creates a new state file.
|
|
495
|
+
"""
|
|
496
|
+
import hashlib
|
|
497
|
+
|
|
498
|
+
# Extract key options that affect output (exclude paths, they're added separately)
|
|
499
|
+
key_options = {
|
|
500
|
+
k: v
|
|
501
|
+
for k, v in self.task_options.items()
|
|
502
|
+
if k
|
|
503
|
+
in (
|
|
504
|
+
"llm_enabled",
|
|
505
|
+
"ocr_enabled",
|
|
506
|
+
"screenshot_enabled",
|
|
507
|
+
"image_alt_enabled",
|
|
508
|
+
"image_desc_enabled",
|
|
509
|
+
)
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
hash_params = {
|
|
513
|
+
"input": str(self.input_path.resolve()) if self.input_path else "",
|
|
514
|
+
"output": str(self.output_dir.resolve()),
|
|
515
|
+
"options": key_options,
|
|
516
|
+
}
|
|
517
|
+
hash_str = json.dumps(hash_params, sort_keys=True)
|
|
518
|
+
return hashlib.md5(hash_str.encode()).hexdigest()[:6]
|
|
519
|
+
|
|
520
|
+
def _get_state_file_path(self) -> Path:
|
|
521
|
+
"""Generate state file path for resume capability.
|
|
522
|
+
|
|
523
|
+
Format: states/markitai.<hash>.state.json
|
|
524
|
+
"""
|
|
525
|
+
states_dir = self.output_dir / "states"
|
|
526
|
+
return states_dir / f"markitai.{self.task_hash}.state.json"
|
|
527
|
+
|
|
528
|
+
def _get_report_file_path(self) -> Path:
|
|
529
|
+
"""Generate report file path based on task hash.
|
|
530
|
+
|
|
531
|
+
Format: reports/markitai.<hash>.report.json
|
|
532
|
+
Respects on_conflict strategy for rename.
|
|
533
|
+
"""
|
|
534
|
+
reports_dir = self.output_dir / "reports"
|
|
535
|
+
base_path = reports_dir / f"markitai.{self.task_hash}.report.json"
|
|
536
|
+
|
|
537
|
+
if not base_path.exists():
|
|
538
|
+
return base_path
|
|
539
|
+
|
|
540
|
+
if self.on_conflict == "skip":
|
|
541
|
+
return base_path # Will be handled by caller
|
|
542
|
+
elif self.on_conflict == "overwrite":
|
|
543
|
+
return base_path
|
|
544
|
+
else: # rename
|
|
545
|
+
seq = 2
|
|
546
|
+
while True:
|
|
547
|
+
new_path = reports_dir / f"markitai.{self.task_hash}.v{seq}.report.json"
|
|
548
|
+
if not new_path.exists():
|
|
549
|
+
return new_path
|
|
550
|
+
seq += 1
|
|
551
|
+
|
|
552
|
+
def start_live_display(
|
|
553
|
+
self,
|
|
554
|
+
verbose: bool = False,
|
|
555
|
+
console_handler_id: int | None = None,
|
|
556
|
+
total_files: int = 0,
|
|
557
|
+
total_urls: int = 0,
|
|
558
|
+
) -> None:
|
|
559
|
+
"""Start Live display with progress bar and optional log panel.
|
|
560
|
+
|
|
561
|
+
Call this before any processing (including pre-conversion) to capture
|
|
562
|
+
all logs in the panel instead of printing to console.
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
verbose: Whether to show log panel
|
|
566
|
+
console_handler_id: Loguru console handler ID to disable
|
|
567
|
+
total_files: Total number of files (for progress bar)
|
|
568
|
+
total_urls: Total number of URLs to process
|
|
569
|
+
"""
|
|
570
|
+
|
|
571
|
+
self._verbose = verbose
|
|
572
|
+
self._console_handler_id = console_handler_id
|
|
573
|
+
|
|
574
|
+
# Create progress display
|
|
575
|
+
self._progress = Progress(
|
|
576
|
+
SpinnerColumn(),
|
|
577
|
+
TextColumn("[bold blue]{task.fields[filename]:<30}"),
|
|
578
|
+
BarColumn(),
|
|
579
|
+
TaskProgressColumn(),
|
|
580
|
+
TimeElapsedColumn(),
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
# Store totals for progress display
|
|
584
|
+
self._total_urls = total_urls
|
|
585
|
+
self._total_files = total_files
|
|
586
|
+
self._completed_urls = 0
|
|
587
|
+
self._completed_files = 0
|
|
588
|
+
|
|
589
|
+
# Add URL progress task if there are URLs to process
|
|
590
|
+
if total_urls > 0:
|
|
591
|
+
self._url_task_id = self._progress.add_task(
|
|
592
|
+
"URLs",
|
|
593
|
+
total=total_urls,
|
|
594
|
+
filename=f"[URLs:0/{total_urls}]",
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# Add file progress task (or overall if no URLs)
|
|
598
|
+
self._overall_task_id = self._progress.add_task(
|
|
599
|
+
"Overall",
|
|
600
|
+
total=total_files,
|
|
601
|
+
filename=f"[Files:0/{total_files}]"
|
|
602
|
+
if total_urls > 0
|
|
603
|
+
else "[Overall Progress]",
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
# Create log panel for verbose mode
|
|
607
|
+
if verbose:
|
|
608
|
+
self._log_panel = LogPanel()
|
|
609
|
+
|
|
610
|
+
def panel_sink(message: Any) -> None:
|
|
611
|
+
"""Sink function to write logs to the panel."""
|
|
612
|
+
if self._log_panel is not None:
|
|
613
|
+
self._log_panel.add(message.record["message"])
|
|
614
|
+
|
|
615
|
+
# Add a handler that writes to the log panel
|
|
616
|
+
self._panel_handler_id = logger.add(
|
|
617
|
+
panel_sink,
|
|
618
|
+
level="INFO",
|
|
619
|
+
format="{message}",
|
|
620
|
+
filter=lambda record: record["level"].no >= 20, # INFO and above
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
# Disable console handler to avoid conflict with progress bar
|
|
624
|
+
if console_handler_id is not None:
|
|
625
|
+
try:
|
|
626
|
+
logger.remove(console_handler_id)
|
|
627
|
+
except ValueError:
|
|
628
|
+
pass # Handler already removed
|
|
629
|
+
|
|
630
|
+
# Start Live display
|
|
631
|
+
if verbose and self._log_panel is not None:
|
|
632
|
+
display = Group(self._progress, self._log_panel)
|
|
633
|
+
self._live = Live(display, console=self.console, refresh_per_second=4)
|
|
634
|
+
else:
|
|
635
|
+
self._live = Live(
|
|
636
|
+
self._progress, console=self.console, refresh_per_second=4
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
self._live.start()
|
|
640
|
+
|
|
641
|
+
def stop_live_display(self) -> None:
|
|
642
|
+
"""Stop Live display and restore console handler."""
|
|
643
|
+
import sys
|
|
644
|
+
|
|
645
|
+
# Stop Live display
|
|
646
|
+
if self._live is not None:
|
|
647
|
+
self._live.stop()
|
|
648
|
+
self._live = None
|
|
649
|
+
|
|
650
|
+
# Remove panel handler if added
|
|
651
|
+
if self._panel_handler_id is not None:
|
|
652
|
+
try:
|
|
653
|
+
logger.remove(self._panel_handler_id)
|
|
654
|
+
except ValueError:
|
|
655
|
+
pass
|
|
656
|
+
self._panel_handler_id = None
|
|
657
|
+
|
|
658
|
+
# Re-add console handler (restore original state)
|
|
659
|
+
if self._console_handler_id is not None:
|
|
660
|
+
new_handler_id = logger.add(
|
|
661
|
+
sys.stderr,
|
|
662
|
+
level="DEBUG" if self._verbose else "INFO",
|
|
663
|
+
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
|
|
664
|
+
)
|
|
665
|
+
self._restored_console_handler_id = new_handler_id
|
|
666
|
+
self._console_handler_id = None
|
|
667
|
+
|
|
668
|
+
def update_progress_total(self, total: int) -> None:
|
|
669
|
+
"""Update progress bar total after file discovery."""
|
|
670
|
+
self._total_files = total
|
|
671
|
+
if self._progress is not None and self._overall_task_id is not None:
|
|
672
|
+
self._progress.update(self._overall_task_id, total=total)
|
|
673
|
+
# Update filename with new total
|
|
674
|
+
self._progress.update(
|
|
675
|
+
self._overall_task_id,
|
|
676
|
+
filename=f"[Files:{self._completed_files}/{total}]",
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
def advance_progress(self) -> None:
|
|
680
|
+
"""Advance progress bar by one."""
|
|
681
|
+
if self._progress is not None and self._overall_task_id is not None:
|
|
682
|
+
self._completed_files += 1
|
|
683
|
+
self._progress.advance(self._overall_task_id)
|
|
684
|
+
# Update filename with current count
|
|
685
|
+
self._progress.update(
|
|
686
|
+
self._overall_task_id,
|
|
687
|
+
filename=f"[Files:{self._completed_files}/{self._total_files}]",
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
def update_url_status(self, url: str, completed: bool = False) -> None:
|
|
691
|
+
"""Update URL processing status in progress display.
|
|
692
|
+
|
|
693
|
+
Args:
|
|
694
|
+
url: The URL being processed (displayed in progress bar)
|
|
695
|
+
completed: If True, advance the URL progress counter
|
|
696
|
+
"""
|
|
697
|
+
if self._progress is not None and self._url_task_id is not None:
|
|
698
|
+
if completed:
|
|
699
|
+
self._completed_urls += 1
|
|
700
|
+
self._progress.advance(self._url_task_id)
|
|
701
|
+
# Update filename with current count
|
|
702
|
+
self._progress.update(
|
|
703
|
+
self._url_task_id,
|
|
704
|
+
filename=f"[URLs:{self._completed_urls}/{self._total_urls}]",
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
def finish_url_processing(self, completed: int, failed: int) -> None:
|
|
708
|
+
"""Mark URL processing as complete.
|
|
709
|
+
|
|
710
|
+
Args:
|
|
711
|
+
completed: Number of URLs successfully processed
|
|
712
|
+
failed: Number of URLs that failed
|
|
713
|
+
"""
|
|
714
|
+
if self._progress is not None and self._url_task_id is not None:
|
|
715
|
+
# Final status already shows count from update_url_status
|
|
716
|
+
pass
|
|
717
|
+
|
|
718
|
+
def discover_files(
|
|
719
|
+
self,
|
|
720
|
+
input_path: Path,
|
|
721
|
+
extensions: set[str],
|
|
722
|
+
) -> list[Path]:
|
|
723
|
+
"""
|
|
724
|
+
Discover files to process.
|
|
725
|
+
|
|
726
|
+
Args:
|
|
727
|
+
input_path: Input file or directory
|
|
728
|
+
extensions: Set of valid file extensions (e.g., {".docx", ".pdf"})
|
|
729
|
+
|
|
730
|
+
Returns:
|
|
731
|
+
List of file paths
|
|
732
|
+
|
|
733
|
+
Raises:
|
|
734
|
+
ValueError: If any discovered file is outside the input directory
|
|
735
|
+
"""
|
|
736
|
+
from markitai.security import validate_path_within_base
|
|
737
|
+
|
|
738
|
+
if input_path.is_file():
|
|
739
|
+
return [input_path]
|
|
740
|
+
|
|
741
|
+
input_resolved = input_path.resolve()
|
|
742
|
+
files: list[Path] = []
|
|
743
|
+
max_depth = max(0, self.config.scan_max_depth)
|
|
744
|
+
max_files = max(1, self.config.scan_max_files)
|
|
745
|
+
|
|
746
|
+
def should_include(path: Path) -> bool:
|
|
747
|
+
try:
|
|
748
|
+
validate_path_within_base(path, input_resolved)
|
|
749
|
+
except ValueError:
|
|
750
|
+
logger.warning(f"Skipping file outside input directory: {path}")
|
|
751
|
+
return False
|
|
752
|
+
return path.is_file()
|
|
753
|
+
|
|
754
|
+
for ext in extensions:
|
|
755
|
+
# Search both lowercase and uppercase variants (Linux glob is case-sensitive)
|
|
756
|
+
ext_variants = [ext, ext.upper()]
|
|
757
|
+
candidates = []
|
|
758
|
+
|
|
759
|
+
for ext_variant in ext_variants:
|
|
760
|
+
if max_depth == 0:
|
|
761
|
+
candidates.extend(input_path.glob(f"*{ext_variant}"))
|
|
762
|
+
else:
|
|
763
|
+
# Use rglob for recursive search, then filter by depth
|
|
764
|
+
for f in input_path.rglob(f"*{ext_variant}"):
|
|
765
|
+
# Calculate relative depth
|
|
766
|
+
try:
|
|
767
|
+
rel_path = f.relative_to(input_path)
|
|
768
|
+
depth = len(rel_path.parts) - 1 # -1 for filename itself
|
|
769
|
+
if depth <= max_depth:
|
|
770
|
+
candidates.append(f)
|
|
771
|
+
except ValueError:
|
|
772
|
+
continue
|
|
773
|
+
|
|
774
|
+
for f in candidates:
|
|
775
|
+
if len(files) >= max_files:
|
|
776
|
+
logger.warning(
|
|
777
|
+
f"Reached scan_max_files={max_files}, stopping file discovery"
|
|
778
|
+
)
|
|
779
|
+
return sorted(set(files))
|
|
780
|
+
if should_include(f):
|
|
781
|
+
files.append(f)
|
|
782
|
+
|
|
783
|
+
return sorted(set(files))
|
|
784
|
+
|
|
785
|
+
def load_state(self) -> BatchState | None:
|
|
786
|
+
"""Load state from state file if exists (for resume capability)."""
|
|
787
|
+
from markitai.constants import MAX_STATE_FILE_SIZE
|
|
788
|
+
from markitai.security import validate_file_size
|
|
789
|
+
|
|
790
|
+
if not self.state_file.exists():
|
|
791
|
+
return None
|
|
792
|
+
|
|
793
|
+
try:
|
|
794
|
+
# Validate file size to prevent DoS
|
|
795
|
+
validate_file_size(self.state_file, MAX_STATE_FILE_SIZE)
|
|
796
|
+
data = json.loads(self.state_file.read_text(encoding="utf-8"))
|
|
797
|
+
return BatchState.from_dict(data)
|
|
798
|
+
except Exception as e:
|
|
799
|
+
logger.warning(f"Failed to load state file for resume: {e}")
|
|
800
|
+
return None
|
|
801
|
+
|
|
802
|
+
def save_state(self, force: bool = False, log: bool = False) -> None:
|
|
803
|
+
"""Save current state to state file for resume capability.
|
|
804
|
+
|
|
805
|
+
State file is saved to: states/markitai.<hash>.state.json
|
|
806
|
+
|
|
807
|
+
Optimized with interval-based throttling:
|
|
808
|
+
- Checks interval BEFORE serialization to avoid unnecessary work
|
|
809
|
+
- Uses minimal serialization when possible
|
|
810
|
+
|
|
811
|
+
Args:
|
|
812
|
+
force: Force save even if interval hasn't passed
|
|
813
|
+
log: Whether to log the save operation
|
|
814
|
+
"""
|
|
815
|
+
if self.state is None:
|
|
816
|
+
return
|
|
817
|
+
|
|
818
|
+
now = datetime.now().astimezone()
|
|
819
|
+
interval = getattr(self.config, "state_flush_interval_seconds", 0) or 0
|
|
820
|
+
|
|
821
|
+
# Check interval BEFORE any serialization work (optimization)
|
|
822
|
+
if not force and interval > 0:
|
|
823
|
+
last_saved = getattr(self, "_last_state_save", None)
|
|
824
|
+
if last_saved and (now - last_saved).total_seconds() < interval:
|
|
825
|
+
return # Skip: interval not passed, no work done
|
|
826
|
+
|
|
827
|
+
self.state.updated_at = now.isoformat()
|
|
828
|
+
|
|
829
|
+
# Build minimal state document (only what's needed for resume)
|
|
830
|
+
state_data = self.state.to_minimal_dict()
|
|
831
|
+
|
|
832
|
+
# Ensure states directory exists
|
|
833
|
+
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
|
834
|
+
|
|
835
|
+
atomic_write_json(self.state_file, state_data, order_func=order_state)
|
|
836
|
+
self._last_state_save = now
|
|
837
|
+
|
|
838
|
+
if log:
|
|
839
|
+
logger.info(f"State file saved: {self.state_file.resolve()}")
|
|
840
|
+
|
|
841
|
+
def _compute_summary(self) -> dict[str, Any]:
|
|
842
|
+
"""Compute summary statistics for report."""
|
|
843
|
+
if self.state is None:
|
|
844
|
+
return {}
|
|
845
|
+
|
|
846
|
+
# Calculate wall-clock duration
|
|
847
|
+
wall_duration = 0.0
|
|
848
|
+
if self.state.started_at and self.state.updated_at:
|
|
849
|
+
try:
|
|
850
|
+
start = datetime.fromisoformat(self.state.started_at)
|
|
851
|
+
end = datetime.fromisoformat(self.state.updated_at)
|
|
852
|
+
wall_duration = (end - start).total_seconds()
|
|
853
|
+
except ValueError:
|
|
854
|
+
wall_duration = 0.0
|
|
855
|
+
|
|
856
|
+
# Calculate cumulative processing time (files + URLs)
|
|
857
|
+
file_duration = sum(f.duration or 0 for f in self.state.files.values())
|
|
858
|
+
url_duration = sum(u.duration or 0 for u in self.state.urls.values())
|
|
859
|
+
processing_time = file_duration + url_duration
|
|
860
|
+
|
|
861
|
+
# URL cache hits count
|
|
862
|
+
url_cache_hits = sum(
|
|
863
|
+
1
|
|
864
|
+
for u in self.state.urls.values()
|
|
865
|
+
if u.status == FileStatus.COMPLETED and u.cache_hit
|
|
866
|
+
)
|
|
867
|
+
|
|
868
|
+
return {
|
|
869
|
+
"total_documents": self.state.total,
|
|
870
|
+
"completed_documents": self.state.completed_count,
|
|
871
|
+
"failed_documents": self.state.failed_count,
|
|
872
|
+
"pending_documents": self.state.pending_count,
|
|
873
|
+
"total_urls": self.state.total_urls,
|
|
874
|
+
"completed_urls": self.state.completed_urls_count,
|
|
875
|
+
"failed_urls": self.state.failed_urls_count,
|
|
876
|
+
"pending_urls": self.state.pending_urls_count,
|
|
877
|
+
"url_cache_hits": url_cache_hits,
|
|
878
|
+
"url_sources": len(self.state.url_sources),
|
|
879
|
+
"duration": wall_duration,
|
|
880
|
+
"processing_time": processing_time,
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
def _compute_llm_usage(self) -> dict[str, Any]:
|
|
884
|
+
"""Compute aggregated LLM usage statistics for report."""
|
|
885
|
+
if self.state is None:
|
|
886
|
+
return {}
|
|
887
|
+
|
|
888
|
+
# Aggregate LLM usage by model (from both files and URLs)
|
|
889
|
+
models_usage: dict[str, dict[str, Any]] = {}
|
|
890
|
+
|
|
891
|
+
# Aggregate from files
|
|
892
|
+
for f in self.state.files.values():
|
|
893
|
+
for model, usage in f.llm_usage.items():
|
|
894
|
+
if model not in models_usage:
|
|
895
|
+
models_usage[model] = {
|
|
896
|
+
"requests": 0,
|
|
897
|
+
"input_tokens": 0,
|
|
898
|
+
"output_tokens": 0,
|
|
899
|
+
"cost_usd": 0.0,
|
|
900
|
+
}
|
|
901
|
+
models_usage[model]["requests"] += usage.get("requests", 0)
|
|
902
|
+
models_usage[model]["input_tokens"] += usage.get("input_tokens", 0)
|
|
903
|
+
models_usage[model]["output_tokens"] += usage.get("output_tokens", 0)
|
|
904
|
+
models_usage[model]["cost_usd"] += usage.get("cost_usd", 0.0)
|
|
905
|
+
|
|
906
|
+
# Aggregate from URLs
|
|
907
|
+
for u in self.state.urls.values():
|
|
908
|
+
for model, usage in u.llm_usage.items():
|
|
909
|
+
if model not in models_usage:
|
|
910
|
+
models_usage[model] = {
|
|
911
|
+
"requests": 0,
|
|
912
|
+
"input_tokens": 0,
|
|
913
|
+
"output_tokens": 0,
|
|
914
|
+
"cost_usd": 0.0,
|
|
915
|
+
}
|
|
916
|
+
models_usage[model]["requests"] += usage.get("requests", 0)
|
|
917
|
+
models_usage[model]["input_tokens"] += usage.get("input_tokens", 0)
|
|
918
|
+
models_usage[model]["output_tokens"] += usage.get("output_tokens", 0)
|
|
919
|
+
models_usage[model]["cost_usd"] += usage.get("cost_usd", 0.0)
|
|
920
|
+
|
|
921
|
+
# Calculate totals (files + URLs)
|
|
922
|
+
total_cost = sum(f.cost_usd for f in self.state.files.values()) + sum(
|
|
923
|
+
u.cost_usd for u in self.state.urls.values()
|
|
924
|
+
)
|
|
925
|
+
input_tokens = sum(m["input_tokens"] for m in models_usage.values())
|
|
926
|
+
output_tokens = sum(m["output_tokens"] for m in models_usage.values())
|
|
927
|
+
requests = sum(m["requests"] for m in models_usage.values())
|
|
928
|
+
|
|
929
|
+
return {
|
|
930
|
+
"models": models_usage,
|
|
931
|
+
"requests": requests,
|
|
932
|
+
"input_tokens": input_tokens,
|
|
933
|
+
"output_tokens": output_tokens,
|
|
934
|
+
"cost_usd": total_cost,
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
def init_state(
|
|
938
|
+
self,
|
|
939
|
+
input_dir: Path,
|
|
940
|
+
files: list[Path],
|
|
941
|
+
options: dict[str, Any],
|
|
942
|
+
started_at: str | None = None,
|
|
943
|
+
) -> BatchState:
|
|
944
|
+
"""
|
|
945
|
+
Initialize a new batch state.
|
|
946
|
+
|
|
947
|
+
Args:
|
|
948
|
+
input_dir: Input directory
|
|
949
|
+
files: List of files to process
|
|
950
|
+
options: Processing options (will be updated with absolute paths)
|
|
951
|
+
started_at: ISO timestamp when processing started (defaults to now)
|
|
952
|
+
|
|
953
|
+
Returns:
|
|
954
|
+
New BatchState
|
|
955
|
+
"""
|
|
956
|
+
# Resolve absolute paths
|
|
957
|
+
abs_input_dir = str(input_dir.resolve())
|
|
958
|
+
abs_output_dir = str(self.output_dir.resolve())
|
|
959
|
+
abs_log_file = None
|
|
960
|
+
if self.log_file:
|
|
961
|
+
log_path = Path(self.log_file)
|
|
962
|
+
abs_log_file = (
|
|
963
|
+
str(log_path.resolve()) if log_path.exists() else self.log_file
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
# Update options with absolute paths
|
|
967
|
+
options["input_dir"] = abs_input_dir
|
|
968
|
+
options["output_dir"] = abs_output_dir
|
|
969
|
+
|
|
970
|
+
now = datetime.now().astimezone().isoformat()
|
|
971
|
+
state = BatchState(
|
|
972
|
+
started_at=started_at or now,
|
|
973
|
+
updated_at=now,
|
|
974
|
+
input_dir=abs_input_dir,
|
|
975
|
+
output_dir=abs_output_dir,
|
|
976
|
+
log_file=abs_log_file,
|
|
977
|
+
options=options,
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
for file_path in files:
|
|
981
|
+
state.files[str(file_path)] = FileState(path=str(file_path))
|
|
982
|
+
|
|
983
|
+
return state
|
|
984
|
+
|
|
985
|
+
async def process_batch(
|
|
986
|
+
self,
|
|
987
|
+
files: list[Path],
|
|
988
|
+
process_func: ProcessFunc,
|
|
989
|
+
resume: bool = False,
|
|
990
|
+
options: dict[str, Any] | None = None,
|
|
991
|
+
verbose: bool = False,
|
|
992
|
+
console_handler_id: int | None = None,
|
|
993
|
+
started_at: str | None = None,
|
|
994
|
+
) -> BatchState:
|
|
995
|
+
"""
|
|
996
|
+
Process files in batch with concurrency control.
|
|
997
|
+
|
|
998
|
+
Args:
|
|
999
|
+
files: List of files to process
|
|
1000
|
+
process_func: Async function to process each file
|
|
1001
|
+
resume: Whether to resume from previous state
|
|
1002
|
+
options: Task options to record in report
|
|
1003
|
+
verbose: Whether to show log panel during processing
|
|
1004
|
+
console_handler_id: Loguru console handler ID for temporary disable
|
|
1005
|
+
(ignored if start_live_display was already called)
|
|
1006
|
+
started_at: ISO timestamp when processing started (for accurate duration)
|
|
1007
|
+
|
|
1008
|
+
Returns:
|
|
1009
|
+
Final batch state
|
|
1010
|
+
"""
|
|
1011
|
+
# Use provided started_at or default to now
|
|
1012
|
+
actual_started_at = started_at or datetime.now().astimezone().isoformat()
|
|
1013
|
+
|
|
1014
|
+
# Initialize or load state
|
|
1015
|
+
if resume:
|
|
1016
|
+
self.state = self.load_state()
|
|
1017
|
+
if self.state:
|
|
1018
|
+
files = self.state.get_pending_files()
|
|
1019
|
+
logger.info(
|
|
1020
|
+
f"Resuming batch: {self.state.completed_count} completed, "
|
|
1021
|
+
f"{len(files)} remaining"
|
|
1022
|
+
)
|
|
1023
|
+
# Reset started_at for accurate duration calculation in this session
|
|
1024
|
+
self.state.started_at = actual_started_at
|
|
1025
|
+
|
|
1026
|
+
if self.state is None:
|
|
1027
|
+
self.state = self.init_state(
|
|
1028
|
+
input_dir=files[0].parent if files else Path("."),
|
|
1029
|
+
files=files,
|
|
1030
|
+
options=options or {},
|
|
1031
|
+
started_at=actual_started_at,
|
|
1032
|
+
)
|
|
1033
|
+
self.save_state(force=True)
|
|
1034
|
+
|
|
1035
|
+
if not files:
|
|
1036
|
+
logger.info("No files to process")
|
|
1037
|
+
self.save_state(force=True)
|
|
1038
|
+
return self.state
|
|
1039
|
+
|
|
1040
|
+
# Create semaphore for concurrency control
|
|
1041
|
+
semaphore = asyncio.Semaphore(self.config.concurrency)
|
|
1042
|
+
|
|
1043
|
+
# Check if Live display was already started by caller
|
|
1044
|
+
live_already_started = self._live is not None
|
|
1045
|
+
|
|
1046
|
+
# Use existing progress or create new one
|
|
1047
|
+
if live_already_started and self._progress is not None:
|
|
1048
|
+
progress = self._progress
|
|
1049
|
+
overall_task = self._overall_task_id
|
|
1050
|
+
assert overall_task is not None # Guaranteed when _progress is set
|
|
1051
|
+
# Update total in case it changed
|
|
1052
|
+
progress.update(overall_task, total=len(files))
|
|
1053
|
+
log_panel = self._log_panel
|
|
1054
|
+
else:
|
|
1055
|
+
# Create progress display (legacy path for backwards compatibility)
|
|
1056
|
+
progress = Progress(
|
|
1057
|
+
SpinnerColumn(),
|
|
1058
|
+
TextColumn("[bold blue]{task.fields[filename]:<30}"),
|
|
1059
|
+
BarColumn(),
|
|
1060
|
+
TaskProgressColumn(),
|
|
1061
|
+
TimeElapsedColumn(),
|
|
1062
|
+
)
|
|
1063
|
+
overall_task = progress.add_task(
|
|
1064
|
+
"Overall",
|
|
1065
|
+
total=len(files),
|
|
1066
|
+
filename="[Overall Progress]",
|
|
1067
|
+
)
|
|
1068
|
+
log_panel = None
|
|
1069
|
+
|
|
1070
|
+
# Create log panel for verbose mode (if not already created)
|
|
1071
|
+
if verbose:
|
|
1072
|
+
log_panel = LogPanel()
|
|
1073
|
+
|
|
1074
|
+
def panel_sink(message: Any) -> None:
|
|
1075
|
+
"""Sink function to write logs to the panel."""
|
|
1076
|
+
if log_panel is not None:
|
|
1077
|
+
log_panel.add(message.record["message"])
|
|
1078
|
+
|
|
1079
|
+
# Add a handler that writes to the log panel
|
|
1080
|
+
self._panel_handler_id = logger.add(
|
|
1081
|
+
panel_sink,
|
|
1082
|
+
level="INFO",
|
|
1083
|
+
format="{message}",
|
|
1084
|
+
filter=lambda record: record["level"].no >= 20, # INFO and above
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
async def process_with_limit(file_path: Path) -> None:
|
|
1088
|
+
"""Process a file with semaphore limit.
|
|
1089
|
+
|
|
1090
|
+
State saving is performed outside the semaphore to avoid blocking
|
|
1091
|
+
concurrent file processing.
|
|
1092
|
+
"""
|
|
1093
|
+
assert self.state is not None # Guaranteed by _init_state() above
|
|
1094
|
+
file_key = str(file_path)
|
|
1095
|
+
file_state = self.state.files.get(file_key)
|
|
1096
|
+
|
|
1097
|
+
if file_state is None:
|
|
1098
|
+
file_state = FileState(path=file_key)
|
|
1099
|
+
self.state.files[file_key] = file_state
|
|
1100
|
+
|
|
1101
|
+
# Update state to in_progress
|
|
1102
|
+
file_state.status = FileStatus.IN_PROGRESS
|
|
1103
|
+
file_state.started_at = datetime.now().astimezone().isoformat()
|
|
1104
|
+
|
|
1105
|
+
start_time = asyncio.get_event_loop().time()
|
|
1106
|
+
|
|
1107
|
+
try:
|
|
1108
|
+
# Process file within semaphore
|
|
1109
|
+
async with semaphore:
|
|
1110
|
+
result = await process_func(file_path)
|
|
1111
|
+
|
|
1112
|
+
if result.success:
|
|
1113
|
+
file_state.status = FileStatus.COMPLETED
|
|
1114
|
+
file_state.output = result.output_path
|
|
1115
|
+
file_state.images = result.images
|
|
1116
|
+
file_state.screenshots = result.screenshots
|
|
1117
|
+
file_state.cost_usd = result.cost_usd
|
|
1118
|
+
file_state.llm_usage = result.llm_usage
|
|
1119
|
+
file_state.cache_hit = result.cache_hit
|
|
1120
|
+
# Collect image analysis result for JSON aggregation
|
|
1121
|
+
if result.image_analysis_result is not None:
|
|
1122
|
+
self.image_analysis_results.append(result.image_analysis_result)
|
|
1123
|
+
else:
|
|
1124
|
+
file_state.status = FileStatus.FAILED
|
|
1125
|
+
file_state.error = result.error
|
|
1126
|
+
|
|
1127
|
+
except Exception as e:
|
|
1128
|
+
file_state.status = FileStatus.FAILED
|
|
1129
|
+
file_state.error = str(e)
|
|
1130
|
+
logger.error(f"Failed to process {file_path.name}: {e}")
|
|
1131
|
+
|
|
1132
|
+
finally:
|
|
1133
|
+
end_time = asyncio.get_event_loop().time()
|
|
1134
|
+
file_state.completed_at = datetime.now().astimezone().isoformat()
|
|
1135
|
+
file_state.duration = end_time - start_time
|
|
1136
|
+
|
|
1137
|
+
# Update progress
|
|
1138
|
+
progress.advance(overall_task)
|
|
1139
|
+
|
|
1140
|
+
# Save state outside semaphore (non-blocking, throttled)
|
|
1141
|
+
# Use asyncio.to_thread to avoid blocking the event loop
|
|
1142
|
+
await asyncio.to_thread(self.save_state)
|
|
1143
|
+
|
|
1144
|
+
# If Live display was already started, just run the tasks without creating new Live
|
|
1145
|
+
if live_already_started:
|
|
1146
|
+
tasks = [process_with_limit(f) for f in files]
|
|
1147
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
1148
|
+
else:
|
|
1149
|
+
# No external Live display provided - create one here
|
|
1150
|
+
# Disable console handler to avoid conflict with progress bar
|
|
1151
|
+
if console_handler_id is not None:
|
|
1152
|
+
try:
|
|
1153
|
+
logger.remove(console_handler_id)
|
|
1154
|
+
except ValueError:
|
|
1155
|
+
pass # Handler already removed
|
|
1156
|
+
|
|
1157
|
+
try:
|
|
1158
|
+
if verbose and log_panel is not None:
|
|
1159
|
+
# Verbose mode: show progress + log panel
|
|
1160
|
+
display = Group(progress, log_panel)
|
|
1161
|
+
with Live(display, console=self.console, refresh_per_second=4):
|
|
1162
|
+
tasks = [process_with_limit(f) for f in files]
|
|
1163
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
1164
|
+
else:
|
|
1165
|
+
# Normal mode: progress bar only
|
|
1166
|
+
with Live(progress, console=self.console, refresh_per_second=4):
|
|
1167
|
+
tasks = [process_with_limit(f) for f in files]
|
|
1168
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
1169
|
+
finally:
|
|
1170
|
+
# Remove panel handler if added
|
|
1171
|
+
if self._panel_handler_id is not None:
|
|
1172
|
+
try:
|
|
1173
|
+
logger.remove(self._panel_handler_id)
|
|
1174
|
+
except ValueError:
|
|
1175
|
+
pass
|
|
1176
|
+
self._panel_handler_id = None
|
|
1177
|
+
|
|
1178
|
+
# Re-add console handler (restore original state)
|
|
1179
|
+
if console_handler_id is not None:
|
|
1180
|
+
import sys
|
|
1181
|
+
|
|
1182
|
+
new_handler_id = logger.add(
|
|
1183
|
+
sys.stderr,
|
|
1184
|
+
level="DEBUG" if verbose else "INFO",
|
|
1185
|
+
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
|
|
1186
|
+
)
|
|
1187
|
+
self._restored_console_handler_id = new_handler_id
|
|
1188
|
+
|
|
1189
|
+
# Final save
|
|
1190
|
+
self.save_state(force=True)
|
|
1191
|
+
|
|
1192
|
+
return self.state
|
|
1193
|
+
|
|
1194
|
+
def generate_report(self) -> dict[str, Any]:
|
|
1195
|
+
"""
|
|
1196
|
+
Generate final processing report.
|
|
1197
|
+
|
|
1198
|
+
Returns:
|
|
1199
|
+
Report dictionary (same structure as saved report file)
|
|
1200
|
+
"""
|
|
1201
|
+
if self.state is None:
|
|
1202
|
+
return {}
|
|
1203
|
+
|
|
1204
|
+
report = self.state.to_dict()
|
|
1205
|
+
report["summary"] = self._compute_summary()
|
|
1206
|
+
report["llm_usage"] = self._compute_llm_usage()
|
|
1207
|
+
report["generated_at"] = datetime.now().astimezone().isoformat()
|
|
1208
|
+
|
|
1209
|
+
return report
|
|
1210
|
+
|
|
1211
|
+
def save_report(self) -> Path:
|
|
1212
|
+
"""Finalize and save report to file.
|
|
1213
|
+
|
|
1214
|
+
Report file is saved to: reports/markitai.<hash>.report.json
|
|
1215
|
+
Respects on_conflict strategy (skip/overwrite/rename).
|
|
1216
|
+
|
|
1217
|
+
Returns:
|
|
1218
|
+
Path to the report file
|
|
1219
|
+
"""
|
|
1220
|
+
# First, ensure state is saved for resume capability
|
|
1221
|
+
self.save_state(force=True, log=True)
|
|
1222
|
+
|
|
1223
|
+
# Generate and save the report
|
|
1224
|
+
report = self.generate_report()
|
|
1225
|
+
|
|
1226
|
+
# Ensure reports directory exists
|
|
1227
|
+
self.report_file.parent.mkdir(parents=True, exist_ok=True)
|
|
1228
|
+
|
|
1229
|
+
atomic_write_json(self.report_file, report, order_func=order_report)
|
|
1230
|
+
logger.info(f"Report saved: {self.report_file.resolve()}")
|
|
1231
|
+
|
|
1232
|
+
return self.report_file
|
|
1233
|
+
|
|
1234
|
+
def print_summary(
|
|
1235
|
+
self,
|
|
1236
|
+
url_completed: int = 0,
|
|
1237
|
+
url_failed: int = 0,
|
|
1238
|
+
url_cache_hits: int = 0,
|
|
1239
|
+
url_sources: int = 0,
|
|
1240
|
+
) -> None:
|
|
1241
|
+
"""Print summary to console.
|
|
1242
|
+
|
|
1243
|
+
Args:
|
|
1244
|
+
url_completed: Number of URLs successfully processed
|
|
1245
|
+
url_failed: Number of URLs that failed
|
|
1246
|
+
url_cache_hits: Number of URLs that hit LLM cache
|
|
1247
|
+
url_sources: Number of .urls source files processed
|
|
1248
|
+
"""
|
|
1249
|
+
if self.state is None:
|
|
1250
|
+
return
|
|
1251
|
+
|
|
1252
|
+
table = Table(title="Batch Processing Summary")
|
|
1253
|
+
table.add_column("Metric", style="cyan")
|
|
1254
|
+
table.add_column("Value", style="green")
|
|
1255
|
+
|
|
1256
|
+
# Local Files section
|
|
1257
|
+
if self.state.total > 0:
|
|
1258
|
+
table.add_row("Local Files", str(self.state.total))
|
|
1259
|
+
table.add_row("Completed", str(self.state.completed_count))
|
|
1260
|
+
if self.state.failed_count > 0:
|
|
1261
|
+
table.add_row("Failed", str(self.state.failed_count))
|
|
1262
|
+
|
|
1263
|
+
# File cache hits
|
|
1264
|
+
completed_files = [
|
|
1265
|
+
f for f in self.state.files.values() if f.status == FileStatus.COMPLETED
|
|
1266
|
+
]
|
|
1267
|
+
file_cache_hits = sum(1 for f in completed_files if f.cache_hit)
|
|
1268
|
+
if completed_files:
|
|
1269
|
+
table.add_row("Cache Hits", f"{file_cache_hits}/{len(completed_files)}")
|
|
1270
|
+
|
|
1271
|
+
# Add separator if URLs follow
|
|
1272
|
+
total_urls = url_completed + url_failed
|
|
1273
|
+
if total_urls > 0:
|
|
1274
|
+
table.add_row("", "") # Empty row as separator
|
|
1275
|
+
|
|
1276
|
+
# URL Files section
|
|
1277
|
+
total_urls = url_completed + url_failed
|
|
1278
|
+
if total_urls > 0:
|
|
1279
|
+
if url_sources > 0:
|
|
1280
|
+
table.add_row("URL Files", str(url_sources))
|
|
1281
|
+
table.add_row("URLs", str(total_urls))
|
|
1282
|
+
table.add_row("Completed", str(url_completed))
|
|
1283
|
+
if url_failed > 0:
|
|
1284
|
+
table.add_row("Failed", str(url_failed))
|
|
1285
|
+
if url_completed > 0:
|
|
1286
|
+
table.add_row("Cache Hits", f"{url_cache_hits}/{url_completed}")
|
|
1287
|
+
|
|
1288
|
+
# Add separator before duration
|
|
1289
|
+
if self.state.total > 0 or total_urls > 0:
|
|
1290
|
+
table.add_row("", "") # Empty row as separator
|
|
1291
|
+
|
|
1292
|
+
# Calculate wall-clock duration from started_at to updated_at
|
|
1293
|
+
wall_duration = 0.0
|
|
1294
|
+
if self.state.started_at and self.state.updated_at:
|
|
1295
|
+
try:
|
|
1296
|
+
start = datetime.fromisoformat(self.state.started_at)
|
|
1297
|
+
end = datetime.fromisoformat(self.state.updated_at)
|
|
1298
|
+
wall_duration = (end - start).total_seconds()
|
|
1299
|
+
except ValueError:
|
|
1300
|
+
# Fallback to sum of individual durations
|
|
1301
|
+
wall_duration = sum(f.duration or 0 for f in self.state.files.values())
|
|
1302
|
+
table.add_row("Duration", f"{wall_duration:.1f}s")
|
|
1303
|
+
|
|
1304
|
+
# LLM cost
|
|
1305
|
+
total_cost = sum(f.cost_usd for f in self.state.files.values())
|
|
1306
|
+
if total_cost > 0:
|
|
1307
|
+
table.add_row("LLM Cost", f"${total_cost:.4f}")
|
|
1308
|
+
|
|
1309
|
+
self.console.print(table)
|
|
1310
|
+
|
|
1311
|
+
# Print failed files if any
|
|
1312
|
+
failed = [f for f in self.state.files.values() if f.status == FileStatus.FAILED]
|
|
1313
|
+
if failed:
|
|
1314
|
+
self.console.print("\n[red]Failed files:[/red]")
|
|
1315
|
+
for f in failed:
|
|
1316
|
+
self.console.print(f" - {Path(f.path).name}: {f.error}")
|