markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/cli.py
ADDED
|
@@ -0,0 +1,3979 @@
|
|
|
1
|
+
"""Command-line interface for Markitai."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import sys
|
|
11
|
+
import tempfile
|
|
12
|
+
import warnings
|
|
13
|
+
from collections.abc import Callable
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import TYPE_CHECKING, Any
|
|
16
|
+
from urllib.parse import urlparse
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from markitai.fetch import FetchCache, FetchStrategy
|
|
20
|
+
from markitai.llm import ImageAnalysis, LLMProcessor
|
|
21
|
+
|
|
22
|
+
# Suppress noisy messages before imports
|
|
23
|
+
os.environ.setdefault("PYMUPDF_SUGGEST_LAYOUT_ANALYZER", "0")
|
|
24
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
|
|
25
|
+
# Suppress litellm async client cleanup warning (harmless, occurs at exit)
|
|
26
|
+
warnings.filterwarnings(
|
|
27
|
+
"ignore",
|
|
28
|
+
message="coroutine 'close_litellm_async_clients' was never awaited",
|
|
29
|
+
category=RuntimeWarning,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
import click
|
|
33
|
+
from dotenv import load_dotenv
|
|
34
|
+
|
|
35
|
+
# Load .env file from current directory and parent directories
|
|
36
|
+
load_dotenv()
|
|
37
|
+
|
|
38
|
+
from click import Context
|
|
39
|
+
from loguru import logger
|
|
40
|
+
from rich.console import Console
|
|
41
|
+
from rich.panel import Panel
|
|
42
|
+
from rich.syntax import Syntax
|
|
43
|
+
|
|
44
|
+
from markitai import __version__
|
|
45
|
+
from markitai.config import ConfigManager, MarkitaiConfig
|
|
46
|
+
from markitai.constants import (
|
|
47
|
+
DEFAULT_MAX_IMAGES_PER_BATCH,
|
|
48
|
+
IMAGE_EXTENSIONS,
|
|
49
|
+
MAX_DOCUMENT_SIZE,
|
|
50
|
+
)
|
|
51
|
+
from markitai.converter import FileFormat, detect_format
|
|
52
|
+
from markitai.converter.base import EXTENSION_MAP
|
|
53
|
+
from markitai.image import ImageProcessor
|
|
54
|
+
from markitai.json_order import order_report
|
|
55
|
+
from markitai.security import (
|
|
56
|
+
atomic_write_json,
|
|
57
|
+
atomic_write_text,
|
|
58
|
+
validate_file_size,
|
|
59
|
+
)
|
|
60
|
+
from markitai.utils.output import resolve_output_path
|
|
61
|
+
from markitai.utils.paths import ensure_dir, ensure_screenshots_dir
|
|
62
|
+
from markitai.workflow.helpers import (
|
|
63
|
+
add_basic_frontmatter as _add_basic_frontmatter,
|
|
64
|
+
)
|
|
65
|
+
from markitai.workflow.helpers import (
|
|
66
|
+
create_llm_processor,
|
|
67
|
+
write_images_json,
|
|
68
|
+
)
|
|
69
|
+
from markitai.workflow.helpers import (
|
|
70
|
+
detect_language as _detect_language,
|
|
71
|
+
)
|
|
72
|
+
from markitai.workflow.helpers import (
|
|
73
|
+
merge_llm_usage as _merge_llm_usage,
|
|
74
|
+
)
|
|
75
|
+
from markitai.workflow.single import ImageAnalysisResult
|
|
76
|
+
|
|
77
|
+
console = Console()
|
|
78
|
+
# Separate stderr console for status/progress (doesn't mix with stdout output)
|
|
79
|
+
stderr_console = Console(stderr=True)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ProgressReporter:
|
|
83
|
+
"""Progress reporter for single file/URL conversion.
|
|
84
|
+
|
|
85
|
+
In non-verbose mode, shows:
|
|
86
|
+
1. Spinner during conversion/processing stages
|
|
87
|
+
2. Completion messages after each stage
|
|
88
|
+
3. Clears all output before final result
|
|
89
|
+
|
|
90
|
+
In verbose mode, does nothing (logging handles feedback).
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(self, enabled: bool = True):
|
|
94
|
+
"""Initialize progress reporter.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
enabled: Whether to show progress (False in verbose mode)
|
|
98
|
+
"""
|
|
99
|
+
self.enabled = enabled
|
|
100
|
+
self._status = None
|
|
101
|
+
self._messages: list[str] = []
|
|
102
|
+
|
|
103
|
+
def start_spinner(self, message: str) -> None:
|
|
104
|
+
"""Start showing a spinner with message."""
|
|
105
|
+
if not self.enabled:
|
|
106
|
+
return
|
|
107
|
+
self.stop_spinner() # Stop any existing spinner
|
|
108
|
+
self._status = stderr_console.status(f"[cyan]{message}[/cyan]", spinner="dots")
|
|
109
|
+
self._status.start()
|
|
110
|
+
|
|
111
|
+
def stop_spinner(self) -> None:
|
|
112
|
+
"""Stop the current spinner."""
|
|
113
|
+
if self._status is not None:
|
|
114
|
+
self._status.stop()
|
|
115
|
+
self._status = None
|
|
116
|
+
|
|
117
|
+
def log(self, message: str) -> None:
|
|
118
|
+
"""Print a progress message."""
|
|
119
|
+
if not self.enabled:
|
|
120
|
+
return
|
|
121
|
+
self.stop_spinner()
|
|
122
|
+
self._messages.append(message)
|
|
123
|
+
stderr_console.print(f"[dim]{message}[/dim]")
|
|
124
|
+
|
|
125
|
+
def clear_and_finish(self) -> None:
|
|
126
|
+
"""Clear all progress output before printing final result.
|
|
127
|
+
|
|
128
|
+
Uses ANSI escape codes to move cursor up and clear lines.
|
|
129
|
+
"""
|
|
130
|
+
if not self.enabled:
|
|
131
|
+
return
|
|
132
|
+
self.stop_spinner()
|
|
133
|
+
|
|
134
|
+
# Clear previous messages by moving cursor up and clearing lines
|
|
135
|
+
if self._messages:
|
|
136
|
+
# Move cursor up N lines and clear each line
|
|
137
|
+
for _ in self._messages:
|
|
138
|
+
# Move up one line and clear it
|
|
139
|
+
stderr_console.file.write("\033[A\033[2K")
|
|
140
|
+
stderr_console.file.flush()
|
|
141
|
+
self._messages.clear()
|
|
142
|
+
|
|
143
|
+
def __enter__(self):
|
|
144
|
+
return self
|
|
145
|
+
|
|
146
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
147
|
+
self.stop_spinner()
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# URL pattern for detecting URLs
|
|
152
|
+
_URL_PATTERN = re.compile(r"^https?://", re.IGNORECASE)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def is_url(s: str) -> bool:
|
|
156
|
+
"""Check if string is a URL (http:// or https://)."""
|
|
157
|
+
return bool(_URL_PATTERN.match(s))
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def url_to_filename(url: str) -> str:
|
|
161
|
+
"""Generate a safe filename from URL.
|
|
162
|
+
|
|
163
|
+
Examples:
|
|
164
|
+
https://example.com/page.html -> page.html.md
|
|
165
|
+
https://example.com/path/to/doc -> doc.md
|
|
166
|
+
https://example.com/ -> example_com.md
|
|
167
|
+
https://youtube.com/watch?v=abc -> youtube_com_watch.md
|
|
168
|
+
"""
|
|
169
|
+
parsed = urlparse(url)
|
|
170
|
+
|
|
171
|
+
# Try to get filename from path
|
|
172
|
+
path = parsed.path.rstrip("/")
|
|
173
|
+
if path:
|
|
174
|
+
# Get last segment of path
|
|
175
|
+
filename = path.split("/")[-1]
|
|
176
|
+
if filename:
|
|
177
|
+
# Sanitize for cross-platform compatibility
|
|
178
|
+
filename = _sanitize_filename(filename)
|
|
179
|
+
return f"{filename}.md"
|
|
180
|
+
|
|
181
|
+
# Fallback: use domain name
|
|
182
|
+
domain = parsed.netloc.replace(".", "_").replace(":", "_")
|
|
183
|
+
path_part = parsed.path.strip("/").replace("/", "_")[:50] # limit length
|
|
184
|
+
if path_part:
|
|
185
|
+
return f"{_sanitize_filename(domain)}_{_sanitize_filename(path_part)}.md"
|
|
186
|
+
return f"{_sanitize_filename(domain)}.md"
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _sanitize_filename(name: str) -> str:
|
|
190
|
+
"""Sanitize filename for cross-platform compatibility.
|
|
191
|
+
|
|
192
|
+
Removes or replaces characters that are invalid on Windows/Linux/macOS.
|
|
193
|
+
"""
|
|
194
|
+
# Characters invalid on Windows: \ / : * ? " < > |
|
|
195
|
+
# Also replace other problematic characters
|
|
196
|
+
invalid_chars = r'<>:"/\|?*'
|
|
197
|
+
for char in invalid_chars:
|
|
198
|
+
name = name.replace(char, "_")
|
|
199
|
+
# Remove leading/trailing spaces and dots (Windows issue)
|
|
200
|
+
name = name.strip(". ")
|
|
201
|
+
# Limit length (255 is common max, but leave room for .md extension)
|
|
202
|
+
if len(name) > 200:
|
|
203
|
+
name = name[:200]
|
|
204
|
+
return name or "unnamed"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# Import shared ThreadPoolExecutor shutdown function from utils.executor
|
|
208
|
+
# This module provides a global executor shared across all conversion operations
|
|
209
|
+
from markitai.utils.executor import shutdown_converter_executor
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def compute_task_hash(
|
|
213
|
+
input_path: Path,
|
|
214
|
+
output_dir: Path,
|
|
215
|
+
options: dict[str, Any] | None = None,
|
|
216
|
+
) -> str:
|
|
217
|
+
"""Compute hash from task input parameters.
|
|
218
|
+
|
|
219
|
+
Hash is based on:
|
|
220
|
+
- input_path (resolved)
|
|
221
|
+
- output_dir (resolved)
|
|
222
|
+
- key task options (llm, ocr, etc.)
|
|
223
|
+
|
|
224
|
+
This ensures different parameter combinations produce different hashes.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
input_path: Input file or directory path
|
|
228
|
+
output_dir: Output directory path
|
|
229
|
+
options: Task options dict (llm, ocr, etc.)
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
6-character hex hash string
|
|
233
|
+
"""
|
|
234
|
+
import hashlib
|
|
235
|
+
|
|
236
|
+
# Extract key options that affect output
|
|
237
|
+
key_options = {}
|
|
238
|
+
if options:
|
|
239
|
+
key_options = {
|
|
240
|
+
k: v
|
|
241
|
+
for k, v in options.items()
|
|
242
|
+
if k
|
|
243
|
+
in (
|
|
244
|
+
"llm",
|
|
245
|
+
"ocr",
|
|
246
|
+
"screenshot",
|
|
247
|
+
"alt",
|
|
248
|
+
"desc",
|
|
249
|
+
)
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
hash_params = {
|
|
253
|
+
"input": str(input_path.resolve()),
|
|
254
|
+
"output": str(output_dir.resolve()),
|
|
255
|
+
"options": key_options,
|
|
256
|
+
}
|
|
257
|
+
hash_str = json.dumps(hash_params, sort_keys=True)
|
|
258
|
+
return hashlib.md5(hash_str.encode()).hexdigest()[:6]
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def get_report_file_path(
|
|
262
|
+
output_dir: Path,
|
|
263
|
+
task_hash: str,
|
|
264
|
+
on_conflict: str = "rename",
|
|
265
|
+
) -> Path:
|
|
266
|
+
"""Generate report file path based on task hash.
|
|
267
|
+
|
|
268
|
+
Format: reports/markitai.<hash>.report.json
|
|
269
|
+
Respects on_conflict strategy for rename.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
output_dir: Output directory
|
|
273
|
+
task_hash: Task hash string
|
|
274
|
+
on_conflict: Conflict resolution strategy
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
Path to the report file
|
|
278
|
+
"""
|
|
279
|
+
reports_dir = output_dir / "reports"
|
|
280
|
+
base_path = reports_dir / f"markitai.{task_hash}.report.json"
|
|
281
|
+
|
|
282
|
+
if not base_path.exists():
|
|
283
|
+
return base_path
|
|
284
|
+
|
|
285
|
+
if on_conflict == "skip":
|
|
286
|
+
return base_path # Will be handled by caller
|
|
287
|
+
elif on_conflict == "overwrite":
|
|
288
|
+
return base_path
|
|
289
|
+
else: # rename
|
|
290
|
+
seq = 2
|
|
291
|
+
while True:
|
|
292
|
+
new_path = reports_dir / f"markitai.{task_hash}.v{seq}.report.json"
|
|
293
|
+
if not new_path.exists():
|
|
294
|
+
return new_path
|
|
295
|
+
seq += 1
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
# =============================================================================
|
|
299
|
+
# Custom CLI Group
|
|
300
|
+
# =============================================================================
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class MarkitaiGroup(click.Group):
|
|
304
|
+
"""Custom Group that supports main command with arguments and subcommands.
|
|
305
|
+
|
|
306
|
+
This allows:
|
|
307
|
+
markitai document.docx --llm # Convert file (main command)
|
|
308
|
+
markitai urls.urls -o out # URL list batch (.urls auto-detected)
|
|
309
|
+
markitai config list # Subcommand
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
# Options that take a path argument (so we skip their values when looking for INPUT)
|
|
313
|
+
_PATH_OPTIONS = {"-o", "--output", "-c", "--config"}
|
|
314
|
+
|
|
315
|
+
def parse_args(self, ctx: Context, args: list[str]) -> list[str]:
|
|
316
|
+
"""Parse arguments, detecting if first arg is a subcommand or file path."""
|
|
317
|
+
# Find INPUT: first positional arg that's not:
|
|
318
|
+
# - An option flag (starts with -)
|
|
319
|
+
# - A subcommand
|
|
320
|
+
# - A value for a path option
|
|
321
|
+
ctx.ensure_object(dict)
|
|
322
|
+
skip_next = False
|
|
323
|
+
input_idx = None
|
|
324
|
+
|
|
325
|
+
for i, arg in enumerate(args):
|
|
326
|
+
if skip_next:
|
|
327
|
+
skip_next = False
|
|
328
|
+
continue
|
|
329
|
+
|
|
330
|
+
# Check if this is an option that takes a value
|
|
331
|
+
if arg in self._PATH_OPTIONS or arg.startswith(
|
|
332
|
+
tuple(f"{opt}=" for opt in self._PATH_OPTIONS)
|
|
333
|
+
):
|
|
334
|
+
if "=" not in arg:
|
|
335
|
+
skip_next = True # Next arg is the option's value
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
if arg.startswith("-"):
|
|
339
|
+
# Other options (flags or with values)
|
|
340
|
+
# For simplicity, assume they don't need skipping unless it's a known path option
|
|
341
|
+
continue
|
|
342
|
+
|
|
343
|
+
# First positional argument
|
|
344
|
+
if arg in self.commands:
|
|
345
|
+
# It's a subcommand - stop looking
|
|
346
|
+
break
|
|
347
|
+
else:
|
|
348
|
+
# It's a file path - store for later use
|
|
349
|
+
ctx.obj["_input_path"] = arg
|
|
350
|
+
input_idx = i
|
|
351
|
+
break
|
|
352
|
+
|
|
353
|
+
# Remove INPUT from args so Group doesn't treat it as subcommand
|
|
354
|
+
if input_idx is not None:
|
|
355
|
+
args = args[:input_idx] + args[input_idx + 1 :]
|
|
356
|
+
|
|
357
|
+
return super().parse_args(ctx, args)
|
|
358
|
+
|
|
359
|
+
def format_usage(
|
|
360
|
+
self,
|
|
361
|
+
ctx: Context,
|
|
362
|
+
formatter: click.HelpFormatter,
|
|
363
|
+
) -> None:
|
|
364
|
+
"""Custom usage line to show INPUT argument."""
|
|
365
|
+
formatter.write_usage(
|
|
366
|
+
ctx.command_path,
|
|
367
|
+
"[OPTIONS] INPUT [COMMAND]",
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
def format_help(self, ctx: Context, formatter: click.HelpFormatter) -> None:
|
|
371
|
+
"""Custom help formatting to show INPUT argument."""
|
|
372
|
+
# Usage
|
|
373
|
+
self.format_usage(ctx, formatter)
|
|
374
|
+
|
|
375
|
+
# Help text
|
|
376
|
+
self.format_help_text(ctx, formatter)
|
|
377
|
+
|
|
378
|
+
# Arguments section
|
|
379
|
+
with formatter.section("Arguments"):
|
|
380
|
+
formatter.write_dl(
|
|
381
|
+
[
|
|
382
|
+
(
|
|
383
|
+
"INPUT",
|
|
384
|
+
"File, directory, URL, or .urls file to convert",
|
|
385
|
+
)
|
|
386
|
+
]
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Options (not format_options which may include epilog)
|
|
390
|
+
opts = []
|
|
391
|
+
for param in self.get_params(ctx):
|
|
392
|
+
rv = param.get_help_record(ctx)
|
|
393
|
+
if rv is not None:
|
|
394
|
+
opts.append(rv)
|
|
395
|
+
if opts:
|
|
396
|
+
with formatter.section("Options"):
|
|
397
|
+
formatter.write_dl(opts)
|
|
398
|
+
|
|
399
|
+
# Commands
|
|
400
|
+
commands = []
|
|
401
|
+
for name in self.list_commands(ctx):
|
|
402
|
+
cmd = self.get_command(ctx, name)
|
|
403
|
+
if cmd is None or cmd.hidden:
|
|
404
|
+
continue
|
|
405
|
+
commands.append((name, cmd.get_short_help_str(limit=formatter.width)))
|
|
406
|
+
if commands:
|
|
407
|
+
with formatter.section("Commands"):
|
|
408
|
+
formatter.write_dl(commands)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
# =============================================================================
|
|
412
|
+
# Utility functions
|
|
413
|
+
# =============================================================================
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
class LoggingContext:
|
|
417
|
+
"""Context manager for temporarily disabling/re-enabling console logging.
|
|
418
|
+
|
|
419
|
+
This provides a clean way to manage loguru console handler lifecycle,
|
|
420
|
+
especially useful for batch processing with Rich progress bars.
|
|
421
|
+
|
|
422
|
+
Usage:
|
|
423
|
+
logging_ctx = LoggingContext(console_handler_id, verbose)
|
|
424
|
+
with logging_ctx.suspend_console():
|
|
425
|
+
# Rich progress bar here - no console log conflicts
|
|
426
|
+
...
|
|
427
|
+
# Console logging automatically restored
|
|
428
|
+
"""
|
|
429
|
+
|
|
430
|
+
def __init__(self, console_handler_id: int | None, verbose: bool = False) -> None:
|
|
431
|
+
self.original_handler_id = console_handler_id
|
|
432
|
+
self.verbose = verbose
|
|
433
|
+
self._current_handler_id: int | None = console_handler_id
|
|
434
|
+
self._suspended = False
|
|
435
|
+
|
|
436
|
+
@property
|
|
437
|
+
def current_handler_id(self) -> int | None:
|
|
438
|
+
"""Get the current console handler ID."""
|
|
439
|
+
return self._current_handler_id
|
|
440
|
+
|
|
441
|
+
def suspend_console(self) -> LoggingContext:
|
|
442
|
+
"""Return self as context manager for suspend/resume."""
|
|
443
|
+
return self
|
|
444
|
+
|
|
445
|
+
def __enter__(self) -> LoggingContext:
|
|
446
|
+
"""Suspend console logging."""
|
|
447
|
+
if self._current_handler_id is not None and not self._suspended:
|
|
448
|
+
try:
|
|
449
|
+
logger.remove(self._current_handler_id)
|
|
450
|
+
self._suspended = True
|
|
451
|
+
except ValueError:
|
|
452
|
+
pass # Handler already removed
|
|
453
|
+
return self
|
|
454
|
+
|
|
455
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
456
|
+
"""Resume console logging."""
|
|
457
|
+
if self._suspended:
|
|
458
|
+
console_level = "DEBUG" if self.verbose else "INFO"
|
|
459
|
+
self._current_handler_id = logger.add(
|
|
460
|
+
sys.stderr,
|
|
461
|
+
level=console_level,
|
|
462
|
+
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
|
|
463
|
+
)
|
|
464
|
+
self._suspended = False
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
class InterceptHandler(logging.Handler):
|
|
468
|
+
"""Intercept standard logging and forward to loguru.
|
|
469
|
+
|
|
470
|
+
This allows capturing logs from dependencies (litellm, instructor, etc.)
|
|
471
|
+
into our unified logging system.
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
475
|
+
# Get corresponding loguru level
|
|
476
|
+
try:
|
|
477
|
+
level = logger.level(record.levelname).name
|
|
478
|
+
except ValueError:
|
|
479
|
+
level = record.levelno
|
|
480
|
+
|
|
481
|
+
# Find caller from where the logged message originated
|
|
482
|
+
frame, depth = logging.currentframe(), 2
|
|
483
|
+
while frame and frame.f_code.co_filename == logging.__file__:
|
|
484
|
+
frame = frame.f_back
|
|
485
|
+
depth += 1
|
|
486
|
+
|
|
487
|
+
logger.opt(depth=depth, exception=record.exc_info).log(
|
|
488
|
+
level, record.getMessage()
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def setup_logging(
|
|
493
|
+
verbose: bool,
|
|
494
|
+
log_dir: str | None = None,
|
|
495
|
+
log_level: str = "DEBUG",
|
|
496
|
+
rotation: str = "10 MB",
|
|
497
|
+
retention: str = "7 days",
|
|
498
|
+
quiet: bool = False,
|
|
499
|
+
) -> tuple[int | None, Path | None]:
|
|
500
|
+
"""Configure logging based on configuration.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
verbose: Enable DEBUG level for console output.
|
|
504
|
+
log_dir: Directory for log files. Supports ~ expansion.
|
|
505
|
+
Can be overridden by MARKITAI_LOG_DIR env var.
|
|
506
|
+
log_level: Log level for file output.
|
|
507
|
+
rotation: Log file rotation size.
|
|
508
|
+
retention: Log file retention period.
|
|
509
|
+
quiet: If True, disable console logging entirely (for single file mode).
|
|
510
|
+
Logs will still be written to file if log_dir is configured.
|
|
511
|
+
|
|
512
|
+
Returns:
|
|
513
|
+
Tuple of (console_handler_id, log_file_path).
|
|
514
|
+
Console handler ID can be used to temporarily disable console logging.
|
|
515
|
+
Log file path is None if file logging is disabled.
|
|
516
|
+
"""
|
|
517
|
+
from datetime import datetime
|
|
518
|
+
|
|
519
|
+
logger.remove()
|
|
520
|
+
|
|
521
|
+
# Console logging: disabled in quiet mode, otherwise based on verbose flag
|
|
522
|
+
console_handler_id: int | None = None
|
|
523
|
+
if not quiet:
|
|
524
|
+
console_level = "DEBUG" if verbose else "INFO"
|
|
525
|
+
console_handler_id = logger.add(
|
|
526
|
+
sys.stderr,
|
|
527
|
+
level=console_level,
|
|
528
|
+
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Check environment variable override
|
|
532
|
+
env_log_dir = os.environ.get("MARKITAI_LOG_DIR")
|
|
533
|
+
if env_log_dir:
|
|
534
|
+
log_dir = env_log_dir
|
|
535
|
+
|
|
536
|
+
# Add file logging (independent handler, not affected by console disable)
|
|
537
|
+
log_file_path: Path | None = None
|
|
538
|
+
if log_dir:
|
|
539
|
+
log_path = Path(log_dir).expanduser()
|
|
540
|
+
log_path.mkdir(parents=True, exist_ok=True)
|
|
541
|
+
# Generate log filename with current timestamp (matching loguru's format)
|
|
542
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
|
543
|
+
log_file_path = log_path / f"markitai_{timestamp}.log"
|
|
544
|
+
logger.add(
|
|
545
|
+
log_file_path,
|
|
546
|
+
level=log_level,
|
|
547
|
+
rotation=rotation,
|
|
548
|
+
retention=retention,
|
|
549
|
+
serialize=True,
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
# Intercept standard logging from dependencies (litellm, instructor, etc.)
|
|
553
|
+
# and route to loguru for unified log handling
|
|
554
|
+
intercept_handler = InterceptHandler()
|
|
555
|
+
for logger_name in ["LiteLLM", "LiteLLM Router", "LiteLLM Proxy", "httpx"]:
|
|
556
|
+
stdlib_logger = logging.getLogger(logger_name)
|
|
557
|
+
stdlib_logger.handlers.clear() # Remove existing handlers (e.g., StreamHandler)
|
|
558
|
+
stdlib_logger.addHandler(intercept_handler)
|
|
559
|
+
stdlib_logger.propagate = False # Don't propagate to root logger
|
|
560
|
+
|
|
561
|
+
return console_handler_id, log_file_path
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def print_version(ctx: Context, param: Any, value: bool) -> None:
|
|
565
|
+
"""Print version and exit."""
|
|
566
|
+
if not value or ctx.resilient_parsing:
|
|
567
|
+
return
|
|
568
|
+
console.print(f"markitai {__version__}")
|
|
569
|
+
ctx.exit(0)
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
# =============================================================================
|
|
573
|
+
# Main CLI app
|
|
574
|
+
# =============================================================================
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
@click.group(
|
|
578
|
+
cls=MarkitaiGroup,
|
|
579
|
+
invoke_without_command=True,
|
|
580
|
+
context_settings={"help_option_names": ["-h", "--help"]},
|
|
581
|
+
)
|
|
582
|
+
@click.option(
|
|
583
|
+
"--output",
|
|
584
|
+
"-o",
|
|
585
|
+
type=click.Path(path_type=Path),
|
|
586
|
+
default=Path("./output"),
|
|
587
|
+
help="Output directory.",
|
|
588
|
+
)
|
|
589
|
+
@click.option(
|
|
590
|
+
"--config",
|
|
591
|
+
"-c",
|
|
592
|
+
"config_path",
|
|
593
|
+
type=click.Path(exists=True, path_type=Path),
|
|
594
|
+
default=None,
|
|
595
|
+
help="Path to configuration file.",
|
|
596
|
+
)
|
|
597
|
+
@click.option(
|
|
598
|
+
"--preset",
|
|
599
|
+
"-p",
|
|
600
|
+
type=click.Choice(["rich", "standard", "minimal"], case_sensitive=False),
|
|
601
|
+
default=None,
|
|
602
|
+
help="Use a preset configuration (rich/standard/minimal).",
|
|
603
|
+
)
|
|
604
|
+
@click.option(
|
|
605
|
+
"--llm/--no-llm",
|
|
606
|
+
default=None,
|
|
607
|
+
help="Enable/disable LLM processing.",
|
|
608
|
+
)
|
|
609
|
+
@click.option(
|
|
610
|
+
"--alt/--no-alt",
|
|
611
|
+
default=None,
|
|
612
|
+
help="Enable/disable alt text generation for images.",
|
|
613
|
+
)
|
|
614
|
+
@click.option(
|
|
615
|
+
"--desc/--no-desc",
|
|
616
|
+
default=None,
|
|
617
|
+
help="Enable/disable JSON description file for images.",
|
|
618
|
+
)
|
|
619
|
+
@click.option(
|
|
620
|
+
"--ocr/--no-ocr",
|
|
621
|
+
default=None,
|
|
622
|
+
help="Enable/disable OCR for scanned documents.",
|
|
623
|
+
)
|
|
624
|
+
@click.option(
|
|
625
|
+
"--screenshot/--no-screenshot",
|
|
626
|
+
default=None,
|
|
627
|
+
help="Enable/disable page screenshots for PDF/PPTX.",
|
|
628
|
+
)
|
|
629
|
+
@click.option(
|
|
630
|
+
"--resume",
|
|
631
|
+
is_flag=True,
|
|
632
|
+
help="Resume interrupted batch processing.",
|
|
633
|
+
)
|
|
634
|
+
@click.option(
|
|
635
|
+
"--no-compress",
|
|
636
|
+
is_flag=True,
|
|
637
|
+
help="Disable image compression.",
|
|
638
|
+
)
|
|
639
|
+
@click.option(
|
|
640
|
+
"--no-cache",
|
|
641
|
+
is_flag=True,
|
|
642
|
+
help="Disable LLM result caching (force fresh API calls).",
|
|
643
|
+
)
|
|
644
|
+
@click.option(
|
|
645
|
+
"--no-cache-for",
|
|
646
|
+
type=str,
|
|
647
|
+
default=None,
|
|
648
|
+
help="Disable cache for specific files/patterns (comma-separated, supports glob). "
|
|
649
|
+
"E.g., 'file.pdf', '*.docx', '**/reports/*.pdf'.",
|
|
650
|
+
)
|
|
651
|
+
@click.option(
|
|
652
|
+
"--llm-concurrency",
|
|
653
|
+
type=int,
|
|
654
|
+
default=None,
|
|
655
|
+
help="Number of concurrent LLM requests (default from config).",
|
|
656
|
+
)
|
|
657
|
+
@click.option(
|
|
658
|
+
"--batch-concurrency",
|
|
659
|
+
"-j",
|
|
660
|
+
type=int,
|
|
661
|
+
default=None,
|
|
662
|
+
help="Number of concurrent batch tasks (default from config).",
|
|
663
|
+
)
|
|
664
|
+
@click.option(
|
|
665
|
+
"--url-concurrency",
|
|
666
|
+
type=int,
|
|
667
|
+
default=None,
|
|
668
|
+
help="Number of concurrent URL fetches (default from config, separate from file processing).",
|
|
669
|
+
)
|
|
670
|
+
@click.option(
|
|
671
|
+
"--agent-browser",
|
|
672
|
+
"use_agent_browser",
|
|
673
|
+
is_flag=True,
|
|
674
|
+
help="Force browser rendering for URLs via agent-browser.",
|
|
675
|
+
)
|
|
676
|
+
@click.option(
|
|
677
|
+
"--jina",
|
|
678
|
+
"use_jina",
|
|
679
|
+
is_flag=True,
|
|
680
|
+
help="Force Jina Reader API for URL fetching.",
|
|
681
|
+
)
|
|
682
|
+
@click.option(
|
|
683
|
+
"--verbose",
|
|
684
|
+
is_flag=True,
|
|
685
|
+
help="Enable verbose output.",
|
|
686
|
+
)
|
|
687
|
+
@click.option(
|
|
688
|
+
"--dry-run",
|
|
689
|
+
is_flag=True,
|
|
690
|
+
help="Preview conversion without writing files.",
|
|
691
|
+
)
|
|
692
|
+
@click.option(
|
|
693
|
+
"--version",
|
|
694
|
+
"-v",
|
|
695
|
+
is_flag=True,
|
|
696
|
+
callback=print_version,
|
|
697
|
+
expose_value=False,
|
|
698
|
+
is_eager=True,
|
|
699
|
+
help="Show version and exit.",
|
|
700
|
+
)
|
|
701
|
+
@click.pass_context
|
|
702
|
+
def app(
|
|
703
|
+
ctx: Context,
|
|
704
|
+
output: Path,
|
|
705
|
+
config_path: Path | None,
|
|
706
|
+
preset: str | None,
|
|
707
|
+
llm: bool | None,
|
|
708
|
+
alt: bool | None,
|
|
709
|
+
desc: bool | None,
|
|
710
|
+
ocr: bool | None,
|
|
711
|
+
screenshot: bool | None,
|
|
712
|
+
resume: bool,
|
|
713
|
+
no_compress: bool,
|
|
714
|
+
no_cache: bool,
|
|
715
|
+
no_cache_for: str | None,
|
|
716
|
+
batch_concurrency: int | None,
|
|
717
|
+
url_concurrency: int | None,
|
|
718
|
+
llm_concurrency: int | None,
|
|
719
|
+
use_agent_browser: bool,
|
|
720
|
+
use_jina: bool,
|
|
721
|
+
verbose: bool,
|
|
722
|
+
dry_run: bool,
|
|
723
|
+
) -> None:
|
|
724
|
+
"""Markitai - Document to Markdown converter with LLM enhancement.
|
|
725
|
+
|
|
726
|
+
Convert various document formats and URLs to Markdown with optional
|
|
727
|
+
LLM-powered enhancement for format optimization and image analysis.
|
|
728
|
+
|
|
729
|
+
\b
|
|
730
|
+
Presets:
|
|
731
|
+
rich - LLM + alt + desc + screenshot (complex documents)
|
|
732
|
+
standard - LLM + alt + desc (normal documents)
|
|
733
|
+
minimal - No enhancement (just convert)
|
|
734
|
+
|
|
735
|
+
\b
|
|
736
|
+
Examples:
|
|
737
|
+
markitai document.docx # Convert single file
|
|
738
|
+
markitai https://example.com/page # Convert web page
|
|
739
|
+
markitai urls.urls -o ./output/ # Batch URL processing
|
|
740
|
+
markitai https://youtube.com/watch?v=abc # Convert YouTube video
|
|
741
|
+
markitai document.pdf --preset rich # Use rich preset
|
|
742
|
+
markitai document.pdf --preset rich --ocr # Rich + OCR for scans
|
|
743
|
+
markitai document.pdf --preset rich --no-desc # Rich without desc
|
|
744
|
+
markitai ./docs/ -o ./output/ --resume # Batch conversion
|
|
745
|
+
markitai config list # Show configuration
|
|
746
|
+
"""
|
|
747
|
+
# If subcommand is invoked, let it handle
|
|
748
|
+
if ctx.invoked_subcommand is not None:
|
|
749
|
+
return
|
|
750
|
+
|
|
751
|
+
# Get input path from context (set by MarkitaiGroup.parse_args)
|
|
752
|
+
ctx.ensure_object(dict)
|
|
753
|
+
input_path_str = ctx.obj.get("_input_path")
|
|
754
|
+
|
|
755
|
+
if not input_path_str:
|
|
756
|
+
click.echo(ctx.get_help())
|
|
757
|
+
ctx.exit(0)
|
|
758
|
+
|
|
759
|
+
# Check if input is a URL
|
|
760
|
+
is_url_input = is_url(input_path_str)
|
|
761
|
+
|
|
762
|
+
# Initialize URL list mode variables
|
|
763
|
+
url_entries: list = []
|
|
764
|
+
is_url_list_mode = False
|
|
765
|
+
input_path: Path | None = None
|
|
766
|
+
|
|
767
|
+
# For file/directory inputs, validate existence and check for .urls file
|
|
768
|
+
if not is_url_input:
|
|
769
|
+
input_path = Path(input_path_str)
|
|
770
|
+
if not input_path.exists():
|
|
771
|
+
console.print(f"[red]Error: Path '{input_path}' does not exist.[/red]")
|
|
772
|
+
ctx.exit(1)
|
|
773
|
+
|
|
774
|
+
# Auto-detect .urls file
|
|
775
|
+
if input_path.is_file() and input_path.suffix == ".urls":
|
|
776
|
+
from markitai.urls import UrlListParseError, parse_url_list
|
|
777
|
+
|
|
778
|
+
try:
|
|
779
|
+
url_entries = parse_url_list(input_path)
|
|
780
|
+
except UrlListParseError as e:
|
|
781
|
+
console.print(f"[red]Error parsing URL list: {e}[/red]")
|
|
782
|
+
ctx.exit(1)
|
|
783
|
+
|
|
784
|
+
if not url_entries:
|
|
785
|
+
console.print(f"[yellow]No valid URLs found in {input_path}[/yellow]")
|
|
786
|
+
ctx.exit(0)
|
|
787
|
+
|
|
788
|
+
is_url_list_mode = True
|
|
789
|
+
input_path = None # Clear input_path for URL list mode
|
|
790
|
+
|
|
791
|
+
# Load configuration first
|
|
792
|
+
config_manager = ConfigManager()
|
|
793
|
+
cfg = config_manager.load(config_path=config_path)
|
|
794
|
+
|
|
795
|
+
# Determine if we're in single file/URL mode (not batch)
|
|
796
|
+
# Single file/URL mode: quiet console unless --verbose is specified
|
|
797
|
+
# URL list mode is batch mode
|
|
798
|
+
is_single_mode = (
|
|
799
|
+
is_url_input or (input_path is not None and input_path.is_file())
|
|
800
|
+
) and not is_url_list_mode
|
|
801
|
+
quiet_console = is_single_mode and not verbose
|
|
802
|
+
|
|
803
|
+
# Setup logging with configuration
|
|
804
|
+
console_handler_id, log_file_path = setup_logging(
|
|
805
|
+
verbose=verbose,
|
|
806
|
+
log_dir=cfg.log.dir,
|
|
807
|
+
log_level=cfg.log.level,
|
|
808
|
+
rotation=cfg.log.rotation,
|
|
809
|
+
retention=cfg.log.retention,
|
|
810
|
+
quiet=quiet_console,
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
# Log configuration status after logging is set up
|
|
814
|
+
if config_manager.config_path:
|
|
815
|
+
logger.info(f"[Config] Loaded from: {config_manager.config_path}")
|
|
816
|
+
else:
|
|
817
|
+
logger.warning("[Config] No config file found, using defaults")
|
|
818
|
+
|
|
819
|
+
# Warn if LLM is enabled but no models configured
|
|
820
|
+
if cfg.llm.enabled and not cfg.llm.model_list:
|
|
821
|
+
logger.warning(
|
|
822
|
+
"[Config] LLM enabled but no models configured. "
|
|
823
|
+
"Add models to llm.model_list in config file or specify -c <config_path>"
|
|
824
|
+
)
|
|
825
|
+
elif cfg.llm.enabled and cfg.llm.model_list:
|
|
826
|
+
model_names = [m.litellm_params.model for m in cfg.llm.model_list]
|
|
827
|
+
unique_models = set(model_names)
|
|
828
|
+
logger.debug(
|
|
829
|
+
f"[Config] LLM models configured: {len(model_names)} entries, "
|
|
830
|
+
f"{len(unique_models)} unique models"
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
# Store handler ID, log file path and verbose in context for batch processing
|
|
834
|
+
ctx.obj["_console_handler_id"] = console_handler_id
|
|
835
|
+
ctx.obj["_log_file_path"] = log_file_path
|
|
836
|
+
ctx.obj["_verbose"] = verbose
|
|
837
|
+
|
|
838
|
+
# Apply preset first (if specified)
|
|
839
|
+
from markitai.config import get_preset
|
|
840
|
+
|
|
841
|
+
if preset:
|
|
842
|
+
preset_config = get_preset(preset, cfg)
|
|
843
|
+
if preset_config:
|
|
844
|
+
# Apply preset values as base
|
|
845
|
+
cfg.llm.enabled = preset_config.llm
|
|
846
|
+
cfg.image.alt_enabled = preset_config.alt
|
|
847
|
+
cfg.image.desc_enabled = preset_config.desc
|
|
848
|
+
cfg.ocr.enabled = preset_config.ocr
|
|
849
|
+
cfg.screenshot.enabled = preset_config.screenshot
|
|
850
|
+
logger.debug(f"Applied preset: {preset}")
|
|
851
|
+
else:
|
|
852
|
+
console.print(f"[yellow]Warning: Unknown preset '{preset}'[/yellow]")
|
|
853
|
+
|
|
854
|
+
# Override with explicit CLI options (--flag or --no-flag)
|
|
855
|
+
# None means not specified, so we don't override
|
|
856
|
+
if llm is not None:
|
|
857
|
+
cfg.llm.enabled = llm
|
|
858
|
+
if alt is not None:
|
|
859
|
+
cfg.image.alt_enabled = alt
|
|
860
|
+
if desc is not None:
|
|
861
|
+
cfg.image.desc_enabled = desc
|
|
862
|
+
if ocr is not None:
|
|
863
|
+
cfg.ocr.enabled = ocr
|
|
864
|
+
if screenshot is not None:
|
|
865
|
+
cfg.screenshot.enabled = screenshot
|
|
866
|
+
if no_compress:
|
|
867
|
+
cfg.image.compress = False
|
|
868
|
+
if no_cache:
|
|
869
|
+
cfg.cache.no_cache = True
|
|
870
|
+
if no_cache_for:
|
|
871
|
+
# Parse comma-separated patterns
|
|
872
|
+
cfg.cache.no_cache_patterns = [
|
|
873
|
+
p.strip() for p in no_cache_for.split(",") if p.strip()
|
|
874
|
+
]
|
|
875
|
+
if batch_concurrency is not None:
|
|
876
|
+
cfg.batch.concurrency = batch_concurrency
|
|
877
|
+
if url_concurrency is not None:
|
|
878
|
+
cfg.batch.url_concurrency = url_concurrency
|
|
879
|
+
if llm_concurrency is not None:
|
|
880
|
+
cfg.llm.concurrency = llm_concurrency
|
|
881
|
+
|
|
882
|
+
# Validate vision model configuration if image analysis is enabled
|
|
883
|
+
_check_vision_model_config(cfg, console, verbose)
|
|
884
|
+
|
|
885
|
+
# Validate fetch strategy flags (mutually exclusive)
|
|
886
|
+
if use_agent_browser and use_jina:
|
|
887
|
+
console.print(
|
|
888
|
+
"[red]Error: --agent-browser and --jina are mutually exclusive.[/red]"
|
|
889
|
+
)
|
|
890
|
+
ctx.exit(1)
|
|
891
|
+
|
|
892
|
+
# Determine fetch strategy
|
|
893
|
+
from markitai.fetch import FetchStrategy
|
|
894
|
+
|
|
895
|
+
if use_agent_browser:
|
|
896
|
+
fetch_strategy = FetchStrategy.BROWSER
|
|
897
|
+
explicit_fetch_strategy = True
|
|
898
|
+
elif use_jina:
|
|
899
|
+
fetch_strategy = FetchStrategy.JINA
|
|
900
|
+
explicit_fetch_strategy = True
|
|
901
|
+
else:
|
|
902
|
+
# Use config default or auto
|
|
903
|
+
fetch_strategy = FetchStrategy(cfg.fetch.strategy)
|
|
904
|
+
explicit_fetch_strategy = False
|
|
905
|
+
|
|
906
|
+
# Log input info
|
|
907
|
+
if is_url_list_mode:
|
|
908
|
+
logger.debug(f"Processing URL list: {len(url_entries)} URLs")
|
|
909
|
+
elif is_url_input:
|
|
910
|
+
logger.debug(f"Processing URL: {input_path_str}")
|
|
911
|
+
else:
|
|
912
|
+
assert input_path is not None # Already validated above
|
|
913
|
+
logger.debug(f"Processing: {input_path.resolve()}")
|
|
914
|
+
logger.debug(f"Output directory: {output.resolve()}")
|
|
915
|
+
|
|
916
|
+
async def run_workflow() -> None:
|
|
917
|
+
# URL list batch mode
|
|
918
|
+
if is_url_list_mode:
|
|
919
|
+
await process_url_batch(
|
|
920
|
+
url_entries,
|
|
921
|
+
output,
|
|
922
|
+
cfg,
|
|
923
|
+
dry_run,
|
|
924
|
+
verbose,
|
|
925
|
+
log_file_path,
|
|
926
|
+
concurrency=cfg.batch.url_concurrency,
|
|
927
|
+
fetch_strategy=fetch_strategy,
|
|
928
|
+
explicit_fetch_strategy=explicit_fetch_strategy,
|
|
929
|
+
)
|
|
930
|
+
return
|
|
931
|
+
|
|
932
|
+
# Single URL mode
|
|
933
|
+
if is_url_input:
|
|
934
|
+
assert input_path_str is not None # Guaranteed when is_url_input is True
|
|
935
|
+
await process_url(
|
|
936
|
+
input_path_str,
|
|
937
|
+
output,
|
|
938
|
+
cfg,
|
|
939
|
+
dry_run,
|
|
940
|
+
verbose,
|
|
941
|
+
log_file_path,
|
|
942
|
+
fetch_strategy=fetch_strategy,
|
|
943
|
+
explicit_fetch_strategy=explicit_fetch_strategy,
|
|
944
|
+
)
|
|
945
|
+
return
|
|
946
|
+
|
|
947
|
+
# File/directory mode
|
|
948
|
+
assert input_path is not None # Already validated above
|
|
949
|
+
|
|
950
|
+
# Check if input is directory (batch mode)
|
|
951
|
+
if input_path.is_dir():
|
|
952
|
+
await process_batch(
|
|
953
|
+
input_path,
|
|
954
|
+
output,
|
|
955
|
+
cfg,
|
|
956
|
+
resume,
|
|
957
|
+
dry_run,
|
|
958
|
+
verbose=verbose,
|
|
959
|
+
console_handler_id=console_handler_id,
|
|
960
|
+
log_file_path=log_file_path,
|
|
961
|
+
fetch_strategy=fetch_strategy,
|
|
962
|
+
explicit_fetch_strategy=explicit_fetch_strategy,
|
|
963
|
+
)
|
|
964
|
+
return
|
|
965
|
+
|
|
966
|
+
# Single file mode
|
|
967
|
+
await process_single_file(
|
|
968
|
+
input_path, output, cfg, dry_run, log_file_path, verbose
|
|
969
|
+
)
|
|
970
|
+
|
|
971
|
+
async def run_workflow_with_cleanup() -> None:
|
|
972
|
+
"""Run workflow with explicit resource cleanup on exit."""
|
|
973
|
+
from markitai.fetch import close_shared_clients
|
|
974
|
+
|
|
975
|
+
try:
|
|
976
|
+
await run_workflow()
|
|
977
|
+
finally:
|
|
978
|
+
# Cleanup shared resources
|
|
979
|
+
await close_shared_clients() # Close httpx.AsyncClient for Jina
|
|
980
|
+
shutdown_converter_executor() # Shutdown ThreadPoolExecutor
|
|
981
|
+
# Note: FetchCache cleanup happens automatically when process exits
|
|
982
|
+
# as SQLite handles connection cleanup. For explicit cleanup, the
|
|
983
|
+
# global _fetch_cache.close() could be called, but it's not critical.
|
|
984
|
+
|
|
985
|
+
asyncio.run(run_workflow_with_cleanup())
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
# =============================================================================
|
|
989
|
+
# Config subcommands
|
|
990
|
+
# =============================================================================
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
@app.group()
|
|
994
|
+
def config() -> None:
|
|
995
|
+
"""Configuration management commands."""
|
|
996
|
+
pass
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
@config.command("list")
|
|
1000
|
+
def config_list() -> None:
|
|
1001
|
+
"""Show current effective configuration."""
|
|
1002
|
+
manager = ConfigManager()
|
|
1003
|
+
cfg = manager.load()
|
|
1004
|
+
|
|
1005
|
+
config_dict = cfg.model_dump(mode="json", exclude_none=True)
|
|
1006
|
+
config_json = json.dumps(config_dict, indent=2, ensure_ascii=False)
|
|
1007
|
+
|
|
1008
|
+
syntax = Syntax(config_json, "json", theme="monokai", line_numbers=False)
|
|
1009
|
+
console.print(syntax)
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
@config.command("path")
|
|
1013
|
+
def config_path_cmd() -> None:
|
|
1014
|
+
"""Show configuration file paths."""
|
|
1015
|
+
manager = ConfigManager()
|
|
1016
|
+
manager.load()
|
|
1017
|
+
|
|
1018
|
+
console.print("[bold]Configuration file search order:[/bold]")
|
|
1019
|
+
console.print(" 1. --config CLI argument")
|
|
1020
|
+
console.print(" 2. MARKITAI_CONFIG environment variable")
|
|
1021
|
+
console.print(" 3. ./markitai.json (current directory)")
|
|
1022
|
+
console.print(f" 4. {manager.DEFAULT_USER_CONFIG_DIR / 'config.json'}")
|
|
1023
|
+
console.print()
|
|
1024
|
+
|
|
1025
|
+
if manager.config_path:
|
|
1026
|
+
console.print(f"[green]Currently using:[/green] {manager.config_path}")
|
|
1027
|
+
else:
|
|
1028
|
+
console.print(
|
|
1029
|
+
"[yellow]Using default configuration (no config file found)[/yellow]"
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
@config.command("init")
|
|
1034
|
+
@click.option(
|
|
1035
|
+
"--output",
|
|
1036
|
+
"-o",
|
|
1037
|
+
"output_path",
|
|
1038
|
+
type=click.Path(path_type=Path),
|
|
1039
|
+
default=None,
|
|
1040
|
+
help="Output path for configuration file.",
|
|
1041
|
+
)
|
|
1042
|
+
def config_init(output_path: Path | None) -> None:
|
|
1043
|
+
"""Initialize a configuration file with defaults."""
|
|
1044
|
+
manager = ConfigManager()
|
|
1045
|
+
|
|
1046
|
+
if output_path is None:
|
|
1047
|
+
output_path = manager.DEFAULT_USER_CONFIG_DIR / "config.json"
|
|
1048
|
+
elif output_path.is_dir():
|
|
1049
|
+
# User passed a directory, append default filename
|
|
1050
|
+
output_path = output_path / "markitai.json"
|
|
1051
|
+
|
|
1052
|
+
# Check if file exists (not directory)
|
|
1053
|
+
if output_path.exists() and output_path.is_file():
|
|
1054
|
+
if not click.confirm(f"{output_path} already exists. Overwrite?"):
|
|
1055
|
+
raise click.Abort()
|
|
1056
|
+
|
|
1057
|
+
# Save minimal template config (essential fields only)
|
|
1058
|
+
saved_path = manager.save(output_path, minimal=True)
|
|
1059
|
+
console.print(f"[green]Configuration file created:[/green] {saved_path}")
|
|
1060
|
+
console.print("\nEdit this file to customize your settings.")
|
|
1061
|
+
console.print(
|
|
1062
|
+
"[dim]Note: max_tokens, supports_vision are auto-detected from litellm.[/dim]"
|
|
1063
|
+
)
|
|
1064
|
+
console.print("Run 'markitai config list' to see the current configuration.")
|
|
1065
|
+
|
|
1066
|
+
|
|
1067
|
+
@config.command("validate")
|
|
1068
|
+
@click.argument(
|
|
1069
|
+
"config_file",
|
|
1070
|
+
type=click.Path(exists=True, path_type=Path),
|
|
1071
|
+
required=False,
|
|
1072
|
+
)
|
|
1073
|
+
def config_validate(config_file: Path | None) -> None:
|
|
1074
|
+
"""Validate a configuration file."""
|
|
1075
|
+
manager = ConfigManager()
|
|
1076
|
+
|
|
1077
|
+
try:
|
|
1078
|
+
manager.load(config_path=config_file)
|
|
1079
|
+
|
|
1080
|
+
console.print("[green]Configuration is valid![/green]")
|
|
1081
|
+
|
|
1082
|
+
if manager.config_path:
|
|
1083
|
+
console.print(f"[dim]Validated: {manager.config_path}[/dim]")
|
|
1084
|
+
|
|
1085
|
+
except Exception as e:
|
|
1086
|
+
console.print(f"[red]Configuration error:[/red] {e}")
|
|
1087
|
+
raise SystemExit(2)
|
|
1088
|
+
|
|
1089
|
+
|
|
1090
|
+
@config.command("get")
|
|
1091
|
+
@click.argument("key")
|
|
1092
|
+
def config_get(key: str) -> None:
|
|
1093
|
+
"""Get a configuration value."""
|
|
1094
|
+
manager = ConfigManager()
|
|
1095
|
+
manager.load()
|
|
1096
|
+
|
|
1097
|
+
value = manager.get(key)
|
|
1098
|
+
if value is None:
|
|
1099
|
+
console.print(f"[yellow]Key not found:[/yellow] {key}")
|
|
1100
|
+
raise SystemExit(1)
|
|
1101
|
+
|
|
1102
|
+
# Format output
|
|
1103
|
+
if isinstance(value, (dict, list)):
|
|
1104
|
+
console.print(json.dumps(value, indent=2, ensure_ascii=False))
|
|
1105
|
+
else:
|
|
1106
|
+
console.print(str(value))
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
@config.command("set")
|
|
1110
|
+
@click.argument("key")
|
|
1111
|
+
@click.argument("value")
|
|
1112
|
+
def config_set(key: str, value: str) -> None:
|
|
1113
|
+
"""Set a configuration value."""
|
|
1114
|
+
manager = ConfigManager()
|
|
1115
|
+
manager.load()
|
|
1116
|
+
|
|
1117
|
+
# Parse value
|
|
1118
|
+
parsed_value: bool | int | float | str
|
|
1119
|
+
if value.lower() in ("true", "false"):
|
|
1120
|
+
parsed_value = value.lower() == "true"
|
|
1121
|
+
else:
|
|
1122
|
+
try:
|
|
1123
|
+
parsed_value = int(value)
|
|
1124
|
+
except ValueError:
|
|
1125
|
+
try:
|
|
1126
|
+
parsed_value = float(value)
|
|
1127
|
+
except ValueError:
|
|
1128
|
+
parsed_value = value
|
|
1129
|
+
|
|
1130
|
+
try:
|
|
1131
|
+
manager.set(key, parsed_value)
|
|
1132
|
+
manager.save()
|
|
1133
|
+
console.print(f"[green]Set {key} = {parsed_value}[/green]")
|
|
1134
|
+
|
|
1135
|
+
except Exception as e:
|
|
1136
|
+
console.print(f"[red]Error setting value:[/red] {e}")
|
|
1137
|
+
raise SystemExit(1)
|
|
1138
|
+
|
|
1139
|
+
|
|
1140
|
+
# =============================================================================
|
|
1141
|
+
# Cache subcommands
|
|
1142
|
+
# =============================================================================
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
@app.group()
|
|
1146
|
+
def cache() -> None:
|
|
1147
|
+
"""Cache management commands."""
|
|
1148
|
+
pass
|
|
1149
|
+
|
|
1150
|
+
|
|
1151
|
+
@cache.command("stats")
|
|
1152
|
+
@click.option(
|
|
1153
|
+
"--json",
|
|
1154
|
+
"as_json",
|
|
1155
|
+
is_flag=True,
|
|
1156
|
+
help="Output as JSON.",
|
|
1157
|
+
)
|
|
1158
|
+
@click.option(
|
|
1159
|
+
"-v",
|
|
1160
|
+
"--verbose",
|
|
1161
|
+
is_flag=True,
|
|
1162
|
+
help="Show detailed cache entries and model breakdown.",
|
|
1163
|
+
)
|
|
1164
|
+
@click.option(
|
|
1165
|
+
"--limit",
|
|
1166
|
+
default=20,
|
|
1167
|
+
type=int,
|
|
1168
|
+
help="Number of entries to show in verbose mode (default: 20).",
|
|
1169
|
+
)
|
|
1170
|
+
@click.option(
|
|
1171
|
+
"--scope",
|
|
1172
|
+
type=click.Choice(["project", "global", "all"]),
|
|
1173
|
+
default="all",
|
|
1174
|
+
help="Cache scope to display (default: all).",
|
|
1175
|
+
)
|
|
1176
|
+
def cache_stats(as_json: bool, verbose: bool, limit: int, scope: str) -> None:
|
|
1177
|
+
"""Show cache statistics."""
|
|
1178
|
+
from rich.table import Table
|
|
1179
|
+
|
|
1180
|
+
from markitai.constants import (
|
|
1181
|
+
DEFAULT_CACHE_DB_FILENAME,
|
|
1182
|
+
DEFAULT_PROJECT_CACHE_DIR,
|
|
1183
|
+
)
|
|
1184
|
+
from markitai.llm import SQLiteCache
|
|
1185
|
+
|
|
1186
|
+
def format_size(size_bytes: int) -> str:
|
|
1187
|
+
"""Format size in human-readable format."""
|
|
1188
|
+
if size_bytes < 1024:
|
|
1189
|
+
return f"{size_bytes} B"
|
|
1190
|
+
elif size_bytes < 1024 * 1024:
|
|
1191
|
+
return f"{size_bytes / 1024:.1f} KB"
|
|
1192
|
+
else:
|
|
1193
|
+
return f"{size_bytes / (1024 * 1024):.2f} MB"
|
|
1194
|
+
|
|
1195
|
+
def print_verbose_details(
|
|
1196
|
+
cache: SQLiteCache, cache_name: str, limit: int, as_json: bool
|
|
1197
|
+
) -> dict[str, Any]:
|
|
1198
|
+
"""Collect and optionally print verbose cache details."""
|
|
1199
|
+
by_model = cache.stats_by_model()
|
|
1200
|
+
entries = cache.list_entries(limit)
|
|
1201
|
+
|
|
1202
|
+
if not as_json:
|
|
1203
|
+
# Print By Model table
|
|
1204
|
+
if by_model:
|
|
1205
|
+
model_table = Table(title=f"{cache_name} - By Model")
|
|
1206
|
+
model_table.add_column("Model", style="cyan")
|
|
1207
|
+
model_table.add_column("Entries", justify="right")
|
|
1208
|
+
model_table.add_column("Size", justify="right")
|
|
1209
|
+
for model, data in by_model.items():
|
|
1210
|
+
model_table.add_row(
|
|
1211
|
+
model, str(data["count"]), format_size(data["size_bytes"])
|
|
1212
|
+
)
|
|
1213
|
+
console.print(model_table)
|
|
1214
|
+
console.print()
|
|
1215
|
+
|
|
1216
|
+
# Print Recent Entries table
|
|
1217
|
+
if entries:
|
|
1218
|
+
entry_table = Table(title=f"{cache_name} - Recent Entries")
|
|
1219
|
+
entry_table.add_column("Key", style="dim", max_width=18)
|
|
1220
|
+
entry_table.add_column("Model", max_width=30)
|
|
1221
|
+
entry_table.add_column("Size", justify="right")
|
|
1222
|
+
entry_table.add_column("Preview", max_width=40)
|
|
1223
|
+
for entry in entries:
|
|
1224
|
+
key_display = (
|
|
1225
|
+
entry["key"][:16] + "..."
|
|
1226
|
+
if len(entry["key"]) > 16
|
|
1227
|
+
else entry["key"]
|
|
1228
|
+
)
|
|
1229
|
+
entry_table.add_row(
|
|
1230
|
+
key_display,
|
|
1231
|
+
entry["model"],
|
|
1232
|
+
format_size(entry["size_bytes"]),
|
|
1233
|
+
entry["preview"],
|
|
1234
|
+
)
|
|
1235
|
+
console.print(entry_table)
|
|
1236
|
+
|
|
1237
|
+
return {"by_model": by_model, "entries": entries}
|
|
1238
|
+
|
|
1239
|
+
manager = ConfigManager()
|
|
1240
|
+
cfg = manager.load()
|
|
1241
|
+
|
|
1242
|
+
stats_data: dict[str, Any] = {
|
|
1243
|
+
"project": None,
|
|
1244
|
+
"global": None,
|
|
1245
|
+
"enabled": cfg.cache.enabled,
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
# Check project cache (current directory)
|
|
1249
|
+
project_cache: SQLiteCache | None = None
|
|
1250
|
+
if scope in ("project", "all"):
|
|
1251
|
+
project_cache_path = (
|
|
1252
|
+
Path.cwd() / DEFAULT_PROJECT_CACHE_DIR / DEFAULT_CACHE_DB_FILENAME
|
|
1253
|
+
)
|
|
1254
|
+
if project_cache_path.exists():
|
|
1255
|
+
try:
|
|
1256
|
+
project_cache = SQLiteCache(
|
|
1257
|
+
project_cache_path, cfg.cache.max_size_bytes
|
|
1258
|
+
)
|
|
1259
|
+
stats_data["project"] = project_cache.stats()
|
|
1260
|
+
except Exception as e:
|
|
1261
|
+
stats_data["project"] = {"error": str(e)}
|
|
1262
|
+
|
|
1263
|
+
# Check global cache
|
|
1264
|
+
global_cache: SQLiteCache | None = None
|
|
1265
|
+
if scope in ("global", "all"):
|
|
1266
|
+
global_cache_path = (
|
|
1267
|
+
Path(cfg.cache.global_dir).expanduser() / DEFAULT_CACHE_DB_FILENAME
|
|
1268
|
+
)
|
|
1269
|
+
if global_cache_path.exists():
|
|
1270
|
+
try:
|
|
1271
|
+
global_cache = SQLiteCache(global_cache_path, cfg.cache.max_size_bytes)
|
|
1272
|
+
stats_data["global"] = global_cache.stats()
|
|
1273
|
+
except Exception as e:
|
|
1274
|
+
stats_data["global"] = {"error": str(e)}
|
|
1275
|
+
|
|
1276
|
+
# Collect verbose data if needed
|
|
1277
|
+
if verbose:
|
|
1278
|
+
if (
|
|
1279
|
+
project_cache
|
|
1280
|
+
and stats_data["project"]
|
|
1281
|
+
and "error" not in stats_data["project"]
|
|
1282
|
+
):
|
|
1283
|
+
verbose_data = print_verbose_details(
|
|
1284
|
+
project_cache, "Project Cache", limit, as_json
|
|
1285
|
+
)
|
|
1286
|
+
stats_data["project"]["by_model"] = verbose_data["by_model"]
|
|
1287
|
+
stats_data["project"]["entries"] = verbose_data["entries"]
|
|
1288
|
+
|
|
1289
|
+
if (
|
|
1290
|
+
global_cache
|
|
1291
|
+
and stats_data["global"]
|
|
1292
|
+
and "error" not in stats_data["global"]
|
|
1293
|
+
):
|
|
1294
|
+
verbose_data = print_verbose_details(
|
|
1295
|
+
global_cache, "Global Cache", limit, as_json
|
|
1296
|
+
)
|
|
1297
|
+
stats_data["global"]["by_model"] = verbose_data["by_model"]
|
|
1298
|
+
stats_data["global"]["entries"] = verbose_data["entries"]
|
|
1299
|
+
|
|
1300
|
+
if as_json:
|
|
1301
|
+
# Use soft_wrap=True to prevent rich from breaking long lines
|
|
1302
|
+
console.print(
|
|
1303
|
+
json.dumps(stats_data, indent=2, ensure_ascii=False), soft_wrap=True
|
|
1304
|
+
)
|
|
1305
|
+
else:
|
|
1306
|
+
console.print("[bold]Cache Statistics[/bold]")
|
|
1307
|
+
console.print(f"Enabled: {cfg.cache.enabled}")
|
|
1308
|
+
console.print()
|
|
1309
|
+
|
|
1310
|
+
if scope in ("project", "all"):
|
|
1311
|
+
if stats_data["project"]:
|
|
1312
|
+
p = stats_data["project"]
|
|
1313
|
+
if "error" in p:
|
|
1314
|
+
console.print(f"[red]Project cache error:[/red] {p['error']}")
|
|
1315
|
+
else:
|
|
1316
|
+
console.print("[bold]Project Cache[/bold]")
|
|
1317
|
+
console.print(f" Path: {p['db_path']}")
|
|
1318
|
+
console.print(f" Entries: {p['count']}")
|
|
1319
|
+
console.print(f" Size: {p['size_mb']} MB / {p['max_size_mb']} MB")
|
|
1320
|
+
console.print()
|
|
1321
|
+
else:
|
|
1322
|
+
console.print("[dim]No project cache found in current directory[/dim]")
|
|
1323
|
+
console.print()
|
|
1324
|
+
|
|
1325
|
+
if scope in ("global", "all"):
|
|
1326
|
+
if stats_data["global"]:
|
|
1327
|
+
g = stats_data["global"]
|
|
1328
|
+
if "error" in g:
|
|
1329
|
+
console.print(f"[red]Global cache error:[/red] {g['error']}")
|
|
1330
|
+
else:
|
|
1331
|
+
console.print("[bold]Global Cache[/bold]")
|
|
1332
|
+
console.print(f" Path: {g['db_path']}")
|
|
1333
|
+
console.print(f" Entries: {g['count']}")
|
|
1334
|
+
console.print(f" Size: {g['size_mb']} MB / {g['max_size_mb']} MB")
|
|
1335
|
+
console.print()
|
|
1336
|
+
else:
|
|
1337
|
+
console.print("[dim]No global cache found[/dim]")
|
|
1338
|
+
|
|
1339
|
+
|
|
1340
|
+
@cache.command("clear")
|
|
1341
|
+
@click.option(
|
|
1342
|
+
"--scope",
|
|
1343
|
+
type=click.Choice(["project", "global", "all"]),
|
|
1344
|
+
default="project",
|
|
1345
|
+
help="Which cache to clear (default: project).",
|
|
1346
|
+
)
|
|
1347
|
+
@click.option(
|
|
1348
|
+
"--yes",
|
|
1349
|
+
"-y",
|
|
1350
|
+
is_flag=True,
|
|
1351
|
+
help="Skip confirmation prompt.",
|
|
1352
|
+
)
|
|
1353
|
+
def cache_clear(scope: str, yes: bool) -> None:
|
|
1354
|
+
"""Clear cache entries."""
|
|
1355
|
+
from markitai.constants import (
|
|
1356
|
+
DEFAULT_CACHE_DB_FILENAME,
|
|
1357
|
+
DEFAULT_PROJECT_CACHE_DIR,
|
|
1358
|
+
)
|
|
1359
|
+
from markitai.llm import SQLiteCache
|
|
1360
|
+
|
|
1361
|
+
manager = ConfigManager()
|
|
1362
|
+
cfg = manager.load()
|
|
1363
|
+
|
|
1364
|
+
# Confirm if not --yes
|
|
1365
|
+
if not yes:
|
|
1366
|
+
scope_desc = {
|
|
1367
|
+
"project": "project cache (current directory)",
|
|
1368
|
+
"global": "global cache (~/.markitai)",
|
|
1369
|
+
"all": "ALL caches (project + global)",
|
|
1370
|
+
}
|
|
1371
|
+
if not click.confirm(f"Clear {scope_desc[scope]}?"):
|
|
1372
|
+
console.print("[yellow]Aborted[/yellow]")
|
|
1373
|
+
return
|
|
1374
|
+
|
|
1375
|
+
result = {"project": 0, "global": 0}
|
|
1376
|
+
|
|
1377
|
+
# Clear project cache
|
|
1378
|
+
if scope in ("project", "all"):
|
|
1379
|
+
project_cache_path = (
|
|
1380
|
+
Path.cwd() / DEFAULT_PROJECT_CACHE_DIR / DEFAULT_CACHE_DB_FILENAME
|
|
1381
|
+
)
|
|
1382
|
+
if project_cache_path.exists():
|
|
1383
|
+
try:
|
|
1384
|
+
project_cache = SQLiteCache(
|
|
1385
|
+
project_cache_path, cfg.cache.max_size_bytes
|
|
1386
|
+
)
|
|
1387
|
+
result["project"] = project_cache.clear()
|
|
1388
|
+
except Exception as e:
|
|
1389
|
+
console.print(f"[red]Failed to clear project cache:[/red] {e}")
|
|
1390
|
+
|
|
1391
|
+
# Clear global cache
|
|
1392
|
+
if scope in ("global", "all"):
|
|
1393
|
+
global_cache_path = (
|
|
1394
|
+
Path(cfg.cache.global_dir).expanduser() / DEFAULT_CACHE_DB_FILENAME
|
|
1395
|
+
)
|
|
1396
|
+
if global_cache_path.exists():
|
|
1397
|
+
try:
|
|
1398
|
+
global_cache = SQLiteCache(global_cache_path, cfg.cache.max_size_bytes)
|
|
1399
|
+
result["global"] = global_cache.clear()
|
|
1400
|
+
except Exception as e:
|
|
1401
|
+
console.print(f"[red]Failed to clear global cache:[/red] {e}")
|
|
1402
|
+
|
|
1403
|
+
# Report results
|
|
1404
|
+
total = result["project"] + result["global"]
|
|
1405
|
+
if total > 0:
|
|
1406
|
+
console.print(f"[green]Cleared {total} cache entries[/green]")
|
|
1407
|
+
if result["project"] > 0:
|
|
1408
|
+
console.print(f" Project: {result['project']}")
|
|
1409
|
+
if result["global"] > 0:
|
|
1410
|
+
console.print(f" Global: {result['global']}")
|
|
1411
|
+
else:
|
|
1412
|
+
console.print("[dim]No cache entries to clear[/dim]")
|
|
1413
|
+
|
|
1414
|
+
|
|
1415
|
+
# =============================================================================
|
|
1416
|
+
# Check dependencies command
|
|
1417
|
+
# =============================================================================
|
|
1418
|
+
|
|
1419
|
+
|
|
1420
|
+
@app.command("check-deps")
|
|
1421
|
+
@click.option(
|
|
1422
|
+
"--json",
|
|
1423
|
+
"as_json",
|
|
1424
|
+
is_flag=True,
|
|
1425
|
+
help="Output as JSON.",
|
|
1426
|
+
)
|
|
1427
|
+
def check_deps(as_json: bool) -> None:
|
|
1428
|
+
"""Check all optional dependencies and their status.
|
|
1429
|
+
|
|
1430
|
+
This command helps diagnose setup issues by verifying:
|
|
1431
|
+
- agent-browser (for dynamic URL fetching)
|
|
1432
|
+
- LibreOffice (for Office document conversion)
|
|
1433
|
+
- Tesseract OCR (for scanned document processing)
|
|
1434
|
+
- LLM API configuration (for content enhancement)
|
|
1435
|
+
"""
|
|
1436
|
+
import json
|
|
1437
|
+
import shutil
|
|
1438
|
+
import subprocess
|
|
1439
|
+
|
|
1440
|
+
from rich.panel import Panel
|
|
1441
|
+
from rich.table import Table
|
|
1442
|
+
|
|
1443
|
+
from markitai.fetch import verify_agent_browser_ready
|
|
1444
|
+
|
|
1445
|
+
manager = ConfigManager()
|
|
1446
|
+
cfg = manager.load()
|
|
1447
|
+
|
|
1448
|
+
results: dict[str, dict[str, Any]] = {}
|
|
1449
|
+
|
|
1450
|
+
# 1. Check agent-browser
|
|
1451
|
+
is_ready, message = verify_agent_browser_ready(use_cache=False)
|
|
1452
|
+
results["agent-browser"] = {
|
|
1453
|
+
"name": "agent-browser",
|
|
1454
|
+
"description": "Browser automation for dynamic URLs",
|
|
1455
|
+
"status": "ok" if is_ready else "missing",
|
|
1456
|
+
"message": message,
|
|
1457
|
+
"install_hint": "npm install -g agent-browser && npx playwright install chromium",
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
# 2. Check LibreOffice
|
|
1461
|
+
soffice_path = shutil.which("soffice") or shutil.which("libreoffice")
|
|
1462
|
+
if soffice_path:
|
|
1463
|
+
try:
|
|
1464
|
+
proc = subprocess.run(
|
|
1465
|
+
[soffice_path, "--version"],
|
|
1466
|
+
capture_output=True,
|
|
1467
|
+
text=True,
|
|
1468
|
+
timeout=10,
|
|
1469
|
+
)
|
|
1470
|
+
version = (
|
|
1471
|
+
proc.stdout.strip().split("\n")[0]
|
|
1472
|
+
if proc.returncode == 0
|
|
1473
|
+
else "unknown"
|
|
1474
|
+
)
|
|
1475
|
+
results["libreoffice"] = {
|
|
1476
|
+
"name": "LibreOffice",
|
|
1477
|
+
"description": "Office document conversion (doc, docx, xls, xlsx, ppt, pptx)",
|
|
1478
|
+
"status": "ok",
|
|
1479
|
+
"message": f"Found at {soffice_path} ({version})",
|
|
1480
|
+
"install_hint": "",
|
|
1481
|
+
}
|
|
1482
|
+
except Exception as e:
|
|
1483
|
+
results["libreoffice"] = {
|
|
1484
|
+
"name": "LibreOffice",
|
|
1485
|
+
"description": "Office document conversion (doc, docx, xls, xlsx, ppt, pptx)",
|
|
1486
|
+
"status": "error",
|
|
1487
|
+
"message": f"Found but failed to run: {e}",
|
|
1488
|
+
"install_hint": "Reinstall LibreOffice",
|
|
1489
|
+
}
|
|
1490
|
+
else:
|
|
1491
|
+
results["libreoffice"] = {
|
|
1492
|
+
"name": "LibreOffice",
|
|
1493
|
+
"description": "Office document conversion (doc, docx, xls, xlsx, ppt, pptx)",
|
|
1494
|
+
"status": "missing",
|
|
1495
|
+
"message": "soffice/libreoffice command not found",
|
|
1496
|
+
"install_hint": "apt install libreoffice (Linux) / brew install libreoffice (macOS)",
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
# 3. Check Tesseract OCR
|
|
1500
|
+
tesseract_path = shutil.which("tesseract")
|
|
1501
|
+
if tesseract_path:
|
|
1502
|
+
try:
|
|
1503
|
+
proc = subprocess.run(
|
|
1504
|
+
["tesseract", "--version"],
|
|
1505
|
+
capture_output=True,
|
|
1506
|
+
text=True,
|
|
1507
|
+
timeout=10,
|
|
1508
|
+
)
|
|
1509
|
+
version = (
|
|
1510
|
+
proc.stdout.strip().split("\n")[0]
|
|
1511
|
+
if proc.returncode == 0
|
|
1512
|
+
else "unknown"
|
|
1513
|
+
)
|
|
1514
|
+
results["tesseract"] = {
|
|
1515
|
+
"name": "Tesseract OCR",
|
|
1516
|
+
"description": "OCR for scanned documents",
|
|
1517
|
+
"status": "ok",
|
|
1518
|
+
"message": f"Found at {tesseract_path} ({version})",
|
|
1519
|
+
"install_hint": "",
|
|
1520
|
+
}
|
|
1521
|
+
except Exception as e:
|
|
1522
|
+
results["tesseract"] = {
|
|
1523
|
+
"name": "Tesseract OCR",
|
|
1524
|
+
"description": "OCR for scanned documents",
|
|
1525
|
+
"status": "error",
|
|
1526
|
+
"message": f"Found but failed to run: {e}",
|
|
1527
|
+
"install_hint": "Reinstall tesseract",
|
|
1528
|
+
}
|
|
1529
|
+
else:
|
|
1530
|
+
results["tesseract"] = {
|
|
1531
|
+
"name": "Tesseract OCR",
|
|
1532
|
+
"description": "OCR for scanned documents",
|
|
1533
|
+
"status": "missing",
|
|
1534
|
+
"message": "tesseract command not found",
|
|
1535
|
+
"install_hint": "apt install tesseract-ocr (Linux) / brew install tesseract (macOS)",
|
|
1536
|
+
}
|
|
1537
|
+
|
|
1538
|
+
# 4. Check LLM API configuration (check model_list for configured models)
|
|
1539
|
+
configured_models = cfg.llm.model_list if cfg.llm.model_list else []
|
|
1540
|
+
if configured_models:
|
|
1541
|
+
# Find first model with api_key to determine provider
|
|
1542
|
+
first_model = configured_models[0].litellm_params.model
|
|
1543
|
+
provider = first_model.split("/")[0] if "/" in first_model else "openai"
|
|
1544
|
+
results["llm-api"] = {
|
|
1545
|
+
"name": f"LLM API ({provider})",
|
|
1546
|
+
"description": "Content enhancement and image analysis",
|
|
1547
|
+
"status": "ok",
|
|
1548
|
+
"message": f"{len(configured_models)} model(s) configured",
|
|
1549
|
+
"install_hint": "",
|
|
1550
|
+
}
|
|
1551
|
+
else:
|
|
1552
|
+
results["llm-api"] = {
|
|
1553
|
+
"name": "LLM API",
|
|
1554
|
+
"description": "Content enhancement and image analysis",
|
|
1555
|
+
"status": "missing",
|
|
1556
|
+
"message": "No models configured in llm.model_list",
|
|
1557
|
+
"install_hint": "Configure llm.model_list in markitai.json",
|
|
1558
|
+
}
|
|
1559
|
+
|
|
1560
|
+
# 5. Check vision model configuration (models with supports_vision=true)
|
|
1561
|
+
vision_models = [
|
|
1562
|
+
m for m in configured_models if m.model_info and m.model_info.supports_vision
|
|
1563
|
+
]
|
|
1564
|
+
if vision_models:
|
|
1565
|
+
vision_model_names = [m.litellm_params.model for m in vision_models]
|
|
1566
|
+
results["vision-model"] = {
|
|
1567
|
+
"name": "Vision Model",
|
|
1568
|
+
"description": "Image analysis (alt text, descriptions)",
|
|
1569
|
+
"status": "ok",
|
|
1570
|
+
"message": f"Configured: {', '.join(vision_model_names[:2])}{'...' if len(vision_model_names) > 2 else ''}",
|
|
1571
|
+
"install_hint": "",
|
|
1572
|
+
}
|
|
1573
|
+
else:
|
|
1574
|
+
results["vision-model"] = {
|
|
1575
|
+
"name": "Vision Model",
|
|
1576
|
+
"description": "Image analysis (alt text, descriptions)",
|
|
1577
|
+
"status": "warning",
|
|
1578
|
+
"message": "No vision model configured (set model_info.supports_vision=true)",
|
|
1579
|
+
"install_hint": "Add supports_vision: true to model_info in model_list",
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1582
|
+
# Output results
|
|
1583
|
+
if as_json:
|
|
1584
|
+
# Use click.echo for raw JSON (avoid Rich formatting which breaks JSON)
|
|
1585
|
+
click.echo(json.dumps(results, indent=2))
|
|
1586
|
+
return
|
|
1587
|
+
|
|
1588
|
+
# Rich table output
|
|
1589
|
+
table = Table(title="Dependency Status")
|
|
1590
|
+
table.add_column("Component", style="cyan")
|
|
1591
|
+
table.add_column("Status", justify="center")
|
|
1592
|
+
table.add_column("Description")
|
|
1593
|
+
table.add_column("Details")
|
|
1594
|
+
|
|
1595
|
+
status_icons = {
|
|
1596
|
+
"ok": "[green]✓[/green]",
|
|
1597
|
+
"warning": "[yellow]⚠[/yellow]",
|
|
1598
|
+
"missing": "[red]✗[/red]",
|
|
1599
|
+
"error": "[red]![/red]",
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
for _key, info in results.items():
|
|
1603
|
+
status_icon = status_icons.get(info["status"], "?")
|
|
1604
|
+
table.add_row(
|
|
1605
|
+
info["name"],
|
|
1606
|
+
status_icon,
|
|
1607
|
+
info["description"],
|
|
1608
|
+
info["message"],
|
|
1609
|
+
)
|
|
1610
|
+
|
|
1611
|
+
console.print(table)
|
|
1612
|
+
console.print()
|
|
1613
|
+
|
|
1614
|
+
# Show install hints for missing/error items
|
|
1615
|
+
hints = [
|
|
1616
|
+
(info["name"], info["install_hint"])
|
|
1617
|
+
for info in results.values()
|
|
1618
|
+
if info["status"] in ("missing", "error") and info["install_hint"]
|
|
1619
|
+
]
|
|
1620
|
+
|
|
1621
|
+
if hints:
|
|
1622
|
+
hint_text = "\n".join([f" • {name}: {hint}" for name, hint in hints])
|
|
1623
|
+
console.print(
|
|
1624
|
+
Panel(
|
|
1625
|
+
f"[yellow]To fix missing dependencies:[/yellow]\n{hint_text}",
|
|
1626
|
+
title="Installation Hints",
|
|
1627
|
+
border_style="yellow",
|
|
1628
|
+
)
|
|
1629
|
+
)
|
|
1630
|
+
else:
|
|
1631
|
+
console.print("[green]All dependencies are properly configured![/green]")
|
|
1632
|
+
|
|
1633
|
+
|
|
1634
|
+
# =============================================================================
|
|
1635
|
+
# Processing functions
|
|
1636
|
+
# =============================================================================
|
|
1637
|
+
|
|
1638
|
+
|
|
1639
|
+
async def process_single_file(
|
|
1640
|
+
input_path: Path,
|
|
1641
|
+
output_dir: Path,
|
|
1642
|
+
cfg: MarkitaiConfig,
|
|
1643
|
+
dry_run: bool,
|
|
1644
|
+
log_file_path: Path | None = None,
|
|
1645
|
+
verbose: bool = False,
|
|
1646
|
+
) -> None:
|
|
1647
|
+
"""Process a single file using workflow/core pipeline.
|
|
1648
|
+
|
|
1649
|
+
After conversion completes, outputs the final markdown to stdout.
|
|
1650
|
+
If LLM is enabled, outputs .llm.md content; otherwise outputs .md content.
|
|
1651
|
+
"""
|
|
1652
|
+
from datetime import datetime
|
|
1653
|
+
|
|
1654
|
+
from markitai.workflow.core import (
|
|
1655
|
+
ConversionContext,
|
|
1656
|
+
convert_document_core,
|
|
1657
|
+
)
|
|
1658
|
+
|
|
1659
|
+
# Validate file size to prevent DoS
|
|
1660
|
+
try:
|
|
1661
|
+
validate_file_size(input_path, MAX_DOCUMENT_SIZE)
|
|
1662
|
+
except ValueError as e:
|
|
1663
|
+
console.print(Panel(f"[red]{e}[/red]", title="Error"))
|
|
1664
|
+
raise SystemExit(1)
|
|
1665
|
+
|
|
1666
|
+
# Detect file format for dry-run display
|
|
1667
|
+
fmt = detect_format(input_path)
|
|
1668
|
+
if fmt == FileFormat.UNKNOWN:
|
|
1669
|
+
console.print(
|
|
1670
|
+
Panel(
|
|
1671
|
+
f"[red]Unsupported file format: {input_path.suffix}[/red]",
|
|
1672
|
+
title="Error",
|
|
1673
|
+
)
|
|
1674
|
+
)
|
|
1675
|
+
raise SystemExit(1)
|
|
1676
|
+
|
|
1677
|
+
# Handle dry-run
|
|
1678
|
+
if dry_run:
|
|
1679
|
+
cache_status = "enabled" if cfg.cache.enabled else "disabled"
|
|
1680
|
+
dry_run_msg = (
|
|
1681
|
+
f"[yellow]Would convert:[/yellow] {input_path}\n"
|
|
1682
|
+
f"[yellow]Format:[/yellow] {fmt.value.upper()}\n"
|
|
1683
|
+
f"[yellow]Output:[/yellow] {output_dir / (input_path.name + '.md')}\n"
|
|
1684
|
+
f"[yellow]Cache:[/yellow] {cache_status}"
|
|
1685
|
+
)
|
|
1686
|
+
console.print(Panel(dry_run_msg, title="Dry Run"))
|
|
1687
|
+
if cfg.cache.enabled:
|
|
1688
|
+
console.print(
|
|
1689
|
+
"[dim]Tip: Use 'markitai cache stats -v' to view cached entries[/dim]"
|
|
1690
|
+
)
|
|
1691
|
+
raise SystemExit(0)
|
|
1692
|
+
|
|
1693
|
+
# Progress reporter for non-verbose mode feedback
|
|
1694
|
+
progress = ProgressReporter(enabled=not verbose)
|
|
1695
|
+
started_at = datetime.now()
|
|
1696
|
+
error_msg = None
|
|
1697
|
+
|
|
1698
|
+
try:
|
|
1699
|
+
progress.start_spinner(f"Converting {input_path.name}...")
|
|
1700
|
+
|
|
1701
|
+
# Create conversion context
|
|
1702
|
+
ctx = ConversionContext(
|
|
1703
|
+
input_path=input_path,
|
|
1704
|
+
output_dir=output_dir,
|
|
1705
|
+
config=cfg,
|
|
1706
|
+
project_dir=output_dir.parent,
|
|
1707
|
+
)
|
|
1708
|
+
|
|
1709
|
+
# Run core conversion pipeline
|
|
1710
|
+
result = await convert_document_core(ctx, MAX_DOCUMENT_SIZE)
|
|
1711
|
+
|
|
1712
|
+
if not result.success:
|
|
1713
|
+
if result.error:
|
|
1714
|
+
raise RuntimeError(result.error)
|
|
1715
|
+
raise RuntimeError("Unknown conversion error")
|
|
1716
|
+
|
|
1717
|
+
if result.skip_reason == "exists":
|
|
1718
|
+
progress.stop_spinner()
|
|
1719
|
+
base_output_file = output_dir / f"{input_path.name}.md"
|
|
1720
|
+
console.print(f"[yellow]Skipped (exists):[/yellow] {base_output_file}")
|
|
1721
|
+
return
|
|
1722
|
+
|
|
1723
|
+
# Show conversion complete message
|
|
1724
|
+
progress.log(f"Converted: {input_path.name}")
|
|
1725
|
+
|
|
1726
|
+
# Write image descriptions (single file)
|
|
1727
|
+
if ctx.image_analysis and cfg.image.desc_enabled:
|
|
1728
|
+
write_images_json(output_dir, [ctx.image_analysis])
|
|
1729
|
+
|
|
1730
|
+
# Generate report
|
|
1731
|
+
finished_at = datetime.now()
|
|
1732
|
+
duration = (finished_at - started_at).total_seconds()
|
|
1733
|
+
|
|
1734
|
+
input_tokens = sum(u.get("input_tokens", 0) for u in ctx.llm_usage.values())
|
|
1735
|
+
output_tokens = sum(u.get("output_tokens", 0) for u in ctx.llm_usage.values())
|
|
1736
|
+
requests = sum(u.get("requests", 0) for u in ctx.llm_usage.values())
|
|
1737
|
+
|
|
1738
|
+
report = {
|
|
1739
|
+
"version": "1.0",
|
|
1740
|
+
"generated_at": datetime.now().astimezone().isoformat(),
|
|
1741
|
+
"log_file": str(log_file_path) if log_file_path else None,
|
|
1742
|
+
"summary": {
|
|
1743
|
+
"total_documents": 1,
|
|
1744
|
+
"completed_documents": 1,
|
|
1745
|
+
"failed_documents": 0,
|
|
1746
|
+
"duration": duration,
|
|
1747
|
+
},
|
|
1748
|
+
"llm_usage": {
|
|
1749
|
+
"models": ctx.llm_usage,
|
|
1750
|
+
"requests": requests,
|
|
1751
|
+
"input_tokens": input_tokens,
|
|
1752
|
+
"output_tokens": output_tokens,
|
|
1753
|
+
"cost_usd": ctx.llm_cost,
|
|
1754
|
+
},
|
|
1755
|
+
"documents": {
|
|
1756
|
+
input_path.name: {
|
|
1757
|
+
"status": "completed",
|
|
1758
|
+
"error": None,
|
|
1759
|
+
"output": str(
|
|
1760
|
+
ctx.output_file.with_suffix(".llm.md")
|
|
1761
|
+
if cfg.llm.enabled and ctx.output_file
|
|
1762
|
+
else ctx.output_file
|
|
1763
|
+
),
|
|
1764
|
+
"images": ctx.embedded_images_count,
|
|
1765
|
+
"screenshots": ctx.screenshots_count,
|
|
1766
|
+
"duration": duration,
|
|
1767
|
+
"llm_usage": {
|
|
1768
|
+
"input_tokens": input_tokens,
|
|
1769
|
+
"output_tokens": output_tokens,
|
|
1770
|
+
"cost_usd": ctx.llm_cost,
|
|
1771
|
+
},
|
|
1772
|
+
}
|
|
1773
|
+
},
|
|
1774
|
+
}
|
|
1775
|
+
|
|
1776
|
+
# Generate report file path
|
|
1777
|
+
task_options = {
|
|
1778
|
+
"llm": cfg.llm.enabled,
|
|
1779
|
+
"ocr": cfg.ocr.enabled,
|
|
1780
|
+
"screenshot": cfg.screenshot.enabled,
|
|
1781
|
+
"alt": cfg.image.alt_enabled,
|
|
1782
|
+
"desc": cfg.image.desc_enabled,
|
|
1783
|
+
}
|
|
1784
|
+
task_hash = compute_task_hash(input_path, output_dir, task_options)
|
|
1785
|
+
report_path = get_report_file_path(
|
|
1786
|
+
output_dir, task_hash, cfg.output.on_conflict
|
|
1787
|
+
)
|
|
1788
|
+
report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1789
|
+
|
|
1790
|
+
atomic_write_json(report_path, report, order_func=order_report)
|
|
1791
|
+
logger.info(f"Report saved: {report_path}")
|
|
1792
|
+
|
|
1793
|
+
# Clear progress output before printing final result
|
|
1794
|
+
progress.clear_and_finish()
|
|
1795
|
+
|
|
1796
|
+
# Output final markdown to stdout
|
|
1797
|
+
if ctx.output_file:
|
|
1798
|
+
final_output_file = (
|
|
1799
|
+
ctx.output_file.with_suffix(".llm.md")
|
|
1800
|
+
if cfg.llm.enabled
|
|
1801
|
+
else ctx.output_file
|
|
1802
|
+
)
|
|
1803
|
+
if final_output_file.exists():
|
|
1804
|
+
final_content = final_output_file.read_text(encoding="utf-8")
|
|
1805
|
+
print(final_content)
|
|
1806
|
+
|
|
1807
|
+
except Exception as e:
|
|
1808
|
+
error_msg = str(e)
|
|
1809
|
+
console.print(Panel(f"[red]{error_msg}[/red]", title="Error"))
|
|
1810
|
+
sys.exit(1)
|
|
1811
|
+
|
|
1812
|
+
finally:
|
|
1813
|
+
if error_msg:
|
|
1814
|
+
logger.warning(f"Failed to process {input_path.name}: {error_msg}")
|
|
1815
|
+
|
|
1816
|
+
|
|
1817
|
+
async def process_url(
|
|
1818
|
+
url: str,
|
|
1819
|
+
output_dir: Path,
|
|
1820
|
+
cfg: MarkitaiConfig,
|
|
1821
|
+
dry_run: bool,
|
|
1822
|
+
verbose: bool,
|
|
1823
|
+
log_file_path: Path | None = None,
|
|
1824
|
+
fetch_strategy: FetchStrategy | None = None,
|
|
1825
|
+
explicit_fetch_strategy: bool = False,
|
|
1826
|
+
) -> None:
|
|
1827
|
+
"""Process a URL and convert to Markdown.
|
|
1828
|
+
|
|
1829
|
+
Supports multiple fetch strategies:
|
|
1830
|
+
- auto: Detect JS-required pages and fallback automatically
|
|
1831
|
+
- static: Direct HTTP request via markitdown (fastest)
|
|
1832
|
+
- browser: Headless browser via agent-browser (for JS-rendered pages)
|
|
1833
|
+
- jina: Jina Reader API (cloud-based, no local dependencies)
|
|
1834
|
+
|
|
1835
|
+
Also supports:
|
|
1836
|
+
- LLM enhancement via --llm flag for document cleaning and frontmatter
|
|
1837
|
+
- Image downloading and analysis via --alt/--desc flags
|
|
1838
|
+
|
|
1839
|
+
Note: --screenshot and --ocr are not supported for URLs.
|
|
1840
|
+
|
|
1841
|
+
Args:
|
|
1842
|
+
url: URL to convert (http:// or https://)
|
|
1843
|
+
output_dir: Output directory for the markdown file
|
|
1844
|
+
cfg: Configuration
|
|
1845
|
+
dry_run: If True, only show what would be done
|
|
1846
|
+
verbose: If True, print logs before output
|
|
1847
|
+
log_file_path: Path to log file (for report)
|
|
1848
|
+
fetch_strategy: Strategy to use for fetching URL content
|
|
1849
|
+
explicit_fetch_strategy: If True, strategy was explicitly set via CLI flag
|
|
1850
|
+
"""
|
|
1851
|
+
from markitai.fetch import (
|
|
1852
|
+
AgentBrowserNotFoundError,
|
|
1853
|
+
FetchError,
|
|
1854
|
+
FetchStrategy,
|
|
1855
|
+
JinaRateLimitError,
|
|
1856
|
+
fetch_url,
|
|
1857
|
+
)
|
|
1858
|
+
from markitai.image import download_url_images
|
|
1859
|
+
|
|
1860
|
+
# Default to auto strategy if not specified
|
|
1861
|
+
if fetch_strategy is None:
|
|
1862
|
+
fetch_strategy = FetchStrategy(cfg.fetch.strategy)
|
|
1863
|
+
# At this point fetch_strategy is guaranteed to be non-None
|
|
1864
|
+
assert fetch_strategy is not None # for type checker
|
|
1865
|
+
|
|
1866
|
+
# Warn about unsupported/ignored options for URL mode
|
|
1867
|
+
# Note: --alt and --desc are now supported (images will be downloaded)
|
|
1868
|
+
# --screenshot is now supported for URLs (captures full-page screenshot via browser)
|
|
1869
|
+
# --ocr is not applicable for URLs
|
|
1870
|
+
if cfg.ocr.enabled:
|
|
1871
|
+
logger.warning("[URL] --ocr is not supported for URL conversion, ignored")
|
|
1872
|
+
|
|
1873
|
+
# Generate output filename from URL
|
|
1874
|
+
filename = url_to_filename(url)
|
|
1875
|
+
|
|
1876
|
+
if dry_run:
|
|
1877
|
+
llm_status = "enabled" if cfg.llm.enabled else "disabled"
|
|
1878
|
+
cache_status = "enabled" if cfg.cache.enabled else "disabled"
|
|
1879
|
+
fetch_strategy_str = fetch_strategy.value if fetch_strategy else "auto"
|
|
1880
|
+
dry_run_msg = (
|
|
1881
|
+
f"[yellow]Would convert URL:[/yellow] {url}\n"
|
|
1882
|
+
f"[yellow]Output:[/yellow] {output_dir / filename}\n"
|
|
1883
|
+
f"[yellow]Fetch strategy:[/yellow] {fetch_strategy_str}\n"
|
|
1884
|
+
f"[yellow]LLM:[/yellow] {llm_status}\n"
|
|
1885
|
+
f"[yellow]Cache:[/yellow] {cache_status}"
|
|
1886
|
+
)
|
|
1887
|
+
console.print(Panel(dry_run_msg, title="Dry Run"))
|
|
1888
|
+
if cfg.cache.enabled:
|
|
1889
|
+
console.print(
|
|
1890
|
+
"[dim]Tip: Use 'markitai cache stats -v' to view cached entries[/dim]"
|
|
1891
|
+
)
|
|
1892
|
+
raise SystemExit(0)
|
|
1893
|
+
|
|
1894
|
+
# Create output directory
|
|
1895
|
+
from markitai.security import check_symlink_safety
|
|
1896
|
+
|
|
1897
|
+
check_symlink_safety(output_dir, allow_symlinks=cfg.output.allow_symlinks)
|
|
1898
|
+
ensure_dir(output_dir)
|
|
1899
|
+
|
|
1900
|
+
from datetime import datetime
|
|
1901
|
+
|
|
1902
|
+
started_at = datetime.now()
|
|
1903
|
+
llm_cost = 0.0
|
|
1904
|
+
llm_usage: dict[str, dict[str, Any]] = {}
|
|
1905
|
+
|
|
1906
|
+
# Progress reporter for non-verbose mode feedback
|
|
1907
|
+
progress = ProgressReporter(enabled=not verbose)
|
|
1908
|
+
|
|
1909
|
+
# Track cache hit for reporting
|
|
1910
|
+
fetch_cache_hit = False
|
|
1911
|
+
|
|
1912
|
+
# Initialize fetch cache if caching is enabled
|
|
1913
|
+
fetch_cache: FetchCache | None = None
|
|
1914
|
+
if cfg.cache.enabled:
|
|
1915
|
+
from markitai.fetch import get_fetch_cache
|
|
1916
|
+
|
|
1917
|
+
cache_dir = output_dir.parent / ".markitai"
|
|
1918
|
+
fetch_cache = get_fetch_cache(cache_dir, cfg.cache.max_size_bytes)
|
|
1919
|
+
|
|
1920
|
+
try:
|
|
1921
|
+
logger.info(f"Fetching URL: {url} (strategy: {fetch_strategy.value})")
|
|
1922
|
+
progress.start_spinner(f"Fetching {url}...")
|
|
1923
|
+
|
|
1924
|
+
# Fetch URL using the configured strategy
|
|
1925
|
+
# Prepare screenshot options if enabled
|
|
1926
|
+
screenshot_dir = (
|
|
1927
|
+
ensure_screenshots_dir(output_dir) if cfg.screenshot.enabled else None
|
|
1928
|
+
)
|
|
1929
|
+
|
|
1930
|
+
try:
|
|
1931
|
+
fetch_result = await fetch_url(
|
|
1932
|
+
url,
|
|
1933
|
+
fetch_strategy,
|
|
1934
|
+
cfg.fetch,
|
|
1935
|
+
explicit_strategy=explicit_fetch_strategy,
|
|
1936
|
+
cache=fetch_cache,
|
|
1937
|
+
skip_read_cache=cfg.cache.no_cache,
|
|
1938
|
+
screenshot=cfg.screenshot.enabled,
|
|
1939
|
+
screenshot_dir=screenshot_dir,
|
|
1940
|
+
screenshot_config=cfg.screenshot if cfg.screenshot.enabled else None,
|
|
1941
|
+
)
|
|
1942
|
+
fetch_cache_hit = fetch_result.cache_hit
|
|
1943
|
+
used_strategy = fetch_result.strategy_used
|
|
1944
|
+
original_markdown = fetch_result.content
|
|
1945
|
+
screenshot_path = fetch_result.screenshot_path
|
|
1946
|
+
logger.info(f"Fetched via {used_strategy}: {url}")
|
|
1947
|
+
except AgentBrowserNotFoundError:
|
|
1948
|
+
console.print(
|
|
1949
|
+
Panel(
|
|
1950
|
+
"[red]agent-browser is not installed.[/red]\n\n"
|
|
1951
|
+
"Install with:\n"
|
|
1952
|
+
" npm install -g agent-browser\n"
|
|
1953
|
+
" agent-browser install\n\n"
|
|
1954
|
+
"[dim]Or use --jina for cloud-based rendering.[/dim]",
|
|
1955
|
+
title="Error",
|
|
1956
|
+
)
|
|
1957
|
+
)
|
|
1958
|
+
raise SystemExit(1)
|
|
1959
|
+
except JinaRateLimitError:
|
|
1960
|
+
console.print(
|
|
1961
|
+
Panel(
|
|
1962
|
+
"[red]Jina Reader rate limit exceeded (free tier: 20 RPM).[/red]\n\n"
|
|
1963
|
+
"[dim]Try again later or use --agent-browser for local rendering.[/dim]",
|
|
1964
|
+
title="Error",
|
|
1965
|
+
)
|
|
1966
|
+
)
|
|
1967
|
+
raise SystemExit(1)
|
|
1968
|
+
except FetchError as e:
|
|
1969
|
+
console.print(Panel(f"[red]{e}[/red]", title="Error"))
|
|
1970
|
+
raise SystemExit(1)
|
|
1971
|
+
|
|
1972
|
+
if not original_markdown.strip():
|
|
1973
|
+
console.print(
|
|
1974
|
+
Panel(
|
|
1975
|
+
f"[red]No content extracted from URL: {url}[/red]\n"
|
|
1976
|
+
"[dim]The page may be empty, require JavaScript, or use an unsupported format.[/dim]",
|
|
1977
|
+
title="Error",
|
|
1978
|
+
)
|
|
1979
|
+
)
|
|
1980
|
+
raise SystemExit(1)
|
|
1981
|
+
|
|
1982
|
+
# Generate output path with conflict resolution
|
|
1983
|
+
base_output_file = output_dir / filename
|
|
1984
|
+
output_file = resolve_output_path(base_output_file, cfg.output.on_conflict)
|
|
1985
|
+
|
|
1986
|
+
if output_file is None:
|
|
1987
|
+
logger.info(f"[SKIP] Output exists: {base_output_file}")
|
|
1988
|
+
console.print(f"[yellow]Skipped (exists):[/yellow] {base_output_file}")
|
|
1989
|
+
return
|
|
1990
|
+
|
|
1991
|
+
# original_markdown was already set from fetch_result.content above
|
|
1992
|
+
markdown_for_llm = original_markdown
|
|
1993
|
+
progress.log(f"Fetched via {used_strategy}: {url}")
|
|
1994
|
+
|
|
1995
|
+
# Download images from URLs if --alt or --desc is enabled
|
|
1996
|
+
# Only update markdown_for_llm, keep original_markdown unchanged
|
|
1997
|
+
downloaded_images: list[Path] = []
|
|
1998
|
+
images_count = 0
|
|
1999
|
+
screenshots_count = 1 if screenshot_path and screenshot_path.exists() else 0
|
|
2000
|
+
img_analysis: ImageAnalysisResult | None = None
|
|
2001
|
+
|
|
2002
|
+
# Log screenshot capture if successful
|
|
2003
|
+
if screenshot_path and screenshot_path.exists():
|
|
2004
|
+
progress.log(f"Screenshot captured: {screenshot_path.name}")
|
|
2005
|
+
logger.info(f"Screenshot saved: {screenshot_path}")
|
|
2006
|
+
|
|
2007
|
+
if cfg.image.alt_enabled or cfg.image.desc_enabled:
|
|
2008
|
+
progress.start_spinner("Downloading images...")
|
|
2009
|
+
download_result = await download_url_images(
|
|
2010
|
+
markdown=original_markdown,
|
|
2011
|
+
output_dir=output_dir,
|
|
2012
|
+
base_url=url,
|
|
2013
|
+
config=cfg.image,
|
|
2014
|
+
source_name=url_to_filename(url).replace(".md", ""),
|
|
2015
|
+
concurrency=5,
|
|
2016
|
+
timeout=30,
|
|
2017
|
+
)
|
|
2018
|
+
markdown_for_llm = download_result.updated_markdown
|
|
2019
|
+
downloaded_images = download_result.downloaded_paths
|
|
2020
|
+
images_count = len(downloaded_images)
|
|
2021
|
+
|
|
2022
|
+
if download_result.failed_urls:
|
|
2023
|
+
for failed_url in download_result.failed_urls:
|
|
2024
|
+
logger.warning(f"Failed to download image: {failed_url}")
|
|
2025
|
+
|
|
2026
|
+
if downloaded_images:
|
|
2027
|
+
progress.log(f"Downloaded {len(downloaded_images)} images")
|
|
2028
|
+
else:
|
|
2029
|
+
progress.log("No images to download")
|
|
2030
|
+
|
|
2031
|
+
# Write base .md file with original content (no image link replacement)
|
|
2032
|
+
base_content = _add_basic_frontmatter(
|
|
2033
|
+
original_markdown,
|
|
2034
|
+
url,
|
|
2035
|
+
fetch_strategy=used_strategy,
|
|
2036
|
+
screenshot_path=screenshot_path,
|
|
2037
|
+
output_dir=output_dir,
|
|
2038
|
+
)
|
|
2039
|
+
atomic_write_text(output_file, base_content)
|
|
2040
|
+
logger.info(f"Written output: {output_file}")
|
|
2041
|
+
|
|
2042
|
+
# LLM processing (if enabled) uses markdown with local image paths
|
|
2043
|
+
final_content = base_content
|
|
2044
|
+
if cfg.llm.enabled:
|
|
2045
|
+
logger.info(f"[LLM] Processing URL content: {url}")
|
|
2046
|
+
|
|
2047
|
+
# Check if image analysis should run
|
|
2048
|
+
should_analyze_images = (
|
|
2049
|
+
cfg.image.alt_enabled or cfg.image.desc_enabled
|
|
2050
|
+
) and downloaded_images
|
|
2051
|
+
|
|
2052
|
+
# Check for multi-source content (static + browser + screenshot)
|
|
2053
|
+
has_multi_source = (
|
|
2054
|
+
fetch_result.static_content is not None
|
|
2055
|
+
or fetch_result.browser_content is not None
|
|
2056
|
+
)
|
|
2057
|
+
has_screenshot = screenshot_path and screenshot_path.exists()
|
|
2058
|
+
use_vision_enhancement = has_multi_source and has_screenshot
|
|
2059
|
+
|
|
2060
|
+
if use_vision_enhancement and screenshot_path:
|
|
2061
|
+
# Multi-source URL with screenshot: use vision LLM
|
|
2062
|
+
progress.start_spinner("Processing with Vision LLM (multi-source)...")
|
|
2063
|
+
multi_source_content = _build_multi_source_content(
|
|
2064
|
+
fetch_result.static_content,
|
|
2065
|
+
fetch_result.browser_content,
|
|
2066
|
+
markdown_for_llm,
|
|
2067
|
+
)
|
|
2068
|
+
logger.info(
|
|
2069
|
+
f"[URL] Using vision enhancement for multi-source URL: {url}"
|
|
2070
|
+
)
|
|
2071
|
+
|
|
2072
|
+
_, doc_cost, doc_usage = await _process_url_with_vision(
|
|
2073
|
+
multi_source_content,
|
|
2074
|
+
screenshot_path,
|
|
2075
|
+
url,
|
|
2076
|
+
cfg,
|
|
2077
|
+
output_file,
|
|
2078
|
+
project_dir=output_dir.parent,
|
|
2079
|
+
)
|
|
2080
|
+
llm_cost += doc_cost
|
|
2081
|
+
_merge_llm_usage(llm_usage, doc_usage)
|
|
2082
|
+
|
|
2083
|
+
# Run image analysis if needed
|
|
2084
|
+
if should_analyze_images:
|
|
2085
|
+
(
|
|
2086
|
+
_,
|
|
2087
|
+
image_cost,
|
|
2088
|
+
image_usage,
|
|
2089
|
+
img_analysis,
|
|
2090
|
+
) = await analyze_images_with_llm(
|
|
2091
|
+
downloaded_images,
|
|
2092
|
+
multi_source_content,
|
|
2093
|
+
output_file,
|
|
2094
|
+
cfg,
|
|
2095
|
+
Path(url),
|
|
2096
|
+
concurrency_limit=cfg.llm.concurrency,
|
|
2097
|
+
project_dir=output_dir.parent,
|
|
2098
|
+
)
|
|
2099
|
+
llm_cost += image_cost
|
|
2100
|
+
_merge_llm_usage(llm_usage, image_usage)
|
|
2101
|
+
progress.log("LLM processing complete (vision enhanced)")
|
|
2102
|
+
elif should_analyze_images:
|
|
2103
|
+
# Standard processing with image analysis
|
|
2104
|
+
progress.start_spinner("Processing document and images with LLM...")
|
|
2105
|
+
|
|
2106
|
+
# Create parallel tasks
|
|
2107
|
+
doc_task = process_with_llm(
|
|
2108
|
+
markdown_for_llm,
|
|
2109
|
+
url, # Use URL as source identifier
|
|
2110
|
+
cfg,
|
|
2111
|
+
output_file,
|
|
2112
|
+
project_dir=output_dir.parent,
|
|
2113
|
+
)
|
|
2114
|
+
img_task = analyze_images_with_llm(
|
|
2115
|
+
downloaded_images,
|
|
2116
|
+
markdown_for_llm,
|
|
2117
|
+
output_file,
|
|
2118
|
+
cfg,
|
|
2119
|
+
Path(url), # Use URL as source path
|
|
2120
|
+
concurrency_limit=cfg.llm.concurrency,
|
|
2121
|
+
project_dir=output_dir.parent,
|
|
2122
|
+
)
|
|
2123
|
+
|
|
2124
|
+
# Execute in parallel
|
|
2125
|
+
doc_result, img_result = await asyncio.gather(doc_task, img_task)
|
|
2126
|
+
|
|
2127
|
+
# Unpack results
|
|
2128
|
+
_, doc_cost, doc_usage = doc_result
|
|
2129
|
+
_, image_cost, image_usage, img_analysis = img_result
|
|
2130
|
+
|
|
2131
|
+
llm_cost += doc_cost + image_cost
|
|
2132
|
+
_merge_llm_usage(llm_usage, doc_usage)
|
|
2133
|
+
_merge_llm_usage(llm_usage, image_usage)
|
|
2134
|
+
progress.log("LLM processing complete (document + images)")
|
|
2135
|
+
else:
|
|
2136
|
+
# Only document processing, no images to analyze
|
|
2137
|
+
progress.start_spinner("Processing with LLM...")
|
|
2138
|
+
_, doc_cost, doc_usage = await process_with_llm(
|
|
2139
|
+
markdown_for_llm,
|
|
2140
|
+
url, # Use URL as source identifier
|
|
2141
|
+
cfg,
|
|
2142
|
+
output_file,
|
|
2143
|
+
project_dir=output_dir.parent,
|
|
2144
|
+
)
|
|
2145
|
+
llm_cost += doc_cost
|
|
2146
|
+
_merge_llm_usage(llm_usage, doc_usage)
|
|
2147
|
+
progress.log("LLM processing complete")
|
|
2148
|
+
|
|
2149
|
+
# Read the LLM-processed content for stdout output
|
|
2150
|
+
llm_output_file = output_file.with_suffix(".llm.md")
|
|
2151
|
+
if llm_output_file.exists():
|
|
2152
|
+
final_content = llm_output_file.read_text(encoding="utf-8")
|
|
2153
|
+
|
|
2154
|
+
# Write image descriptions (if enabled and images were analyzed)
|
|
2155
|
+
if img_analysis and cfg.image.desc_enabled:
|
|
2156
|
+
write_images_json(output_dir, [img_analysis])
|
|
2157
|
+
|
|
2158
|
+
# Generate report before final output
|
|
2159
|
+
finished_at = datetime.now()
|
|
2160
|
+
duration = (finished_at - started_at).total_seconds()
|
|
2161
|
+
|
|
2162
|
+
input_tokens = sum(u.get("input_tokens", 0) for u in llm_usage.values())
|
|
2163
|
+
output_tokens = sum(u.get("output_tokens", 0) for u in llm_usage.values())
|
|
2164
|
+
requests = sum(u.get("requests", 0) for u in llm_usage.values())
|
|
2165
|
+
|
|
2166
|
+
task_options = {
|
|
2167
|
+
"llm": cfg.llm.enabled,
|
|
2168
|
+
"url": url,
|
|
2169
|
+
}
|
|
2170
|
+
task_hash = compute_task_hash(output_dir, output_dir, task_options)
|
|
2171
|
+
report_path = get_report_file_path(
|
|
2172
|
+
output_dir, task_hash, cfg.output.on_conflict
|
|
2173
|
+
)
|
|
2174
|
+
report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2175
|
+
|
|
2176
|
+
# Determine cache hit status (LLM was enabled but no tokens used)
|
|
2177
|
+
llm_cache_hit = cfg.llm.enabled and requests == 0
|
|
2178
|
+
|
|
2179
|
+
report = {
|
|
2180
|
+
"version": "1.0",
|
|
2181
|
+
"generated_at": datetime.now().astimezone().isoformat(),
|
|
2182
|
+
"log_file": str(log_file_path) if log_file_path else None,
|
|
2183
|
+
"options": {
|
|
2184
|
+
"llm": cfg.llm.enabled,
|
|
2185
|
+
"cache": cfg.cache.enabled,
|
|
2186
|
+
"fetch_strategy": used_strategy,
|
|
2187
|
+
"alt": cfg.image.alt_enabled,
|
|
2188
|
+
"desc": cfg.image.desc_enabled,
|
|
2189
|
+
},
|
|
2190
|
+
"summary": {
|
|
2191
|
+
"total_documents": 0,
|
|
2192
|
+
"completed_documents": 0,
|
|
2193
|
+
"failed_documents": 0,
|
|
2194
|
+
"total_urls": 1,
|
|
2195
|
+
"completed_urls": 1,
|
|
2196
|
+
"failed_urls": 0,
|
|
2197
|
+
"duration": duration,
|
|
2198
|
+
},
|
|
2199
|
+
"llm_usage": {
|
|
2200
|
+
"models": llm_usage,
|
|
2201
|
+
"requests": requests,
|
|
2202
|
+
"input_tokens": input_tokens,
|
|
2203
|
+
"output_tokens": output_tokens,
|
|
2204
|
+
"cost_usd": llm_cost,
|
|
2205
|
+
},
|
|
2206
|
+
"urls": {
|
|
2207
|
+
url: {
|
|
2208
|
+
"status": "completed",
|
|
2209
|
+
"source_file": "cli",
|
|
2210
|
+
"error": None,
|
|
2211
|
+
"output": str(
|
|
2212
|
+
output_file.with_suffix(".llm.md")
|
|
2213
|
+
if cfg.llm.enabled
|
|
2214
|
+
else output_file
|
|
2215
|
+
),
|
|
2216
|
+
"fetch_strategy": used_strategy,
|
|
2217
|
+
"fetch_cache_hit": fetch_cache_hit,
|
|
2218
|
+
"llm_cache_hit": llm_cache_hit,
|
|
2219
|
+
"images": images_count,
|
|
2220
|
+
"screenshots": screenshots_count,
|
|
2221
|
+
"duration": duration,
|
|
2222
|
+
"llm_usage": {
|
|
2223
|
+
"input_tokens": input_tokens,
|
|
2224
|
+
"output_tokens": output_tokens,
|
|
2225
|
+
"cost_usd": llm_cost,
|
|
2226
|
+
},
|
|
2227
|
+
}
|
|
2228
|
+
},
|
|
2229
|
+
}
|
|
2230
|
+
|
|
2231
|
+
atomic_write_json(report_path, report, order_func=order_report)
|
|
2232
|
+
logger.info(f"Report saved: {report_path}")
|
|
2233
|
+
|
|
2234
|
+
# Clear progress output before printing final result
|
|
2235
|
+
progress.clear_and_finish()
|
|
2236
|
+
|
|
2237
|
+
# Output to stdout (single URL mode behavior, same as single file)
|
|
2238
|
+
print(final_content)
|
|
2239
|
+
|
|
2240
|
+
except SystemExit:
|
|
2241
|
+
raise
|
|
2242
|
+
except Exception as e:
|
|
2243
|
+
console.print(Panel(f"[red]{e}[/red]", title="Error"))
|
|
2244
|
+
raise SystemExit(1)
|
|
2245
|
+
|
|
2246
|
+
|
|
2247
|
+
async def process_url_batch(
|
|
2248
|
+
url_entries: list, # list[UrlEntry] but imported dynamically
|
|
2249
|
+
output_dir: Path,
|
|
2250
|
+
cfg: MarkitaiConfig,
|
|
2251
|
+
dry_run: bool,
|
|
2252
|
+
verbose: bool,
|
|
2253
|
+
log_file_path: Path | None = None,
|
|
2254
|
+
concurrency: int = 3,
|
|
2255
|
+
fetch_strategy: FetchStrategy | None = None,
|
|
2256
|
+
explicit_fetch_strategy: bool = False,
|
|
2257
|
+
) -> None:
|
|
2258
|
+
"""Batch process multiple URLs from a URL list file.
|
|
2259
|
+
|
|
2260
|
+
Shows progress bar similar to file batch processing.
|
|
2261
|
+
Each URL is processed concurrently up to the concurrency limit.
|
|
2262
|
+
|
|
2263
|
+
Args:
|
|
2264
|
+
url_entries: List of UrlEntry objects from parse_url_list()
|
|
2265
|
+
output_dir: Output directory for all markdown files
|
|
2266
|
+
cfg: Configuration
|
|
2267
|
+
dry_run: If True, only show what would be done
|
|
2268
|
+
verbose: If True, enable verbose logging
|
|
2269
|
+
log_file_path: Path to log file (for report)
|
|
2270
|
+
concurrency: Max concurrent URL processing (default 3)
|
|
2271
|
+
fetch_strategy: Strategy to use for fetching URL content
|
|
2272
|
+
explicit_fetch_strategy: If True, strategy was explicitly set via CLI flag
|
|
2273
|
+
"""
|
|
2274
|
+
from datetime import datetime
|
|
2275
|
+
|
|
2276
|
+
from rich.progress import (
|
|
2277
|
+
BarColumn,
|
|
2278
|
+
MofNCompleteColumn,
|
|
2279
|
+
Progress,
|
|
2280
|
+
SpinnerColumn,
|
|
2281
|
+
TextColumn,
|
|
2282
|
+
TimeElapsedColumn,
|
|
2283
|
+
)
|
|
2284
|
+
|
|
2285
|
+
from markitai.fetch import (
|
|
2286
|
+
AgentBrowserNotFoundError,
|
|
2287
|
+
FetchError,
|
|
2288
|
+
FetchStrategy,
|
|
2289
|
+
JinaRateLimitError,
|
|
2290
|
+
fetch_url,
|
|
2291
|
+
get_fetch_cache,
|
|
2292
|
+
)
|
|
2293
|
+
from markitai.image import download_url_images
|
|
2294
|
+
from markitai.security import check_symlink_safety
|
|
2295
|
+
|
|
2296
|
+
# Default to auto strategy if not specified
|
|
2297
|
+
if fetch_strategy is None:
|
|
2298
|
+
fetch_strategy = FetchStrategy(cfg.fetch.strategy)
|
|
2299
|
+
assert fetch_strategy is not None # for type checker
|
|
2300
|
+
|
|
2301
|
+
# Dry run: just show what would be done
|
|
2302
|
+
if dry_run:
|
|
2303
|
+
console.print(
|
|
2304
|
+
Panel(
|
|
2305
|
+
f"[yellow]Would process {len(url_entries)} URLs[/yellow]\n"
|
|
2306
|
+
f"[yellow]Output directory:[/yellow] {output_dir}",
|
|
2307
|
+
title="Dry Run - URL Batch",
|
|
2308
|
+
)
|
|
2309
|
+
)
|
|
2310
|
+
for entry in url_entries[:10]:
|
|
2311
|
+
filename = entry.output_name or url_to_filename(entry.url).replace(
|
|
2312
|
+
".md", ""
|
|
2313
|
+
)
|
|
2314
|
+
console.print(f" - {entry.url} -> {filename}.md")
|
|
2315
|
+
if len(url_entries) > 10:
|
|
2316
|
+
console.print(f" ... and {len(url_entries) - 10} more")
|
|
2317
|
+
raise SystemExit(0)
|
|
2318
|
+
|
|
2319
|
+
# Create output directory
|
|
2320
|
+
check_symlink_safety(output_dir, allow_symlinks=cfg.output.allow_symlinks)
|
|
2321
|
+
ensure_dir(output_dir)
|
|
2322
|
+
|
|
2323
|
+
# Initialize fetch cache if caching is enabled
|
|
2324
|
+
fetch_cache = None
|
|
2325
|
+
if cfg.cache.enabled:
|
|
2326
|
+
cache_dir = output_dir.parent / ".markitai"
|
|
2327
|
+
fetch_cache = get_fetch_cache(cache_dir, cfg.cache.max_size_bytes)
|
|
2328
|
+
|
|
2329
|
+
started_at = datetime.now()
|
|
2330
|
+
total_llm_cost = 0.0
|
|
2331
|
+
total_llm_usage: dict[str, dict[str, Any]] = {}
|
|
2332
|
+
completed = 0
|
|
2333
|
+
failed = 0
|
|
2334
|
+
results: dict[str, dict] = {}
|
|
2335
|
+
|
|
2336
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
2337
|
+
|
|
2338
|
+
async def process_single_url(entry, progress_task, progress_obj) -> None:
|
|
2339
|
+
"""Process a single URL."""
|
|
2340
|
+
nonlocal completed, failed, total_llm_cost
|
|
2341
|
+
|
|
2342
|
+
url = entry.url
|
|
2343
|
+
custom_name = entry.output_name
|
|
2344
|
+
url_fetch_strategy = "unknown"
|
|
2345
|
+
|
|
2346
|
+
async with semaphore:
|
|
2347
|
+
try:
|
|
2348
|
+
# Generate filename
|
|
2349
|
+
if custom_name:
|
|
2350
|
+
filename = f"{custom_name}.md"
|
|
2351
|
+
else:
|
|
2352
|
+
filename = url_to_filename(url)
|
|
2353
|
+
|
|
2354
|
+
logger.info(f"Processing URL: {url} (strategy: {fetch_strategy.value})")
|
|
2355
|
+
progress_obj.update(progress_task, description=f"[cyan]{url[:50]}...")
|
|
2356
|
+
|
|
2357
|
+
# Fetch URL using the configured strategy
|
|
2358
|
+
try:
|
|
2359
|
+
fetch_result = await fetch_url(
|
|
2360
|
+
url,
|
|
2361
|
+
fetch_strategy,
|
|
2362
|
+
cfg.fetch,
|
|
2363
|
+
explicit_strategy=explicit_fetch_strategy,
|
|
2364
|
+
cache=fetch_cache,
|
|
2365
|
+
skip_read_cache=cfg.cache.no_cache,
|
|
2366
|
+
)
|
|
2367
|
+
url_fetch_strategy = fetch_result.strategy_used
|
|
2368
|
+
markdown_content = fetch_result.content
|
|
2369
|
+
cache_status = " [cache]" if fetch_result.cache_hit else ""
|
|
2370
|
+
logger.info(
|
|
2371
|
+
f"Fetched via {url_fetch_strategy}{cache_status}: {url}"
|
|
2372
|
+
)
|
|
2373
|
+
except AgentBrowserNotFoundError:
|
|
2374
|
+
logger.error(f"agent-browser not installed for: {url}")
|
|
2375
|
+
results[url] = {
|
|
2376
|
+
"status": "failed",
|
|
2377
|
+
"error": "agent-browser not installed",
|
|
2378
|
+
}
|
|
2379
|
+
failed += 1
|
|
2380
|
+
return
|
|
2381
|
+
except JinaRateLimitError:
|
|
2382
|
+
logger.error(f"Jina Reader rate limit exceeded for: {url}")
|
|
2383
|
+
results[url] = {
|
|
2384
|
+
"status": "failed",
|
|
2385
|
+
"error": "Jina Reader rate limit exceeded (20 RPM)",
|
|
2386
|
+
}
|
|
2387
|
+
failed += 1
|
|
2388
|
+
return
|
|
2389
|
+
except FetchError as e:
|
|
2390
|
+
logger.error(f"Failed to fetch {url}: {e}")
|
|
2391
|
+
results[url] = {"status": "failed", "error": str(e)}
|
|
2392
|
+
failed += 1
|
|
2393
|
+
return
|
|
2394
|
+
|
|
2395
|
+
if not markdown_content.strip():
|
|
2396
|
+
logger.warning(f"No content extracted from URL: {url}")
|
|
2397
|
+
results[url] = {
|
|
2398
|
+
"status": "failed",
|
|
2399
|
+
"error": "No content extracted",
|
|
2400
|
+
}
|
|
2401
|
+
failed += 1
|
|
2402
|
+
return
|
|
2403
|
+
|
|
2404
|
+
# Download images if --alt or --desc is enabled
|
|
2405
|
+
images_count = 0
|
|
2406
|
+
if cfg.image.alt_enabled or cfg.image.desc_enabled:
|
|
2407
|
+
download_result = await download_url_images(
|
|
2408
|
+
markdown=markdown_content,
|
|
2409
|
+
output_dir=output_dir,
|
|
2410
|
+
base_url=url,
|
|
2411
|
+
config=cfg.image,
|
|
2412
|
+
source_name=filename.replace(".md", ""),
|
|
2413
|
+
concurrency=5,
|
|
2414
|
+
timeout=30,
|
|
2415
|
+
)
|
|
2416
|
+
markdown_content = download_result.updated_markdown
|
|
2417
|
+
images_count = len(download_result.downloaded_paths)
|
|
2418
|
+
|
|
2419
|
+
# Generate output path with conflict resolution
|
|
2420
|
+
base_output_file = output_dir / filename
|
|
2421
|
+
output_file = resolve_output_path(
|
|
2422
|
+
base_output_file, cfg.output.on_conflict
|
|
2423
|
+
)
|
|
2424
|
+
|
|
2425
|
+
if output_file is None:
|
|
2426
|
+
logger.info(f"[SKIP] Output exists: {base_output_file}")
|
|
2427
|
+
results[url] = {"status": "skipped", "error": "Output exists"}
|
|
2428
|
+
return
|
|
2429
|
+
|
|
2430
|
+
# Write base .md file with frontmatter
|
|
2431
|
+
base_content = _add_basic_frontmatter(
|
|
2432
|
+
markdown_content,
|
|
2433
|
+
url,
|
|
2434
|
+
fetch_strategy=url_fetch_strategy,
|
|
2435
|
+
output_dir=output_dir,
|
|
2436
|
+
)
|
|
2437
|
+
atomic_write_text(output_file, base_content)
|
|
2438
|
+
|
|
2439
|
+
llm_cost = 0.0
|
|
2440
|
+
llm_usage: dict[str, dict[str, Any]] = {}
|
|
2441
|
+
|
|
2442
|
+
# LLM processing (if enabled)
|
|
2443
|
+
if cfg.llm.enabled:
|
|
2444
|
+
_, doc_cost, doc_usage = await process_with_llm(
|
|
2445
|
+
markdown_content,
|
|
2446
|
+
url,
|
|
2447
|
+
cfg,
|
|
2448
|
+
output_file,
|
|
2449
|
+
project_dir=output_dir.parent,
|
|
2450
|
+
)
|
|
2451
|
+
llm_cost += doc_cost
|
|
2452
|
+
_merge_llm_usage(llm_usage, doc_usage)
|
|
2453
|
+
|
|
2454
|
+
total_llm_cost += llm_cost
|
|
2455
|
+
_merge_llm_usage(total_llm_usage, llm_usage)
|
|
2456
|
+
|
|
2457
|
+
results[url] = {
|
|
2458
|
+
"status": "completed",
|
|
2459
|
+
"error": None,
|
|
2460
|
+
"output": str(
|
|
2461
|
+
output_file.with_suffix(".llm.md")
|
|
2462
|
+
if cfg.llm.enabled
|
|
2463
|
+
else output_file
|
|
2464
|
+
),
|
|
2465
|
+
"fetch_strategy": url_fetch_strategy,
|
|
2466
|
+
"images": images_count,
|
|
2467
|
+
}
|
|
2468
|
+
completed += 1
|
|
2469
|
+
logger.info(f"Completed via {url_fetch_strategy}: {url}")
|
|
2470
|
+
|
|
2471
|
+
except Exception as e:
|
|
2472
|
+
logger.error(f"Failed to process {url}: {e}")
|
|
2473
|
+
results[url] = {"status": "failed", "error": str(e)}
|
|
2474
|
+
failed += 1
|
|
2475
|
+
|
|
2476
|
+
finally:
|
|
2477
|
+
progress_obj.advance(progress_task)
|
|
2478
|
+
|
|
2479
|
+
# Process all URLs with progress bar
|
|
2480
|
+
with Progress(
|
|
2481
|
+
SpinnerColumn(),
|
|
2482
|
+
TextColumn("[progress.description]{task.description}"),
|
|
2483
|
+
BarColumn(),
|
|
2484
|
+
MofNCompleteColumn(),
|
|
2485
|
+
TimeElapsedColumn(),
|
|
2486
|
+
console=console,
|
|
2487
|
+
) as progress:
|
|
2488
|
+
task = progress.add_task("[cyan]Processing URLs...", total=len(url_entries))
|
|
2489
|
+
|
|
2490
|
+
tasks = [process_single_url(entry, task, progress) for entry in url_entries]
|
|
2491
|
+
await asyncio.gather(*tasks)
|
|
2492
|
+
|
|
2493
|
+
# Generate report
|
|
2494
|
+
finished_at = datetime.now()
|
|
2495
|
+
duration = (finished_at - started_at).total_seconds()
|
|
2496
|
+
|
|
2497
|
+
input_tokens = sum(u.get("input_tokens", 0) for u in total_llm_usage.values())
|
|
2498
|
+
output_tokens = sum(u.get("output_tokens", 0) for u in total_llm_usage.values())
|
|
2499
|
+
requests = sum(u.get("requests", 0) for u in total_llm_usage.values())
|
|
2500
|
+
|
|
2501
|
+
task_options = {
|
|
2502
|
+
"llm": cfg.llm.enabled,
|
|
2503
|
+
"alt": cfg.image.alt_enabled,
|
|
2504
|
+
"desc": cfg.image.desc_enabled,
|
|
2505
|
+
}
|
|
2506
|
+
task_hash = compute_task_hash(output_dir, output_dir, task_options)
|
|
2507
|
+
report_path = get_report_file_path(output_dir, task_hash, cfg.output.on_conflict)
|
|
2508
|
+
report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2509
|
+
|
|
2510
|
+
report = {
|
|
2511
|
+
"version": "1.0",
|
|
2512
|
+
"generated_at": datetime.now().astimezone().isoformat(),
|
|
2513
|
+
"log_file": str(log_file_path) if log_file_path else None,
|
|
2514
|
+
"summary": {
|
|
2515
|
+
"total_documents": 0,
|
|
2516
|
+
"completed_documents": 0,
|
|
2517
|
+
"failed_documents": 0,
|
|
2518
|
+
"total_urls": len(url_entries),
|
|
2519
|
+
"completed_urls": completed,
|
|
2520
|
+
"failed_urls": failed,
|
|
2521
|
+
"duration": duration,
|
|
2522
|
+
},
|
|
2523
|
+
"llm_usage": {
|
|
2524
|
+
"models": total_llm_usage,
|
|
2525
|
+
"requests": requests,
|
|
2526
|
+
"input_tokens": input_tokens,
|
|
2527
|
+
"output_tokens": output_tokens,
|
|
2528
|
+
"cost_usd": total_llm_cost,
|
|
2529
|
+
},
|
|
2530
|
+
"urls": results,
|
|
2531
|
+
}
|
|
2532
|
+
|
|
2533
|
+
atomic_write_json(report_path, report, order_func=order_report)
|
|
2534
|
+
logger.info(f"Report saved: {report_path}")
|
|
2535
|
+
|
|
2536
|
+
# Print summary
|
|
2537
|
+
console.print()
|
|
2538
|
+
console.print(
|
|
2539
|
+
Panel(
|
|
2540
|
+
f"[green]Completed:[/green] {completed}\n"
|
|
2541
|
+
f"[red]Failed:[/red] {failed}\n"
|
|
2542
|
+
f"[dim]Duration:[/dim] {duration:.1f}s\n"
|
|
2543
|
+
f"[dim]Report:[/dim] {report_path}",
|
|
2544
|
+
title="URL Batch Complete",
|
|
2545
|
+
)
|
|
2546
|
+
)
|
|
2547
|
+
|
|
2548
|
+
|
|
2549
|
+
def _build_multi_source_content(
|
|
2550
|
+
static_content: str | None,
|
|
2551
|
+
browser_content: str | None,
|
|
2552
|
+
fallback_content: str,
|
|
2553
|
+
) -> str:
|
|
2554
|
+
"""Build content from URL fetch result (single-source strategy).
|
|
2555
|
+
|
|
2556
|
+
With the static-first + browser-fallback strategy, we only have one
|
|
2557
|
+
valid source at a time. This function simply returns the primary content
|
|
2558
|
+
without adding any source labels (which would leak into the final output).
|
|
2559
|
+
|
|
2560
|
+
Args:
|
|
2561
|
+
static_content: Content from static/jina fetch (may be None)
|
|
2562
|
+
browser_content: Content from browser fetch (may be None)
|
|
2563
|
+
fallback_content: Primary content from FetchResult.content
|
|
2564
|
+
|
|
2565
|
+
Returns:
|
|
2566
|
+
Single-source content without labels
|
|
2567
|
+
"""
|
|
2568
|
+
# With single-source strategy, fallback_content is already the best source
|
|
2569
|
+
# No need to merge or add labels - just return the primary content
|
|
2570
|
+
return fallback_content.strip() if fallback_content else ""
|
|
2571
|
+
|
|
2572
|
+
|
|
2573
|
+
async def _process_url_with_vision(
|
|
2574
|
+
content: str,
|
|
2575
|
+
screenshot_path: Path,
|
|
2576
|
+
url: str,
|
|
2577
|
+
cfg: MarkitaiConfig,
|
|
2578
|
+
output_file: Path,
|
|
2579
|
+
processor: LLMProcessor | None = None,
|
|
2580
|
+
project_dir: Path | None = None,
|
|
2581
|
+
) -> tuple[str, float, dict[str, dict[str, Any]]]:
|
|
2582
|
+
"""Process URL content with vision enhancement using screenshot.
|
|
2583
|
+
|
|
2584
|
+
This provides similar functionality to PDF/PPTX vision enhancement,
|
|
2585
|
+
using the page screenshot as visual reference for content extraction.
|
|
2586
|
+
|
|
2587
|
+
Args:
|
|
2588
|
+
content: Markdown content (may be multi-source combined)
|
|
2589
|
+
screenshot_path: Path to the URL screenshot
|
|
2590
|
+
url: Original URL (used as source identifier)
|
|
2591
|
+
cfg: Configuration
|
|
2592
|
+
output_file: Output file path
|
|
2593
|
+
processor: Optional shared LLMProcessor
|
|
2594
|
+
project_dir: Project directory for cache
|
|
2595
|
+
|
|
2596
|
+
Returns:
|
|
2597
|
+
Tuple of (original_content, cost_usd, llm_usage)
|
|
2598
|
+
"""
|
|
2599
|
+
from markitai.workflow.helpers import create_llm_processor
|
|
2600
|
+
|
|
2601
|
+
try:
|
|
2602
|
+
if processor is None:
|
|
2603
|
+
processor = create_llm_processor(cfg, project_dir=project_dir)
|
|
2604
|
+
|
|
2605
|
+
# Use URL-specific vision enhancement (no slide/page marker protection)
|
|
2606
|
+
cleaned_content, frontmatter = await processor.enhance_url_with_vision(
|
|
2607
|
+
content, screenshot_path, context=url
|
|
2608
|
+
)
|
|
2609
|
+
|
|
2610
|
+
# Format and write LLM output
|
|
2611
|
+
llm_output = output_file.with_suffix(".llm.md")
|
|
2612
|
+
llm_content = processor.format_llm_output(cleaned_content, frontmatter)
|
|
2613
|
+
|
|
2614
|
+
# Add screenshot reference as comment
|
|
2615
|
+
screenshot_comment = (
|
|
2616
|
+
f"\n\n<!-- Screenshot for reference -->\n"
|
|
2617
|
+
f"<!--  -->"
|
|
2618
|
+
)
|
|
2619
|
+
llm_content += screenshot_comment
|
|
2620
|
+
|
|
2621
|
+
atomic_write_text(llm_output, llm_content)
|
|
2622
|
+
logger.info(f"Written LLM version with vision: {llm_output}")
|
|
2623
|
+
|
|
2624
|
+
# Get usage for this URL
|
|
2625
|
+
cost = processor.get_context_cost(url)
|
|
2626
|
+
usage = processor.get_context_usage(url)
|
|
2627
|
+
return content, cost, usage
|
|
2628
|
+
|
|
2629
|
+
except Exception as e:
|
|
2630
|
+
logger.warning(
|
|
2631
|
+
f"Vision enhancement failed for {url}: {e}, falling back to standard processing"
|
|
2632
|
+
)
|
|
2633
|
+
# Fallback to standard processing
|
|
2634
|
+
return await process_with_llm(
|
|
2635
|
+
content,
|
|
2636
|
+
url,
|
|
2637
|
+
cfg,
|
|
2638
|
+
output_file,
|
|
2639
|
+
processor=processor,
|
|
2640
|
+
project_dir=project_dir,
|
|
2641
|
+
)
|
|
2642
|
+
|
|
2643
|
+
|
|
2644
|
+
async def process_with_llm(
|
|
2645
|
+
markdown: str,
|
|
2646
|
+
source: str,
|
|
2647
|
+
cfg: MarkitaiConfig,
|
|
2648
|
+
output_file: Path,
|
|
2649
|
+
page_images: list[dict] | None = None,
|
|
2650
|
+
processor: LLMProcessor | None = None,
|
|
2651
|
+
original_markdown: str | None = None,
|
|
2652
|
+
project_dir: Path | None = None,
|
|
2653
|
+
) -> tuple[str, float, dict[str, dict[str, Any]]]:
|
|
2654
|
+
"""Process markdown with LLM and write enhanced version to .llm.md file.
|
|
2655
|
+
|
|
2656
|
+
The LLM-enhanced content is written to output_file with .llm.md suffix.
|
|
2657
|
+
Returns the original markdown unchanged for use in base .md file.
|
|
2658
|
+
|
|
2659
|
+
Args:
|
|
2660
|
+
markdown: Markdown content to process
|
|
2661
|
+
source: Source file name (used as LLM context identifier)
|
|
2662
|
+
cfg: Configuration with LLM and prompt settings
|
|
2663
|
+
output_file: Base output file path (.llm.md suffix added automatically)
|
|
2664
|
+
page_images: Optional page image info for adding commented references
|
|
2665
|
+
processor: Optional shared LLMProcessor (created if not provided)
|
|
2666
|
+
original_markdown: Original markdown for detecting hallucinated images
|
|
2667
|
+
project_dir: Project directory for cache isolation
|
|
2668
|
+
|
|
2669
|
+
Returns:
|
|
2670
|
+
Tuple of (original_markdown, cost_usd, llm_usage):
|
|
2671
|
+
- original_markdown: Input markdown unchanged (for base .md file)
|
|
2672
|
+
- cost_usd: LLM API cost for this file
|
|
2673
|
+
- llm_usage: Per-model usage {model: {requests, input_tokens, output_tokens, cost_usd}}
|
|
2674
|
+
|
|
2675
|
+
Side Effects:
|
|
2676
|
+
Writes LLM-enhanced content to {output_file}.llm.md
|
|
2677
|
+
"""
|
|
2678
|
+
try:
|
|
2679
|
+
if processor is None:
|
|
2680
|
+
processor = create_llm_processor(cfg, project_dir=project_dir)
|
|
2681
|
+
|
|
2682
|
+
cleaned, frontmatter = await processor.process_document(markdown, source)
|
|
2683
|
+
|
|
2684
|
+
# Remove hallucinated image URLs (URLs that don't exist in original)
|
|
2685
|
+
original_for_comparison = original_markdown if original_markdown else markdown
|
|
2686
|
+
cleaned = ImageProcessor.remove_hallucinated_images(
|
|
2687
|
+
cleaned, original_for_comparison
|
|
2688
|
+
)
|
|
2689
|
+
|
|
2690
|
+
# Validate local image references - remove non-existent assets
|
|
2691
|
+
assets_dir = output_file.parent / "assets"
|
|
2692
|
+
if assets_dir.exists():
|
|
2693
|
+
cleaned = ImageProcessor.remove_nonexistent_images(cleaned, assets_dir)
|
|
2694
|
+
|
|
2695
|
+
# Write LLM version
|
|
2696
|
+
llm_output = output_file.with_suffix(".llm.md")
|
|
2697
|
+
llm_content = processor.format_llm_output(cleaned, frontmatter)
|
|
2698
|
+
|
|
2699
|
+
# Check if page_images comments already exist in content
|
|
2700
|
+
# process_document's placeholder protection should preserve them
|
|
2701
|
+
# Append missing page image comments
|
|
2702
|
+
if page_images:
|
|
2703
|
+
page_header = "<!-- Page images for reference -->"
|
|
2704
|
+
has_page_images_header = page_header in llm_content
|
|
2705
|
+
|
|
2706
|
+
# Build the complete page images section
|
|
2707
|
+
commented_images = [
|
|
2708
|
+
f"<!-- ![Page {img['page']}](screenshots/{img['name']}) -->"
|
|
2709
|
+
for img in sorted(page_images, key=lambda x: x.get("page", 0))
|
|
2710
|
+
]
|
|
2711
|
+
|
|
2712
|
+
if not has_page_images_header:
|
|
2713
|
+
# No header exists, add complete section
|
|
2714
|
+
llm_content += "\n\n" + page_header + "\n" + "\n".join(commented_images)
|
|
2715
|
+
else:
|
|
2716
|
+
# Header exists, check for missing page comments
|
|
2717
|
+
import re
|
|
2718
|
+
|
|
2719
|
+
for comment in commented_images:
|
|
2720
|
+
# Check if this specific page is already referenced
|
|
2721
|
+
page_match = re.search(r"!\[Page\s+(\d+)\]", comment)
|
|
2722
|
+
if page_match:
|
|
2723
|
+
page_num = page_match.group(1)
|
|
2724
|
+
# Look for this page number in any form (commented or not)
|
|
2725
|
+
if not re.search(rf"!\[Page\s+{page_num}\]", llm_content):
|
|
2726
|
+
# Append missing page comment
|
|
2727
|
+
llm_content = llm_content.rstrip() + "\n" + comment
|
|
2728
|
+
|
|
2729
|
+
atomic_write_text(llm_output, llm_content)
|
|
2730
|
+
logger.info(f"Written LLM version: {llm_output}")
|
|
2731
|
+
|
|
2732
|
+
# Get usage for THIS file only, not global cumulative usage
|
|
2733
|
+
cost = processor.get_context_cost(source)
|
|
2734
|
+
usage = processor.get_context_usage(source)
|
|
2735
|
+
return markdown, cost, usage # Return original for base .md file
|
|
2736
|
+
|
|
2737
|
+
except Exception as e:
|
|
2738
|
+
logger.warning(f"LLM processing failed: {e}")
|
|
2739
|
+
return markdown, 0.0, {}
|
|
2740
|
+
|
|
2741
|
+
|
|
2742
|
+
def _format_standalone_image_markdown(
|
|
2743
|
+
input_path: Path,
|
|
2744
|
+
analysis: ImageAnalysis,
|
|
2745
|
+
image_ref_path: str,
|
|
2746
|
+
include_frontmatter: bool = False,
|
|
2747
|
+
) -> str:
|
|
2748
|
+
"""Format analysis results for a standalone image file.
|
|
2749
|
+
|
|
2750
|
+
This is a wrapper that delegates to workflow/helpers.format_standalone_image_markdown.
|
|
2751
|
+
|
|
2752
|
+
Args:
|
|
2753
|
+
input_path: Original image file path
|
|
2754
|
+
analysis: ImageAnalysis result with caption, description, extracted_text
|
|
2755
|
+
image_ref_path: Relative path for image reference
|
|
2756
|
+
include_frontmatter: Whether to include YAML frontmatter
|
|
2757
|
+
|
|
2758
|
+
Returns:
|
|
2759
|
+
Formatted markdown string
|
|
2760
|
+
"""
|
|
2761
|
+
from markitai.workflow.helpers import format_standalone_image_markdown
|
|
2762
|
+
|
|
2763
|
+
return format_standalone_image_markdown(
|
|
2764
|
+
input_path, analysis, image_ref_path, include_frontmatter
|
|
2765
|
+
)
|
|
2766
|
+
|
|
2767
|
+
|
|
2768
|
+
async def analyze_images_with_llm(
|
|
2769
|
+
image_paths: list[Path],
|
|
2770
|
+
markdown: str,
|
|
2771
|
+
output_file: Path,
|
|
2772
|
+
cfg: MarkitaiConfig,
|
|
2773
|
+
input_path: Path | None = None,
|
|
2774
|
+
concurrency_limit: int | None = None,
|
|
2775
|
+
processor: LLMProcessor | None = None,
|
|
2776
|
+
project_dir: Path | None = None,
|
|
2777
|
+
) -> tuple[str, float, dict[str, dict[str, Any]], ImageAnalysisResult | None]:
|
|
2778
|
+
"""Analyze images with LLM Vision using batch processing.
|
|
2779
|
+
|
|
2780
|
+
Uses batch analysis to reduce LLM calls (10 images per call instead of 1).
|
|
2781
|
+
|
|
2782
|
+
Behavior controlled by config:
|
|
2783
|
+
- alt_enabled: Update alt text in markdown
|
|
2784
|
+
- desc_enabled: Collect asset descriptions (caller writes JSON)
|
|
2785
|
+
|
|
2786
|
+
Args:
|
|
2787
|
+
image_paths: List of image file paths
|
|
2788
|
+
markdown: Original markdown content
|
|
2789
|
+
output_file: Output markdown file path
|
|
2790
|
+
cfg: Configuration
|
|
2791
|
+
input_path: Source input file path (for absolute path in JSON)
|
|
2792
|
+
concurrency_limit: Max concurrent LLM requests (unused, kept for API compat)
|
|
2793
|
+
processor: Optional shared LLMProcessor (created if not provided)
|
|
2794
|
+
project_dir: Project directory for persistent cache scope
|
|
2795
|
+
|
|
2796
|
+
Returns:
|
|
2797
|
+
Tuple of (updated_markdown, cost_usd, llm_usage, image_analysis_result):
|
|
2798
|
+
- updated_markdown: Markdown with updated alt text (if alt_enabled)
|
|
2799
|
+
- cost_usd: LLM API cost for image analysis
|
|
2800
|
+
- llm_usage: Per-model usage {model: {requests, input_tokens, output_tokens, cost_usd}}
|
|
2801
|
+
- image_analysis_result: Analysis data for JSON output (None if desc_enabled=False)
|
|
2802
|
+
"""
|
|
2803
|
+
import re
|
|
2804
|
+
from datetime import datetime
|
|
2805
|
+
|
|
2806
|
+
alt_enabled = cfg.image.alt_enabled
|
|
2807
|
+
desc_enabled = cfg.image.desc_enabled
|
|
2808
|
+
|
|
2809
|
+
try:
|
|
2810
|
+
if processor is None:
|
|
2811
|
+
processor = create_llm_processor(cfg, project_dir=project_dir)
|
|
2812
|
+
|
|
2813
|
+
# Use unique context for image analysis to track usage separately from doc processing
|
|
2814
|
+
# Format: "full_path:images" ensures isolation even for files with same name in different dirs
|
|
2815
|
+
# This prevents usage from concurrent files being mixed together
|
|
2816
|
+
source_path = (
|
|
2817
|
+
str(input_path.resolve()) if input_path else str(output_file.resolve())
|
|
2818
|
+
)
|
|
2819
|
+
context = f"{source_path}:images"
|
|
2820
|
+
|
|
2821
|
+
# Detect document language from markdown content
|
|
2822
|
+
language = _detect_language(markdown)
|
|
2823
|
+
|
|
2824
|
+
# Use batch analysis
|
|
2825
|
+
logger.info(f"Analyzing {len(image_paths)} images in batches...")
|
|
2826
|
+
analyses = await processor.analyze_images_batch(
|
|
2827
|
+
image_paths,
|
|
2828
|
+
language=language,
|
|
2829
|
+
max_images_per_batch=DEFAULT_MAX_IMAGES_PER_BATCH,
|
|
2830
|
+
context=context,
|
|
2831
|
+
)
|
|
2832
|
+
|
|
2833
|
+
timestamp = datetime.now().astimezone().isoformat()
|
|
2834
|
+
|
|
2835
|
+
# Collect asset descriptions for JSON output
|
|
2836
|
+
asset_descriptions: list[dict[str, Any]] = []
|
|
2837
|
+
|
|
2838
|
+
# Check if this is a standalone image file
|
|
2839
|
+
is_standalone_image = (
|
|
2840
|
+
input_path is not None
|
|
2841
|
+
and input_path.suffix.lower() in IMAGE_EXTENSIONS
|
|
2842
|
+
and len(image_paths) == 1
|
|
2843
|
+
)
|
|
2844
|
+
|
|
2845
|
+
# Process results (analyses is in same order as image_paths)
|
|
2846
|
+
results: list[tuple[Path, ImageAnalysis | None, str]] = []
|
|
2847
|
+
for image_path, analysis in zip(image_paths, analyses):
|
|
2848
|
+
results.append((image_path, analysis, timestamp))
|
|
2849
|
+
|
|
2850
|
+
# Collect for JSON output (if desc_enabled)
|
|
2851
|
+
if desc_enabled:
|
|
2852
|
+
asset_descriptions.append(
|
|
2853
|
+
{
|
|
2854
|
+
"asset": str(image_path.resolve()),
|
|
2855
|
+
"alt": analysis.caption,
|
|
2856
|
+
"desc": analysis.description,
|
|
2857
|
+
"text": analysis.extracted_text or "",
|
|
2858
|
+
"llm_usage": analysis.llm_usage or {},
|
|
2859
|
+
"created": timestamp,
|
|
2860
|
+
}
|
|
2861
|
+
)
|
|
2862
|
+
|
|
2863
|
+
# Update alt text in markdown (if alt_enabled)
|
|
2864
|
+
if alt_enabled and not is_standalone_image:
|
|
2865
|
+
old_pattern = rf"!\[[^\]]*\]\([^)]*{re.escape(image_path.name)}\)"
|
|
2866
|
+
new_ref = f""
|
|
2867
|
+
markdown = re.sub(old_pattern, new_ref, markdown)
|
|
2868
|
+
|
|
2869
|
+
# Update .llm.md file
|
|
2870
|
+
llm_output = output_file.with_suffix(".llm.md")
|
|
2871
|
+
if is_standalone_image and results and results[0][1] is not None:
|
|
2872
|
+
# For standalone images, write the rich formatted content with frontmatter
|
|
2873
|
+
assert input_path is not None
|
|
2874
|
+
_, analysis, _ = results[0]
|
|
2875
|
+
if analysis:
|
|
2876
|
+
rich_content = _format_standalone_image_markdown(
|
|
2877
|
+
input_path,
|
|
2878
|
+
analysis,
|
|
2879
|
+
f"assets/{input_path.name}",
|
|
2880
|
+
include_frontmatter=True,
|
|
2881
|
+
)
|
|
2882
|
+
# Normalize whitespace (ensure headers have blank lines before/after)
|
|
2883
|
+
from markitai.utils.text import normalize_markdown_whitespace
|
|
2884
|
+
|
|
2885
|
+
rich_content = normalize_markdown_whitespace(rich_content)
|
|
2886
|
+
atomic_write_text(llm_output, rich_content)
|
|
2887
|
+
elif alt_enabled:
|
|
2888
|
+
# For other files, update alt text in .llm.md
|
|
2889
|
+
# Wait for .llm.md file to exist (it's written by parallel doc processing)
|
|
2890
|
+
max_wait_seconds = 120 # Max wait time
|
|
2891
|
+
poll_interval = 0.5 # Check every 0.5 seconds
|
|
2892
|
+
waited = 0.0
|
|
2893
|
+
while not llm_output.exists() and waited < max_wait_seconds:
|
|
2894
|
+
await asyncio.sleep(poll_interval)
|
|
2895
|
+
waited += poll_interval
|
|
2896
|
+
|
|
2897
|
+
if llm_output.exists():
|
|
2898
|
+
llm_content = llm_output.read_text(encoding="utf-8")
|
|
2899
|
+
for image_path, analysis, _ in results:
|
|
2900
|
+
if analysis is None:
|
|
2901
|
+
continue
|
|
2902
|
+
old_pattern = rf"!\[[^\]]*\]\([^)]*{re.escape(image_path.name)}\)"
|
|
2903
|
+
new_ref = f""
|
|
2904
|
+
llm_content = re.sub(old_pattern, new_ref, llm_content)
|
|
2905
|
+
atomic_write_text(llm_output, llm_content)
|
|
2906
|
+
else:
|
|
2907
|
+
logger.warning(
|
|
2908
|
+
f"Skipped alt text update: {llm_output} not created within {max_wait_seconds}s"
|
|
2909
|
+
)
|
|
2910
|
+
|
|
2911
|
+
# Build analysis result for caller to aggregate
|
|
2912
|
+
analysis_result: ImageAnalysisResult | None = None
|
|
2913
|
+
if desc_enabled and asset_descriptions:
|
|
2914
|
+
source_path = str(input_path.resolve()) if input_path else output_file.stem
|
|
2915
|
+
analysis_result = ImageAnalysisResult(
|
|
2916
|
+
source_file=source_path,
|
|
2917
|
+
assets=asset_descriptions,
|
|
2918
|
+
)
|
|
2919
|
+
|
|
2920
|
+
# Get usage for THIS file only using context-based tracking
|
|
2921
|
+
# This is concurrency-safe: only includes LLM calls tagged with this context
|
|
2922
|
+
incremental_usage = processor.get_context_usage(context)
|
|
2923
|
+
incremental_cost = processor.get_context_cost(context)
|
|
2924
|
+
|
|
2925
|
+
return (
|
|
2926
|
+
markdown,
|
|
2927
|
+
incremental_cost,
|
|
2928
|
+
incremental_usage,
|
|
2929
|
+
analysis_result,
|
|
2930
|
+
)
|
|
2931
|
+
|
|
2932
|
+
except Exception as e:
|
|
2933
|
+
logger.warning(f"Image analysis failed: {e}")
|
|
2934
|
+
return markdown, 0.0, {}, None
|
|
2935
|
+
|
|
2936
|
+
|
|
2937
|
+
async def enhance_document_with_vision(
|
|
2938
|
+
extracted_text: str,
|
|
2939
|
+
page_images: list[dict],
|
|
2940
|
+
cfg: MarkitaiConfig,
|
|
2941
|
+
source: str = "document",
|
|
2942
|
+
processor: LLMProcessor | None = None,
|
|
2943
|
+
project_dir: Path | None = None,
|
|
2944
|
+
) -> tuple[str, str, float, dict[str, dict[str, Any]]]:
|
|
2945
|
+
"""Enhance document by combining extracted text with page images.
|
|
2946
|
+
|
|
2947
|
+
This is used for OCR+LLM mode where we have:
|
|
2948
|
+
1. Text extracted programmatically (pymupdf4llm/markitdown) - accurate content
|
|
2949
|
+
2. Page images - visual reference for layout/structure
|
|
2950
|
+
|
|
2951
|
+
The LLM uses both to produce optimized markdown + frontmatter.
|
|
2952
|
+
|
|
2953
|
+
Args:
|
|
2954
|
+
extracted_text: Text extracted by pymupdf4llm/markitdown
|
|
2955
|
+
page_images: List of page image info dicts with 'path' key
|
|
2956
|
+
cfg: Configuration
|
|
2957
|
+
source: Source file name for logging context
|
|
2958
|
+
processor: Optional shared LLMProcessor (created if not provided)
|
|
2959
|
+
project_dir: Project directory for persistent cache scope
|
|
2960
|
+
|
|
2961
|
+
Returns:
|
|
2962
|
+
Tuple of (cleaned_markdown, frontmatter_yaml, cost_usd, llm_usage)
|
|
2963
|
+
"""
|
|
2964
|
+
try:
|
|
2965
|
+
if processor is None:
|
|
2966
|
+
processor = create_llm_processor(cfg, project_dir=project_dir)
|
|
2967
|
+
|
|
2968
|
+
# Sort images by page number
|
|
2969
|
+
def get_page_num(img_info: dict) -> int:
|
|
2970
|
+
return img_info.get("page", 0)
|
|
2971
|
+
|
|
2972
|
+
sorted_images = sorted(page_images, key=get_page_num)
|
|
2973
|
+
|
|
2974
|
+
# Convert to Path list
|
|
2975
|
+
image_paths = [Path(img["path"]) for img in sorted_images]
|
|
2976
|
+
|
|
2977
|
+
logger.info(
|
|
2978
|
+
f"[START] {source}: Enhancing with {len(image_paths)} page images..."
|
|
2979
|
+
)
|
|
2980
|
+
|
|
2981
|
+
# Call the combined enhancement method (clean + frontmatter)
|
|
2982
|
+
cleaned_content, frontmatter = await processor.enhance_document_complete(
|
|
2983
|
+
extracted_text, image_paths, source=source
|
|
2984
|
+
)
|
|
2985
|
+
|
|
2986
|
+
# Get usage for THIS file only, not global cumulative usage
|
|
2987
|
+
return (
|
|
2988
|
+
cleaned_content,
|
|
2989
|
+
frontmatter,
|
|
2990
|
+
processor.get_context_cost(source),
|
|
2991
|
+
processor.get_context_usage(source),
|
|
2992
|
+
)
|
|
2993
|
+
|
|
2994
|
+
except Exception as e:
|
|
2995
|
+
logger.warning(f"Document enhancement failed: {e}")
|
|
2996
|
+
# Return original text with basic frontmatter as fallback
|
|
2997
|
+
basic_frontmatter = f"title: {source}\nsource: {source}"
|
|
2998
|
+
return extracted_text, basic_frontmatter, 0.0, {}
|
|
2999
|
+
|
|
3000
|
+
|
|
3001
|
+
def _check_vision_model_config(cfg: Any, console: Any, verbose: bool = False) -> None:
|
|
3002
|
+
"""Check vision model configuration when image analysis is enabled.
|
|
3003
|
+
|
|
3004
|
+
Args:
|
|
3005
|
+
cfg: Configuration object
|
|
3006
|
+
console: Rich console for output
|
|
3007
|
+
verbose: Whether to show extra details
|
|
3008
|
+
"""
|
|
3009
|
+
# Only check if image analysis is enabled
|
|
3010
|
+
if not (cfg.image.alt_enabled or cfg.image.desc_enabled):
|
|
3011
|
+
return
|
|
3012
|
+
|
|
3013
|
+
# Check if LLM is enabled
|
|
3014
|
+
if not cfg.llm.enabled:
|
|
3015
|
+
from rich.panel import Panel
|
|
3016
|
+
|
|
3017
|
+
warning_text = (
|
|
3018
|
+
"[yellow]⚠ Image analysis (--alt/--desc) requires LLM to be enabled.[/yellow]\n\n"
|
|
3019
|
+
"[dim]Image alt text and descriptions will be skipped without LLM.[/dim]\n\n"
|
|
3020
|
+
"To enable LLM processing:\n"
|
|
3021
|
+
" [cyan]markitai --llm ...[/cyan] or use [cyan]--preset rich/standard[/cyan]"
|
|
3022
|
+
)
|
|
3023
|
+
console.print(Panel(warning_text, title="LLM Required", border_style="yellow"))
|
|
3024
|
+
return
|
|
3025
|
+
|
|
3026
|
+
# Check if vision-capable models are configured (auto-detect from litellm)
|
|
3027
|
+
from markitai.llm import get_model_info_cached
|
|
3028
|
+
|
|
3029
|
+
def is_vision_model(model_config: Any) -> bool:
|
|
3030
|
+
"""Check if model supports vision (config override or auto-detect)."""
|
|
3031
|
+
if (
|
|
3032
|
+
model_config.model_info
|
|
3033
|
+
and model_config.model_info.supports_vision is not None
|
|
3034
|
+
):
|
|
3035
|
+
return model_config.model_info.supports_vision
|
|
3036
|
+
info = get_model_info_cached(model_config.litellm_params.model)
|
|
3037
|
+
return info.get("supports_vision", False)
|
|
3038
|
+
|
|
3039
|
+
vision_models = [m for m in cfg.llm.model_list if is_vision_model(m)]
|
|
3040
|
+
|
|
3041
|
+
if not vision_models and cfg.llm.model_list:
|
|
3042
|
+
from rich.panel import Panel
|
|
3043
|
+
|
|
3044
|
+
# List configured models
|
|
3045
|
+
configured_models = ", ".join(
|
|
3046
|
+
[m.litellm_params.model for m in cfg.llm.model_list[:3]]
|
|
3047
|
+
)
|
|
3048
|
+
if len(cfg.llm.model_list) > 3:
|
|
3049
|
+
configured_models += f" (+{len(cfg.llm.model_list) - 3} more)"
|
|
3050
|
+
|
|
3051
|
+
warning_text = (
|
|
3052
|
+
"[yellow]⚠ No vision-capable models detected.[/yellow]\n\n"
|
|
3053
|
+
f"[dim]Current models: {configured_models}[/dim]\n"
|
|
3054
|
+
"[dim]Vision models are auto-detected from litellm. "
|
|
3055
|
+
"Add `supports_vision: true` in config to override.[/dim]"
|
|
3056
|
+
)
|
|
3057
|
+
console.print(
|
|
3058
|
+
Panel(warning_text, title="Vision Model Recommended", border_style="yellow")
|
|
3059
|
+
)
|
|
3060
|
+
elif verbose and vision_models:
|
|
3061
|
+
# In verbose mode, show which vision models are configured
|
|
3062
|
+
model_names = [m.litellm_params.model for m in vision_models]
|
|
3063
|
+
count = len(model_names)
|
|
3064
|
+
if count <= 3:
|
|
3065
|
+
logger.debug(
|
|
3066
|
+
f"Vision models configured: {count} ({', '.join(model_names)})"
|
|
3067
|
+
)
|
|
3068
|
+
else:
|
|
3069
|
+
preview = ", ".join(model_names[:3])
|
|
3070
|
+
logger.debug(f"Vision models configured: {count} ({preview}, ...)")
|
|
3071
|
+
|
|
3072
|
+
|
|
3073
|
+
def _check_agent_browser_for_urls(cfg: Any, console: Any) -> None:
|
|
3074
|
+
"""Check agent-browser availability and warn if not ready for URL processing.
|
|
3075
|
+
|
|
3076
|
+
Args:
|
|
3077
|
+
cfg: Configuration object
|
|
3078
|
+
console: Rich console for output
|
|
3079
|
+
"""
|
|
3080
|
+
from markitai.fetch import FetchStrategy, verify_agent_browser_ready
|
|
3081
|
+
|
|
3082
|
+
# Only check if strategy might use browser
|
|
3083
|
+
strategy = (
|
|
3084
|
+
cfg.fetch.strategy if hasattr(cfg.fetch, "strategy") else FetchStrategy.AUTO
|
|
3085
|
+
)
|
|
3086
|
+
if strategy == FetchStrategy.STATIC or strategy == FetchStrategy.JINA:
|
|
3087
|
+
return # No browser needed
|
|
3088
|
+
|
|
3089
|
+
# Get command from config
|
|
3090
|
+
command = "agent-browser"
|
|
3091
|
+
if hasattr(cfg, "agent_browser") and hasattr(cfg.agent_browser, "command"):
|
|
3092
|
+
command = cfg.agent_browser.command
|
|
3093
|
+
|
|
3094
|
+
is_ready, message = verify_agent_browser_ready(command, use_cache=True)
|
|
3095
|
+
|
|
3096
|
+
if not is_ready:
|
|
3097
|
+
from rich.panel import Panel
|
|
3098
|
+
|
|
3099
|
+
warning_text = (
|
|
3100
|
+
f"[yellow]⚠ {message}[/yellow]\n\n"
|
|
3101
|
+
"[dim]URL processing will fall back to static fetch strategy.\n"
|
|
3102
|
+
"For JavaScript-rendered pages (Twitter/X, etc.), browser support is recommended.\n\n"
|
|
3103
|
+
"To install browser support:[/dim]\n"
|
|
3104
|
+
" [cyan]agent-browser install[/cyan] [dim]or[/dim] [cyan]npx playwright install chromium[/cyan]"
|
|
3105
|
+
)
|
|
3106
|
+
console.print(
|
|
3107
|
+
Panel(warning_text, title="Browser Not Available", border_style="yellow")
|
|
3108
|
+
)
|
|
3109
|
+
|
|
3110
|
+
|
|
3111
|
+
def _warn_case_sensitivity_mismatches(
|
|
3112
|
+
files: list[Path],
|
|
3113
|
+
input_dir: Path,
|
|
3114
|
+
patterns: list[str],
|
|
3115
|
+
) -> None:
|
|
3116
|
+
"""Warn about files that would match patterns if case-insensitive.
|
|
3117
|
+
|
|
3118
|
+
This helps users catch cases where e.g., '*.jpg' doesn't match 'IMAGE.JPG'
|
|
3119
|
+
because pattern matching is case-sensitive on most platforms.
|
|
3120
|
+
|
|
3121
|
+
Args:
|
|
3122
|
+
files: List of files discovered for processing
|
|
3123
|
+
input_dir: Base input directory for relative path calculation
|
|
3124
|
+
patterns: List of --no-cache-for patterns
|
|
3125
|
+
"""
|
|
3126
|
+
import fnmatch
|
|
3127
|
+
|
|
3128
|
+
# Collect potential case mismatches
|
|
3129
|
+
mismatches: list[tuple[str, str]] = [] # (file_path, pattern)
|
|
3130
|
+
|
|
3131
|
+
for f in files:
|
|
3132
|
+
try:
|
|
3133
|
+
rel_path = f.relative_to(input_dir).as_posix()
|
|
3134
|
+
except ValueError:
|
|
3135
|
+
rel_path = f.name
|
|
3136
|
+
|
|
3137
|
+
for pattern in patterns:
|
|
3138
|
+
# Normalize pattern
|
|
3139
|
+
norm_pattern = pattern.replace("\\", "/")
|
|
3140
|
+
|
|
3141
|
+
# Check if it would match case-insensitively but not case-sensitively
|
|
3142
|
+
if not fnmatch.fnmatch(rel_path, norm_pattern):
|
|
3143
|
+
if fnmatch.fnmatch(rel_path.lower(), norm_pattern.lower()):
|
|
3144
|
+
mismatches.append((rel_path, pattern))
|
|
3145
|
+
|
|
3146
|
+
if mismatches:
|
|
3147
|
+
# Group by pattern for cleaner output
|
|
3148
|
+
by_pattern: dict[str, list[str]] = {}
|
|
3149
|
+
for file_path, pattern in mismatches:
|
|
3150
|
+
by_pattern.setdefault(pattern, []).append(file_path)
|
|
3151
|
+
|
|
3152
|
+
# Log warning
|
|
3153
|
+
logger.warning(
|
|
3154
|
+
f"[Cache] Case-sensitivity: {len(mismatches)} file(s) would match "
|
|
3155
|
+
"--no-cache-for patterns if case-insensitive"
|
|
3156
|
+
)
|
|
3157
|
+
|
|
3158
|
+
# Show details in console
|
|
3159
|
+
console.print(
|
|
3160
|
+
f"[yellow]Warning: {len(mismatches)} file(s) have case mismatches "
|
|
3161
|
+
"with --no-cache-for patterns[/yellow]"
|
|
3162
|
+
)
|
|
3163
|
+
for pattern, file_paths in by_pattern.items():
|
|
3164
|
+
console.print(f" Pattern: [cyan]{pattern}[/cyan]")
|
|
3165
|
+
for fp in file_paths[:3]: # Show max 3 examples
|
|
3166
|
+
console.print(f" - {fp}")
|
|
3167
|
+
if len(file_paths) > 3:
|
|
3168
|
+
console.print(f" ... and {len(file_paths) - 3} more")
|
|
3169
|
+
console.print(
|
|
3170
|
+
"[dim]Hint: Pattern matching is case-sensitive. "
|
|
3171
|
+
"Use exact case or patterns like '*.[jJ][pP][gG]'[/dim]"
|
|
3172
|
+
)
|
|
3173
|
+
|
|
3174
|
+
|
|
3175
|
+
def _create_process_file(
|
|
3176
|
+
cfg: MarkitaiConfig,
|
|
3177
|
+
input_dir: Path,
|
|
3178
|
+
output_dir: Path,
|
|
3179
|
+
preconverted_map: dict[Path, Path],
|
|
3180
|
+
shared_processor: LLMProcessor | None,
|
|
3181
|
+
):
|
|
3182
|
+
"""Create a process_file function using workflow/core pipeline.
|
|
3183
|
+
|
|
3184
|
+
This factory function creates a closure that captures the batch processing
|
|
3185
|
+
context for conversion.
|
|
3186
|
+
|
|
3187
|
+
Args:
|
|
3188
|
+
cfg: Markitai configuration
|
|
3189
|
+
input_dir: Input directory for relative path calculation
|
|
3190
|
+
output_dir: Output directory
|
|
3191
|
+
preconverted_map: Map of pre-converted legacy Office files
|
|
3192
|
+
shared_processor: Shared LLM processor for batch mode
|
|
3193
|
+
|
|
3194
|
+
Returns:
|
|
3195
|
+
An async function that processes a single file and returns ProcessResult
|
|
3196
|
+
"""
|
|
3197
|
+
from markitai.batch import ProcessResult
|
|
3198
|
+
from markitai.workflow.core import ConversionContext, convert_document_core
|
|
3199
|
+
|
|
3200
|
+
async def process_file(file_path: Path) -> ProcessResult:
|
|
3201
|
+
"""Process a single file using workflow/core pipeline."""
|
|
3202
|
+
import time
|
|
3203
|
+
|
|
3204
|
+
start_time = time.perf_counter()
|
|
3205
|
+
logger.info(f"[START] {file_path.name}")
|
|
3206
|
+
|
|
3207
|
+
try:
|
|
3208
|
+
# Calculate relative path to preserve directory structure
|
|
3209
|
+
try:
|
|
3210
|
+
rel_path = file_path.parent.relative_to(input_dir)
|
|
3211
|
+
file_output_dir = output_dir / rel_path
|
|
3212
|
+
except ValueError:
|
|
3213
|
+
file_output_dir = output_dir
|
|
3214
|
+
|
|
3215
|
+
# Create conversion context
|
|
3216
|
+
ctx = ConversionContext(
|
|
3217
|
+
input_path=file_path,
|
|
3218
|
+
output_dir=file_output_dir,
|
|
3219
|
+
config=cfg,
|
|
3220
|
+
actual_file=preconverted_map.get(file_path),
|
|
3221
|
+
shared_processor=shared_processor,
|
|
3222
|
+
project_dir=output_dir.parent,
|
|
3223
|
+
use_multiprocess_images=True,
|
|
3224
|
+
input_base_dir=input_dir,
|
|
3225
|
+
)
|
|
3226
|
+
|
|
3227
|
+
# Run core conversion pipeline
|
|
3228
|
+
result = await convert_document_core(ctx, MAX_DOCUMENT_SIZE)
|
|
3229
|
+
|
|
3230
|
+
total_time = time.perf_counter() - start_time
|
|
3231
|
+
|
|
3232
|
+
if not result.success:
|
|
3233
|
+
logger.error(
|
|
3234
|
+
f"[FAIL] {file_path.name}: {result.error} ({total_time:.2f}s)"
|
|
3235
|
+
)
|
|
3236
|
+
return ProcessResult(success=False, error=result.error)
|
|
3237
|
+
|
|
3238
|
+
if result.skip_reason == "exists":
|
|
3239
|
+
logger.info(
|
|
3240
|
+
f"[SKIP] Output exists: {file_output_dir / f'{file_path.name}.md'}"
|
|
3241
|
+
)
|
|
3242
|
+
return ProcessResult(
|
|
3243
|
+
success=True,
|
|
3244
|
+
output_path=str(file_output_dir / f"{file_path.name}.md"),
|
|
3245
|
+
error="skipped (exists)",
|
|
3246
|
+
)
|
|
3247
|
+
|
|
3248
|
+
# Determine cache hit
|
|
3249
|
+
cache_hit = cfg.llm.enabled and not ctx.llm_usage
|
|
3250
|
+
|
|
3251
|
+
logger.info(
|
|
3252
|
+
f"[DONE] {file_path.name}: {total_time:.2f}s "
|
|
3253
|
+
f"(images={ctx.embedded_images_count}, screenshots={ctx.screenshots_count}, cost=${ctx.llm_cost:.4f})"
|
|
3254
|
+
+ (" [cache]" if cache_hit else "")
|
|
3255
|
+
)
|
|
3256
|
+
|
|
3257
|
+
return ProcessResult(
|
|
3258
|
+
success=True,
|
|
3259
|
+
output_path=str(
|
|
3260
|
+
ctx.output_file.with_suffix(".llm.md")
|
|
3261
|
+
if cfg.llm.enabled and ctx.output_file
|
|
3262
|
+
else ctx.output_file
|
|
3263
|
+
),
|
|
3264
|
+
images=ctx.embedded_images_count,
|
|
3265
|
+
screenshots=ctx.screenshots_count,
|
|
3266
|
+
cost_usd=ctx.llm_cost,
|
|
3267
|
+
llm_usage=ctx.llm_usage,
|
|
3268
|
+
image_analysis_result=ctx.image_analysis,
|
|
3269
|
+
cache_hit=cache_hit,
|
|
3270
|
+
)
|
|
3271
|
+
|
|
3272
|
+
except Exception as e:
|
|
3273
|
+
total_time = time.perf_counter() - start_time
|
|
3274
|
+
logger.error(f"[FAIL] {file_path.name}: {e} ({total_time:.2f}s)")
|
|
3275
|
+
return ProcessResult(success=False, error=str(e))
|
|
3276
|
+
|
|
3277
|
+
return process_file
|
|
3278
|
+
|
|
3279
|
+
|
|
3280
|
+
def _create_url_processor(
|
|
3281
|
+
cfg: MarkitaiConfig,
|
|
3282
|
+
output_dir: Path,
|
|
3283
|
+
fetch_strategy: FetchStrategy | None,
|
|
3284
|
+
explicit_fetch_strategy: bool,
|
|
3285
|
+
shared_processor: LLMProcessor | None = None,
|
|
3286
|
+
) -> Callable:
|
|
3287
|
+
"""Create a URL processing function for batch processing.
|
|
3288
|
+
|
|
3289
|
+
Args:
|
|
3290
|
+
cfg: Configuration
|
|
3291
|
+
output_dir: Output directory
|
|
3292
|
+
fetch_strategy: Fetch strategy to use
|
|
3293
|
+
explicit_fetch_strategy: Whether strategy was explicitly specified
|
|
3294
|
+
shared_processor: Optional shared LLMProcessor
|
|
3295
|
+
|
|
3296
|
+
Returns:
|
|
3297
|
+
Async function that processes a single URL and returns ProcessResult
|
|
3298
|
+
"""
|
|
3299
|
+
from markitai.batch import ProcessResult
|
|
3300
|
+
from markitai.fetch import (
|
|
3301
|
+
AgentBrowserNotFoundError,
|
|
3302
|
+
FetchError,
|
|
3303
|
+
FetchStrategy,
|
|
3304
|
+
JinaRateLimitError,
|
|
3305
|
+
fetch_url,
|
|
3306
|
+
get_fetch_cache,
|
|
3307
|
+
)
|
|
3308
|
+
from markitai.image import download_url_images
|
|
3309
|
+
|
|
3310
|
+
# Determine fetch strategy (use config default if not specified)
|
|
3311
|
+
_fetch_strategy = fetch_strategy
|
|
3312
|
+
if _fetch_strategy is None:
|
|
3313
|
+
_fetch_strategy = FetchStrategy(cfg.fetch.strategy)
|
|
3314
|
+
|
|
3315
|
+
# Initialize fetch cache for URL processing
|
|
3316
|
+
url_fetch_cache = None
|
|
3317
|
+
if cfg.cache.enabled:
|
|
3318
|
+
url_cache_dir = output_dir.parent / ".markitai"
|
|
3319
|
+
url_fetch_cache = get_fetch_cache(url_cache_dir, cfg.cache.max_size_bytes)
|
|
3320
|
+
|
|
3321
|
+
# Prepare screenshot directory if enabled
|
|
3322
|
+
url_screenshot_dir = (
|
|
3323
|
+
ensure_screenshots_dir(output_dir) if cfg.screenshot.enabled else None
|
|
3324
|
+
)
|
|
3325
|
+
|
|
3326
|
+
async def process_url(
|
|
3327
|
+
url: str,
|
|
3328
|
+
source_file: Path,
|
|
3329
|
+
custom_name: str | None = None,
|
|
3330
|
+
) -> tuple[ProcessResult, dict[str, Any]]:
|
|
3331
|
+
"""Process a single URL.
|
|
3332
|
+
|
|
3333
|
+
Args:
|
|
3334
|
+
url: URL to process
|
|
3335
|
+
source_file: Path to the .urls file containing this URL
|
|
3336
|
+
custom_name: Optional custom output name
|
|
3337
|
+
|
|
3338
|
+
Returns:
|
|
3339
|
+
Tuple of (ProcessResult, extra_info dict with fetch_strategy)
|
|
3340
|
+
"""
|
|
3341
|
+
import time
|
|
3342
|
+
|
|
3343
|
+
start_time = time.perf_counter()
|
|
3344
|
+
extra_info: dict[str, Any] = {
|
|
3345
|
+
"fetch_strategy": "unknown",
|
|
3346
|
+
}
|
|
3347
|
+
|
|
3348
|
+
try:
|
|
3349
|
+
# Generate filename
|
|
3350
|
+
if custom_name:
|
|
3351
|
+
filename = f"{custom_name}.md"
|
|
3352
|
+
else:
|
|
3353
|
+
filename = url_to_filename(url)
|
|
3354
|
+
|
|
3355
|
+
logger.info(f"[URL] Processing: {url} (strategy: {_fetch_strategy.value})")
|
|
3356
|
+
|
|
3357
|
+
# Fetch URL using the configured strategy
|
|
3358
|
+
try:
|
|
3359
|
+
fetch_result = await fetch_url(
|
|
3360
|
+
url,
|
|
3361
|
+
_fetch_strategy,
|
|
3362
|
+
cfg.fetch,
|
|
3363
|
+
explicit_strategy=explicit_fetch_strategy,
|
|
3364
|
+
cache=url_fetch_cache,
|
|
3365
|
+
skip_read_cache=cfg.cache.no_cache,
|
|
3366
|
+
screenshot=cfg.screenshot.enabled,
|
|
3367
|
+
screenshot_dir=url_screenshot_dir,
|
|
3368
|
+
screenshot_config=cfg.screenshot
|
|
3369
|
+
if cfg.screenshot.enabled
|
|
3370
|
+
else None,
|
|
3371
|
+
)
|
|
3372
|
+
extra_info["fetch_strategy"] = fetch_result.strategy_used
|
|
3373
|
+
original_markdown = fetch_result.content
|
|
3374
|
+
screenshot_path = fetch_result.screenshot_path
|
|
3375
|
+
cache_status = " [cache]" if fetch_result.cache_hit else ""
|
|
3376
|
+
logger.debug(
|
|
3377
|
+
f"[URL] Fetched via {fetch_result.strategy_used}{cache_status}: {url}"
|
|
3378
|
+
)
|
|
3379
|
+
except AgentBrowserNotFoundError:
|
|
3380
|
+
logger.error(f"[URL] agent-browser not installed for: {url}")
|
|
3381
|
+
return ProcessResult(
|
|
3382
|
+
success=False,
|
|
3383
|
+
error="agent-browser not installed",
|
|
3384
|
+
), extra_info
|
|
3385
|
+
except JinaRateLimitError:
|
|
3386
|
+
logger.error(f"[URL] Jina rate limit exceeded for: {url}")
|
|
3387
|
+
return ProcessResult(
|
|
3388
|
+
success=False,
|
|
3389
|
+
error="Jina Reader rate limit exceeded (20 RPM)",
|
|
3390
|
+
), extra_info
|
|
3391
|
+
except FetchError as e:
|
|
3392
|
+
logger.error(f"[URL] Fetch failed {url}: {e}")
|
|
3393
|
+
return ProcessResult(success=False, error=str(e)), extra_info
|
|
3394
|
+
|
|
3395
|
+
if not original_markdown.strip():
|
|
3396
|
+
logger.warning(f"[URL] No content: {url}")
|
|
3397
|
+
return ProcessResult(
|
|
3398
|
+
success=False,
|
|
3399
|
+
error="No content extracted",
|
|
3400
|
+
), extra_info
|
|
3401
|
+
|
|
3402
|
+
markdown_for_llm = original_markdown
|
|
3403
|
+
|
|
3404
|
+
# Check for multi-source content (static + browser + screenshot)
|
|
3405
|
+
has_multi_source = (
|
|
3406
|
+
fetch_result.static_content is not None
|
|
3407
|
+
or fetch_result.browser_content is not None
|
|
3408
|
+
)
|
|
3409
|
+
has_screenshot = screenshot_path and screenshot_path.exists()
|
|
3410
|
+
|
|
3411
|
+
logger.debug(
|
|
3412
|
+
f"[URL] Multi-source check: static={fetch_result.static_content is not None}, "
|
|
3413
|
+
f"browser={fetch_result.browser_content is not None}, "
|
|
3414
|
+
f"has_multi_source={has_multi_source}, has_screenshot={has_screenshot}"
|
|
3415
|
+
)
|
|
3416
|
+
|
|
3417
|
+
# Download images if --alt or --desc is enabled
|
|
3418
|
+
images_count = 0
|
|
3419
|
+
screenshots_count = 1 if has_screenshot else 0
|
|
3420
|
+
downloaded_images: list[Path] = []
|
|
3421
|
+
|
|
3422
|
+
if has_screenshot and screenshot_path:
|
|
3423
|
+
logger.debug(f"[URL] Screenshot captured: {screenshot_path.name}")
|
|
3424
|
+
if cfg.image.alt_enabled or cfg.image.desc_enabled:
|
|
3425
|
+
download_result = await download_url_images(
|
|
3426
|
+
markdown=original_markdown,
|
|
3427
|
+
output_dir=output_dir,
|
|
3428
|
+
base_url=url,
|
|
3429
|
+
config=cfg.image,
|
|
3430
|
+
source_name=filename.replace(".md", ""),
|
|
3431
|
+
concurrency=5,
|
|
3432
|
+
timeout=30,
|
|
3433
|
+
)
|
|
3434
|
+
markdown_for_llm = download_result.updated_markdown
|
|
3435
|
+
downloaded_images = download_result.downloaded_paths
|
|
3436
|
+
images_count = len(downloaded_images)
|
|
3437
|
+
|
|
3438
|
+
# Generate output path
|
|
3439
|
+
base_output_file = output_dir / filename
|
|
3440
|
+
output_file = resolve_output_path(base_output_file, cfg.output.on_conflict)
|
|
3441
|
+
|
|
3442
|
+
if output_file is None:
|
|
3443
|
+
logger.info(f"[URL] Skipped (exists): {base_output_file}")
|
|
3444
|
+
return ProcessResult(
|
|
3445
|
+
success=True,
|
|
3446
|
+
output_path=str(base_output_file),
|
|
3447
|
+
error="skipped (exists)",
|
|
3448
|
+
), extra_info
|
|
3449
|
+
|
|
3450
|
+
# Write base .md file with original content
|
|
3451
|
+
base_content = _add_basic_frontmatter(
|
|
3452
|
+
original_markdown,
|
|
3453
|
+
url,
|
|
3454
|
+
fetch_strategy=fetch_result.strategy_used if fetch_result else None,
|
|
3455
|
+
screenshot_path=screenshot_path,
|
|
3456
|
+
output_dir=output_dir,
|
|
3457
|
+
)
|
|
3458
|
+
atomic_write_text(output_file, base_content)
|
|
3459
|
+
|
|
3460
|
+
# LLM processing uses markdown with local image paths
|
|
3461
|
+
url_llm_usage: dict[str, dict[str, Any]] = {}
|
|
3462
|
+
llm_cost = 0.0
|
|
3463
|
+
img_analysis = None
|
|
3464
|
+
|
|
3465
|
+
if cfg.llm.enabled:
|
|
3466
|
+
# Check if image analysis should run
|
|
3467
|
+
should_analyze_images = (
|
|
3468
|
+
cfg.image.alt_enabled or cfg.image.desc_enabled
|
|
3469
|
+
) and downloaded_images
|
|
3470
|
+
|
|
3471
|
+
# Check if we should use vision enhancement (multi-source + screenshot)
|
|
3472
|
+
use_vision_enhancement = (
|
|
3473
|
+
has_multi_source and has_screenshot and screenshot_path
|
|
3474
|
+
)
|
|
3475
|
+
|
|
3476
|
+
if use_vision_enhancement:
|
|
3477
|
+
# Multi-source URL with screenshot: use vision LLM for better content extraction
|
|
3478
|
+
# Build multi-source markdown content for LLM
|
|
3479
|
+
multi_source_content = _build_multi_source_content(
|
|
3480
|
+
fetch_result.static_content,
|
|
3481
|
+
fetch_result.browser_content,
|
|
3482
|
+
markdown_for_llm, # Fallback primary content
|
|
3483
|
+
)
|
|
3484
|
+
|
|
3485
|
+
logger.info(
|
|
3486
|
+
f"[URL] Using vision enhancement for multi-source URL: {url}"
|
|
3487
|
+
)
|
|
3488
|
+
|
|
3489
|
+
# Use vision enhancement with screenshot
|
|
3490
|
+
assert (
|
|
3491
|
+
screenshot_path is not None
|
|
3492
|
+
) # Guaranteed by use_vision_enhancement check
|
|
3493
|
+
_, cost, url_llm_usage = await _process_url_with_vision(
|
|
3494
|
+
multi_source_content,
|
|
3495
|
+
screenshot_path,
|
|
3496
|
+
url,
|
|
3497
|
+
cfg,
|
|
3498
|
+
output_file,
|
|
3499
|
+
processor=shared_processor,
|
|
3500
|
+
project_dir=output_dir.parent,
|
|
3501
|
+
)
|
|
3502
|
+
llm_cost = cost
|
|
3503
|
+
|
|
3504
|
+
# Run image analysis in parallel if needed
|
|
3505
|
+
if should_analyze_images:
|
|
3506
|
+
(
|
|
3507
|
+
_,
|
|
3508
|
+
image_cost,
|
|
3509
|
+
image_usage,
|
|
3510
|
+
img_analysis,
|
|
3511
|
+
) = await analyze_images_with_llm(
|
|
3512
|
+
downloaded_images,
|
|
3513
|
+
multi_source_content,
|
|
3514
|
+
output_file,
|
|
3515
|
+
cfg,
|
|
3516
|
+
Path(url),
|
|
3517
|
+
concurrency_limit=cfg.llm.concurrency,
|
|
3518
|
+
processor=shared_processor,
|
|
3519
|
+
project_dir=output_dir.parent,
|
|
3520
|
+
)
|
|
3521
|
+
_merge_llm_usage(url_llm_usage, image_usage)
|
|
3522
|
+
llm_cost += image_cost
|
|
3523
|
+
elif should_analyze_images:
|
|
3524
|
+
# Standard processing with image analysis
|
|
3525
|
+
doc_task = process_with_llm(
|
|
3526
|
+
markdown_for_llm,
|
|
3527
|
+
url,
|
|
3528
|
+
cfg,
|
|
3529
|
+
output_file,
|
|
3530
|
+
processor=shared_processor,
|
|
3531
|
+
project_dir=output_dir.parent,
|
|
3532
|
+
)
|
|
3533
|
+
img_task = analyze_images_with_llm(
|
|
3534
|
+
downloaded_images,
|
|
3535
|
+
markdown_for_llm,
|
|
3536
|
+
output_file,
|
|
3537
|
+
cfg,
|
|
3538
|
+
Path(url), # Use URL as source path
|
|
3539
|
+
concurrency_limit=cfg.llm.concurrency,
|
|
3540
|
+
processor=shared_processor,
|
|
3541
|
+
project_dir=output_dir.parent,
|
|
3542
|
+
)
|
|
3543
|
+
|
|
3544
|
+
# Execute in parallel
|
|
3545
|
+
doc_result, img_result = await asyncio.gather(doc_task, img_task)
|
|
3546
|
+
|
|
3547
|
+
# Unpack results
|
|
3548
|
+
_, cost, url_llm_usage = doc_result
|
|
3549
|
+
_, image_cost, image_usage, img_analysis = img_result
|
|
3550
|
+
|
|
3551
|
+
_merge_llm_usage(url_llm_usage, image_usage)
|
|
3552
|
+
llm_cost = cost + image_cost
|
|
3553
|
+
else:
|
|
3554
|
+
# Only document processing
|
|
3555
|
+
_, cost, url_llm_usage = await process_with_llm(
|
|
3556
|
+
markdown_for_llm,
|
|
3557
|
+
url,
|
|
3558
|
+
cfg,
|
|
3559
|
+
output_file,
|
|
3560
|
+
processor=shared_processor,
|
|
3561
|
+
project_dir=output_dir.parent,
|
|
3562
|
+
)
|
|
3563
|
+
llm_cost = cost
|
|
3564
|
+
|
|
3565
|
+
# Track cache hit: LLM enabled but no usage means cache hit
|
|
3566
|
+
is_cache_hit = cfg.llm.enabled and not url_llm_usage
|
|
3567
|
+
|
|
3568
|
+
total_time = time.perf_counter() - start_time
|
|
3569
|
+
logger.info(
|
|
3570
|
+
f"[URL] Completed via {extra_info['fetch_strategy']}: {url} "
|
|
3571
|
+
f"({total_time:.2f}s)" + (" [cache]" if is_cache_hit else "")
|
|
3572
|
+
)
|
|
3573
|
+
|
|
3574
|
+
return ProcessResult(
|
|
3575
|
+
success=True,
|
|
3576
|
+
output_path=str(
|
|
3577
|
+
output_file.with_suffix(".llm.md")
|
|
3578
|
+
if cfg.llm.enabled
|
|
3579
|
+
else output_file
|
|
3580
|
+
),
|
|
3581
|
+
images=images_count,
|
|
3582
|
+
screenshots=screenshots_count,
|
|
3583
|
+
cost_usd=llm_cost,
|
|
3584
|
+
llm_usage=url_llm_usage,
|
|
3585
|
+
image_analysis_result=img_analysis,
|
|
3586
|
+
cache_hit=is_cache_hit,
|
|
3587
|
+
), extra_info
|
|
3588
|
+
|
|
3589
|
+
except Exception as e:
|
|
3590
|
+
total_time = time.perf_counter() - start_time
|
|
3591
|
+
logger.error(f"[URL] Failed {url}: {e} ({total_time:.2f}s)")
|
|
3592
|
+
return ProcessResult(success=False, error=str(e)), extra_info
|
|
3593
|
+
|
|
3594
|
+
return process_url
|
|
3595
|
+
|
|
3596
|
+
|
|
3597
|
+
async def process_batch(
|
|
3598
|
+
input_dir: Path,
|
|
3599
|
+
output_dir: Path,
|
|
3600
|
+
cfg: MarkitaiConfig,
|
|
3601
|
+
resume: bool,
|
|
3602
|
+
dry_run: bool,
|
|
3603
|
+
verbose: bool = False,
|
|
3604
|
+
console_handler_id: int | None = None,
|
|
3605
|
+
log_file_path: Path | None = None,
|
|
3606
|
+
fetch_strategy: FetchStrategy | None = None,
|
|
3607
|
+
explicit_fetch_strategy: bool = False,
|
|
3608
|
+
) -> None:
|
|
3609
|
+
"""Process directory in batch mode."""
|
|
3610
|
+
from markitai.batch import BatchProcessor
|
|
3611
|
+
|
|
3612
|
+
# Supported extensions
|
|
3613
|
+
extensions = set(EXTENSION_MAP.keys())
|
|
3614
|
+
|
|
3615
|
+
# Build task options for report (before BatchProcessor init for hash calculation)
|
|
3616
|
+
# Note: input_dir and output_dir will be converted to absolute paths by init_state()
|
|
3617
|
+
task_options: dict[str, Any] = {
|
|
3618
|
+
"concurrency": cfg.batch.concurrency,
|
|
3619
|
+
"llm": cfg.llm.enabled,
|
|
3620
|
+
"ocr": cfg.ocr.enabled,
|
|
3621
|
+
"screenshot": cfg.screenshot.enabled,
|
|
3622
|
+
"alt": cfg.image.alt_enabled,
|
|
3623
|
+
"desc": cfg.image.desc_enabled,
|
|
3624
|
+
}
|
|
3625
|
+
if cfg.llm.enabled and cfg.llm.model_list:
|
|
3626
|
+
task_options["models"] = [m.litellm_params.model for m in cfg.llm.model_list]
|
|
3627
|
+
|
|
3628
|
+
batch = BatchProcessor(
|
|
3629
|
+
cfg.batch,
|
|
3630
|
+
output_dir,
|
|
3631
|
+
input_path=input_dir,
|
|
3632
|
+
log_file=log_file_path,
|
|
3633
|
+
on_conflict=cfg.output.on_conflict,
|
|
3634
|
+
task_options=task_options,
|
|
3635
|
+
)
|
|
3636
|
+
files = batch.discover_files(input_dir, extensions)
|
|
3637
|
+
|
|
3638
|
+
# Discover .urls files for URL batch processing
|
|
3639
|
+
from markitai.urls import find_url_list_files, parse_url_list
|
|
3640
|
+
|
|
3641
|
+
url_list_files = find_url_list_files(input_dir)
|
|
3642
|
+
url_entries_from_files: list = [] # List of (source_file, UrlEntry)
|
|
3643
|
+
|
|
3644
|
+
for url_file in url_list_files:
|
|
3645
|
+
try:
|
|
3646
|
+
entries = parse_url_list(url_file)
|
|
3647
|
+
for entry in entries:
|
|
3648
|
+
url_entries_from_files.append((url_file, entry))
|
|
3649
|
+
if entries:
|
|
3650
|
+
logger.info(f"Found {len(entries)} URLs in {url_file.name}")
|
|
3651
|
+
except Exception as e:
|
|
3652
|
+
logger.warning(f"Failed to parse URL list {url_file}: {e}")
|
|
3653
|
+
|
|
3654
|
+
# Check agent-browser availability if URLs will be processed
|
|
3655
|
+
if url_entries_from_files:
|
|
3656
|
+
_check_agent_browser_for_urls(cfg, console)
|
|
3657
|
+
|
|
3658
|
+
if not files and not url_entries_from_files:
|
|
3659
|
+
console.print("[yellow]No supported files or URL lists found.[/yellow]")
|
|
3660
|
+
raise SystemExit(0)
|
|
3661
|
+
|
|
3662
|
+
# Warn about potential case-sensitivity mismatches in --no-cache-for patterns
|
|
3663
|
+
if cfg.cache.no_cache_patterns:
|
|
3664
|
+
_warn_case_sensitivity_mismatches(files, input_dir, cfg.cache.no_cache_patterns)
|
|
3665
|
+
|
|
3666
|
+
from markitai.security import check_symlink_safety
|
|
3667
|
+
|
|
3668
|
+
check_symlink_safety(output_dir, allow_symlinks=cfg.output.allow_symlinks)
|
|
3669
|
+
ensure_dir(output_dir)
|
|
3670
|
+
|
|
3671
|
+
if dry_run:
|
|
3672
|
+
# Build dry run message
|
|
3673
|
+
cache_status = "enabled" if cfg.cache.enabled else "disabled"
|
|
3674
|
+
dry_run_msg = f"[yellow]Would process {len(files)} files[/yellow]"
|
|
3675
|
+
if url_entries_from_files:
|
|
3676
|
+
dry_run_msg += f"\n[yellow]Would process {len(url_entries_from_files)} URLs from {len(url_list_files)} .urls files[/yellow]"
|
|
3677
|
+
dry_run_msg += f"\n[yellow]Input:[/yellow] {input_dir}\n[yellow]Output:[/yellow] {output_dir}"
|
|
3678
|
+
dry_run_msg += f"\n[yellow]Cache:[/yellow] {cache_status}"
|
|
3679
|
+
|
|
3680
|
+
console.print(Panel(dry_run_msg, title="Dry Run"))
|
|
3681
|
+
for f in files[:10]:
|
|
3682
|
+
console.print(f" - {f.name}")
|
|
3683
|
+
if len(files) > 10:
|
|
3684
|
+
console.print(f" ... and {len(files) - 10} more files")
|
|
3685
|
+
if url_entries_from_files:
|
|
3686
|
+
console.print("[dim]URL list files:[/dim]")
|
|
3687
|
+
for url_file in url_list_files[:5]:
|
|
3688
|
+
console.print(f" - {url_file.name}")
|
|
3689
|
+
if len(url_list_files) > 5:
|
|
3690
|
+
console.print(f" ... and {len(url_list_files) - 5} more .urls files")
|
|
3691
|
+
if cfg.cache.enabled:
|
|
3692
|
+
console.print(
|
|
3693
|
+
"[dim]Tip: Use 'markitai cache stats -v' to view cached entries[/dim]"
|
|
3694
|
+
)
|
|
3695
|
+
raise SystemExit(0)
|
|
3696
|
+
|
|
3697
|
+
# Record batch start time before any processing (including pre-conversion)
|
|
3698
|
+
from datetime import datetime
|
|
3699
|
+
|
|
3700
|
+
batch_started_at = datetime.now().astimezone().isoformat()
|
|
3701
|
+
|
|
3702
|
+
# Start Live display early to capture all logs (including URL processing)
|
|
3703
|
+
# This ensures all INFO+ logs go to the panel instead of console
|
|
3704
|
+
batch.start_live_display(
|
|
3705
|
+
verbose=verbose,
|
|
3706
|
+
console_handler_id=console_handler_id,
|
|
3707
|
+
total_files=len(files),
|
|
3708
|
+
total_urls=len(url_entries_from_files),
|
|
3709
|
+
)
|
|
3710
|
+
|
|
3711
|
+
# Pre-convert legacy Office files using batch COM (Windows only)
|
|
3712
|
+
# This reduces overhead by starting each Office app only once
|
|
3713
|
+
legacy_suffixes = {".doc", ".ppt", ".xls"}
|
|
3714
|
+
legacy_files = [f for f in files if f.suffix.lower() in legacy_suffixes]
|
|
3715
|
+
preconverted_map: dict[Path, Path] = {}
|
|
3716
|
+
preconvert_temp_dir: tempfile.TemporaryDirectory | None = None
|
|
3717
|
+
|
|
3718
|
+
if legacy_files:
|
|
3719
|
+
import platform
|
|
3720
|
+
|
|
3721
|
+
if platform.system() == "Windows":
|
|
3722
|
+
from markitai.converter.legacy import batch_convert_legacy_files
|
|
3723
|
+
|
|
3724
|
+
# Create temp directory for pre-converted files
|
|
3725
|
+
preconvert_temp_dir = tempfile.TemporaryDirectory(
|
|
3726
|
+
prefix="markitai_preconv_"
|
|
3727
|
+
)
|
|
3728
|
+
preconvert_path = Path(preconvert_temp_dir.name)
|
|
3729
|
+
|
|
3730
|
+
logger.info(f"Pre-converting {len(legacy_files)} legacy files...")
|
|
3731
|
+
preconverted_map = batch_convert_legacy_files(legacy_files, preconvert_path)
|
|
3732
|
+
if preconverted_map:
|
|
3733
|
+
logger.info(
|
|
3734
|
+
f"Pre-converted {len(preconverted_map)}/{len(legacy_files)} files with MS Office COM"
|
|
3735
|
+
)
|
|
3736
|
+
|
|
3737
|
+
# Create shared LLM runtime and processor for batch mode
|
|
3738
|
+
shared_processor = None
|
|
3739
|
+
if cfg.llm.enabled:
|
|
3740
|
+
from markitai.llm import LLMRuntime
|
|
3741
|
+
|
|
3742
|
+
runtime = LLMRuntime(concurrency=cfg.llm.concurrency)
|
|
3743
|
+
# Use output directory's parent as project dir for project-level cache
|
|
3744
|
+
project_dir = output_dir.parent if output_dir else Path.cwd()
|
|
3745
|
+
shared_processor = create_llm_processor(
|
|
3746
|
+
cfg, project_dir=project_dir, runtime=runtime
|
|
3747
|
+
)
|
|
3748
|
+
logger.info(
|
|
3749
|
+
f"Created shared LLMProcessor with concurrency={cfg.llm.concurrency}"
|
|
3750
|
+
)
|
|
3751
|
+
|
|
3752
|
+
# Create process_file using workflow/core implementation
|
|
3753
|
+
process_file = _create_process_file(
|
|
3754
|
+
cfg=cfg,
|
|
3755
|
+
input_dir=input_dir,
|
|
3756
|
+
output_dir=output_dir,
|
|
3757
|
+
preconverted_map=preconverted_map,
|
|
3758
|
+
shared_processor=shared_processor,
|
|
3759
|
+
)
|
|
3760
|
+
logger.debug("Using workflow/core implementation for batch processing")
|
|
3761
|
+
|
|
3762
|
+
# Initialize state for URL tracking
|
|
3763
|
+
from markitai.batch import FileStatus, UrlState
|
|
3764
|
+
|
|
3765
|
+
# Group URL entries by source file and collect source file list
|
|
3766
|
+
url_sources_set: set[str] = set()
|
|
3767
|
+
if url_entries_from_files:
|
|
3768
|
+
for source_file, _entry in url_entries_from_files:
|
|
3769
|
+
url_sources_set.add(str(source_file))
|
|
3770
|
+
|
|
3771
|
+
# Initialize batch state with files
|
|
3772
|
+
if files or url_entries_from_files:
|
|
3773
|
+
batch.state = batch.init_state(
|
|
3774
|
+
input_dir=input_dir,
|
|
3775
|
+
files=files,
|
|
3776
|
+
options=task_options,
|
|
3777
|
+
started_at=batch_started_at,
|
|
3778
|
+
)
|
|
3779
|
+
# Add URL source files to state
|
|
3780
|
+
batch.state.url_sources = list(url_sources_set)
|
|
3781
|
+
|
|
3782
|
+
# Initialize URL states in batch state
|
|
3783
|
+
for source_file, entry in url_entries_from_files:
|
|
3784
|
+
batch.state.urls[entry.url] = UrlState(
|
|
3785
|
+
url=entry.url,
|
|
3786
|
+
source_file=str(source_file),
|
|
3787
|
+
status=FileStatus.PENDING,
|
|
3788
|
+
)
|
|
3789
|
+
|
|
3790
|
+
# Create URL processor function
|
|
3791
|
+
url_processor = None
|
|
3792
|
+
if url_entries_from_files:
|
|
3793
|
+
url_processor = _create_url_processor(
|
|
3794
|
+
cfg=cfg,
|
|
3795
|
+
output_dir=output_dir,
|
|
3796
|
+
fetch_strategy=fetch_strategy,
|
|
3797
|
+
explicit_fetch_strategy=explicit_fetch_strategy,
|
|
3798
|
+
shared_processor=shared_processor,
|
|
3799
|
+
)
|
|
3800
|
+
|
|
3801
|
+
# Create separate semaphores for file and URL processing
|
|
3802
|
+
# This allows file processing and URL fetching to run at their own concurrency levels
|
|
3803
|
+
file_semaphore = asyncio.Semaphore(cfg.batch.concurrency)
|
|
3804
|
+
url_semaphore = asyncio.Semaphore(cfg.batch.url_concurrency)
|
|
3805
|
+
|
|
3806
|
+
async def process_url_with_state(
|
|
3807
|
+
url: str,
|
|
3808
|
+
source_file: Path,
|
|
3809
|
+
custom_name: str | None,
|
|
3810
|
+
) -> None:
|
|
3811
|
+
"""Process a URL and update batch state."""
|
|
3812
|
+
assert batch.state is not None
|
|
3813
|
+
assert url_processor is not None
|
|
3814
|
+
|
|
3815
|
+
url_state = batch.state.urls.get(url)
|
|
3816
|
+
if url_state is None:
|
|
3817
|
+
return
|
|
3818
|
+
|
|
3819
|
+
# Update state to in_progress
|
|
3820
|
+
url_state.status = FileStatus.IN_PROGRESS
|
|
3821
|
+
url_state.started_at = datetime.now().astimezone().isoformat()
|
|
3822
|
+
|
|
3823
|
+
start_time = asyncio.get_event_loop().time()
|
|
3824
|
+
|
|
3825
|
+
try:
|
|
3826
|
+
async with url_semaphore:
|
|
3827
|
+
result, extra_info = await url_processor(url, source_file, custom_name)
|
|
3828
|
+
|
|
3829
|
+
if result.success:
|
|
3830
|
+
url_state.status = FileStatus.COMPLETED
|
|
3831
|
+
url_state.output = result.output_path
|
|
3832
|
+
url_state.fetch_strategy = extra_info.get("fetch_strategy")
|
|
3833
|
+
url_state.images = result.images
|
|
3834
|
+
url_state.cost_usd = result.cost_usd
|
|
3835
|
+
url_state.llm_usage = result.llm_usage
|
|
3836
|
+
url_state.cache_hit = result.cache_hit
|
|
3837
|
+
# Collect image analysis for JSON output
|
|
3838
|
+
if result.image_analysis_result is not None:
|
|
3839
|
+
batch.image_analysis_results.append(result.image_analysis_result)
|
|
3840
|
+
else:
|
|
3841
|
+
url_state.status = FileStatus.FAILED
|
|
3842
|
+
url_state.error = result.error
|
|
3843
|
+
|
|
3844
|
+
except Exception as e:
|
|
3845
|
+
url_state.status = FileStatus.FAILED
|
|
3846
|
+
url_state.error = str(e)
|
|
3847
|
+
logger.error(f"[URL] Failed {url}: {e}")
|
|
3848
|
+
|
|
3849
|
+
finally:
|
|
3850
|
+
end_time = asyncio.get_event_loop().time()
|
|
3851
|
+
url_state.completed_at = datetime.now().astimezone().isoformat()
|
|
3852
|
+
url_state.duration = end_time - start_time
|
|
3853
|
+
|
|
3854
|
+
# Update progress
|
|
3855
|
+
batch.update_url_status(url, completed=True)
|
|
3856
|
+
|
|
3857
|
+
# Save state (non-blocking, throttled)
|
|
3858
|
+
await asyncio.to_thread(batch.save_state)
|
|
3859
|
+
|
|
3860
|
+
async def process_file_with_state(file_path: Path) -> None:
|
|
3861
|
+
"""Process a file and update batch state."""
|
|
3862
|
+
assert batch.state is not None
|
|
3863
|
+
|
|
3864
|
+
file_key = str(file_path)
|
|
3865
|
+
file_state = batch.state.files.get(file_key)
|
|
3866
|
+
|
|
3867
|
+
if file_state is None:
|
|
3868
|
+
return
|
|
3869
|
+
|
|
3870
|
+
# Update state to in_progress
|
|
3871
|
+
file_state.status = FileStatus.IN_PROGRESS
|
|
3872
|
+
file_state.started_at = datetime.now().astimezone().isoformat()
|
|
3873
|
+
|
|
3874
|
+
start_time = asyncio.get_event_loop().time()
|
|
3875
|
+
|
|
3876
|
+
try:
|
|
3877
|
+
async with file_semaphore:
|
|
3878
|
+
result = await process_file(file_path)
|
|
3879
|
+
|
|
3880
|
+
if result.success:
|
|
3881
|
+
file_state.status = FileStatus.COMPLETED
|
|
3882
|
+
file_state.output = result.output_path
|
|
3883
|
+
file_state.images = result.images
|
|
3884
|
+
file_state.screenshots = result.screenshots
|
|
3885
|
+
file_state.cost_usd = result.cost_usd
|
|
3886
|
+
file_state.llm_usage = result.llm_usage
|
|
3887
|
+
file_state.cache_hit = result.cache_hit
|
|
3888
|
+
# Collect image analysis for JSON output
|
|
3889
|
+
if result.image_analysis_result is not None:
|
|
3890
|
+
batch.image_analysis_results.append(result.image_analysis_result)
|
|
3891
|
+
else:
|
|
3892
|
+
file_state.status = FileStatus.FAILED
|
|
3893
|
+
file_state.error = result.error
|
|
3894
|
+
|
|
3895
|
+
except Exception as e:
|
|
3896
|
+
file_state.status = FileStatus.FAILED
|
|
3897
|
+
file_state.error = str(e)
|
|
3898
|
+
logger.error(f"[FAIL] {file_path.name}: {e}")
|
|
3899
|
+
|
|
3900
|
+
finally:
|
|
3901
|
+
end_time = asyncio.get_event_loop().time()
|
|
3902
|
+
file_state.completed_at = datetime.now().astimezone().isoformat()
|
|
3903
|
+
file_state.duration = end_time - start_time
|
|
3904
|
+
|
|
3905
|
+
# Update progress
|
|
3906
|
+
batch.advance_progress()
|
|
3907
|
+
|
|
3908
|
+
# Save state (non-blocking, throttled)
|
|
3909
|
+
await asyncio.to_thread(batch.save_state)
|
|
3910
|
+
|
|
3911
|
+
# Run all tasks in parallel (URLs + files)
|
|
3912
|
+
state = batch.state
|
|
3913
|
+
try:
|
|
3914
|
+
if files or url_entries_from_files:
|
|
3915
|
+
# Build task list
|
|
3916
|
+
all_tasks = []
|
|
3917
|
+
|
|
3918
|
+
# Add URL tasks
|
|
3919
|
+
for source_file, entry in url_entries_from_files:
|
|
3920
|
+
all_tasks.append(
|
|
3921
|
+
process_url_with_state(entry.url, source_file, entry.output_name)
|
|
3922
|
+
)
|
|
3923
|
+
|
|
3924
|
+
# Add file tasks
|
|
3925
|
+
for file_path in files:
|
|
3926
|
+
all_tasks.append(process_file_with_state(file_path))
|
|
3927
|
+
|
|
3928
|
+
if all_tasks:
|
|
3929
|
+
logger.info(
|
|
3930
|
+
f"Processing {len(files)} files and {len(url_entries_from_files)} URLs "
|
|
3931
|
+
f"with concurrency {cfg.batch.concurrency}"
|
|
3932
|
+
)
|
|
3933
|
+
|
|
3934
|
+
# Run all tasks in parallel
|
|
3935
|
+
await asyncio.gather(*all_tasks, return_exceptions=True)
|
|
3936
|
+
|
|
3937
|
+
finally:
|
|
3938
|
+
# Stop Live display and restore console handler
|
|
3939
|
+
# This must be done before printing summary
|
|
3940
|
+
batch.stop_live_display()
|
|
3941
|
+
|
|
3942
|
+
# Clean up pre-conversion temp directory
|
|
3943
|
+
if preconvert_temp_dir is not None:
|
|
3944
|
+
preconvert_temp_dir.cleanup()
|
|
3945
|
+
|
|
3946
|
+
if state:
|
|
3947
|
+
# Update state timestamp
|
|
3948
|
+
state.updated_at = datetime.now().astimezone().isoformat()
|
|
3949
|
+
batch.save_state(force=True)
|
|
3950
|
+
|
|
3951
|
+
# Print summary (uses state for URL stats)
|
|
3952
|
+
batch.print_summary(
|
|
3953
|
+
url_completed=state.completed_urls_count,
|
|
3954
|
+
url_failed=state.failed_urls_count,
|
|
3955
|
+
url_cache_hits=sum(
|
|
3956
|
+
1
|
|
3957
|
+
for u in state.urls.values()
|
|
3958
|
+
if u.status == FileStatus.COMPLETED and u.cache_hit
|
|
3959
|
+
),
|
|
3960
|
+
url_sources=len(state.url_sources),
|
|
3961
|
+
)
|
|
3962
|
+
|
|
3963
|
+
# Write aggregated image analysis JSON (if any)
|
|
3964
|
+
if batch.image_analysis_results and cfg.image.desc_enabled:
|
|
3965
|
+
write_images_json(output_dir, batch.image_analysis_results)
|
|
3966
|
+
|
|
3967
|
+
# Save report (logging is done inside save_report)
|
|
3968
|
+
batch.save_report()
|
|
3969
|
+
|
|
3970
|
+
# Exit with appropriate code
|
|
3971
|
+
total_failed = (state.failed_count if state else 0) + (
|
|
3972
|
+
state.failed_urls_count if state else 0
|
|
3973
|
+
)
|
|
3974
|
+
if total_failed > 0:
|
|
3975
|
+
raise SystemExit(10) # PARTIAL_FAILURE
|
|
3976
|
+
|
|
3977
|
+
|
|
3978
|
+
if __name__ == "__main__":
|
|
3979
|
+
app()
|