markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/fetch.py ADDED
@@ -0,0 +1,1725 @@
1
+ """URL fetch module for handling static and JS-rendered pages.
2
+
3
+ This module provides a unified interface for fetching web pages using different
4
+ strategies:
5
+ - static: Direct HTTP request via markitdown (default, fastest)
6
+ - browser: Headless browser via agent-browser (for JS-rendered pages)
7
+ - jina: Jina Reader API (cloud-based, no local dependencies)
8
+ - auto: Auto-detect and fallback (tries static first, then browser/jina)
9
+
10
+ Example usage:
11
+ from markitai.fetch import fetch_url, FetchStrategy
12
+
13
+ # Auto-detect strategy
14
+ result = await fetch_url("https://example.com", FetchStrategy.AUTO, config.fetch)
15
+
16
+ # Force browser rendering
17
+ result = await fetch_url("https://x.com/...", FetchStrategy.BROWSER, config.fetch)
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import asyncio
23
+ import hashlib
24
+ import json
25
+ import re
26
+ import shutil
27
+ import sqlite3
28
+ import time
29
+ import uuid
30
+ from dataclasses import dataclass, field
31
+ from enum import Enum
32
+ from pathlib import Path
33
+ from typing import TYPE_CHECKING, Any
34
+ from urllib.parse import urlparse
35
+
36
+ from loguru import logger
37
+
38
+ from markitai.constants import (
39
+ DEFAULT_JINA_BASE_URL,
40
+ JS_REQUIRED_PATTERNS,
41
+ )
42
+
43
+ if TYPE_CHECKING:
44
+ from markitai.config import FetchConfig, ScreenshotConfig
45
+
46
+
47
+ class FetchStrategy(Enum):
48
+ """URL fetch strategy."""
49
+
50
+ AUTO = "auto"
51
+ STATIC = "static"
52
+ BROWSER = "browser"
53
+ JINA = "jina"
54
+
55
+
56
+ class FetchError(Exception):
57
+ """Base exception for fetch errors."""
58
+
59
+ pass
60
+
61
+
62
+ class AgentBrowserNotFoundError(FetchError):
63
+ """Raised when agent-browser is not installed."""
64
+
65
+ def __init__(self) -> None:
66
+ super().__init__(
67
+ "agent-browser is not installed. Install with: npm install -g agent-browser && agent-browser install"
68
+ )
69
+
70
+
71
+ class JinaRateLimitError(FetchError):
72
+ """Raised when Jina Reader API rate limit is exceeded."""
73
+
74
+ def __init__(self) -> None:
75
+ super().__init__(
76
+ "Jina Reader rate limit exceeded (free tier: 20 RPM). "
77
+ "Try again later or use --agent-browser for browser rendering."
78
+ )
79
+
80
+
81
+ class JinaAPIError(FetchError):
82
+ """Raised when Jina Reader API returns an error."""
83
+
84
+ def __init__(self, status_code: int, message: str) -> None:
85
+ self.status_code = status_code
86
+ super().__init__(f"Jina Reader API error ({status_code}): {message}")
87
+
88
+
89
+ @dataclass
90
+ class FetchResult:
91
+ """Result of a URL fetch operation.
92
+
93
+ Supports multi-source content for URL fetching:
94
+ - content: Primary markdown content (best available)
95
+ - static_content: Content from static/jina fetch (pure text)
96
+ - browser_content: Content from browser fetch (rendered page)
97
+ - screenshot_path: Full-page screenshot (visual reference)
98
+
99
+ For LLM processing, all three sources can be provided:
100
+ 1. static_content - Clean text, reliable but may miss JS content
101
+ 2. browser_content - Rendered content, includes JS but may have noise
102
+ 3. screenshot - Visual reference for layout/structure
103
+ """
104
+
105
+ content: str # Primary markdown content (best available)
106
+ strategy_used: str # Actual strategy used (static/browser/jina)
107
+ title: str | None = None # Page title if available
108
+ url: str = "" # Original URL
109
+ final_url: str | None = None # Final URL after redirects
110
+ metadata: dict = field(default_factory=dict) # Additional metadata
111
+ cache_hit: bool = False # Whether result was served from cache
112
+ screenshot_path: Path | None = None # Path to captured screenshot (if any)
113
+ # Multi-source content for enhanced LLM processing
114
+ static_content: str | None = None # Content from static fetch
115
+ browser_content: str | None = None # Content from browser fetch
116
+
117
+
118
+ class FetchCache:
119
+ """SQLite-based cache for fetch results.
120
+
121
+ Caches the fetched content by URL to avoid repeated network requests.
122
+ Uses the same LRU eviction strategy as LLM cache.
123
+
124
+ Connection reuse: A single connection is reused for all operations
125
+ within the same FetchCache instance to reduce connection overhead.
126
+ Thread safety is ensured by a lock protecting all database operations.
127
+ """
128
+
129
+ def __init__(self, db_path: Path, max_size_bytes: int = 100 * 1024 * 1024) -> None:
130
+ """Initialize fetch cache.
131
+
132
+ Args:
133
+ db_path: Path to SQLite database file
134
+ max_size_bytes: Maximum cache size in bytes (default 100MB)
135
+ """
136
+ import threading
137
+
138
+ self._db_path = db_path
139
+ self._max_size_bytes = max_size_bytes
140
+ self._connection: sqlite3.Connection | None = None
141
+ self._lock = threading.Lock() # Protect database operations
142
+ self._db_path.parent.mkdir(parents=True, exist_ok=True)
143
+ self._init_db()
144
+
145
+ def _get_connection(self) -> sqlite3.Connection:
146
+ """Get or create a reusable database connection.
147
+
148
+ Connection is created on first use and reused for subsequent calls.
149
+ Uses check_same_thread=False to allow cross-thread usage in async context.
150
+ Note: Callers must hold self._lock when calling this method.
151
+ """
152
+ if self._connection is None:
153
+ self._connection = sqlite3.connect(
154
+ str(self._db_path),
155
+ timeout=30.0,
156
+ check_same_thread=False, # Allow cross-thread usage for async
157
+ )
158
+ self._connection.execute("PRAGMA journal_mode=WAL")
159
+ self._connection.execute("PRAGMA synchronous=NORMAL")
160
+ self._connection.row_factory = sqlite3.Row
161
+ return self._connection
162
+
163
+ def close(self) -> None:
164
+ """Close the database connection.
165
+
166
+ Call this during cleanup to release resources.
167
+ """
168
+ if self._connection is not None:
169
+ self._connection.close()
170
+ self._connection = None
171
+
172
+ def _init_db(self) -> None:
173
+ """Initialize database schema."""
174
+ with self._lock:
175
+ conn = self._get_connection()
176
+ conn.execute("""
177
+ CREATE TABLE IF NOT EXISTS fetch_cache (
178
+ key TEXT PRIMARY KEY,
179
+ url TEXT NOT NULL,
180
+ content TEXT NOT NULL,
181
+ strategy_used TEXT NOT NULL,
182
+ title TEXT,
183
+ final_url TEXT,
184
+ metadata TEXT,
185
+ created_at INTEGER NOT NULL,
186
+ accessed_at INTEGER NOT NULL,
187
+ size_bytes INTEGER NOT NULL
188
+ )
189
+ """)
190
+ conn.execute(
191
+ "CREATE INDEX IF NOT EXISTS idx_fetch_accessed ON fetch_cache(accessed_at)"
192
+ )
193
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_fetch_url ON fetch_cache(url)")
194
+ conn.commit()
195
+
196
+ def _compute_hash(self, url: str) -> str:
197
+ """Compute hash key from URL."""
198
+ return hashlib.sha256(url.encode()).hexdigest()[:32]
199
+
200
+ def get(self, url: str) -> FetchResult | None:
201
+ """Get cached fetch result if exists.
202
+
203
+ Args:
204
+ url: URL to look up
205
+
206
+ Returns:
207
+ Cached FetchResult or None if not found
208
+ """
209
+ key = self._compute_hash(url)
210
+ now = int(time.time())
211
+
212
+ with self._lock:
213
+ conn = self._get_connection()
214
+ row = conn.execute(
215
+ "SELECT * FROM fetch_cache WHERE key = ?", (key,)
216
+ ).fetchone()
217
+
218
+ if row:
219
+ # Update accessed_at for LRU tracking
220
+ conn.execute(
221
+ "UPDATE fetch_cache SET accessed_at = ? WHERE key = ?", (now, key)
222
+ )
223
+ conn.commit()
224
+
225
+ metadata = json.loads(row["metadata"]) if row["metadata"] else {}
226
+ logger.debug(f"[FetchCache] Cache hit for URL: {url}")
227
+ return FetchResult(
228
+ content=row["content"],
229
+ strategy_used=row["strategy_used"],
230
+ title=row["title"],
231
+ url=row["url"],
232
+ final_url=row["final_url"],
233
+ metadata=metadata,
234
+ cache_hit=True,
235
+ )
236
+
237
+ return None
238
+
239
+ def set(self, url: str, result: FetchResult) -> None:
240
+ """Cache a fetch result.
241
+
242
+ Args:
243
+ url: URL that was fetched
244
+ result: FetchResult to cache
245
+ """
246
+ key = self._compute_hash(url)
247
+ now = int(time.time())
248
+ metadata_json = json.dumps(result.metadata) if result.metadata else None
249
+ size_bytes = len(result.content.encode("utf-8"))
250
+
251
+ with self._lock:
252
+ conn = self._get_connection()
253
+ # Check current total size
254
+ total_size = conn.execute(
255
+ "SELECT COALESCE(SUM(size_bytes), 0) as total FROM fetch_cache"
256
+ ).fetchone()["total"]
257
+
258
+ # Evict LRU entries if needed
259
+ while total_size + size_bytes > self._max_size_bytes:
260
+ oldest = conn.execute(
261
+ "SELECT key, size_bytes FROM fetch_cache ORDER BY accessed_at ASC LIMIT 1"
262
+ ).fetchone()
263
+
264
+ if oldest is None:
265
+ break
266
+
267
+ conn.execute("DELETE FROM fetch_cache WHERE key = ?", (oldest["key"],))
268
+ total_size -= oldest["size_bytes"]
269
+ logger.debug(f"[FetchCache] Evicted LRU entry: {oldest['key'][:8]}...")
270
+
271
+ # Insert or replace
272
+ conn.execute(
273
+ """
274
+ INSERT OR REPLACE INTO fetch_cache
275
+ (key, url, content, strategy_used, title, final_url, metadata, created_at, accessed_at, size_bytes)
276
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
277
+ """,
278
+ (
279
+ key,
280
+ url,
281
+ result.content,
282
+ result.strategy_used,
283
+ result.title,
284
+ result.final_url,
285
+ metadata_json,
286
+ now,
287
+ now,
288
+ size_bytes,
289
+ ),
290
+ )
291
+ conn.commit()
292
+ logger.debug(f"[FetchCache] Cached URL: {url} ({size_bytes} bytes)")
293
+
294
+ def stats(self) -> dict[str, Any]:
295
+ """Return cache statistics."""
296
+ with self._lock:
297
+ conn = self._get_connection()
298
+ row = conn.execute(
299
+ """
300
+ SELECT COUNT(*) as count, COALESCE(SUM(size_bytes), 0) as size_bytes
301
+ FROM fetch_cache
302
+ """
303
+ ).fetchone()
304
+
305
+ return {
306
+ "count": row["count"],
307
+ "size_bytes": row["size_bytes"],
308
+ "size_mb": round(row["size_bytes"] / (1024 * 1024), 2),
309
+ "max_size_mb": round(self._max_size_bytes / (1024 * 1024), 2),
310
+ "db_path": str(self._db_path),
311
+ }
312
+
313
+ def clear(self) -> int:
314
+ """Clear all entries.
315
+
316
+ Returns:
317
+ Number of entries deleted
318
+ """
319
+ with self._lock:
320
+ conn = self._get_connection()
321
+ count = conn.execute("SELECT COUNT(*) as cnt FROM fetch_cache").fetchone()[
322
+ "cnt"
323
+ ]
324
+ conn.execute("DELETE FROM fetch_cache")
325
+ conn.commit()
326
+ return count
327
+
328
+
329
+ # Global fetch cache instance (initialized lazily)
330
+ _fetch_cache: FetchCache | None = None
331
+
332
+
333
+ def get_fetch_cache(
334
+ cache_dir: Path, max_size_bytes: int = 100 * 1024 * 1024
335
+ ) -> FetchCache:
336
+ """Get or create the global fetch cache instance.
337
+
338
+ Args:
339
+ cache_dir: Directory to store cache database
340
+ max_size_bytes: Maximum cache size
341
+
342
+ Returns:
343
+ FetchCache instance
344
+ """
345
+ global _fetch_cache
346
+ if _fetch_cache is None:
347
+ db_path = cache_dir / "fetch_cache.db"
348
+ _fetch_cache = FetchCache(db_path, max_size_bytes)
349
+ return _fetch_cache
350
+
351
+
352
+ # Global MarkItDown instance (reused for static fetching)
353
+ # Note: MarkItDown's requests.Session is NOT thread-safe. However, since
354
+ # fetch_with_static runs in the asyncio event loop (not in a thread pool),
355
+ # only one md.convert() call executes at a time, avoiding thread safety issues.
356
+ # If fetch_with_static is ever moved to run_in_executor with threads, this
357
+ # should be changed to use threading.local() for thread-local instances.
358
+ _markitdown_instance: Any = None
359
+
360
+ # Global httpx.AsyncClient for Jina fetching (reused to avoid connection overhead)
361
+ _jina_client: Any = None
362
+
363
+
364
+ def _get_markitdown() -> Any:
365
+ """Get or create the shared MarkItDown instance.
366
+
367
+ Reusing a single instance avoids repeated initialization overhead.
368
+ """
369
+ global _markitdown_instance
370
+ if _markitdown_instance is None:
371
+ from markitdown import MarkItDown
372
+
373
+ _markitdown_instance = MarkItDown()
374
+ return _markitdown_instance
375
+
376
+
377
+ def _get_jina_client(timeout: int = 30) -> Any:
378
+ """Get or create the shared httpx.AsyncClient for Jina fetching.
379
+
380
+ Reusing a single client instance avoids repeated connection setup overhead.
381
+ The client uses connection pooling for better performance.
382
+
383
+ Args:
384
+ timeout: Request timeout in seconds (used on first creation only)
385
+
386
+ Returns:
387
+ httpx.AsyncClient instance
388
+ """
389
+ global _jina_client
390
+ if _jina_client is None:
391
+ import httpx
392
+
393
+ _jina_client = httpx.AsyncClient(
394
+ timeout=timeout,
395
+ limits=httpx.Limits(max_connections=10, max_keepalive_connections=5),
396
+ )
397
+ return _jina_client
398
+
399
+
400
+ async def close_shared_clients() -> None:
401
+ """Close shared client instances.
402
+
403
+ Call this during cleanup to release resources.
404
+ """
405
+ global _jina_client, _fetch_cache
406
+ if _jina_client is not None:
407
+ await _jina_client.aclose()
408
+ _jina_client = None
409
+ if _fetch_cache is not None:
410
+ _fetch_cache.close()
411
+ _fetch_cache = None
412
+
413
+
414
+ def detect_js_required(content: str) -> bool:
415
+ """Detect if content indicates JavaScript rendering is required.
416
+
417
+ Args:
418
+ content: HTML or Markdown content to check
419
+
420
+ Returns:
421
+ True if content suggests JavaScript is needed
422
+ """
423
+ if not content:
424
+ return True # Empty content likely means JS-rendered
425
+
426
+ content_lower = content.lower()
427
+ for pattern in JS_REQUIRED_PATTERNS:
428
+ if pattern.lower() in content_lower:
429
+ logger.debug(f"JS required pattern detected: {pattern}")
430
+ return True
431
+
432
+ # Check for very short content (likely a JS-only page)
433
+ # Strip markdown formatting for length check
434
+ text_only = re.sub(r"[#*_\[\]()>`-]", "", content).strip()
435
+ if len(text_only) < 100:
436
+ logger.debug(f"Content too short ({len(text_only)} chars), likely JS-rendered")
437
+ return True
438
+
439
+ return False
440
+
441
+
442
+ def should_use_browser_for_domain(url: str, fallback_patterns: list[str]) -> bool:
443
+ """Check if URL domain matches fallback patterns that need browser rendering.
444
+
445
+ Args:
446
+ url: URL to check
447
+ fallback_patterns: List of domain patterns (e.g., ["twitter.com", "x.com"])
448
+
449
+ Returns:
450
+ True if domain matches any pattern
451
+ """
452
+ try:
453
+ parsed = urlparse(url)
454
+ domain = parsed.netloc.lower()
455
+
456
+ for pattern in fallback_patterns:
457
+ pattern_lower = pattern.lower()
458
+ # Match exact domain or subdomain
459
+ if domain == pattern_lower or domain.endswith("." + pattern_lower):
460
+ logger.debug(f"Domain {domain} matches fallback pattern {pattern}")
461
+ return True
462
+ except Exception:
463
+ pass
464
+
465
+ return False
466
+
467
+
468
+ def is_agent_browser_available(command: str = "agent-browser") -> bool:
469
+ """Check if agent-browser CLI is installed and available.
470
+
471
+ Args:
472
+ command: Command name or path to check
473
+
474
+ Returns:
475
+ True if agent-browser is available
476
+ """
477
+ return shutil.which(command) is not None
478
+
479
+
480
+ # Cache for agent-browser readiness check
481
+ _agent_browser_ready_cache: dict[str, tuple[bool, str]] = {}
482
+
483
+
484
+ def verify_agent_browser_ready(
485
+ command: str = "agent-browser", use_cache: bool = True
486
+ ) -> tuple[bool, str]:
487
+ """Verify that agent-browser is fully ready (command exists + browser installed).
488
+
489
+ This performs a more thorough check than is_agent_browser_available() by
490
+ actually running agent-browser to verify it works.
491
+
492
+ Args:
493
+ command: Command name or path to check
494
+ use_cache: Whether to use cached result (default True)
495
+
496
+ Returns:
497
+ Tuple of (is_ready, message)
498
+ - (True, "agent-browser is ready") if fully functional
499
+ - (False, "error message") if not ready
500
+ """
501
+ import subprocess
502
+
503
+ # Check cache first
504
+ if use_cache and command in _agent_browser_ready_cache:
505
+ return _agent_browser_ready_cache[command]
506
+
507
+ # Step 1: Check if command exists
508
+ if not shutil.which(command):
509
+ result = (
510
+ False,
511
+ f"'{command}' command not found. Install with: npm install -g agent-browser",
512
+ )
513
+ _agent_browser_ready_cache[command] = result
514
+ return result
515
+
516
+ # Step 2: Check if agent-browser responds to --help
517
+ try:
518
+ proc = subprocess.run(
519
+ [command, "--help"],
520
+ capture_output=True,
521
+ text=True,
522
+ timeout=10,
523
+ )
524
+ if proc.returncode != 0:
525
+ result = (False, f"'{command}' command failed: {proc.stderr.strip()}")
526
+ _agent_browser_ready_cache[command] = result
527
+ return result
528
+ except subprocess.TimeoutExpired:
529
+ result = (False, f"'{command}' command timed out")
530
+ _agent_browser_ready_cache[command] = result
531
+ return result
532
+ except Exception as e:
533
+ result = (False, f"'{command}' command error: {e}")
534
+ _agent_browser_ready_cache[command] = result
535
+ return result
536
+
537
+ # Step 3: Try a simple operation to verify browser is installed
538
+ # We use 'agent-browser snapshot' on about:blank which should fail fast if browser not installed
539
+ try:
540
+ proc = subprocess.run(
541
+ [command, "open", "about:blank"],
542
+ capture_output=True,
543
+ text=True,
544
+ timeout=30,
545
+ )
546
+ # Check for known error patterns
547
+ if proc.returncode != 0:
548
+ stderr_lower = proc.stderr.lower()
549
+ stderr_orig = proc.stderr.strip()
550
+ # Check for Playwright browser not installed error
551
+ if (
552
+ "executable doesn't exist" in stderr_lower
553
+ or "browsertype.launch" in stderr_lower
554
+ ):
555
+ result = (
556
+ False,
557
+ "Playwright browser not installed. Run: agent-browser install "
558
+ "OR npx playwright install chromium",
559
+ )
560
+ _agent_browser_ready_cache[command] = result
561
+ return result
562
+ # Check for daemon not found error (global install needs AGENT_BROWSER_HOME)
563
+ if "daemon not found" in stderr_lower:
564
+ result = (
565
+ False,
566
+ "agent-browser daemon not found. "
567
+ "Set AGENT_BROWSER_HOME environment variable to the agent-browser package directory. "
568
+ "For pnpm global install: AGENT_BROWSER_HOME=$(pnpm list -g agent-browser --parseable)/node_modules/agent-browser "
569
+ "For npm global install: AGENT_BROWSER_HOME=$(npm root -g)/agent-browser",
570
+ )
571
+ _agent_browser_ready_cache[command] = result
572
+ return result
573
+ # Other errors might be transient, still mark as ready
574
+ logger.debug(
575
+ f"agent-browser test returned non-zero but may still work: {stderr_orig}"
576
+ )
577
+ except subprocess.TimeoutExpired:
578
+ # Timeout on about:blank is suspicious but not fatal
579
+ logger.debug("agent-browser test timed out, may still work for real pages")
580
+ except Exception as e:
581
+ logger.debug(f"agent-browser test error (may still work): {e}")
582
+
583
+ # Close browser if opened
584
+ try:
585
+ subprocess.run([command, "close"], capture_output=True, timeout=5)
586
+ except Exception:
587
+ pass
588
+
589
+ result = (True, "agent-browser is ready")
590
+ _agent_browser_ready_cache[command] = result
591
+ return result
592
+
593
+
594
+ def clear_agent_browser_cache() -> None:
595
+ """Clear the agent-browser readiness cache."""
596
+ _agent_browser_ready_cache.clear()
597
+
598
+
599
+ def _url_to_screenshot_filename(url: str) -> str:
600
+ """Generate a safe filename for URL screenshot.
601
+
602
+ Examples:
603
+ https://example.com/path → example.com_path.full.jpg
604
+ https://x.com/user/status/123 → x.com_user_status_123.full.jpg
605
+
606
+ Args:
607
+ url: URL to convert
608
+
609
+ Returns:
610
+ Safe filename with .full.jpg extension
611
+ """
612
+ try:
613
+ parsed = urlparse(url)
614
+ # Start with domain
615
+ parts = [parsed.netloc] if parsed.netloc else []
616
+ # Add path parts
617
+ if parsed.path and parsed.path != "/":
618
+ path_parts = parsed.path.strip("/").split("/")
619
+ parts.extend(path_parts)
620
+
621
+ # If no parts, fall back to hash
622
+ if not parts or not any(parts):
623
+ url_hash = hashlib.sha256(url.encode()).hexdigest()[:16]
624
+ return f"screenshot_{url_hash}.full.jpg"
625
+
626
+ # Join with underscores
627
+ name = "_".join(p for p in parts if p)
628
+
629
+ # Sanitize for filesystem (remove/replace unsafe chars)
630
+ # Windows-unsafe: < > : " / \ | ? *
631
+ # Also remove other problematic chars
632
+ unsafe_chars = r'<>:"/\\|?*\x00-\x1f'
633
+ name = re.sub(f"[{unsafe_chars}]", "_", name)
634
+
635
+ # Collapse multiple underscores
636
+ name = re.sub(r"_+", "_", name)
637
+
638
+ # Strip leading/trailing underscores
639
+ name = name.strip("_")
640
+
641
+ # Limit length (leave room for extension)
642
+ max_length = 200
643
+ if len(name) > max_length:
644
+ name = name[:max_length]
645
+
646
+ # Final check for empty name
647
+ if not name:
648
+ url_hash = hashlib.sha256(url.encode()).hexdigest()[:16]
649
+ return f"screenshot_{url_hash}.full.jpg"
650
+
651
+ return f"{name}.full.jpg"
652
+ except Exception:
653
+ # Fallback: hash the URL
654
+ url_hash = hashlib.sha256(url.encode()).hexdigest()[:16]
655
+ return f"screenshot_{url_hash}.full.jpg"
656
+
657
+
658
+ def _compress_screenshot(
659
+ screenshot_path: Path,
660
+ quality: int = 85,
661
+ max_height: int = 10000,
662
+ ) -> None:
663
+ """Compress a screenshot to JPEG with quality and size limits.
664
+
665
+ Args:
666
+ screenshot_path: Path to screenshot file (will be overwritten)
667
+ quality: JPEG quality (1-100)
668
+ max_height: Maximum height in pixels (will resize if exceeded)
669
+ """
670
+ try:
671
+ from PIL import Image
672
+
673
+ with Image.open(screenshot_path) as img:
674
+ # Convert to RGB if necessary (for JPEG)
675
+ if img.mode in ("RGBA", "P"):
676
+ img = img.convert("RGB")
677
+
678
+ # Resize if too tall
679
+ width, height = img.size
680
+ if height > max_height:
681
+ ratio = max_height / height
682
+ new_width = int(width * ratio)
683
+ img = img.resize((new_width, max_height), Image.Resampling.LANCZOS)
684
+ logger.debug(
685
+ f"Resized screenshot from {width}x{height} to {new_width}x{max_height}"
686
+ )
687
+
688
+ # Save with compression
689
+ img.save(screenshot_path, "JPEG", quality=quality, optimize=True)
690
+ logger.debug(
691
+ f"Compressed screenshot to quality={quality}: {screenshot_path}"
692
+ )
693
+ except ImportError:
694
+ logger.warning("Pillow not installed, skipping screenshot compression")
695
+ except Exception as e:
696
+ logger.warning(f"Failed to compress screenshot: {e}")
697
+
698
+
699
+ def _html_to_text(html: str) -> str:
700
+ """Extract clean text from HTML content.
701
+
702
+ Args:
703
+ html: Raw HTML content
704
+
705
+ Returns:
706
+ Extracted text content formatted as markdown
707
+ """
708
+ try:
709
+ from bs4 import BeautifulSoup
710
+
711
+ soup = BeautifulSoup(html, "html.parser")
712
+
713
+ # Remove script and style elements
714
+ for element in soup(["script", "style", "noscript", "nav", "footer", "header"]):
715
+ element.decompose()
716
+
717
+ # Extract text from main content areas
718
+ lines = []
719
+
720
+ # Try to find main content area
721
+ main = soup.find("main") or soup.find("article") or soup.find("body")
722
+ if not main:
723
+ return ""
724
+
725
+ for element in main.find_all(
726
+ ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote", "pre", "code"]
727
+ ):
728
+ text = element.get_text(strip=True)
729
+ if not text:
730
+ continue
731
+
732
+ tag = element.name
733
+ if tag == "h1":
734
+ lines.append(f"# {text}")
735
+ elif tag == "h2":
736
+ lines.append(f"## {text}")
737
+ elif tag == "h3":
738
+ lines.append(f"### {text}")
739
+ elif tag == "h4":
740
+ lines.append(f"#### {text}")
741
+ elif tag == "h5":
742
+ lines.append(f"##### {text}")
743
+ elif tag == "h6":
744
+ lines.append(f"###### {text}")
745
+ elif tag == "p":
746
+ lines.append(text)
747
+ elif tag == "li":
748
+ lines.append(f"- {text}")
749
+ elif tag == "blockquote":
750
+ lines.append(f"> {text}")
751
+ elif tag == "pre" or tag == "code":
752
+ lines.append(f"```\n{text}\n```")
753
+
754
+ lines.append("")
755
+
756
+ return "\n".join(lines).strip()
757
+
758
+ except ImportError:
759
+ logger.debug("BeautifulSoup not installed, using simple text extraction")
760
+ # Fallback: simple regex-based extraction
761
+ import re
762
+
763
+ # Remove tags
764
+ text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL)
765
+ text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL)
766
+ text = re.sub(r"<[^>]+>", " ", text)
767
+ # Normalize whitespace
768
+ text = re.sub(r"\s+", " ", text).strip()
769
+ return text
770
+ except Exception as e:
771
+ logger.debug(f"HTML to text extraction failed: {e}")
772
+ return ""
773
+
774
+
775
+ async def fetch_with_static(url: str) -> FetchResult:
776
+ """Fetch URL using markitdown (direct HTTP request).
777
+
778
+ Args:
779
+ url: URL to fetch
780
+
781
+ Returns:
782
+ FetchResult with markdown content
783
+
784
+ Raises:
785
+ FetchError: If fetch fails
786
+ """
787
+ logger.debug(f"Fetching URL with static strategy: {url}")
788
+
789
+ try:
790
+ md = _get_markitdown()
791
+ result = md.convert(url)
792
+
793
+ if not result.text_content:
794
+ raise FetchError(f"No content extracted from URL: {url}")
795
+
796
+ return FetchResult(
797
+ content=result.text_content,
798
+ strategy_used="static",
799
+ title=result.title,
800
+ url=url,
801
+ metadata={"converter": "markitdown"},
802
+ )
803
+ except Exception as e:
804
+ if "No content extracted" in str(e):
805
+ raise
806
+ raise FetchError(f"Failed to fetch URL: {e}")
807
+
808
+
809
+ async def fetch_with_browser(
810
+ url: str,
811
+ command: str = "agent-browser",
812
+ timeout: int = 30000,
813
+ wait_for: str = "domcontentloaded",
814
+ extra_wait_ms: int = 2000,
815
+ session: str | None = None,
816
+ *,
817
+ screenshot: bool = False,
818
+ screenshot_dir: Path | None = None,
819
+ screenshot_config: ScreenshotConfig | None = None,
820
+ ) -> FetchResult:
821
+ """Fetch URL using agent-browser (headless browser).
822
+
823
+ Args:
824
+ url: URL to fetch
825
+ command: agent-browser command name or path
826
+ timeout: Page load timeout in milliseconds
827
+ wait_for: Wait condition (load/domcontentloaded/networkidle)
828
+ extra_wait_ms: Extra wait time after load state (for JS rendering)
829
+ session: Optional session name for isolated browser
830
+ screenshot: If True, capture full-page screenshot
831
+ screenshot_dir: Directory to save screenshot (required if screenshot=True)
832
+ screenshot_config: Screenshot settings (viewport, quality, etc.)
833
+
834
+ Returns:
835
+ FetchResult with rendered page content and optional screenshot path
836
+
837
+ Raises:
838
+ AgentBrowserNotFoundError: If agent-browser is not installed
839
+ FetchError: If fetch fails
840
+ """
841
+ if not is_agent_browser_available(command):
842
+ raise AgentBrowserNotFoundError()
843
+
844
+ logger.debug(f"Fetching URL with browser strategy: {url}")
845
+
846
+ # Generate unique session ID to avoid conflicts with concurrent browser fetches
847
+ # Each fetch_with_browser call gets its own isolated browser session
848
+ effective_session = (
849
+ session if session else f"markitai-fetch-{uuid.uuid4().hex[:12]}"
850
+ )
851
+
852
+ try:
853
+ # Build command args
854
+ base_args = [command, "--session", effective_session]
855
+
856
+ # Step 1: Open URL and wait for page load
857
+ open_args = [*base_args, "open", url]
858
+ logger.debug(f"Running: {' '.join(open_args)}")
859
+
860
+ proc = await asyncio.create_subprocess_exec(
861
+ *open_args,
862
+ stdout=asyncio.subprocess.PIPE,
863
+ stderr=asyncio.subprocess.PIPE,
864
+ )
865
+ stdout, stderr = await asyncio.wait_for(
866
+ proc.communicate(), timeout=timeout / 1000 + 10
867
+ )
868
+
869
+ if proc.returncode != 0:
870
+ error_msg = stderr.decode() if stderr else "Unknown error"
871
+ raise FetchError(f"agent-browser open failed: {error_msg}")
872
+
873
+ # Step 2: Wait for load state
874
+ wait_args = [*base_args, "wait", "--load", wait_for]
875
+ logger.debug(f"Running: {' '.join(wait_args)}")
876
+
877
+ proc = await asyncio.create_subprocess_exec(
878
+ *wait_args,
879
+ stdout=asyncio.subprocess.PIPE,
880
+ stderr=asyncio.subprocess.PIPE,
881
+ )
882
+ await asyncio.wait_for(proc.communicate(), timeout=timeout / 1000 + 10)
883
+
884
+ # Step 2.5: Extra wait for JS rendering (especially for SPAs)
885
+ if extra_wait_ms > 0:
886
+ extra_wait_args = [*base_args, "wait", str(extra_wait_ms)]
887
+ logger.debug(f"Running: {' '.join(extra_wait_args)}")
888
+ proc = await asyncio.create_subprocess_exec(
889
+ *extra_wait_args,
890
+ stdout=asyncio.subprocess.PIPE,
891
+ stderr=asyncio.subprocess.PIPE,
892
+ )
893
+ await asyncio.wait_for(proc.communicate(), timeout=extra_wait_ms / 1000 + 5)
894
+
895
+ # Step 3: Get page content via snapshot (accessibility tree with text)
896
+ # Using snapshot -c (compact) to get clean text structure
897
+ snapshot_args = [*base_args, "snapshot", "-c", "--json"]
898
+ logger.debug(f"Running: {' '.join(snapshot_args)}")
899
+
900
+ proc = await asyncio.create_subprocess_exec(
901
+ *snapshot_args,
902
+ stdout=asyncio.subprocess.PIPE,
903
+ stderr=asyncio.subprocess.PIPE,
904
+ )
905
+ stdout, stderr = await asyncio.wait_for(
906
+ proc.communicate(), timeout=timeout / 1000 + 10
907
+ )
908
+
909
+ if proc.returncode != 0:
910
+ error_msg = stderr.decode() if stderr else "Unknown error"
911
+ raise FetchError(f"agent-browser snapshot failed: {error_msg}")
912
+
913
+ # Parse snapshot JSON
914
+ try:
915
+ snapshot_data = json.loads(stdout.decode())
916
+ if snapshot_data.get("success"):
917
+ snapshot_text = snapshot_data.get("data", {}).get("snapshot", "")
918
+ else:
919
+ snapshot_text = stdout.decode()
920
+ except json.JSONDecodeError:
921
+ snapshot_text = stdout.decode()
922
+
923
+ # Step 4, 5 & 6: Get page title, final URL and HTML body in parallel
924
+ async def get_title() -> str | None:
925
+ title_args = [*base_args, "get", "title"]
926
+ proc = await asyncio.create_subprocess_exec(
927
+ *title_args,
928
+ stdout=asyncio.subprocess.PIPE,
929
+ stderr=asyncio.subprocess.PIPE,
930
+ )
931
+ stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10)
932
+ if proc.returncode == 0 and stdout:
933
+ return stdout.decode().strip()
934
+ return None
935
+
936
+ async def get_final_url() -> str | None:
937
+ url_args = [*base_args, "get", "url"]
938
+ proc = await asyncio.create_subprocess_exec(
939
+ *url_args,
940
+ stdout=asyncio.subprocess.PIPE,
941
+ stderr=asyncio.subprocess.PIPE,
942
+ )
943
+ stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10)
944
+ if proc.returncode == 0 and stdout:
945
+ return stdout.decode().strip()
946
+ return None
947
+
948
+ async def get_html_body() -> str | None:
949
+ """Get HTML body content for text extraction."""
950
+ html_args = [*base_args, "get", "html", "body"]
951
+ proc = await asyncio.create_subprocess_exec(
952
+ *html_args,
953
+ stdout=asyncio.subprocess.PIPE,
954
+ stderr=asyncio.subprocess.PIPE,
955
+ )
956
+ stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=15)
957
+ if proc.returncode == 0 and stdout:
958
+ return stdout.decode()
959
+ return None
960
+
961
+ # Execute title, URL and HTML fetching in parallel
962
+ title, final_url, html_body = await asyncio.gather(
963
+ get_title(), get_final_url(), get_html_body()
964
+ )
965
+
966
+ # Convert snapshot to markdown format
967
+ markdown_content = _snapshot_to_markdown(snapshot_text, title, url)
968
+
969
+ # Also extract text from HTML as fallback/supplement
970
+ html_text_content: str | None = None
971
+ if html_body:
972
+ html_text_content = _html_to_text(html_body)
973
+
974
+ # Use HTML text if snapshot conversion failed or is too short
975
+ if not markdown_content.strip() or len(markdown_content.strip()) < 100:
976
+ if html_text_content and len(html_text_content.strip()) > len(
977
+ markdown_content.strip()
978
+ ):
979
+ logger.debug("Using HTML text extraction as primary content")
980
+ if title:
981
+ markdown_content = f"# {title}\n\n{html_text_content}"
982
+ else:
983
+ markdown_content = html_text_content
984
+
985
+ if not markdown_content.strip():
986
+ raise FetchError(f"No content extracted from URL via browser: {url}")
987
+
988
+ # Step 6: Capture full-page screenshot if requested
989
+ screenshot_path: Path | None = None
990
+ if screenshot and screenshot_dir:
991
+ try:
992
+ screenshot_dir.mkdir(parents=True, exist_ok=True)
993
+ safe_filename = _url_to_screenshot_filename(url)
994
+ screenshot_path = screenshot_dir / safe_filename
995
+
996
+ # Check if screenshot already exists (simple cache)
997
+ if not screenshot_path.exists():
998
+ # Set viewport if configured
999
+ if screenshot_config:
1000
+ viewport_args = [
1001
+ *base_args,
1002
+ "set",
1003
+ "viewport",
1004
+ str(screenshot_config.viewport_width),
1005
+ str(screenshot_config.viewport_height),
1006
+ ]
1007
+ logger.debug(f"Running: {' '.join(viewport_args)}")
1008
+ proc = await asyncio.create_subprocess_exec(
1009
+ *viewport_args,
1010
+ stdout=asyncio.subprocess.PIPE,
1011
+ stderr=asyncio.subprocess.PIPE,
1012
+ )
1013
+ await asyncio.wait_for(proc.communicate(), timeout=10)
1014
+
1015
+ # Capture full-page screenshot
1016
+ screenshot_args = [
1017
+ *base_args,
1018
+ "screenshot",
1019
+ "--full",
1020
+ str(screenshot_path),
1021
+ ]
1022
+ logger.debug(f"Running: {' '.join(screenshot_args)}")
1023
+ proc = await asyncio.create_subprocess_exec(
1024
+ *screenshot_args,
1025
+ stdout=asyncio.subprocess.PIPE,
1026
+ stderr=asyncio.subprocess.PIPE,
1027
+ )
1028
+ stdout, stderr = await asyncio.wait_for(
1029
+ proc.communicate(), timeout=60
1030
+ )
1031
+
1032
+ if proc.returncode != 0:
1033
+ error_msg = stderr.decode() if stderr else "Unknown error"
1034
+ logger.warning(f"Screenshot capture failed: {error_msg}")
1035
+ screenshot_path = None
1036
+ elif screenshot_path.exists():
1037
+ # Compress screenshot
1038
+ quality = screenshot_config.quality if screenshot_config else 85
1039
+ max_height = (
1040
+ screenshot_config.max_height if screenshot_config else 10000
1041
+ )
1042
+ _compress_screenshot(screenshot_path, quality, max_height)
1043
+ logger.debug(f"Screenshot saved: {screenshot_path}")
1044
+ else:
1045
+ logger.debug(f"Screenshot exists, skipping: {screenshot_path}")
1046
+ except Exception as e:
1047
+ # Screenshot failure should not block the main fetch
1048
+ logger.warning(f"Screenshot failed for {url}: {e}")
1049
+ screenshot_path = None
1050
+
1051
+ return FetchResult(
1052
+ content=markdown_content,
1053
+ strategy_used="browser",
1054
+ title=title,
1055
+ url=url,
1056
+ final_url=final_url,
1057
+ metadata={"renderer": "agent-browser", "wait_for": wait_for},
1058
+ screenshot_path=screenshot_path,
1059
+ )
1060
+
1061
+ except TimeoutError:
1062
+ raise FetchError(f"Browser fetch timed out after {timeout}ms: {url}")
1063
+ except AgentBrowserNotFoundError:
1064
+ raise
1065
+ except FetchError:
1066
+ raise
1067
+ except Exception as e:
1068
+ raise FetchError(f"Browser fetch failed: {e}")
1069
+ finally:
1070
+ # Clean up the browser session to avoid resource leaks
1071
+ # Only close auto-generated sessions (not user-specified ones)
1072
+ if not session:
1073
+ try:
1074
+ close_args = [command, "--session", effective_session, "close"]
1075
+ proc = await asyncio.create_subprocess_exec(
1076
+ *close_args,
1077
+ stdout=asyncio.subprocess.PIPE,
1078
+ stderr=asyncio.subprocess.PIPE,
1079
+ )
1080
+ await asyncio.wait_for(proc.communicate(), timeout=5)
1081
+ logger.debug(f"Closed browser session: {effective_session}")
1082
+ except Exception as e:
1083
+ logger.debug(
1084
+ f"Failed to close browser session {effective_session}: {e}"
1085
+ )
1086
+
1087
+
1088
+ def _snapshot_to_markdown(snapshot: str, title: str | None, url: str) -> str:
1089
+ """Convert agent-browser snapshot to markdown format.
1090
+
1091
+ The snapshot is an accessibility tree with various formats:
1092
+ - heading "Title" [ref=e1] [level=1]
1093
+ - paragraph: Text content here
1094
+ - link "Link text" [ref=e2]:
1095
+ - /url: /path
1096
+ - text: Some text
1097
+
1098
+ Args:
1099
+ snapshot: Accessibility tree snapshot
1100
+ title: Page title
1101
+ url: Original URL
1102
+
1103
+ Returns:
1104
+ Markdown formatted content
1105
+ """
1106
+ lines = []
1107
+
1108
+ # Add title as H1 if available
1109
+ if title:
1110
+ lines.append(f"# {title}")
1111
+ lines.append("")
1112
+
1113
+ # Track current link for multi-line link handling
1114
+ current_link_text: str | None = None
1115
+ current_link_url: str | None = None
1116
+
1117
+ # Parse snapshot and convert to markdown
1118
+ for line in snapshot.split("\n"):
1119
+ stripped = line.lstrip()
1120
+
1121
+ if not stripped:
1122
+ continue
1123
+
1124
+ # Skip structure markers
1125
+ if stripped.startswith("- document:") or stripped.startswith("- navigation:"):
1126
+ continue
1127
+ if stripped.startswith("- main:") or stripped.startswith("- article:"):
1128
+ continue
1129
+ if stripped.startswith("- contentinfo:") or stripped.startswith("- list:"):
1130
+ continue
1131
+ if stripped.startswith("- listitem:"):
1132
+ continue
1133
+
1134
+ # Remove leading "- " if present
1135
+ if stripped.startswith("- "):
1136
+ stripped = stripped[2:]
1137
+
1138
+ # Handle URL lines (part of link)
1139
+ if stripped.startswith("/url:"):
1140
+ current_link_url = stripped[5:].strip()
1141
+ if current_link_text:
1142
+ lines.append(f"[{current_link_text}]({current_link_url})")
1143
+ lines.append("")
1144
+ current_link_text = None
1145
+ current_link_url = None
1146
+ continue
1147
+
1148
+ # Pattern 1: role "content" [attrs] (with or without trailing colon)
1149
+ # e.g., heading "Title" [ref=e1] [level=1]
1150
+ # e.g., link "Text" [ref=e2]:
1151
+ match = re.match(
1152
+ r'(\w+)\s+"([^"]*)"(?:\s*\[([^\]]*(?:\]\s*\[[^\]]*)*)\])?:?$', stripped
1153
+ )
1154
+ if match:
1155
+ role, content, attrs_str = match.groups()
1156
+ attrs_dict = {}
1157
+ if attrs_str:
1158
+ # Parse multiple [key=value] attributes
1159
+ for attr_match in re.finditer(r"\[?([^=\]]+)=([^\]]+)\]?", attrs_str):
1160
+ k, v = attr_match.groups()
1161
+ attrs_dict[k.strip()] = v.strip()
1162
+
1163
+ # Convert to markdown based on role
1164
+ if role == "heading":
1165
+ level = int(attrs_dict.get("level", "2"))
1166
+ lines.append(f"{'#' * level} {content}")
1167
+ lines.append("")
1168
+ elif role == "paragraph":
1169
+ if content:
1170
+ lines.append(content)
1171
+ lines.append("")
1172
+ elif role == "link":
1173
+ # Link URL might be on next line
1174
+ link_url = attrs_dict.get("url", "")
1175
+ if link_url:
1176
+ lines.append(f"[{content}]({link_url})")
1177
+ lines.append("")
1178
+ else:
1179
+ # Wait for /url: line
1180
+ current_link_text = content
1181
+ elif role == "image":
1182
+ alt = content or "image"
1183
+ src = attrs_dict.get("url", attrs_dict.get("src", ""))
1184
+ if src:
1185
+ lines.append(f"![{alt}]({src})")
1186
+ lines.append("")
1187
+ elif role == "listitem":
1188
+ lines.append(f"- {content}")
1189
+ elif role == "code":
1190
+ lines.append(f"`{content}`")
1191
+ elif role in ("text", "StaticText"):
1192
+ if content:
1193
+ lines.append(content)
1194
+ elif role == "button":
1195
+ pass # Skip buttons
1196
+ elif role == "textbox":
1197
+ pass # Skip form inputs
1198
+ elif role == "switch":
1199
+ pass # Skip toggles
1200
+ elif content:
1201
+ # Generic fallback - include content
1202
+ lines.append(content)
1203
+ continue
1204
+
1205
+ # Pattern 2: role: content (no quotes)
1206
+ # e.g., paragraph: Text content here
1207
+ # e.g., text: Some text
1208
+ match2 = re.match(r"(\w+):\s*(.+)$", stripped)
1209
+ if match2:
1210
+ role, content = match2.groups()
1211
+ content = content.strip()
1212
+
1213
+ if role == "paragraph":
1214
+ lines.append(content)
1215
+ lines.append("")
1216
+ elif role == "text":
1217
+ # Only add text if it's meaningful (not just punctuation)
1218
+ if content and len(content) > 2:
1219
+ lines.append(content)
1220
+ elif role == "heading":
1221
+ lines.append(f"## {content}")
1222
+ lines.append("")
1223
+ elif role == "time":
1224
+ lines.append(f"*{content}*")
1225
+ lines.append("")
1226
+ elif role in ("separator",):
1227
+ lines.append("---")
1228
+ lines.append("")
1229
+ continue
1230
+
1231
+ # Pattern 3: Plain text line (not a role definition)
1232
+ # Skip structural elements
1233
+ if stripped and not stripped.endswith(":"):
1234
+ # Check if it looks like content (not a role marker)
1235
+ if not re.match(r"^[a-z]+$", stripped):
1236
+ pass # Don't add raw structural lines
1237
+
1238
+ # Clean up: remove consecutive empty lines
1239
+ result_lines = []
1240
+ prev_empty = False
1241
+ for line in lines:
1242
+ is_empty = not line.strip()
1243
+ if is_empty and prev_empty:
1244
+ continue
1245
+ result_lines.append(line)
1246
+ prev_empty = is_empty
1247
+
1248
+ return "\n".join(result_lines).strip()
1249
+
1250
+
1251
+ async def fetch_with_jina(
1252
+ url: str,
1253
+ api_key: str | None = None,
1254
+ timeout: int = 30,
1255
+ ) -> FetchResult:
1256
+ """Fetch URL using Jina Reader API.
1257
+
1258
+ Args:
1259
+ url: URL to fetch
1260
+ api_key: Optional Jina API key (for higher rate limits)
1261
+ timeout: Request timeout in seconds
1262
+
1263
+ Returns:
1264
+ FetchResult with markdown content
1265
+
1266
+ Raises:
1267
+ JinaRateLimitError: If rate limit exceeded
1268
+ JinaAPIError: If API returns error
1269
+ FetchError: If fetch fails
1270
+ """
1271
+ import httpx
1272
+
1273
+ logger.debug(f"Fetching URL with Jina Reader: {url}")
1274
+
1275
+ jina_url = f"{DEFAULT_JINA_BASE_URL}/{url}"
1276
+ headers = {}
1277
+ if api_key:
1278
+ headers["Authorization"] = f"Bearer {api_key}"
1279
+
1280
+ try:
1281
+ client = _get_jina_client(timeout)
1282
+ response = await client.get(jina_url, headers=headers)
1283
+
1284
+ if response.status_code == 429:
1285
+ raise JinaRateLimitError()
1286
+ elif response.status_code >= 400:
1287
+ raise JinaAPIError(response.status_code, response.text[:200])
1288
+
1289
+ content = response.text
1290
+
1291
+ if not content.strip():
1292
+ raise FetchError(f"No content returned from Jina Reader: {url}")
1293
+
1294
+ # Extract title from first H1 if present
1295
+ title = None
1296
+ title_match = re.match(r"^#\s+(.+)$", content, re.MULTILINE)
1297
+ if title_match:
1298
+ title = title_match.group(1)
1299
+
1300
+ return FetchResult(
1301
+ content=content,
1302
+ strategy_used="jina",
1303
+ title=title,
1304
+ url=url,
1305
+ metadata={"api": "jina-reader"},
1306
+ )
1307
+
1308
+ except (JinaRateLimitError, JinaAPIError):
1309
+ raise
1310
+ except httpx.TimeoutException:
1311
+ raise FetchError(f"Jina Reader request timed out after {timeout}s: {url}")
1312
+ except Exception as e:
1313
+ raise FetchError(f"Jina Reader fetch failed: {e}")
1314
+
1315
+
1316
+ async def fetch_url(
1317
+ url: str,
1318
+ strategy: FetchStrategy,
1319
+ config: FetchConfig,
1320
+ explicit_strategy: bool = False,
1321
+ cache: FetchCache | None = None,
1322
+ skip_read_cache: bool = False,
1323
+ *,
1324
+ screenshot: bool = False,
1325
+ screenshot_dir: Path | None = None,
1326
+ screenshot_config: ScreenshotConfig | None = None,
1327
+ ) -> FetchResult:
1328
+ """Fetch URL content using the specified strategy.
1329
+
1330
+ Args:
1331
+ url: URL to fetch
1332
+ strategy: Fetch strategy to use
1333
+ config: Fetch configuration
1334
+ explicit_strategy: If True, don't fallback on error (user explicitly chose strategy)
1335
+ cache: Optional FetchCache for caching results
1336
+ skip_read_cache: If True, skip reading from cache but still write results (--no-cache)
1337
+ screenshot: If True, capture full-page screenshot (requires browser strategy)
1338
+ screenshot_dir: Directory to save screenshot
1339
+ screenshot_config: Screenshot settings (viewport, quality, etc.)
1340
+
1341
+ Returns:
1342
+ FetchResult with content and metadata
1343
+
1344
+ Raises:
1345
+ FetchError: If fetch fails and no fallback available
1346
+ AgentBrowserNotFoundError: If --agent-browser used but not installed
1347
+ JinaRateLimitError: If --jina used and rate limit exceeded
1348
+ """
1349
+ # When screenshot is enabled, use multi-source fetching strategy
1350
+ # This captures both static content and browser-rendered content
1351
+ if screenshot:
1352
+ return await _fetch_multi_source(
1353
+ url,
1354
+ config,
1355
+ screenshot_dir=screenshot_dir,
1356
+ screenshot_config=screenshot_config,
1357
+ cache=cache,
1358
+ skip_read_cache=skip_read_cache,
1359
+ )
1360
+
1361
+ # Check cache first (unless skip_read_cache is True)
1362
+ if cache is not None and not skip_read_cache:
1363
+ cached_result = cache.get(url)
1364
+ if cached_result is not None:
1365
+ logger.info(f"[FetchCache] Using cached content for: {url}")
1366
+ return cached_result
1367
+
1368
+ # Screenshot kwargs for browser fetching
1369
+ screenshot_kwargs: dict[str, Any] = {}
1370
+
1371
+ # Fetch the content
1372
+ result: FetchResult
1373
+
1374
+ # Handle explicit strategy (no fallback)
1375
+ if explicit_strategy:
1376
+ if strategy == FetchStrategy.BROWSER:
1377
+ result = await fetch_with_browser(
1378
+ url,
1379
+ command=config.agent_browser.command,
1380
+ timeout=config.agent_browser.timeout,
1381
+ wait_for=config.agent_browser.wait_for,
1382
+ extra_wait_ms=config.agent_browser.extra_wait_ms,
1383
+ session=config.agent_browser.session,
1384
+ **screenshot_kwargs,
1385
+ )
1386
+ elif strategy == FetchStrategy.JINA:
1387
+ api_key = config.jina.get_resolved_api_key()
1388
+ result = await fetch_with_jina(url, api_key, config.jina.timeout)
1389
+ elif strategy == FetchStrategy.STATIC:
1390
+ result = await fetch_with_static(url)
1391
+ else:
1392
+ # AUTO with explicit=True shouldn't happen, but handle it
1393
+ strategy = FetchStrategy.AUTO
1394
+ result = await _fetch_with_fallback(
1395
+ url, config, start_with_browser=False, **screenshot_kwargs
1396
+ )
1397
+ elif strategy == FetchStrategy.AUTO:
1398
+ # Check if domain needs browser rendering
1399
+ if should_use_browser_for_domain(url, config.fallback_patterns):
1400
+ logger.info(f"Domain matches fallback pattern, using browser: {url}")
1401
+ result = await _fetch_with_fallback(
1402
+ url, config, start_with_browser=True, **screenshot_kwargs
1403
+ )
1404
+ else:
1405
+ # Try static first, fallback to browser/jina if JS required
1406
+ result = await _fetch_with_fallback(
1407
+ url, config, start_with_browser=False, **screenshot_kwargs
1408
+ )
1409
+ elif strategy == FetchStrategy.STATIC:
1410
+ result = await fetch_with_static(url)
1411
+ elif strategy == FetchStrategy.BROWSER:
1412
+ result = await fetch_with_browser(
1413
+ url,
1414
+ command=config.agent_browser.command,
1415
+ timeout=config.agent_browser.timeout,
1416
+ wait_for=config.agent_browser.wait_for,
1417
+ extra_wait_ms=config.agent_browser.extra_wait_ms,
1418
+ session=config.agent_browser.session,
1419
+ **screenshot_kwargs,
1420
+ )
1421
+ elif strategy == FetchStrategy.JINA:
1422
+ api_key = config.jina.get_resolved_api_key()
1423
+ result = await fetch_with_jina(url, api_key, config.jina.timeout)
1424
+ else:
1425
+ raise ValueError(f"Unknown fetch strategy: {strategy}")
1426
+
1427
+ # Cache the result
1428
+ if cache is not None:
1429
+ cache.set(url, result)
1430
+
1431
+ return result
1432
+
1433
+
1434
+ def _is_invalid_content(content: str) -> tuple[bool, str]:
1435
+ """Check if fetched content is invalid (JS error page, login prompt, etc.).
1436
+
1437
+ Args:
1438
+ content: Fetched content to check
1439
+
1440
+ Returns:
1441
+ Tuple of (is_invalid, reason)
1442
+ """
1443
+ if not content or not content.strip():
1444
+ return True, "empty"
1445
+
1446
+ # Check for common invalid content patterns
1447
+ invalid_patterns = [
1448
+ (r"JavaScript is (not available|disabled)", "javascript_disabled"),
1449
+ (r"Please enable JavaScript", "javascript_required"),
1450
+ (r"switch to a supported browser", "unsupported_browser"),
1451
+ (r"Something went wrong.*let's give it another shot", "error_page"),
1452
+ (r"Log in.*Sign up.*to continue", "login_required"),
1453
+ (r"You must be logged in", "login_required"),
1454
+ ]
1455
+
1456
+ for pattern, reason in invalid_patterns:
1457
+ if re.search(pattern, content, re.IGNORECASE | re.DOTALL):
1458
+ return True, reason
1459
+
1460
+ # Check content length (after removing markdown links and images)
1461
+ clean_content = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", content) # Remove images
1462
+ clean_content = re.sub(r"\[[^\]]*\]\([^)]+\)", "", clean_content) # Remove links
1463
+ clean_content = re.sub(
1464
+ r"[#\-*_>\[\]`|]", "", clean_content
1465
+ ) # Remove markdown syntax
1466
+ clean_content = " ".join(clean_content.split()) # Normalize whitespace
1467
+
1468
+ if len(clean_content) < 100:
1469
+ return True, "too_short"
1470
+
1471
+ return False, ""
1472
+
1473
+
1474
+ async def _fetch_multi_source(
1475
+ url: str,
1476
+ config: FetchConfig,
1477
+ screenshot_dir: Path | None = None,
1478
+ screenshot_config: ScreenshotConfig | None = None,
1479
+ cache: FetchCache | None = None,
1480
+ skip_read_cache: bool = False,
1481
+ ) -> FetchResult:
1482
+ """Fetch URL using static-first strategy with browser fallback.
1483
+
1484
+ Strategy:
1485
+ 1. Fetch both static and browser in parallel
1486
+ 2. Validate content quality using _is_invalid_content()
1487
+ 3. If static is valid → use static only (ignore browser content)
1488
+ 4. Else if browser is valid → use browser only
1489
+ 5. Else → use browser content with warning (both invalid)
1490
+
1491
+ Screenshot is always included when available.
1492
+
1493
+ Args:
1494
+ url: URL to fetch
1495
+ config: Fetch configuration
1496
+ screenshot_dir: Directory to save screenshot
1497
+ screenshot_config: Screenshot settings
1498
+ cache: Optional FetchCache for caching results
1499
+ skip_read_cache: If True, skip reading from cache
1500
+
1501
+ Returns:
1502
+ FetchResult with single-source content (no merging)
1503
+ """
1504
+ static_content: str | None = None
1505
+ browser_result: FetchResult | None = None
1506
+
1507
+ # Task 1: Try static fetch (non-blocking)
1508
+ async def fetch_static() -> str | None:
1509
+ try:
1510
+ result = await fetch_with_static(url)
1511
+ logger.debug(f"[URL] Static fetch success: {len(result.content)} chars")
1512
+ return result.content
1513
+ except Exception as e:
1514
+ logger.debug(f"[URL] Static fetch failed: {e}")
1515
+ return None
1516
+
1517
+ # Task 2: Browser fetch with screenshot
1518
+ async def fetch_browser() -> FetchResult | None:
1519
+ try:
1520
+ if not is_agent_browser_available(config.agent_browser.command):
1521
+ logger.debug("agent-browser not available")
1522
+ return None
1523
+
1524
+ result = await fetch_with_browser(
1525
+ url,
1526
+ command=config.agent_browser.command,
1527
+ timeout=config.agent_browser.timeout,
1528
+ wait_for=config.agent_browser.wait_for,
1529
+ extra_wait_ms=config.agent_browser.extra_wait_ms,
1530
+ session=config.agent_browser.session,
1531
+ screenshot=True,
1532
+ screenshot_dir=screenshot_dir,
1533
+ screenshot_config=screenshot_config,
1534
+ )
1535
+ logger.debug(f"[URL] Browser fetch success: {len(result.content)} chars")
1536
+ return result
1537
+ except Exception as e:
1538
+ logger.debug(f"[URL] Browser fetch failed: {e}")
1539
+ return None
1540
+
1541
+ # Execute both fetches in parallel
1542
+ static_content, browser_result = await asyncio.gather(
1543
+ fetch_static(), fetch_browser()
1544
+ )
1545
+
1546
+ browser_content = browser_result.content if browser_result else None
1547
+ screenshot_path = browser_result.screenshot_path if browser_result else None
1548
+
1549
+ # Validate content quality
1550
+ static_invalid, static_reason = (
1551
+ _is_invalid_content(static_content)
1552
+ if static_content
1553
+ else (True, "fetch_failed")
1554
+ )
1555
+ browser_invalid, browser_reason = (
1556
+ _is_invalid_content(browser_content)
1557
+ if browser_content
1558
+ else (True, "fetch_failed")
1559
+ )
1560
+
1561
+ if static_invalid:
1562
+ logger.debug(f"[URL] Static content invalid: {static_reason}")
1563
+ if browser_invalid:
1564
+ logger.debug(f"[URL] Browser content invalid: {browser_reason}")
1565
+
1566
+ # Determine which source to use (static-first strategy)
1567
+ primary_content = ""
1568
+ strategy_used = ""
1569
+ warning_message = ""
1570
+ final_static_content: str | None = None
1571
+ final_browser_content: str | None = None
1572
+
1573
+ if not static_invalid:
1574
+ # Static is valid → use static only
1575
+ assert static_content is not None
1576
+ primary_content = static_content
1577
+ final_static_content = static_content
1578
+ strategy_used = "static"
1579
+ logger.info(f"[URL] Using static content (valid, {len(static_content)} chars)")
1580
+ elif not browser_invalid:
1581
+ # Static invalid but browser is valid → use browser
1582
+ assert browser_content is not None
1583
+ primary_content = browser_content
1584
+ final_browser_content = browser_content
1585
+ strategy_used = "browser"
1586
+ logger.info(
1587
+ f"[URL] Using browser content (static invalid: {static_reason}, "
1588
+ f"browser valid, {len(browser_content)} chars)"
1589
+ )
1590
+ elif browser_content:
1591
+ # Both invalid, but browser has content → use browser with warning
1592
+ primary_content = browser_content
1593
+ final_browser_content = browser_content
1594
+ strategy_used = "browser"
1595
+ warning_message = (
1596
+ f"Warning: Content may be incomplete. "
1597
+ f"Static: {static_reason}, Browser: {browser_reason}"
1598
+ )
1599
+ logger.warning(
1600
+ f"[URL] Both sources invalid, using browser content with warning: "
1601
+ f"static={static_reason}, browser={browser_reason}"
1602
+ )
1603
+ elif static_content:
1604
+ # Both invalid, no browser but has static → use static with warning
1605
+ primary_content = static_content
1606
+ final_static_content = static_content
1607
+ strategy_used = "static"
1608
+ warning_message = f"Warning: Content may be incomplete. Reason: {static_reason}"
1609
+ logger.warning(
1610
+ f"[URL] Both sources invalid, using static content with warning: {static_reason}"
1611
+ )
1612
+ else:
1613
+ raise FetchError(f"All fetch strategies failed for URL: {url}")
1614
+
1615
+ # Extract title from browser result if available
1616
+ title = browser_result.title if browser_result else None
1617
+ final_url = browser_result.final_url if browser_result else None
1618
+
1619
+ # If no title from browser, try to extract from primary content
1620
+ if not title and primary_content:
1621
+ title_match = re.match(r"^#\s+(.+)$", primary_content, re.MULTILINE)
1622
+ if title_match:
1623
+ title = title_match.group(1)
1624
+
1625
+ metadata: dict[str, Any] = {"single_source": True, "source": strategy_used}
1626
+ if warning_message:
1627
+ metadata["warning"] = warning_message
1628
+
1629
+ assert primary_content is not None # Guaranteed by above branches
1630
+ result = FetchResult(
1631
+ content=primary_content,
1632
+ strategy_used=strategy_used,
1633
+ title=title,
1634
+ url=url,
1635
+ final_url=final_url,
1636
+ metadata=metadata,
1637
+ screenshot_path=screenshot_path,
1638
+ static_content=final_static_content,
1639
+ browser_content=final_browser_content,
1640
+ )
1641
+
1642
+ # Cache the result
1643
+ if cache is not None:
1644
+ cache.set(url, result)
1645
+
1646
+ return result
1647
+
1648
+
1649
+ async def _fetch_with_fallback(
1650
+ url: str,
1651
+ config: FetchConfig,
1652
+ start_with_browser: bool = False,
1653
+ **screenshot_kwargs: Any,
1654
+ ) -> FetchResult:
1655
+ """Fetch URL with automatic fallback between strategies.
1656
+
1657
+ Args:
1658
+ url: URL to fetch
1659
+ config: Fetch configuration
1660
+ start_with_browser: If True, try browser first (for known JS domains)
1661
+ **screenshot_kwargs: Screenshot options (screenshot, screenshot_dir, screenshot_config)
1662
+
1663
+ Returns:
1664
+ FetchResult from first successful strategy
1665
+ """
1666
+ errors = []
1667
+
1668
+ if start_with_browser:
1669
+ # Try browser first for known JS domains
1670
+ strategies = ["browser", "jina", "static"]
1671
+ else:
1672
+ # Normal order: static -> browser -> jina
1673
+ strategies = ["static", "browser", "jina"]
1674
+
1675
+ for strat in strategies:
1676
+ try:
1677
+ if strat == "static":
1678
+ result = await fetch_with_static(url)
1679
+ # Check if JS is required
1680
+ if detect_js_required(result.content):
1681
+ logger.info(
1682
+ "Static content suggests JS required, trying browser..."
1683
+ )
1684
+ continue
1685
+ return result
1686
+
1687
+ elif strat == "browser":
1688
+ if not is_agent_browser_available(config.agent_browser.command):
1689
+ logger.debug("agent-browser not available, skipping")
1690
+ continue
1691
+ return await fetch_with_browser(
1692
+ url,
1693
+ command=config.agent_browser.command,
1694
+ timeout=config.agent_browser.timeout,
1695
+ wait_for=config.agent_browser.wait_for,
1696
+ extra_wait_ms=config.agent_browser.extra_wait_ms,
1697
+ session=config.agent_browser.session,
1698
+ **screenshot_kwargs,
1699
+ )
1700
+
1701
+ elif strat == "jina":
1702
+ api_key = config.jina.get_resolved_api_key()
1703
+ return await fetch_with_jina(url, api_key, config.jina.timeout)
1704
+
1705
+ except AgentBrowserNotFoundError:
1706
+ logger.debug("agent-browser not installed, trying next strategy")
1707
+ continue
1708
+ except JinaRateLimitError as e:
1709
+ errors.append(str(e))
1710
+ logger.warning(str(e))
1711
+ continue
1712
+ except FetchError as e:
1713
+ errors.append(f"{strat}: {e}")
1714
+ logger.debug(f"Strategy {strat} failed: {e}")
1715
+ continue
1716
+ except Exception as e:
1717
+ errors.append(f"{strat}: {e}")
1718
+ logger.debug(f"Strategy {strat} failed: {e}")
1719
+ continue
1720
+
1721
+ # All strategies failed
1722
+ raise FetchError(
1723
+ f"All fetch strategies failed for {url}:\n"
1724
+ + "\n".join(f" - {e}" for e in errors)
1725
+ )