markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/fetch.py
ADDED
|
@@ -0,0 +1,1725 @@
|
|
|
1
|
+
"""URL fetch module for handling static and JS-rendered pages.
|
|
2
|
+
|
|
3
|
+
This module provides a unified interface for fetching web pages using different
|
|
4
|
+
strategies:
|
|
5
|
+
- static: Direct HTTP request via markitdown (default, fastest)
|
|
6
|
+
- browser: Headless browser via agent-browser (for JS-rendered pages)
|
|
7
|
+
- jina: Jina Reader API (cloud-based, no local dependencies)
|
|
8
|
+
- auto: Auto-detect and fallback (tries static first, then browser/jina)
|
|
9
|
+
|
|
10
|
+
Example usage:
|
|
11
|
+
from markitai.fetch import fetch_url, FetchStrategy
|
|
12
|
+
|
|
13
|
+
# Auto-detect strategy
|
|
14
|
+
result = await fetch_url("https://example.com", FetchStrategy.AUTO, config.fetch)
|
|
15
|
+
|
|
16
|
+
# Force browser rendering
|
|
17
|
+
result = await fetch_url("https://x.com/...", FetchStrategy.BROWSER, config.fetch)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import asyncio
|
|
23
|
+
import hashlib
|
|
24
|
+
import json
|
|
25
|
+
import re
|
|
26
|
+
import shutil
|
|
27
|
+
import sqlite3
|
|
28
|
+
import time
|
|
29
|
+
import uuid
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from enum import Enum
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import TYPE_CHECKING, Any
|
|
34
|
+
from urllib.parse import urlparse
|
|
35
|
+
|
|
36
|
+
from loguru import logger
|
|
37
|
+
|
|
38
|
+
from markitai.constants import (
|
|
39
|
+
DEFAULT_JINA_BASE_URL,
|
|
40
|
+
JS_REQUIRED_PATTERNS,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from markitai.config import FetchConfig, ScreenshotConfig
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class FetchStrategy(Enum):
|
|
48
|
+
"""URL fetch strategy."""
|
|
49
|
+
|
|
50
|
+
AUTO = "auto"
|
|
51
|
+
STATIC = "static"
|
|
52
|
+
BROWSER = "browser"
|
|
53
|
+
JINA = "jina"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class FetchError(Exception):
|
|
57
|
+
"""Base exception for fetch errors."""
|
|
58
|
+
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AgentBrowserNotFoundError(FetchError):
|
|
63
|
+
"""Raised when agent-browser is not installed."""
|
|
64
|
+
|
|
65
|
+
def __init__(self) -> None:
|
|
66
|
+
super().__init__(
|
|
67
|
+
"agent-browser is not installed. Install with: npm install -g agent-browser && agent-browser install"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class JinaRateLimitError(FetchError):
|
|
72
|
+
"""Raised when Jina Reader API rate limit is exceeded."""
|
|
73
|
+
|
|
74
|
+
def __init__(self) -> None:
|
|
75
|
+
super().__init__(
|
|
76
|
+
"Jina Reader rate limit exceeded (free tier: 20 RPM). "
|
|
77
|
+
"Try again later or use --agent-browser for browser rendering."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class JinaAPIError(FetchError):
|
|
82
|
+
"""Raised when Jina Reader API returns an error."""
|
|
83
|
+
|
|
84
|
+
def __init__(self, status_code: int, message: str) -> None:
|
|
85
|
+
self.status_code = status_code
|
|
86
|
+
super().__init__(f"Jina Reader API error ({status_code}): {message}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class FetchResult:
|
|
91
|
+
"""Result of a URL fetch operation.
|
|
92
|
+
|
|
93
|
+
Supports multi-source content for URL fetching:
|
|
94
|
+
- content: Primary markdown content (best available)
|
|
95
|
+
- static_content: Content from static/jina fetch (pure text)
|
|
96
|
+
- browser_content: Content from browser fetch (rendered page)
|
|
97
|
+
- screenshot_path: Full-page screenshot (visual reference)
|
|
98
|
+
|
|
99
|
+
For LLM processing, all three sources can be provided:
|
|
100
|
+
1. static_content - Clean text, reliable but may miss JS content
|
|
101
|
+
2. browser_content - Rendered content, includes JS but may have noise
|
|
102
|
+
3. screenshot - Visual reference for layout/structure
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
content: str # Primary markdown content (best available)
|
|
106
|
+
strategy_used: str # Actual strategy used (static/browser/jina)
|
|
107
|
+
title: str | None = None # Page title if available
|
|
108
|
+
url: str = "" # Original URL
|
|
109
|
+
final_url: str | None = None # Final URL after redirects
|
|
110
|
+
metadata: dict = field(default_factory=dict) # Additional metadata
|
|
111
|
+
cache_hit: bool = False # Whether result was served from cache
|
|
112
|
+
screenshot_path: Path | None = None # Path to captured screenshot (if any)
|
|
113
|
+
# Multi-source content for enhanced LLM processing
|
|
114
|
+
static_content: str | None = None # Content from static fetch
|
|
115
|
+
browser_content: str | None = None # Content from browser fetch
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class FetchCache:
|
|
119
|
+
"""SQLite-based cache for fetch results.
|
|
120
|
+
|
|
121
|
+
Caches the fetched content by URL to avoid repeated network requests.
|
|
122
|
+
Uses the same LRU eviction strategy as LLM cache.
|
|
123
|
+
|
|
124
|
+
Connection reuse: A single connection is reused for all operations
|
|
125
|
+
within the same FetchCache instance to reduce connection overhead.
|
|
126
|
+
Thread safety is ensured by a lock protecting all database operations.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def __init__(self, db_path: Path, max_size_bytes: int = 100 * 1024 * 1024) -> None:
|
|
130
|
+
"""Initialize fetch cache.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
db_path: Path to SQLite database file
|
|
134
|
+
max_size_bytes: Maximum cache size in bytes (default 100MB)
|
|
135
|
+
"""
|
|
136
|
+
import threading
|
|
137
|
+
|
|
138
|
+
self._db_path = db_path
|
|
139
|
+
self._max_size_bytes = max_size_bytes
|
|
140
|
+
self._connection: sqlite3.Connection | None = None
|
|
141
|
+
self._lock = threading.Lock() # Protect database operations
|
|
142
|
+
self._db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
143
|
+
self._init_db()
|
|
144
|
+
|
|
145
|
+
def _get_connection(self) -> sqlite3.Connection:
|
|
146
|
+
"""Get or create a reusable database connection.
|
|
147
|
+
|
|
148
|
+
Connection is created on first use and reused for subsequent calls.
|
|
149
|
+
Uses check_same_thread=False to allow cross-thread usage in async context.
|
|
150
|
+
Note: Callers must hold self._lock when calling this method.
|
|
151
|
+
"""
|
|
152
|
+
if self._connection is None:
|
|
153
|
+
self._connection = sqlite3.connect(
|
|
154
|
+
str(self._db_path),
|
|
155
|
+
timeout=30.0,
|
|
156
|
+
check_same_thread=False, # Allow cross-thread usage for async
|
|
157
|
+
)
|
|
158
|
+
self._connection.execute("PRAGMA journal_mode=WAL")
|
|
159
|
+
self._connection.execute("PRAGMA synchronous=NORMAL")
|
|
160
|
+
self._connection.row_factory = sqlite3.Row
|
|
161
|
+
return self._connection
|
|
162
|
+
|
|
163
|
+
def close(self) -> None:
|
|
164
|
+
"""Close the database connection.
|
|
165
|
+
|
|
166
|
+
Call this during cleanup to release resources.
|
|
167
|
+
"""
|
|
168
|
+
if self._connection is not None:
|
|
169
|
+
self._connection.close()
|
|
170
|
+
self._connection = None
|
|
171
|
+
|
|
172
|
+
def _init_db(self) -> None:
|
|
173
|
+
"""Initialize database schema."""
|
|
174
|
+
with self._lock:
|
|
175
|
+
conn = self._get_connection()
|
|
176
|
+
conn.execute("""
|
|
177
|
+
CREATE TABLE IF NOT EXISTS fetch_cache (
|
|
178
|
+
key TEXT PRIMARY KEY,
|
|
179
|
+
url TEXT NOT NULL,
|
|
180
|
+
content TEXT NOT NULL,
|
|
181
|
+
strategy_used TEXT NOT NULL,
|
|
182
|
+
title TEXT,
|
|
183
|
+
final_url TEXT,
|
|
184
|
+
metadata TEXT,
|
|
185
|
+
created_at INTEGER NOT NULL,
|
|
186
|
+
accessed_at INTEGER NOT NULL,
|
|
187
|
+
size_bytes INTEGER NOT NULL
|
|
188
|
+
)
|
|
189
|
+
""")
|
|
190
|
+
conn.execute(
|
|
191
|
+
"CREATE INDEX IF NOT EXISTS idx_fetch_accessed ON fetch_cache(accessed_at)"
|
|
192
|
+
)
|
|
193
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_fetch_url ON fetch_cache(url)")
|
|
194
|
+
conn.commit()
|
|
195
|
+
|
|
196
|
+
def _compute_hash(self, url: str) -> str:
|
|
197
|
+
"""Compute hash key from URL."""
|
|
198
|
+
return hashlib.sha256(url.encode()).hexdigest()[:32]
|
|
199
|
+
|
|
200
|
+
def get(self, url: str) -> FetchResult | None:
|
|
201
|
+
"""Get cached fetch result if exists.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
url: URL to look up
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Cached FetchResult or None if not found
|
|
208
|
+
"""
|
|
209
|
+
key = self._compute_hash(url)
|
|
210
|
+
now = int(time.time())
|
|
211
|
+
|
|
212
|
+
with self._lock:
|
|
213
|
+
conn = self._get_connection()
|
|
214
|
+
row = conn.execute(
|
|
215
|
+
"SELECT * FROM fetch_cache WHERE key = ?", (key,)
|
|
216
|
+
).fetchone()
|
|
217
|
+
|
|
218
|
+
if row:
|
|
219
|
+
# Update accessed_at for LRU tracking
|
|
220
|
+
conn.execute(
|
|
221
|
+
"UPDATE fetch_cache SET accessed_at = ? WHERE key = ?", (now, key)
|
|
222
|
+
)
|
|
223
|
+
conn.commit()
|
|
224
|
+
|
|
225
|
+
metadata = json.loads(row["metadata"]) if row["metadata"] else {}
|
|
226
|
+
logger.debug(f"[FetchCache] Cache hit for URL: {url}")
|
|
227
|
+
return FetchResult(
|
|
228
|
+
content=row["content"],
|
|
229
|
+
strategy_used=row["strategy_used"],
|
|
230
|
+
title=row["title"],
|
|
231
|
+
url=row["url"],
|
|
232
|
+
final_url=row["final_url"],
|
|
233
|
+
metadata=metadata,
|
|
234
|
+
cache_hit=True,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
def set(self, url: str, result: FetchResult) -> None:
|
|
240
|
+
"""Cache a fetch result.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
url: URL that was fetched
|
|
244
|
+
result: FetchResult to cache
|
|
245
|
+
"""
|
|
246
|
+
key = self._compute_hash(url)
|
|
247
|
+
now = int(time.time())
|
|
248
|
+
metadata_json = json.dumps(result.metadata) if result.metadata else None
|
|
249
|
+
size_bytes = len(result.content.encode("utf-8"))
|
|
250
|
+
|
|
251
|
+
with self._lock:
|
|
252
|
+
conn = self._get_connection()
|
|
253
|
+
# Check current total size
|
|
254
|
+
total_size = conn.execute(
|
|
255
|
+
"SELECT COALESCE(SUM(size_bytes), 0) as total FROM fetch_cache"
|
|
256
|
+
).fetchone()["total"]
|
|
257
|
+
|
|
258
|
+
# Evict LRU entries if needed
|
|
259
|
+
while total_size + size_bytes > self._max_size_bytes:
|
|
260
|
+
oldest = conn.execute(
|
|
261
|
+
"SELECT key, size_bytes FROM fetch_cache ORDER BY accessed_at ASC LIMIT 1"
|
|
262
|
+
).fetchone()
|
|
263
|
+
|
|
264
|
+
if oldest is None:
|
|
265
|
+
break
|
|
266
|
+
|
|
267
|
+
conn.execute("DELETE FROM fetch_cache WHERE key = ?", (oldest["key"],))
|
|
268
|
+
total_size -= oldest["size_bytes"]
|
|
269
|
+
logger.debug(f"[FetchCache] Evicted LRU entry: {oldest['key'][:8]}...")
|
|
270
|
+
|
|
271
|
+
# Insert or replace
|
|
272
|
+
conn.execute(
|
|
273
|
+
"""
|
|
274
|
+
INSERT OR REPLACE INTO fetch_cache
|
|
275
|
+
(key, url, content, strategy_used, title, final_url, metadata, created_at, accessed_at, size_bytes)
|
|
276
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
277
|
+
""",
|
|
278
|
+
(
|
|
279
|
+
key,
|
|
280
|
+
url,
|
|
281
|
+
result.content,
|
|
282
|
+
result.strategy_used,
|
|
283
|
+
result.title,
|
|
284
|
+
result.final_url,
|
|
285
|
+
metadata_json,
|
|
286
|
+
now,
|
|
287
|
+
now,
|
|
288
|
+
size_bytes,
|
|
289
|
+
),
|
|
290
|
+
)
|
|
291
|
+
conn.commit()
|
|
292
|
+
logger.debug(f"[FetchCache] Cached URL: {url} ({size_bytes} bytes)")
|
|
293
|
+
|
|
294
|
+
def stats(self) -> dict[str, Any]:
|
|
295
|
+
"""Return cache statistics."""
|
|
296
|
+
with self._lock:
|
|
297
|
+
conn = self._get_connection()
|
|
298
|
+
row = conn.execute(
|
|
299
|
+
"""
|
|
300
|
+
SELECT COUNT(*) as count, COALESCE(SUM(size_bytes), 0) as size_bytes
|
|
301
|
+
FROM fetch_cache
|
|
302
|
+
"""
|
|
303
|
+
).fetchone()
|
|
304
|
+
|
|
305
|
+
return {
|
|
306
|
+
"count": row["count"],
|
|
307
|
+
"size_bytes": row["size_bytes"],
|
|
308
|
+
"size_mb": round(row["size_bytes"] / (1024 * 1024), 2),
|
|
309
|
+
"max_size_mb": round(self._max_size_bytes / (1024 * 1024), 2),
|
|
310
|
+
"db_path": str(self._db_path),
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
def clear(self) -> int:
|
|
314
|
+
"""Clear all entries.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Number of entries deleted
|
|
318
|
+
"""
|
|
319
|
+
with self._lock:
|
|
320
|
+
conn = self._get_connection()
|
|
321
|
+
count = conn.execute("SELECT COUNT(*) as cnt FROM fetch_cache").fetchone()[
|
|
322
|
+
"cnt"
|
|
323
|
+
]
|
|
324
|
+
conn.execute("DELETE FROM fetch_cache")
|
|
325
|
+
conn.commit()
|
|
326
|
+
return count
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
# Global fetch cache instance (initialized lazily)
|
|
330
|
+
_fetch_cache: FetchCache | None = None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def get_fetch_cache(
|
|
334
|
+
cache_dir: Path, max_size_bytes: int = 100 * 1024 * 1024
|
|
335
|
+
) -> FetchCache:
|
|
336
|
+
"""Get or create the global fetch cache instance.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
cache_dir: Directory to store cache database
|
|
340
|
+
max_size_bytes: Maximum cache size
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
FetchCache instance
|
|
344
|
+
"""
|
|
345
|
+
global _fetch_cache
|
|
346
|
+
if _fetch_cache is None:
|
|
347
|
+
db_path = cache_dir / "fetch_cache.db"
|
|
348
|
+
_fetch_cache = FetchCache(db_path, max_size_bytes)
|
|
349
|
+
return _fetch_cache
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
# Global MarkItDown instance (reused for static fetching)
|
|
353
|
+
# Note: MarkItDown's requests.Session is NOT thread-safe. However, since
|
|
354
|
+
# fetch_with_static runs in the asyncio event loop (not in a thread pool),
|
|
355
|
+
# only one md.convert() call executes at a time, avoiding thread safety issues.
|
|
356
|
+
# If fetch_with_static is ever moved to run_in_executor with threads, this
|
|
357
|
+
# should be changed to use threading.local() for thread-local instances.
|
|
358
|
+
_markitdown_instance: Any = None
|
|
359
|
+
|
|
360
|
+
# Global httpx.AsyncClient for Jina fetching (reused to avoid connection overhead)
|
|
361
|
+
_jina_client: Any = None
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _get_markitdown() -> Any:
|
|
365
|
+
"""Get or create the shared MarkItDown instance.
|
|
366
|
+
|
|
367
|
+
Reusing a single instance avoids repeated initialization overhead.
|
|
368
|
+
"""
|
|
369
|
+
global _markitdown_instance
|
|
370
|
+
if _markitdown_instance is None:
|
|
371
|
+
from markitdown import MarkItDown
|
|
372
|
+
|
|
373
|
+
_markitdown_instance = MarkItDown()
|
|
374
|
+
return _markitdown_instance
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _get_jina_client(timeout: int = 30) -> Any:
|
|
378
|
+
"""Get or create the shared httpx.AsyncClient for Jina fetching.
|
|
379
|
+
|
|
380
|
+
Reusing a single client instance avoids repeated connection setup overhead.
|
|
381
|
+
The client uses connection pooling for better performance.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
timeout: Request timeout in seconds (used on first creation only)
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
httpx.AsyncClient instance
|
|
388
|
+
"""
|
|
389
|
+
global _jina_client
|
|
390
|
+
if _jina_client is None:
|
|
391
|
+
import httpx
|
|
392
|
+
|
|
393
|
+
_jina_client = httpx.AsyncClient(
|
|
394
|
+
timeout=timeout,
|
|
395
|
+
limits=httpx.Limits(max_connections=10, max_keepalive_connections=5),
|
|
396
|
+
)
|
|
397
|
+
return _jina_client
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
async def close_shared_clients() -> None:
|
|
401
|
+
"""Close shared client instances.
|
|
402
|
+
|
|
403
|
+
Call this during cleanup to release resources.
|
|
404
|
+
"""
|
|
405
|
+
global _jina_client, _fetch_cache
|
|
406
|
+
if _jina_client is not None:
|
|
407
|
+
await _jina_client.aclose()
|
|
408
|
+
_jina_client = None
|
|
409
|
+
if _fetch_cache is not None:
|
|
410
|
+
_fetch_cache.close()
|
|
411
|
+
_fetch_cache = None
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def detect_js_required(content: str) -> bool:
|
|
415
|
+
"""Detect if content indicates JavaScript rendering is required.
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
content: HTML or Markdown content to check
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
True if content suggests JavaScript is needed
|
|
422
|
+
"""
|
|
423
|
+
if not content:
|
|
424
|
+
return True # Empty content likely means JS-rendered
|
|
425
|
+
|
|
426
|
+
content_lower = content.lower()
|
|
427
|
+
for pattern in JS_REQUIRED_PATTERNS:
|
|
428
|
+
if pattern.lower() in content_lower:
|
|
429
|
+
logger.debug(f"JS required pattern detected: {pattern}")
|
|
430
|
+
return True
|
|
431
|
+
|
|
432
|
+
# Check for very short content (likely a JS-only page)
|
|
433
|
+
# Strip markdown formatting for length check
|
|
434
|
+
text_only = re.sub(r"[#*_\[\]()>`-]", "", content).strip()
|
|
435
|
+
if len(text_only) < 100:
|
|
436
|
+
logger.debug(f"Content too short ({len(text_only)} chars), likely JS-rendered")
|
|
437
|
+
return True
|
|
438
|
+
|
|
439
|
+
return False
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def should_use_browser_for_domain(url: str, fallback_patterns: list[str]) -> bool:
|
|
443
|
+
"""Check if URL domain matches fallback patterns that need browser rendering.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
url: URL to check
|
|
447
|
+
fallback_patterns: List of domain patterns (e.g., ["twitter.com", "x.com"])
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
True if domain matches any pattern
|
|
451
|
+
"""
|
|
452
|
+
try:
|
|
453
|
+
parsed = urlparse(url)
|
|
454
|
+
domain = parsed.netloc.lower()
|
|
455
|
+
|
|
456
|
+
for pattern in fallback_patterns:
|
|
457
|
+
pattern_lower = pattern.lower()
|
|
458
|
+
# Match exact domain or subdomain
|
|
459
|
+
if domain == pattern_lower or domain.endswith("." + pattern_lower):
|
|
460
|
+
logger.debug(f"Domain {domain} matches fallback pattern {pattern}")
|
|
461
|
+
return True
|
|
462
|
+
except Exception:
|
|
463
|
+
pass
|
|
464
|
+
|
|
465
|
+
return False
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def is_agent_browser_available(command: str = "agent-browser") -> bool:
|
|
469
|
+
"""Check if agent-browser CLI is installed and available.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
command: Command name or path to check
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
True if agent-browser is available
|
|
476
|
+
"""
|
|
477
|
+
return shutil.which(command) is not None
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
# Cache for agent-browser readiness check
|
|
481
|
+
_agent_browser_ready_cache: dict[str, tuple[bool, str]] = {}
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def verify_agent_browser_ready(
|
|
485
|
+
command: str = "agent-browser", use_cache: bool = True
|
|
486
|
+
) -> tuple[bool, str]:
|
|
487
|
+
"""Verify that agent-browser is fully ready (command exists + browser installed).
|
|
488
|
+
|
|
489
|
+
This performs a more thorough check than is_agent_browser_available() by
|
|
490
|
+
actually running agent-browser to verify it works.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
command: Command name or path to check
|
|
494
|
+
use_cache: Whether to use cached result (default True)
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
Tuple of (is_ready, message)
|
|
498
|
+
- (True, "agent-browser is ready") if fully functional
|
|
499
|
+
- (False, "error message") if not ready
|
|
500
|
+
"""
|
|
501
|
+
import subprocess
|
|
502
|
+
|
|
503
|
+
# Check cache first
|
|
504
|
+
if use_cache and command in _agent_browser_ready_cache:
|
|
505
|
+
return _agent_browser_ready_cache[command]
|
|
506
|
+
|
|
507
|
+
# Step 1: Check if command exists
|
|
508
|
+
if not shutil.which(command):
|
|
509
|
+
result = (
|
|
510
|
+
False,
|
|
511
|
+
f"'{command}' command not found. Install with: npm install -g agent-browser",
|
|
512
|
+
)
|
|
513
|
+
_agent_browser_ready_cache[command] = result
|
|
514
|
+
return result
|
|
515
|
+
|
|
516
|
+
# Step 2: Check if agent-browser responds to --help
|
|
517
|
+
try:
|
|
518
|
+
proc = subprocess.run(
|
|
519
|
+
[command, "--help"],
|
|
520
|
+
capture_output=True,
|
|
521
|
+
text=True,
|
|
522
|
+
timeout=10,
|
|
523
|
+
)
|
|
524
|
+
if proc.returncode != 0:
|
|
525
|
+
result = (False, f"'{command}' command failed: {proc.stderr.strip()}")
|
|
526
|
+
_agent_browser_ready_cache[command] = result
|
|
527
|
+
return result
|
|
528
|
+
except subprocess.TimeoutExpired:
|
|
529
|
+
result = (False, f"'{command}' command timed out")
|
|
530
|
+
_agent_browser_ready_cache[command] = result
|
|
531
|
+
return result
|
|
532
|
+
except Exception as e:
|
|
533
|
+
result = (False, f"'{command}' command error: {e}")
|
|
534
|
+
_agent_browser_ready_cache[command] = result
|
|
535
|
+
return result
|
|
536
|
+
|
|
537
|
+
# Step 3: Try a simple operation to verify browser is installed
|
|
538
|
+
# We use 'agent-browser snapshot' on about:blank which should fail fast if browser not installed
|
|
539
|
+
try:
|
|
540
|
+
proc = subprocess.run(
|
|
541
|
+
[command, "open", "about:blank"],
|
|
542
|
+
capture_output=True,
|
|
543
|
+
text=True,
|
|
544
|
+
timeout=30,
|
|
545
|
+
)
|
|
546
|
+
# Check for known error patterns
|
|
547
|
+
if proc.returncode != 0:
|
|
548
|
+
stderr_lower = proc.stderr.lower()
|
|
549
|
+
stderr_orig = proc.stderr.strip()
|
|
550
|
+
# Check for Playwright browser not installed error
|
|
551
|
+
if (
|
|
552
|
+
"executable doesn't exist" in stderr_lower
|
|
553
|
+
or "browsertype.launch" in stderr_lower
|
|
554
|
+
):
|
|
555
|
+
result = (
|
|
556
|
+
False,
|
|
557
|
+
"Playwright browser not installed. Run: agent-browser install "
|
|
558
|
+
"OR npx playwright install chromium",
|
|
559
|
+
)
|
|
560
|
+
_agent_browser_ready_cache[command] = result
|
|
561
|
+
return result
|
|
562
|
+
# Check for daemon not found error (global install needs AGENT_BROWSER_HOME)
|
|
563
|
+
if "daemon not found" in stderr_lower:
|
|
564
|
+
result = (
|
|
565
|
+
False,
|
|
566
|
+
"agent-browser daemon not found. "
|
|
567
|
+
"Set AGENT_BROWSER_HOME environment variable to the agent-browser package directory. "
|
|
568
|
+
"For pnpm global install: AGENT_BROWSER_HOME=$(pnpm list -g agent-browser --parseable)/node_modules/agent-browser "
|
|
569
|
+
"For npm global install: AGENT_BROWSER_HOME=$(npm root -g)/agent-browser",
|
|
570
|
+
)
|
|
571
|
+
_agent_browser_ready_cache[command] = result
|
|
572
|
+
return result
|
|
573
|
+
# Other errors might be transient, still mark as ready
|
|
574
|
+
logger.debug(
|
|
575
|
+
f"agent-browser test returned non-zero but may still work: {stderr_orig}"
|
|
576
|
+
)
|
|
577
|
+
except subprocess.TimeoutExpired:
|
|
578
|
+
# Timeout on about:blank is suspicious but not fatal
|
|
579
|
+
logger.debug("agent-browser test timed out, may still work for real pages")
|
|
580
|
+
except Exception as e:
|
|
581
|
+
logger.debug(f"agent-browser test error (may still work): {e}")
|
|
582
|
+
|
|
583
|
+
# Close browser if opened
|
|
584
|
+
try:
|
|
585
|
+
subprocess.run([command, "close"], capture_output=True, timeout=5)
|
|
586
|
+
except Exception:
|
|
587
|
+
pass
|
|
588
|
+
|
|
589
|
+
result = (True, "agent-browser is ready")
|
|
590
|
+
_agent_browser_ready_cache[command] = result
|
|
591
|
+
return result
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def clear_agent_browser_cache() -> None:
|
|
595
|
+
"""Clear the agent-browser readiness cache."""
|
|
596
|
+
_agent_browser_ready_cache.clear()
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def _url_to_screenshot_filename(url: str) -> str:
|
|
600
|
+
"""Generate a safe filename for URL screenshot.
|
|
601
|
+
|
|
602
|
+
Examples:
|
|
603
|
+
https://example.com/path → example.com_path.full.jpg
|
|
604
|
+
https://x.com/user/status/123 → x.com_user_status_123.full.jpg
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
url: URL to convert
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
Safe filename with .full.jpg extension
|
|
611
|
+
"""
|
|
612
|
+
try:
|
|
613
|
+
parsed = urlparse(url)
|
|
614
|
+
# Start with domain
|
|
615
|
+
parts = [parsed.netloc] if parsed.netloc else []
|
|
616
|
+
# Add path parts
|
|
617
|
+
if parsed.path and parsed.path != "/":
|
|
618
|
+
path_parts = parsed.path.strip("/").split("/")
|
|
619
|
+
parts.extend(path_parts)
|
|
620
|
+
|
|
621
|
+
# If no parts, fall back to hash
|
|
622
|
+
if not parts or not any(parts):
|
|
623
|
+
url_hash = hashlib.sha256(url.encode()).hexdigest()[:16]
|
|
624
|
+
return f"screenshot_{url_hash}.full.jpg"
|
|
625
|
+
|
|
626
|
+
# Join with underscores
|
|
627
|
+
name = "_".join(p for p in parts if p)
|
|
628
|
+
|
|
629
|
+
# Sanitize for filesystem (remove/replace unsafe chars)
|
|
630
|
+
# Windows-unsafe: < > : " / \ | ? *
|
|
631
|
+
# Also remove other problematic chars
|
|
632
|
+
unsafe_chars = r'<>:"/\\|?*\x00-\x1f'
|
|
633
|
+
name = re.sub(f"[{unsafe_chars}]", "_", name)
|
|
634
|
+
|
|
635
|
+
# Collapse multiple underscores
|
|
636
|
+
name = re.sub(r"_+", "_", name)
|
|
637
|
+
|
|
638
|
+
# Strip leading/trailing underscores
|
|
639
|
+
name = name.strip("_")
|
|
640
|
+
|
|
641
|
+
# Limit length (leave room for extension)
|
|
642
|
+
max_length = 200
|
|
643
|
+
if len(name) > max_length:
|
|
644
|
+
name = name[:max_length]
|
|
645
|
+
|
|
646
|
+
# Final check for empty name
|
|
647
|
+
if not name:
|
|
648
|
+
url_hash = hashlib.sha256(url.encode()).hexdigest()[:16]
|
|
649
|
+
return f"screenshot_{url_hash}.full.jpg"
|
|
650
|
+
|
|
651
|
+
return f"{name}.full.jpg"
|
|
652
|
+
except Exception:
|
|
653
|
+
# Fallback: hash the URL
|
|
654
|
+
url_hash = hashlib.sha256(url.encode()).hexdigest()[:16]
|
|
655
|
+
return f"screenshot_{url_hash}.full.jpg"
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def _compress_screenshot(
|
|
659
|
+
screenshot_path: Path,
|
|
660
|
+
quality: int = 85,
|
|
661
|
+
max_height: int = 10000,
|
|
662
|
+
) -> None:
|
|
663
|
+
"""Compress a screenshot to JPEG with quality and size limits.
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
screenshot_path: Path to screenshot file (will be overwritten)
|
|
667
|
+
quality: JPEG quality (1-100)
|
|
668
|
+
max_height: Maximum height in pixels (will resize if exceeded)
|
|
669
|
+
"""
|
|
670
|
+
try:
|
|
671
|
+
from PIL import Image
|
|
672
|
+
|
|
673
|
+
with Image.open(screenshot_path) as img:
|
|
674
|
+
# Convert to RGB if necessary (for JPEG)
|
|
675
|
+
if img.mode in ("RGBA", "P"):
|
|
676
|
+
img = img.convert("RGB")
|
|
677
|
+
|
|
678
|
+
# Resize if too tall
|
|
679
|
+
width, height = img.size
|
|
680
|
+
if height > max_height:
|
|
681
|
+
ratio = max_height / height
|
|
682
|
+
new_width = int(width * ratio)
|
|
683
|
+
img = img.resize((new_width, max_height), Image.Resampling.LANCZOS)
|
|
684
|
+
logger.debug(
|
|
685
|
+
f"Resized screenshot from {width}x{height} to {new_width}x{max_height}"
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# Save with compression
|
|
689
|
+
img.save(screenshot_path, "JPEG", quality=quality, optimize=True)
|
|
690
|
+
logger.debug(
|
|
691
|
+
f"Compressed screenshot to quality={quality}: {screenshot_path}"
|
|
692
|
+
)
|
|
693
|
+
except ImportError:
|
|
694
|
+
logger.warning("Pillow not installed, skipping screenshot compression")
|
|
695
|
+
except Exception as e:
|
|
696
|
+
logger.warning(f"Failed to compress screenshot: {e}")
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
def _html_to_text(html: str) -> str:
|
|
700
|
+
"""Extract clean text from HTML content.
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
html: Raw HTML content
|
|
704
|
+
|
|
705
|
+
Returns:
|
|
706
|
+
Extracted text content formatted as markdown
|
|
707
|
+
"""
|
|
708
|
+
try:
|
|
709
|
+
from bs4 import BeautifulSoup
|
|
710
|
+
|
|
711
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
712
|
+
|
|
713
|
+
# Remove script and style elements
|
|
714
|
+
for element in soup(["script", "style", "noscript", "nav", "footer", "header"]):
|
|
715
|
+
element.decompose()
|
|
716
|
+
|
|
717
|
+
# Extract text from main content areas
|
|
718
|
+
lines = []
|
|
719
|
+
|
|
720
|
+
# Try to find main content area
|
|
721
|
+
main = soup.find("main") or soup.find("article") or soup.find("body")
|
|
722
|
+
if not main:
|
|
723
|
+
return ""
|
|
724
|
+
|
|
725
|
+
for element in main.find_all(
|
|
726
|
+
["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote", "pre", "code"]
|
|
727
|
+
):
|
|
728
|
+
text = element.get_text(strip=True)
|
|
729
|
+
if not text:
|
|
730
|
+
continue
|
|
731
|
+
|
|
732
|
+
tag = element.name
|
|
733
|
+
if tag == "h1":
|
|
734
|
+
lines.append(f"# {text}")
|
|
735
|
+
elif tag == "h2":
|
|
736
|
+
lines.append(f"## {text}")
|
|
737
|
+
elif tag == "h3":
|
|
738
|
+
lines.append(f"### {text}")
|
|
739
|
+
elif tag == "h4":
|
|
740
|
+
lines.append(f"#### {text}")
|
|
741
|
+
elif tag == "h5":
|
|
742
|
+
lines.append(f"##### {text}")
|
|
743
|
+
elif tag == "h6":
|
|
744
|
+
lines.append(f"###### {text}")
|
|
745
|
+
elif tag == "p":
|
|
746
|
+
lines.append(text)
|
|
747
|
+
elif tag == "li":
|
|
748
|
+
lines.append(f"- {text}")
|
|
749
|
+
elif tag == "blockquote":
|
|
750
|
+
lines.append(f"> {text}")
|
|
751
|
+
elif tag == "pre" or tag == "code":
|
|
752
|
+
lines.append(f"```\n{text}\n```")
|
|
753
|
+
|
|
754
|
+
lines.append("")
|
|
755
|
+
|
|
756
|
+
return "\n".join(lines).strip()
|
|
757
|
+
|
|
758
|
+
except ImportError:
|
|
759
|
+
logger.debug("BeautifulSoup not installed, using simple text extraction")
|
|
760
|
+
# Fallback: simple regex-based extraction
|
|
761
|
+
import re
|
|
762
|
+
|
|
763
|
+
# Remove tags
|
|
764
|
+
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL)
|
|
765
|
+
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL)
|
|
766
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
767
|
+
# Normalize whitespace
|
|
768
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
769
|
+
return text
|
|
770
|
+
except Exception as e:
|
|
771
|
+
logger.debug(f"HTML to text extraction failed: {e}")
|
|
772
|
+
return ""
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
async def fetch_with_static(url: str) -> FetchResult:
|
|
776
|
+
"""Fetch URL using markitdown (direct HTTP request).
|
|
777
|
+
|
|
778
|
+
Args:
|
|
779
|
+
url: URL to fetch
|
|
780
|
+
|
|
781
|
+
Returns:
|
|
782
|
+
FetchResult with markdown content
|
|
783
|
+
|
|
784
|
+
Raises:
|
|
785
|
+
FetchError: If fetch fails
|
|
786
|
+
"""
|
|
787
|
+
logger.debug(f"Fetching URL with static strategy: {url}")
|
|
788
|
+
|
|
789
|
+
try:
|
|
790
|
+
md = _get_markitdown()
|
|
791
|
+
result = md.convert(url)
|
|
792
|
+
|
|
793
|
+
if not result.text_content:
|
|
794
|
+
raise FetchError(f"No content extracted from URL: {url}")
|
|
795
|
+
|
|
796
|
+
return FetchResult(
|
|
797
|
+
content=result.text_content,
|
|
798
|
+
strategy_used="static",
|
|
799
|
+
title=result.title,
|
|
800
|
+
url=url,
|
|
801
|
+
metadata={"converter": "markitdown"},
|
|
802
|
+
)
|
|
803
|
+
except Exception as e:
|
|
804
|
+
if "No content extracted" in str(e):
|
|
805
|
+
raise
|
|
806
|
+
raise FetchError(f"Failed to fetch URL: {e}")
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
async def fetch_with_browser(
|
|
810
|
+
url: str,
|
|
811
|
+
command: str = "agent-browser",
|
|
812
|
+
timeout: int = 30000,
|
|
813
|
+
wait_for: str = "domcontentloaded",
|
|
814
|
+
extra_wait_ms: int = 2000,
|
|
815
|
+
session: str | None = None,
|
|
816
|
+
*,
|
|
817
|
+
screenshot: bool = False,
|
|
818
|
+
screenshot_dir: Path | None = None,
|
|
819
|
+
screenshot_config: ScreenshotConfig | None = None,
|
|
820
|
+
) -> FetchResult:
|
|
821
|
+
"""Fetch URL using agent-browser (headless browser).
|
|
822
|
+
|
|
823
|
+
Args:
|
|
824
|
+
url: URL to fetch
|
|
825
|
+
command: agent-browser command name or path
|
|
826
|
+
timeout: Page load timeout in milliseconds
|
|
827
|
+
wait_for: Wait condition (load/domcontentloaded/networkidle)
|
|
828
|
+
extra_wait_ms: Extra wait time after load state (for JS rendering)
|
|
829
|
+
session: Optional session name for isolated browser
|
|
830
|
+
screenshot: If True, capture full-page screenshot
|
|
831
|
+
screenshot_dir: Directory to save screenshot (required if screenshot=True)
|
|
832
|
+
screenshot_config: Screenshot settings (viewport, quality, etc.)
|
|
833
|
+
|
|
834
|
+
Returns:
|
|
835
|
+
FetchResult with rendered page content and optional screenshot path
|
|
836
|
+
|
|
837
|
+
Raises:
|
|
838
|
+
AgentBrowserNotFoundError: If agent-browser is not installed
|
|
839
|
+
FetchError: If fetch fails
|
|
840
|
+
"""
|
|
841
|
+
if not is_agent_browser_available(command):
|
|
842
|
+
raise AgentBrowserNotFoundError()
|
|
843
|
+
|
|
844
|
+
logger.debug(f"Fetching URL with browser strategy: {url}")
|
|
845
|
+
|
|
846
|
+
# Generate unique session ID to avoid conflicts with concurrent browser fetches
|
|
847
|
+
# Each fetch_with_browser call gets its own isolated browser session
|
|
848
|
+
effective_session = (
|
|
849
|
+
session if session else f"markitai-fetch-{uuid.uuid4().hex[:12]}"
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
try:
|
|
853
|
+
# Build command args
|
|
854
|
+
base_args = [command, "--session", effective_session]
|
|
855
|
+
|
|
856
|
+
# Step 1: Open URL and wait for page load
|
|
857
|
+
open_args = [*base_args, "open", url]
|
|
858
|
+
logger.debug(f"Running: {' '.join(open_args)}")
|
|
859
|
+
|
|
860
|
+
proc = await asyncio.create_subprocess_exec(
|
|
861
|
+
*open_args,
|
|
862
|
+
stdout=asyncio.subprocess.PIPE,
|
|
863
|
+
stderr=asyncio.subprocess.PIPE,
|
|
864
|
+
)
|
|
865
|
+
stdout, stderr = await asyncio.wait_for(
|
|
866
|
+
proc.communicate(), timeout=timeout / 1000 + 10
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
if proc.returncode != 0:
|
|
870
|
+
error_msg = stderr.decode() if stderr else "Unknown error"
|
|
871
|
+
raise FetchError(f"agent-browser open failed: {error_msg}")
|
|
872
|
+
|
|
873
|
+
# Step 2: Wait for load state
|
|
874
|
+
wait_args = [*base_args, "wait", "--load", wait_for]
|
|
875
|
+
logger.debug(f"Running: {' '.join(wait_args)}")
|
|
876
|
+
|
|
877
|
+
proc = await asyncio.create_subprocess_exec(
|
|
878
|
+
*wait_args,
|
|
879
|
+
stdout=asyncio.subprocess.PIPE,
|
|
880
|
+
stderr=asyncio.subprocess.PIPE,
|
|
881
|
+
)
|
|
882
|
+
await asyncio.wait_for(proc.communicate(), timeout=timeout / 1000 + 10)
|
|
883
|
+
|
|
884
|
+
# Step 2.5: Extra wait for JS rendering (especially for SPAs)
|
|
885
|
+
if extra_wait_ms > 0:
|
|
886
|
+
extra_wait_args = [*base_args, "wait", str(extra_wait_ms)]
|
|
887
|
+
logger.debug(f"Running: {' '.join(extra_wait_args)}")
|
|
888
|
+
proc = await asyncio.create_subprocess_exec(
|
|
889
|
+
*extra_wait_args,
|
|
890
|
+
stdout=asyncio.subprocess.PIPE,
|
|
891
|
+
stderr=asyncio.subprocess.PIPE,
|
|
892
|
+
)
|
|
893
|
+
await asyncio.wait_for(proc.communicate(), timeout=extra_wait_ms / 1000 + 5)
|
|
894
|
+
|
|
895
|
+
# Step 3: Get page content via snapshot (accessibility tree with text)
|
|
896
|
+
# Using snapshot -c (compact) to get clean text structure
|
|
897
|
+
snapshot_args = [*base_args, "snapshot", "-c", "--json"]
|
|
898
|
+
logger.debug(f"Running: {' '.join(snapshot_args)}")
|
|
899
|
+
|
|
900
|
+
proc = await asyncio.create_subprocess_exec(
|
|
901
|
+
*snapshot_args,
|
|
902
|
+
stdout=asyncio.subprocess.PIPE,
|
|
903
|
+
stderr=asyncio.subprocess.PIPE,
|
|
904
|
+
)
|
|
905
|
+
stdout, stderr = await asyncio.wait_for(
|
|
906
|
+
proc.communicate(), timeout=timeout / 1000 + 10
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
if proc.returncode != 0:
|
|
910
|
+
error_msg = stderr.decode() if stderr else "Unknown error"
|
|
911
|
+
raise FetchError(f"agent-browser snapshot failed: {error_msg}")
|
|
912
|
+
|
|
913
|
+
# Parse snapshot JSON
|
|
914
|
+
try:
|
|
915
|
+
snapshot_data = json.loads(stdout.decode())
|
|
916
|
+
if snapshot_data.get("success"):
|
|
917
|
+
snapshot_text = snapshot_data.get("data", {}).get("snapshot", "")
|
|
918
|
+
else:
|
|
919
|
+
snapshot_text = stdout.decode()
|
|
920
|
+
except json.JSONDecodeError:
|
|
921
|
+
snapshot_text = stdout.decode()
|
|
922
|
+
|
|
923
|
+
# Step 4, 5 & 6: Get page title, final URL and HTML body in parallel
|
|
924
|
+
async def get_title() -> str | None:
|
|
925
|
+
title_args = [*base_args, "get", "title"]
|
|
926
|
+
proc = await asyncio.create_subprocess_exec(
|
|
927
|
+
*title_args,
|
|
928
|
+
stdout=asyncio.subprocess.PIPE,
|
|
929
|
+
stderr=asyncio.subprocess.PIPE,
|
|
930
|
+
)
|
|
931
|
+
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10)
|
|
932
|
+
if proc.returncode == 0 and stdout:
|
|
933
|
+
return stdout.decode().strip()
|
|
934
|
+
return None
|
|
935
|
+
|
|
936
|
+
async def get_final_url() -> str | None:
|
|
937
|
+
url_args = [*base_args, "get", "url"]
|
|
938
|
+
proc = await asyncio.create_subprocess_exec(
|
|
939
|
+
*url_args,
|
|
940
|
+
stdout=asyncio.subprocess.PIPE,
|
|
941
|
+
stderr=asyncio.subprocess.PIPE,
|
|
942
|
+
)
|
|
943
|
+
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=10)
|
|
944
|
+
if proc.returncode == 0 and stdout:
|
|
945
|
+
return stdout.decode().strip()
|
|
946
|
+
return None
|
|
947
|
+
|
|
948
|
+
async def get_html_body() -> str | None:
|
|
949
|
+
"""Get HTML body content for text extraction."""
|
|
950
|
+
html_args = [*base_args, "get", "html", "body"]
|
|
951
|
+
proc = await asyncio.create_subprocess_exec(
|
|
952
|
+
*html_args,
|
|
953
|
+
stdout=asyncio.subprocess.PIPE,
|
|
954
|
+
stderr=asyncio.subprocess.PIPE,
|
|
955
|
+
)
|
|
956
|
+
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=15)
|
|
957
|
+
if proc.returncode == 0 and stdout:
|
|
958
|
+
return stdout.decode()
|
|
959
|
+
return None
|
|
960
|
+
|
|
961
|
+
# Execute title, URL and HTML fetching in parallel
|
|
962
|
+
title, final_url, html_body = await asyncio.gather(
|
|
963
|
+
get_title(), get_final_url(), get_html_body()
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
# Convert snapshot to markdown format
|
|
967
|
+
markdown_content = _snapshot_to_markdown(snapshot_text, title, url)
|
|
968
|
+
|
|
969
|
+
# Also extract text from HTML as fallback/supplement
|
|
970
|
+
html_text_content: str | None = None
|
|
971
|
+
if html_body:
|
|
972
|
+
html_text_content = _html_to_text(html_body)
|
|
973
|
+
|
|
974
|
+
# Use HTML text if snapshot conversion failed or is too short
|
|
975
|
+
if not markdown_content.strip() or len(markdown_content.strip()) < 100:
|
|
976
|
+
if html_text_content and len(html_text_content.strip()) > len(
|
|
977
|
+
markdown_content.strip()
|
|
978
|
+
):
|
|
979
|
+
logger.debug("Using HTML text extraction as primary content")
|
|
980
|
+
if title:
|
|
981
|
+
markdown_content = f"# {title}\n\n{html_text_content}"
|
|
982
|
+
else:
|
|
983
|
+
markdown_content = html_text_content
|
|
984
|
+
|
|
985
|
+
if not markdown_content.strip():
|
|
986
|
+
raise FetchError(f"No content extracted from URL via browser: {url}")
|
|
987
|
+
|
|
988
|
+
# Step 6: Capture full-page screenshot if requested
|
|
989
|
+
screenshot_path: Path | None = None
|
|
990
|
+
if screenshot and screenshot_dir:
|
|
991
|
+
try:
|
|
992
|
+
screenshot_dir.mkdir(parents=True, exist_ok=True)
|
|
993
|
+
safe_filename = _url_to_screenshot_filename(url)
|
|
994
|
+
screenshot_path = screenshot_dir / safe_filename
|
|
995
|
+
|
|
996
|
+
# Check if screenshot already exists (simple cache)
|
|
997
|
+
if not screenshot_path.exists():
|
|
998
|
+
# Set viewport if configured
|
|
999
|
+
if screenshot_config:
|
|
1000
|
+
viewport_args = [
|
|
1001
|
+
*base_args,
|
|
1002
|
+
"set",
|
|
1003
|
+
"viewport",
|
|
1004
|
+
str(screenshot_config.viewport_width),
|
|
1005
|
+
str(screenshot_config.viewport_height),
|
|
1006
|
+
]
|
|
1007
|
+
logger.debug(f"Running: {' '.join(viewport_args)}")
|
|
1008
|
+
proc = await asyncio.create_subprocess_exec(
|
|
1009
|
+
*viewport_args,
|
|
1010
|
+
stdout=asyncio.subprocess.PIPE,
|
|
1011
|
+
stderr=asyncio.subprocess.PIPE,
|
|
1012
|
+
)
|
|
1013
|
+
await asyncio.wait_for(proc.communicate(), timeout=10)
|
|
1014
|
+
|
|
1015
|
+
# Capture full-page screenshot
|
|
1016
|
+
screenshot_args = [
|
|
1017
|
+
*base_args,
|
|
1018
|
+
"screenshot",
|
|
1019
|
+
"--full",
|
|
1020
|
+
str(screenshot_path),
|
|
1021
|
+
]
|
|
1022
|
+
logger.debug(f"Running: {' '.join(screenshot_args)}")
|
|
1023
|
+
proc = await asyncio.create_subprocess_exec(
|
|
1024
|
+
*screenshot_args,
|
|
1025
|
+
stdout=asyncio.subprocess.PIPE,
|
|
1026
|
+
stderr=asyncio.subprocess.PIPE,
|
|
1027
|
+
)
|
|
1028
|
+
stdout, stderr = await asyncio.wait_for(
|
|
1029
|
+
proc.communicate(), timeout=60
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
if proc.returncode != 0:
|
|
1033
|
+
error_msg = stderr.decode() if stderr else "Unknown error"
|
|
1034
|
+
logger.warning(f"Screenshot capture failed: {error_msg}")
|
|
1035
|
+
screenshot_path = None
|
|
1036
|
+
elif screenshot_path.exists():
|
|
1037
|
+
# Compress screenshot
|
|
1038
|
+
quality = screenshot_config.quality if screenshot_config else 85
|
|
1039
|
+
max_height = (
|
|
1040
|
+
screenshot_config.max_height if screenshot_config else 10000
|
|
1041
|
+
)
|
|
1042
|
+
_compress_screenshot(screenshot_path, quality, max_height)
|
|
1043
|
+
logger.debug(f"Screenshot saved: {screenshot_path}")
|
|
1044
|
+
else:
|
|
1045
|
+
logger.debug(f"Screenshot exists, skipping: {screenshot_path}")
|
|
1046
|
+
except Exception as e:
|
|
1047
|
+
# Screenshot failure should not block the main fetch
|
|
1048
|
+
logger.warning(f"Screenshot failed for {url}: {e}")
|
|
1049
|
+
screenshot_path = None
|
|
1050
|
+
|
|
1051
|
+
return FetchResult(
|
|
1052
|
+
content=markdown_content,
|
|
1053
|
+
strategy_used="browser",
|
|
1054
|
+
title=title,
|
|
1055
|
+
url=url,
|
|
1056
|
+
final_url=final_url,
|
|
1057
|
+
metadata={"renderer": "agent-browser", "wait_for": wait_for},
|
|
1058
|
+
screenshot_path=screenshot_path,
|
|
1059
|
+
)
|
|
1060
|
+
|
|
1061
|
+
except TimeoutError:
|
|
1062
|
+
raise FetchError(f"Browser fetch timed out after {timeout}ms: {url}")
|
|
1063
|
+
except AgentBrowserNotFoundError:
|
|
1064
|
+
raise
|
|
1065
|
+
except FetchError:
|
|
1066
|
+
raise
|
|
1067
|
+
except Exception as e:
|
|
1068
|
+
raise FetchError(f"Browser fetch failed: {e}")
|
|
1069
|
+
finally:
|
|
1070
|
+
# Clean up the browser session to avoid resource leaks
|
|
1071
|
+
# Only close auto-generated sessions (not user-specified ones)
|
|
1072
|
+
if not session:
|
|
1073
|
+
try:
|
|
1074
|
+
close_args = [command, "--session", effective_session, "close"]
|
|
1075
|
+
proc = await asyncio.create_subprocess_exec(
|
|
1076
|
+
*close_args,
|
|
1077
|
+
stdout=asyncio.subprocess.PIPE,
|
|
1078
|
+
stderr=asyncio.subprocess.PIPE,
|
|
1079
|
+
)
|
|
1080
|
+
await asyncio.wait_for(proc.communicate(), timeout=5)
|
|
1081
|
+
logger.debug(f"Closed browser session: {effective_session}")
|
|
1082
|
+
except Exception as e:
|
|
1083
|
+
logger.debug(
|
|
1084
|
+
f"Failed to close browser session {effective_session}: {e}"
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
|
|
1088
|
+
def _snapshot_to_markdown(snapshot: str, title: str | None, url: str) -> str:
|
|
1089
|
+
"""Convert agent-browser snapshot to markdown format.
|
|
1090
|
+
|
|
1091
|
+
The snapshot is an accessibility tree with various formats:
|
|
1092
|
+
- heading "Title" [ref=e1] [level=1]
|
|
1093
|
+
- paragraph: Text content here
|
|
1094
|
+
- link "Link text" [ref=e2]:
|
|
1095
|
+
- /url: /path
|
|
1096
|
+
- text: Some text
|
|
1097
|
+
|
|
1098
|
+
Args:
|
|
1099
|
+
snapshot: Accessibility tree snapshot
|
|
1100
|
+
title: Page title
|
|
1101
|
+
url: Original URL
|
|
1102
|
+
|
|
1103
|
+
Returns:
|
|
1104
|
+
Markdown formatted content
|
|
1105
|
+
"""
|
|
1106
|
+
lines = []
|
|
1107
|
+
|
|
1108
|
+
# Add title as H1 if available
|
|
1109
|
+
if title:
|
|
1110
|
+
lines.append(f"# {title}")
|
|
1111
|
+
lines.append("")
|
|
1112
|
+
|
|
1113
|
+
# Track current link for multi-line link handling
|
|
1114
|
+
current_link_text: str | None = None
|
|
1115
|
+
current_link_url: str | None = None
|
|
1116
|
+
|
|
1117
|
+
# Parse snapshot and convert to markdown
|
|
1118
|
+
for line in snapshot.split("\n"):
|
|
1119
|
+
stripped = line.lstrip()
|
|
1120
|
+
|
|
1121
|
+
if not stripped:
|
|
1122
|
+
continue
|
|
1123
|
+
|
|
1124
|
+
# Skip structure markers
|
|
1125
|
+
if stripped.startswith("- document:") or stripped.startswith("- navigation:"):
|
|
1126
|
+
continue
|
|
1127
|
+
if stripped.startswith("- main:") or stripped.startswith("- article:"):
|
|
1128
|
+
continue
|
|
1129
|
+
if stripped.startswith("- contentinfo:") or stripped.startswith("- list:"):
|
|
1130
|
+
continue
|
|
1131
|
+
if stripped.startswith("- listitem:"):
|
|
1132
|
+
continue
|
|
1133
|
+
|
|
1134
|
+
# Remove leading "- " if present
|
|
1135
|
+
if stripped.startswith("- "):
|
|
1136
|
+
stripped = stripped[2:]
|
|
1137
|
+
|
|
1138
|
+
# Handle URL lines (part of link)
|
|
1139
|
+
if stripped.startswith("/url:"):
|
|
1140
|
+
current_link_url = stripped[5:].strip()
|
|
1141
|
+
if current_link_text:
|
|
1142
|
+
lines.append(f"[{current_link_text}]({current_link_url})")
|
|
1143
|
+
lines.append("")
|
|
1144
|
+
current_link_text = None
|
|
1145
|
+
current_link_url = None
|
|
1146
|
+
continue
|
|
1147
|
+
|
|
1148
|
+
# Pattern 1: role "content" [attrs] (with or without trailing colon)
|
|
1149
|
+
# e.g., heading "Title" [ref=e1] [level=1]
|
|
1150
|
+
# e.g., link "Text" [ref=e2]:
|
|
1151
|
+
match = re.match(
|
|
1152
|
+
r'(\w+)\s+"([^"]*)"(?:\s*\[([^\]]*(?:\]\s*\[[^\]]*)*)\])?:?$', stripped
|
|
1153
|
+
)
|
|
1154
|
+
if match:
|
|
1155
|
+
role, content, attrs_str = match.groups()
|
|
1156
|
+
attrs_dict = {}
|
|
1157
|
+
if attrs_str:
|
|
1158
|
+
# Parse multiple [key=value] attributes
|
|
1159
|
+
for attr_match in re.finditer(r"\[?([^=\]]+)=([^\]]+)\]?", attrs_str):
|
|
1160
|
+
k, v = attr_match.groups()
|
|
1161
|
+
attrs_dict[k.strip()] = v.strip()
|
|
1162
|
+
|
|
1163
|
+
# Convert to markdown based on role
|
|
1164
|
+
if role == "heading":
|
|
1165
|
+
level = int(attrs_dict.get("level", "2"))
|
|
1166
|
+
lines.append(f"{'#' * level} {content}")
|
|
1167
|
+
lines.append("")
|
|
1168
|
+
elif role == "paragraph":
|
|
1169
|
+
if content:
|
|
1170
|
+
lines.append(content)
|
|
1171
|
+
lines.append("")
|
|
1172
|
+
elif role == "link":
|
|
1173
|
+
# Link URL might be on next line
|
|
1174
|
+
link_url = attrs_dict.get("url", "")
|
|
1175
|
+
if link_url:
|
|
1176
|
+
lines.append(f"[{content}]({link_url})")
|
|
1177
|
+
lines.append("")
|
|
1178
|
+
else:
|
|
1179
|
+
# Wait for /url: line
|
|
1180
|
+
current_link_text = content
|
|
1181
|
+
elif role == "image":
|
|
1182
|
+
alt = content or "image"
|
|
1183
|
+
src = attrs_dict.get("url", attrs_dict.get("src", ""))
|
|
1184
|
+
if src:
|
|
1185
|
+
lines.append(f"")
|
|
1186
|
+
lines.append("")
|
|
1187
|
+
elif role == "listitem":
|
|
1188
|
+
lines.append(f"- {content}")
|
|
1189
|
+
elif role == "code":
|
|
1190
|
+
lines.append(f"`{content}`")
|
|
1191
|
+
elif role in ("text", "StaticText"):
|
|
1192
|
+
if content:
|
|
1193
|
+
lines.append(content)
|
|
1194
|
+
elif role == "button":
|
|
1195
|
+
pass # Skip buttons
|
|
1196
|
+
elif role == "textbox":
|
|
1197
|
+
pass # Skip form inputs
|
|
1198
|
+
elif role == "switch":
|
|
1199
|
+
pass # Skip toggles
|
|
1200
|
+
elif content:
|
|
1201
|
+
# Generic fallback - include content
|
|
1202
|
+
lines.append(content)
|
|
1203
|
+
continue
|
|
1204
|
+
|
|
1205
|
+
# Pattern 2: role: content (no quotes)
|
|
1206
|
+
# e.g., paragraph: Text content here
|
|
1207
|
+
# e.g., text: Some text
|
|
1208
|
+
match2 = re.match(r"(\w+):\s*(.+)$", stripped)
|
|
1209
|
+
if match2:
|
|
1210
|
+
role, content = match2.groups()
|
|
1211
|
+
content = content.strip()
|
|
1212
|
+
|
|
1213
|
+
if role == "paragraph":
|
|
1214
|
+
lines.append(content)
|
|
1215
|
+
lines.append("")
|
|
1216
|
+
elif role == "text":
|
|
1217
|
+
# Only add text if it's meaningful (not just punctuation)
|
|
1218
|
+
if content and len(content) > 2:
|
|
1219
|
+
lines.append(content)
|
|
1220
|
+
elif role == "heading":
|
|
1221
|
+
lines.append(f"## {content}")
|
|
1222
|
+
lines.append("")
|
|
1223
|
+
elif role == "time":
|
|
1224
|
+
lines.append(f"*{content}*")
|
|
1225
|
+
lines.append("")
|
|
1226
|
+
elif role in ("separator",):
|
|
1227
|
+
lines.append("---")
|
|
1228
|
+
lines.append("")
|
|
1229
|
+
continue
|
|
1230
|
+
|
|
1231
|
+
# Pattern 3: Plain text line (not a role definition)
|
|
1232
|
+
# Skip structural elements
|
|
1233
|
+
if stripped and not stripped.endswith(":"):
|
|
1234
|
+
# Check if it looks like content (not a role marker)
|
|
1235
|
+
if not re.match(r"^[a-z]+$", stripped):
|
|
1236
|
+
pass # Don't add raw structural lines
|
|
1237
|
+
|
|
1238
|
+
# Clean up: remove consecutive empty lines
|
|
1239
|
+
result_lines = []
|
|
1240
|
+
prev_empty = False
|
|
1241
|
+
for line in lines:
|
|
1242
|
+
is_empty = not line.strip()
|
|
1243
|
+
if is_empty and prev_empty:
|
|
1244
|
+
continue
|
|
1245
|
+
result_lines.append(line)
|
|
1246
|
+
prev_empty = is_empty
|
|
1247
|
+
|
|
1248
|
+
return "\n".join(result_lines).strip()
|
|
1249
|
+
|
|
1250
|
+
|
|
1251
|
+
async def fetch_with_jina(
|
|
1252
|
+
url: str,
|
|
1253
|
+
api_key: str | None = None,
|
|
1254
|
+
timeout: int = 30,
|
|
1255
|
+
) -> FetchResult:
|
|
1256
|
+
"""Fetch URL using Jina Reader API.
|
|
1257
|
+
|
|
1258
|
+
Args:
|
|
1259
|
+
url: URL to fetch
|
|
1260
|
+
api_key: Optional Jina API key (for higher rate limits)
|
|
1261
|
+
timeout: Request timeout in seconds
|
|
1262
|
+
|
|
1263
|
+
Returns:
|
|
1264
|
+
FetchResult with markdown content
|
|
1265
|
+
|
|
1266
|
+
Raises:
|
|
1267
|
+
JinaRateLimitError: If rate limit exceeded
|
|
1268
|
+
JinaAPIError: If API returns error
|
|
1269
|
+
FetchError: If fetch fails
|
|
1270
|
+
"""
|
|
1271
|
+
import httpx
|
|
1272
|
+
|
|
1273
|
+
logger.debug(f"Fetching URL with Jina Reader: {url}")
|
|
1274
|
+
|
|
1275
|
+
jina_url = f"{DEFAULT_JINA_BASE_URL}/{url}"
|
|
1276
|
+
headers = {}
|
|
1277
|
+
if api_key:
|
|
1278
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
1279
|
+
|
|
1280
|
+
try:
|
|
1281
|
+
client = _get_jina_client(timeout)
|
|
1282
|
+
response = await client.get(jina_url, headers=headers)
|
|
1283
|
+
|
|
1284
|
+
if response.status_code == 429:
|
|
1285
|
+
raise JinaRateLimitError()
|
|
1286
|
+
elif response.status_code >= 400:
|
|
1287
|
+
raise JinaAPIError(response.status_code, response.text[:200])
|
|
1288
|
+
|
|
1289
|
+
content = response.text
|
|
1290
|
+
|
|
1291
|
+
if not content.strip():
|
|
1292
|
+
raise FetchError(f"No content returned from Jina Reader: {url}")
|
|
1293
|
+
|
|
1294
|
+
# Extract title from first H1 if present
|
|
1295
|
+
title = None
|
|
1296
|
+
title_match = re.match(r"^#\s+(.+)$", content, re.MULTILINE)
|
|
1297
|
+
if title_match:
|
|
1298
|
+
title = title_match.group(1)
|
|
1299
|
+
|
|
1300
|
+
return FetchResult(
|
|
1301
|
+
content=content,
|
|
1302
|
+
strategy_used="jina",
|
|
1303
|
+
title=title,
|
|
1304
|
+
url=url,
|
|
1305
|
+
metadata={"api": "jina-reader"},
|
|
1306
|
+
)
|
|
1307
|
+
|
|
1308
|
+
except (JinaRateLimitError, JinaAPIError):
|
|
1309
|
+
raise
|
|
1310
|
+
except httpx.TimeoutException:
|
|
1311
|
+
raise FetchError(f"Jina Reader request timed out after {timeout}s: {url}")
|
|
1312
|
+
except Exception as e:
|
|
1313
|
+
raise FetchError(f"Jina Reader fetch failed: {e}")
|
|
1314
|
+
|
|
1315
|
+
|
|
1316
|
+
async def fetch_url(
|
|
1317
|
+
url: str,
|
|
1318
|
+
strategy: FetchStrategy,
|
|
1319
|
+
config: FetchConfig,
|
|
1320
|
+
explicit_strategy: bool = False,
|
|
1321
|
+
cache: FetchCache | None = None,
|
|
1322
|
+
skip_read_cache: bool = False,
|
|
1323
|
+
*,
|
|
1324
|
+
screenshot: bool = False,
|
|
1325
|
+
screenshot_dir: Path | None = None,
|
|
1326
|
+
screenshot_config: ScreenshotConfig | None = None,
|
|
1327
|
+
) -> FetchResult:
|
|
1328
|
+
"""Fetch URL content using the specified strategy.
|
|
1329
|
+
|
|
1330
|
+
Args:
|
|
1331
|
+
url: URL to fetch
|
|
1332
|
+
strategy: Fetch strategy to use
|
|
1333
|
+
config: Fetch configuration
|
|
1334
|
+
explicit_strategy: If True, don't fallback on error (user explicitly chose strategy)
|
|
1335
|
+
cache: Optional FetchCache for caching results
|
|
1336
|
+
skip_read_cache: If True, skip reading from cache but still write results (--no-cache)
|
|
1337
|
+
screenshot: If True, capture full-page screenshot (requires browser strategy)
|
|
1338
|
+
screenshot_dir: Directory to save screenshot
|
|
1339
|
+
screenshot_config: Screenshot settings (viewport, quality, etc.)
|
|
1340
|
+
|
|
1341
|
+
Returns:
|
|
1342
|
+
FetchResult with content and metadata
|
|
1343
|
+
|
|
1344
|
+
Raises:
|
|
1345
|
+
FetchError: If fetch fails and no fallback available
|
|
1346
|
+
AgentBrowserNotFoundError: If --agent-browser used but not installed
|
|
1347
|
+
JinaRateLimitError: If --jina used and rate limit exceeded
|
|
1348
|
+
"""
|
|
1349
|
+
# When screenshot is enabled, use multi-source fetching strategy
|
|
1350
|
+
# This captures both static content and browser-rendered content
|
|
1351
|
+
if screenshot:
|
|
1352
|
+
return await _fetch_multi_source(
|
|
1353
|
+
url,
|
|
1354
|
+
config,
|
|
1355
|
+
screenshot_dir=screenshot_dir,
|
|
1356
|
+
screenshot_config=screenshot_config,
|
|
1357
|
+
cache=cache,
|
|
1358
|
+
skip_read_cache=skip_read_cache,
|
|
1359
|
+
)
|
|
1360
|
+
|
|
1361
|
+
# Check cache first (unless skip_read_cache is True)
|
|
1362
|
+
if cache is not None and not skip_read_cache:
|
|
1363
|
+
cached_result = cache.get(url)
|
|
1364
|
+
if cached_result is not None:
|
|
1365
|
+
logger.info(f"[FetchCache] Using cached content for: {url}")
|
|
1366
|
+
return cached_result
|
|
1367
|
+
|
|
1368
|
+
# Screenshot kwargs for browser fetching
|
|
1369
|
+
screenshot_kwargs: dict[str, Any] = {}
|
|
1370
|
+
|
|
1371
|
+
# Fetch the content
|
|
1372
|
+
result: FetchResult
|
|
1373
|
+
|
|
1374
|
+
# Handle explicit strategy (no fallback)
|
|
1375
|
+
if explicit_strategy:
|
|
1376
|
+
if strategy == FetchStrategy.BROWSER:
|
|
1377
|
+
result = await fetch_with_browser(
|
|
1378
|
+
url,
|
|
1379
|
+
command=config.agent_browser.command,
|
|
1380
|
+
timeout=config.agent_browser.timeout,
|
|
1381
|
+
wait_for=config.agent_browser.wait_for,
|
|
1382
|
+
extra_wait_ms=config.agent_browser.extra_wait_ms,
|
|
1383
|
+
session=config.agent_browser.session,
|
|
1384
|
+
**screenshot_kwargs,
|
|
1385
|
+
)
|
|
1386
|
+
elif strategy == FetchStrategy.JINA:
|
|
1387
|
+
api_key = config.jina.get_resolved_api_key()
|
|
1388
|
+
result = await fetch_with_jina(url, api_key, config.jina.timeout)
|
|
1389
|
+
elif strategy == FetchStrategy.STATIC:
|
|
1390
|
+
result = await fetch_with_static(url)
|
|
1391
|
+
else:
|
|
1392
|
+
# AUTO with explicit=True shouldn't happen, but handle it
|
|
1393
|
+
strategy = FetchStrategy.AUTO
|
|
1394
|
+
result = await _fetch_with_fallback(
|
|
1395
|
+
url, config, start_with_browser=False, **screenshot_kwargs
|
|
1396
|
+
)
|
|
1397
|
+
elif strategy == FetchStrategy.AUTO:
|
|
1398
|
+
# Check if domain needs browser rendering
|
|
1399
|
+
if should_use_browser_for_domain(url, config.fallback_patterns):
|
|
1400
|
+
logger.info(f"Domain matches fallback pattern, using browser: {url}")
|
|
1401
|
+
result = await _fetch_with_fallback(
|
|
1402
|
+
url, config, start_with_browser=True, **screenshot_kwargs
|
|
1403
|
+
)
|
|
1404
|
+
else:
|
|
1405
|
+
# Try static first, fallback to browser/jina if JS required
|
|
1406
|
+
result = await _fetch_with_fallback(
|
|
1407
|
+
url, config, start_with_browser=False, **screenshot_kwargs
|
|
1408
|
+
)
|
|
1409
|
+
elif strategy == FetchStrategy.STATIC:
|
|
1410
|
+
result = await fetch_with_static(url)
|
|
1411
|
+
elif strategy == FetchStrategy.BROWSER:
|
|
1412
|
+
result = await fetch_with_browser(
|
|
1413
|
+
url,
|
|
1414
|
+
command=config.agent_browser.command,
|
|
1415
|
+
timeout=config.agent_browser.timeout,
|
|
1416
|
+
wait_for=config.agent_browser.wait_for,
|
|
1417
|
+
extra_wait_ms=config.agent_browser.extra_wait_ms,
|
|
1418
|
+
session=config.agent_browser.session,
|
|
1419
|
+
**screenshot_kwargs,
|
|
1420
|
+
)
|
|
1421
|
+
elif strategy == FetchStrategy.JINA:
|
|
1422
|
+
api_key = config.jina.get_resolved_api_key()
|
|
1423
|
+
result = await fetch_with_jina(url, api_key, config.jina.timeout)
|
|
1424
|
+
else:
|
|
1425
|
+
raise ValueError(f"Unknown fetch strategy: {strategy}")
|
|
1426
|
+
|
|
1427
|
+
# Cache the result
|
|
1428
|
+
if cache is not None:
|
|
1429
|
+
cache.set(url, result)
|
|
1430
|
+
|
|
1431
|
+
return result
|
|
1432
|
+
|
|
1433
|
+
|
|
1434
|
+
def _is_invalid_content(content: str) -> tuple[bool, str]:
|
|
1435
|
+
"""Check if fetched content is invalid (JS error page, login prompt, etc.).
|
|
1436
|
+
|
|
1437
|
+
Args:
|
|
1438
|
+
content: Fetched content to check
|
|
1439
|
+
|
|
1440
|
+
Returns:
|
|
1441
|
+
Tuple of (is_invalid, reason)
|
|
1442
|
+
"""
|
|
1443
|
+
if not content or not content.strip():
|
|
1444
|
+
return True, "empty"
|
|
1445
|
+
|
|
1446
|
+
# Check for common invalid content patterns
|
|
1447
|
+
invalid_patterns = [
|
|
1448
|
+
(r"JavaScript is (not available|disabled)", "javascript_disabled"),
|
|
1449
|
+
(r"Please enable JavaScript", "javascript_required"),
|
|
1450
|
+
(r"switch to a supported browser", "unsupported_browser"),
|
|
1451
|
+
(r"Something went wrong.*let's give it another shot", "error_page"),
|
|
1452
|
+
(r"Log in.*Sign up.*to continue", "login_required"),
|
|
1453
|
+
(r"You must be logged in", "login_required"),
|
|
1454
|
+
]
|
|
1455
|
+
|
|
1456
|
+
for pattern, reason in invalid_patterns:
|
|
1457
|
+
if re.search(pattern, content, re.IGNORECASE | re.DOTALL):
|
|
1458
|
+
return True, reason
|
|
1459
|
+
|
|
1460
|
+
# Check content length (after removing markdown links and images)
|
|
1461
|
+
clean_content = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", content) # Remove images
|
|
1462
|
+
clean_content = re.sub(r"\[[^\]]*\]\([^)]+\)", "", clean_content) # Remove links
|
|
1463
|
+
clean_content = re.sub(
|
|
1464
|
+
r"[#\-*_>\[\]`|]", "", clean_content
|
|
1465
|
+
) # Remove markdown syntax
|
|
1466
|
+
clean_content = " ".join(clean_content.split()) # Normalize whitespace
|
|
1467
|
+
|
|
1468
|
+
if len(clean_content) < 100:
|
|
1469
|
+
return True, "too_short"
|
|
1470
|
+
|
|
1471
|
+
return False, ""
|
|
1472
|
+
|
|
1473
|
+
|
|
1474
|
+
async def _fetch_multi_source(
|
|
1475
|
+
url: str,
|
|
1476
|
+
config: FetchConfig,
|
|
1477
|
+
screenshot_dir: Path | None = None,
|
|
1478
|
+
screenshot_config: ScreenshotConfig | None = None,
|
|
1479
|
+
cache: FetchCache | None = None,
|
|
1480
|
+
skip_read_cache: bool = False,
|
|
1481
|
+
) -> FetchResult:
|
|
1482
|
+
"""Fetch URL using static-first strategy with browser fallback.
|
|
1483
|
+
|
|
1484
|
+
Strategy:
|
|
1485
|
+
1. Fetch both static and browser in parallel
|
|
1486
|
+
2. Validate content quality using _is_invalid_content()
|
|
1487
|
+
3. If static is valid → use static only (ignore browser content)
|
|
1488
|
+
4. Else if browser is valid → use browser only
|
|
1489
|
+
5. Else → use browser content with warning (both invalid)
|
|
1490
|
+
|
|
1491
|
+
Screenshot is always included when available.
|
|
1492
|
+
|
|
1493
|
+
Args:
|
|
1494
|
+
url: URL to fetch
|
|
1495
|
+
config: Fetch configuration
|
|
1496
|
+
screenshot_dir: Directory to save screenshot
|
|
1497
|
+
screenshot_config: Screenshot settings
|
|
1498
|
+
cache: Optional FetchCache for caching results
|
|
1499
|
+
skip_read_cache: If True, skip reading from cache
|
|
1500
|
+
|
|
1501
|
+
Returns:
|
|
1502
|
+
FetchResult with single-source content (no merging)
|
|
1503
|
+
"""
|
|
1504
|
+
static_content: str | None = None
|
|
1505
|
+
browser_result: FetchResult | None = None
|
|
1506
|
+
|
|
1507
|
+
# Task 1: Try static fetch (non-blocking)
|
|
1508
|
+
async def fetch_static() -> str | None:
|
|
1509
|
+
try:
|
|
1510
|
+
result = await fetch_with_static(url)
|
|
1511
|
+
logger.debug(f"[URL] Static fetch success: {len(result.content)} chars")
|
|
1512
|
+
return result.content
|
|
1513
|
+
except Exception as e:
|
|
1514
|
+
logger.debug(f"[URL] Static fetch failed: {e}")
|
|
1515
|
+
return None
|
|
1516
|
+
|
|
1517
|
+
# Task 2: Browser fetch with screenshot
|
|
1518
|
+
async def fetch_browser() -> FetchResult | None:
|
|
1519
|
+
try:
|
|
1520
|
+
if not is_agent_browser_available(config.agent_browser.command):
|
|
1521
|
+
logger.debug("agent-browser not available")
|
|
1522
|
+
return None
|
|
1523
|
+
|
|
1524
|
+
result = await fetch_with_browser(
|
|
1525
|
+
url,
|
|
1526
|
+
command=config.agent_browser.command,
|
|
1527
|
+
timeout=config.agent_browser.timeout,
|
|
1528
|
+
wait_for=config.agent_browser.wait_for,
|
|
1529
|
+
extra_wait_ms=config.agent_browser.extra_wait_ms,
|
|
1530
|
+
session=config.agent_browser.session,
|
|
1531
|
+
screenshot=True,
|
|
1532
|
+
screenshot_dir=screenshot_dir,
|
|
1533
|
+
screenshot_config=screenshot_config,
|
|
1534
|
+
)
|
|
1535
|
+
logger.debug(f"[URL] Browser fetch success: {len(result.content)} chars")
|
|
1536
|
+
return result
|
|
1537
|
+
except Exception as e:
|
|
1538
|
+
logger.debug(f"[URL] Browser fetch failed: {e}")
|
|
1539
|
+
return None
|
|
1540
|
+
|
|
1541
|
+
# Execute both fetches in parallel
|
|
1542
|
+
static_content, browser_result = await asyncio.gather(
|
|
1543
|
+
fetch_static(), fetch_browser()
|
|
1544
|
+
)
|
|
1545
|
+
|
|
1546
|
+
browser_content = browser_result.content if browser_result else None
|
|
1547
|
+
screenshot_path = browser_result.screenshot_path if browser_result else None
|
|
1548
|
+
|
|
1549
|
+
# Validate content quality
|
|
1550
|
+
static_invalid, static_reason = (
|
|
1551
|
+
_is_invalid_content(static_content)
|
|
1552
|
+
if static_content
|
|
1553
|
+
else (True, "fetch_failed")
|
|
1554
|
+
)
|
|
1555
|
+
browser_invalid, browser_reason = (
|
|
1556
|
+
_is_invalid_content(browser_content)
|
|
1557
|
+
if browser_content
|
|
1558
|
+
else (True, "fetch_failed")
|
|
1559
|
+
)
|
|
1560
|
+
|
|
1561
|
+
if static_invalid:
|
|
1562
|
+
logger.debug(f"[URL] Static content invalid: {static_reason}")
|
|
1563
|
+
if browser_invalid:
|
|
1564
|
+
logger.debug(f"[URL] Browser content invalid: {browser_reason}")
|
|
1565
|
+
|
|
1566
|
+
# Determine which source to use (static-first strategy)
|
|
1567
|
+
primary_content = ""
|
|
1568
|
+
strategy_used = ""
|
|
1569
|
+
warning_message = ""
|
|
1570
|
+
final_static_content: str | None = None
|
|
1571
|
+
final_browser_content: str | None = None
|
|
1572
|
+
|
|
1573
|
+
if not static_invalid:
|
|
1574
|
+
# Static is valid → use static only
|
|
1575
|
+
assert static_content is not None
|
|
1576
|
+
primary_content = static_content
|
|
1577
|
+
final_static_content = static_content
|
|
1578
|
+
strategy_used = "static"
|
|
1579
|
+
logger.info(f"[URL] Using static content (valid, {len(static_content)} chars)")
|
|
1580
|
+
elif not browser_invalid:
|
|
1581
|
+
# Static invalid but browser is valid → use browser
|
|
1582
|
+
assert browser_content is not None
|
|
1583
|
+
primary_content = browser_content
|
|
1584
|
+
final_browser_content = browser_content
|
|
1585
|
+
strategy_used = "browser"
|
|
1586
|
+
logger.info(
|
|
1587
|
+
f"[URL] Using browser content (static invalid: {static_reason}, "
|
|
1588
|
+
f"browser valid, {len(browser_content)} chars)"
|
|
1589
|
+
)
|
|
1590
|
+
elif browser_content:
|
|
1591
|
+
# Both invalid, but browser has content → use browser with warning
|
|
1592
|
+
primary_content = browser_content
|
|
1593
|
+
final_browser_content = browser_content
|
|
1594
|
+
strategy_used = "browser"
|
|
1595
|
+
warning_message = (
|
|
1596
|
+
f"Warning: Content may be incomplete. "
|
|
1597
|
+
f"Static: {static_reason}, Browser: {browser_reason}"
|
|
1598
|
+
)
|
|
1599
|
+
logger.warning(
|
|
1600
|
+
f"[URL] Both sources invalid, using browser content with warning: "
|
|
1601
|
+
f"static={static_reason}, browser={browser_reason}"
|
|
1602
|
+
)
|
|
1603
|
+
elif static_content:
|
|
1604
|
+
# Both invalid, no browser but has static → use static with warning
|
|
1605
|
+
primary_content = static_content
|
|
1606
|
+
final_static_content = static_content
|
|
1607
|
+
strategy_used = "static"
|
|
1608
|
+
warning_message = f"Warning: Content may be incomplete. Reason: {static_reason}"
|
|
1609
|
+
logger.warning(
|
|
1610
|
+
f"[URL] Both sources invalid, using static content with warning: {static_reason}"
|
|
1611
|
+
)
|
|
1612
|
+
else:
|
|
1613
|
+
raise FetchError(f"All fetch strategies failed for URL: {url}")
|
|
1614
|
+
|
|
1615
|
+
# Extract title from browser result if available
|
|
1616
|
+
title = browser_result.title if browser_result else None
|
|
1617
|
+
final_url = browser_result.final_url if browser_result else None
|
|
1618
|
+
|
|
1619
|
+
# If no title from browser, try to extract from primary content
|
|
1620
|
+
if not title and primary_content:
|
|
1621
|
+
title_match = re.match(r"^#\s+(.+)$", primary_content, re.MULTILINE)
|
|
1622
|
+
if title_match:
|
|
1623
|
+
title = title_match.group(1)
|
|
1624
|
+
|
|
1625
|
+
metadata: dict[str, Any] = {"single_source": True, "source": strategy_used}
|
|
1626
|
+
if warning_message:
|
|
1627
|
+
metadata["warning"] = warning_message
|
|
1628
|
+
|
|
1629
|
+
assert primary_content is not None # Guaranteed by above branches
|
|
1630
|
+
result = FetchResult(
|
|
1631
|
+
content=primary_content,
|
|
1632
|
+
strategy_used=strategy_used,
|
|
1633
|
+
title=title,
|
|
1634
|
+
url=url,
|
|
1635
|
+
final_url=final_url,
|
|
1636
|
+
metadata=metadata,
|
|
1637
|
+
screenshot_path=screenshot_path,
|
|
1638
|
+
static_content=final_static_content,
|
|
1639
|
+
browser_content=final_browser_content,
|
|
1640
|
+
)
|
|
1641
|
+
|
|
1642
|
+
# Cache the result
|
|
1643
|
+
if cache is not None:
|
|
1644
|
+
cache.set(url, result)
|
|
1645
|
+
|
|
1646
|
+
return result
|
|
1647
|
+
|
|
1648
|
+
|
|
1649
|
+
async def _fetch_with_fallback(
|
|
1650
|
+
url: str,
|
|
1651
|
+
config: FetchConfig,
|
|
1652
|
+
start_with_browser: bool = False,
|
|
1653
|
+
**screenshot_kwargs: Any,
|
|
1654
|
+
) -> FetchResult:
|
|
1655
|
+
"""Fetch URL with automatic fallback between strategies.
|
|
1656
|
+
|
|
1657
|
+
Args:
|
|
1658
|
+
url: URL to fetch
|
|
1659
|
+
config: Fetch configuration
|
|
1660
|
+
start_with_browser: If True, try browser first (for known JS domains)
|
|
1661
|
+
**screenshot_kwargs: Screenshot options (screenshot, screenshot_dir, screenshot_config)
|
|
1662
|
+
|
|
1663
|
+
Returns:
|
|
1664
|
+
FetchResult from first successful strategy
|
|
1665
|
+
"""
|
|
1666
|
+
errors = []
|
|
1667
|
+
|
|
1668
|
+
if start_with_browser:
|
|
1669
|
+
# Try browser first for known JS domains
|
|
1670
|
+
strategies = ["browser", "jina", "static"]
|
|
1671
|
+
else:
|
|
1672
|
+
# Normal order: static -> browser -> jina
|
|
1673
|
+
strategies = ["static", "browser", "jina"]
|
|
1674
|
+
|
|
1675
|
+
for strat in strategies:
|
|
1676
|
+
try:
|
|
1677
|
+
if strat == "static":
|
|
1678
|
+
result = await fetch_with_static(url)
|
|
1679
|
+
# Check if JS is required
|
|
1680
|
+
if detect_js_required(result.content):
|
|
1681
|
+
logger.info(
|
|
1682
|
+
"Static content suggests JS required, trying browser..."
|
|
1683
|
+
)
|
|
1684
|
+
continue
|
|
1685
|
+
return result
|
|
1686
|
+
|
|
1687
|
+
elif strat == "browser":
|
|
1688
|
+
if not is_agent_browser_available(config.agent_browser.command):
|
|
1689
|
+
logger.debug("agent-browser not available, skipping")
|
|
1690
|
+
continue
|
|
1691
|
+
return await fetch_with_browser(
|
|
1692
|
+
url,
|
|
1693
|
+
command=config.agent_browser.command,
|
|
1694
|
+
timeout=config.agent_browser.timeout,
|
|
1695
|
+
wait_for=config.agent_browser.wait_for,
|
|
1696
|
+
extra_wait_ms=config.agent_browser.extra_wait_ms,
|
|
1697
|
+
session=config.agent_browser.session,
|
|
1698
|
+
**screenshot_kwargs,
|
|
1699
|
+
)
|
|
1700
|
+
|
|
1701
|
+
elif strat == "jina":
|
|
1702
|
+
api_key = config.jina.get_resolved_api_key()
|
|
1703
|
+
return await fetch_with_jina(url, api_key, config.jina.timeout)
|
|
1704
|
+
|
|
1705
|
+
except AgentBrowserNotFoundError:
|
|
1706
|
+
logger.debug("agent-browser not installed, trying next strategy")
|
|
1707
|
+
continue
|
|
1708
|
+
except JinaRateLimitError as e:
|
|
1709
|
+
errors.append(str(e))
|
|
1710
|
+
logger.warning(str(e))
|
|
1711
|
+
continue
|
|
1712
|
+
except FetchError as e:
|
|
1713
|
+
errors.append(f"{strat}: {e}")
|
|
1714
|
+
logger.debug(f"Strategy {strat} failed: {e}")
|
|
1715
|
+
continue
|
|
1716
|
+
except Exception as e:
|
|
1717
|
+
errors.append(f"{strat}: {e}")
|
|
1718
|
+
logger.debug(f"Strategy {strat} failed: {e}")
|
|
1719
|
+
continue
|
|
1720
|
+
|
|
1721
|
+
# All strategies failed
|
|
1722
|
+
raise FetchError(
|
|
1723
|
+
f"All fetch strategies failed for {url}:\n"
|
|
1724
|
+
+ "\n".join(f" - {e}" for e in errors)
|
|
1725
|
+
)
|