silkweb 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- silkweb/__init__.py +862 -0
- silkweb/cache/__init__.py +14 -0
- silkweb/cache/http.py +91 -0
- silkweb/cache/manager.py +71 -0
- silkweb/cache/page.py +284 -0
- silkweb/cache/selectors.py +139 -0
- silkweb/cli/__init__.py +1 -0
- silkweb/cli/main.py +443 -0
- silkweb/config.py +143 -0
- silkweb/crawl/__init__.py +6 -0
- silkweb/crawl/crawler.py +216 -0
- silkweb/crawl/dedup.py +105 -0
- silkweb/crawl/sitemap.py +154 -0
- silkweb/discover.py +275 -0
- silkweb/exceptions.py +112 -0
- silkweb/fetch/__init__.py +1 -0
- silkweb/fetch/orchestrator.py +566 -0
- silkweb/fetch/tiers/__init__.py +1 -0
- silkweb/fetch/tiers/curl_cffi_fetcher.py +175 -0
- silkweb/fetch/tiers/httpx_fetcher.py +199 -0
- silkweb/fetch/tiers/network_capture.py +58 -0
- silkweb/fetch/tiers/playwright_fetcher.py +307 -0
- silkweb/fetch/tiers/stealth_fetcher.py +397 -0
- silkweb/llm/__init__.py +1 -0
- silkweb/llm/chunking/__init__.py +6 -0
- silkweb/llm/chunking/bm25.py +53 -0
- silkweb/llm/chunking/budget.py +44 -0
- silkweb/llm/chunking/dispatcher.py +45 -0
- silkweb/llm/chunking/dom.py +106 -0
- silkweb/llm/chunking/semantic.py +80 -0
- silkweb/llm/chunking/token.py +67 -0
- silkweb/llm/constrained.py +196 -0
- silkweb/llm/pipelines/__init__.py +1 -0
- silkweb/llm/pipelines/clean.py +208 -0
- silkweb/llm/pipelines/extract.py +462 -0
- silkweb/llm/pipelines/heal.py +178 -0
- silkweb/llm/pipelines/orchestrator.py +286 -0
- silkweb/llm/pipelines/schema.py +162 -0
- silkweb/llm/pipelines/selectors.py +106 -0
- silkweb/llm/providers/__init__.py +6 -0
- silkweb/llm/providers/anthropic.py +104 -0
- silkweb/llm/providers/base.py +195 -0
- silkweb/llm/providers/llamacpp.py +112 -0
- silkweb/llm/providers/ollama.py +103 -0
- silkweb/llm/providers/openai.py +145 -0
- silkweb/llm/providers/registry.py +227 -0
- silkweb/observability/__init__.py +16 -0
- silkweb/observability/logging.py +69 -0
- silkweb/observability/metrics.py +121 -0
- silkweb/observability/replay.py +163 -0
- silkweb/output/__init__.py +25 -0
- silkweb/output/dataframe.py +59 -0
- silkweb/output/dataset.py +28 -0
- silkweb/output/files.py +144 -0
- silkweb/parse/__init__.py +1 -0
- silkweb/parse/page.py +383 -0
- silkweb/recipes/__init__.py +5 -0
- silkweb/recipes/amazon-product.yaml +17 -0
- silkweb/recipes/github-repo.yaml +18 -0
- silkweb/recipes/google-serp.yaml +17 -0
- silkweb/recipes/hacker-news.yaml +18 -0
- silkweb/recipes/news-article.yaml +15 -0
- silkweb/recipes/product-listing.yaml +17 -0
- silkweb/recipes/reddit-posts.yaml +19 -0
- silkweb/recipes/registry.py +244 -0
- silkweb/session/__init__.py +6 -0
- silkweb/session/recorder.py +115 -0
- silkweb/session/session.py +307 -0
- silkweb/silkql/__init__.py +13 -0
- silkweb/silkql/compiler.py +217 -0
- silkweb/silkql/executor.py +421 -0
- silkweb/silkql/parser.py +163 -0
- silkweb/stealth/__init__.py +1 -0
- silkweb/stealth/behavior.py +123 -0
- silkweb/stealth/proxy.py +143 -0
- silkweb/stealth/rate_limit.py +205 -0
- silkweb/watch.py +261 -0
- silkweb-0.1.0.dist-info/METADATA +1869 -0
- silkweb-0.1.0.dist-info/RECORD +82 -0
- silkweb-0.1.0.dist-info/WHEEL +4 -0
- silkweb-0.1.0.dist-info/entry_points.txt +2 -0
- silkweb-0.1.0.dist-info/licenses/LICENSE +22 -0
silkweb/__init__.py
ADDED
|
@@ -0,0 +1,862 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Silkweb public API surface.
|
|
3
|
+
|
|
4
|
+
Project overview and design live in the repository README.md.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
|
|
12
|
+
load_dotenv()
|
|
13
|
+
except ImportError:
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import contextlib
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
import threading
|
|
21
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
22
|
+
from typing import Any, cast
|
|
23
|
+
|
|
24
|
+
_SYNC_LOOP: asyncio.AbstractEventLoop | None = None
|
|
25
|
+
_SYNC_LOCK = threading.Lock()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_sync_loop() -> asyncio.AbstractEventLoop:
|
|
29
|
+
"""Return a persistent event loop for sync wrappers.
|
|
30
|
+
|
|
31
|
+
Using a single loop avoids the 'Event loop is closed' crash on Windows
|
|
32
|
+
that occurs when ``anyio.run()`` repeatedly creates and destroys loops
|
|
33
|
+
while httpx ``AsyncClient`` instances are cached from a prior loop.
|
|
34
|
+
"""
|
|
35
|
+
global _SYNC_LOOP
|
|
36
|
+
if _SYNC_LOOP is not None and not _SYNC_LOOP.is_closed():
|
|
37
|
+
return _SYNC_LOOP
|
|
38
|
+
with _SYNC_LOCK:
|
|
39
|
+
if _SYNC_LOOP is not None and not _SYNC_LOOP.is_closed():
|
|
40
|
+
return _SYNC_LOOP
|
|
41
|
+
loop = asyncio.new_event_loop()
|
|
42
|
+
_SYNC_LOOP = loop
|
|
43
|
+
return loop
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _run_sync(coro):
|
|
47
|
+
"""Run a coroutine on the persistent sync event loop."""
|
|
48
|
+
loop = _get_sync_loop()
|
|
49
|
+
return loop.run_until_complete(coro)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
from .cache.manager import CacheManager
|
|
53
|
+
from .cache.selectors import SelectorCache
|
|
54
|
+
from .config import SilkwebConfig, configure, get_config
|
|
55
|
+
from .crawl.crawler import AsyncCrawler
|
|
56
|
+
from .discover import discover_api as _async_discover_api
|
|
57
|
+
from .exceptions import (
|
|
58
|
+
SilkwebBlockedError,
|
|
59
|
+
SilkwebCacheError,
|
|
60
|
+
SilkwebConfigError,
|
|
61
|
+
SilkwebError,
|
|
62
|
+
SilkwebExtractionError,
|
|
63
|
+
SilkwebFetchError,
|
|
64
|
+
SilkwebHTTPError,
|
|
65
|
+
SilkwebLLMError,
|
|
66
|
+
SilkwebRenderError,
|
|
67
|
+
SilkwebSchemaError,
|
|
68
|
+
SilkwebSelectorError,
|
|
69
|
+
SilkwebSessionError,
|
|
70
|
+
SilkwebSessionExpiredError,
|
|
71
|
+
SilkwebTimeoutError,
|
|
72
|
+
)
|
|
73
|
+
from .fetch.orchestrator import fetch as _async_fetch
|
|
74
|
+
from .llm.pipelines.clean import CleanedContent, clean_html
|
|
75
|
+
from .llm.pipelines.heal import SelfHealer
|
|
76
|
+
from .llm.pipelines.orchestrator import extract_url as _extract_url
|
|
77
|
+
from .llm.pipelines.schema import synthesize_schema
|
|
78
|
+
from .llm.providers.registry import create_provider
|
|
79
|
+
from .observability.logging import log_event
|
|
80
|
+
from .observability.replay import ReplaySession as _ReplaySession
|
|
81
|
+
from .observability.replay import replay as _replay
|
|
82
|
+
from .recipes.registry import recipes
|
|
83
|
+
from .session.recorder import record as record_session
|
|
84
|
+
from .session.recorder import replay as replay_session
|
|
85
|
+
from .session.session import SilkSession
|
|
86
|
+
from .silkql.executor import QueryResult
|
|
87
|
+
from .silkql.executor import execute_query as _execute_query
|
|
88
|
+
from .silkql.executor import execute_query_from_html as _execute_query_from_html
|
|
89
|
+
from .watch import Watcher
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class _CacheFacade:
|
|
93
|
+
"""
|
|
94
|
+
Small convenience wrapper so docs can use `silkweb.cache.*`.
|
|
95
|
+
|
|
96
|
+
This delegates to `CacheManager.from_config()` on each call so it always reflects
|
|
97
|
+
current configuration.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
def clear(self, *, layer: str | None = None, domain: str | None = None) -> None:
|
|
101
|
+
CacheManager.from_config().clear(layer=layer, domain=domain) # type: ignore[arg-type]
|
|
102
|
+
|
|
103
|
+
def stats(self) -> dict[str, Any]:
|
|
104
|
+
return CacheManager.from_config().stats()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
cache = _CacheFacade()
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
__version__ = version("silkweb")
|
|
111
|
+
except PackageNotFoundError: # pragma: no cover
|
|
112
|
+
__version__ = "0.0.0"
|
|
113
|
+
|
|
114
|
+
__all__ = [
|
|
115
|
+
"QueryResult",
|
|
116
|
+
"SilkSession",
|
|
117
|
+
"SilkwebBlockedError",
|
|
118
|
+
"SilkwebCacheError",
|
|
119
|
+
"SilkwebConfig",
|
|
120
|
+
"SilkwebConfigError",
|
|
121
|
+
"SilkwebError",
|
|
122
|
+
"SilkwebExtractionError",
|
|
123
|
+
"SilkwebFetchError",
|
|
124
|
+
"SilkwebHTTPError",
|
|
125
|
+
"SilkwebLLMError",
|
|
126
|
+
"SilkwebRenderError",
|
|
127
|
+
"SilkwebSchemaError",
|
|
128
|
+
"SilkwebSelectorError",
|
|
129
|
+
"SilkwebSessionError",
|
|
130
|
+
"SilkwebSessionExpiredError",
|
|
131
|
+
"SilkwebTimeoutError",
|
|
132
|
+
"ask",
|
|
133
|
+
"async_ask",
|
|
134
|
+
"async_crawl",
|
|
135
|
+
"async_crawl_sitemap",
|
|
136
|
+
"async_extract",
|
|
137
|
+
"async_extract_from_html",
|
|
138
|
+
"async_fetch",
|
|
139
|
+
"async_query",
|
|
140
|
+
"cache",
|
|
141
|
+
"configure",
|
|
142
|
+
"crawl",
|
|
143
|
+
"crawl_sitemap",
|
|
144
|
+
"discover_api",
|
|
145
|
+
"extract",
|
|
146
|
+
"fetch",
|
|
147
|
+
"get_config",
|
|
148
|
+
"query",
|
|
149
|
+
"query_from_html",
|
|
150
|
+
"recipes",
|
|
151
|
+
"record_session",
|
|
152
|
+
"replay",
|
|
153
|
+
"replay_session",
|
|
154
|
+
"watch",
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def replay(session_file: str) -> _ReplaySession:
|
|
159
|
+
"""
|
|
160
|
+
Load an **HTTP fetch replay** bundle (JSON ``*.silkweb`` + HTML sibling) written when
|
|
161
|
+
``configure(replay_dir=...)`` is set. Returns :class:`observability.replay.ReplaySession`
|
|
162
|
+
with ``.html`` / ``.ask()`` / ``.extract()`` / ``.query()`` helpers.
|
|
163
|
+
|
|
164
|
+
This is **not** the same as :func:`replay_session`, which replays a **Playwright**
|
|
165
|
+
recording from ``record_session`` (cookies and actions under ``~/.silkweb/sessions``).
|
|
166
|
+
"""
|
|
167
|
+
return _replay(session_file)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
async def _ask_from_html(
|
|
171
|
+
url: str,
|
|
172
|
+
html: str,
|
|
173
|
+
*,
|
|
174
|
+
prompt: str,
|
|
175
|
+
cleaner_model: str | None = None,
|
|
176
|
+
schema_model: str | None = None,
|
|
177
|
+
extraction_model: str | None = None,
|
|
178
|
+
selector_model: str | None = None,
|
|
179
|
+
force_llm: bool | None = None,
|
|
180
|
+
output: str = "auto",
|
|
181
|
+
dataframe_engine: str = "auto",
|
|
182
|
+
):
|
|
183
|
+
cfg = get_config()
|
|
184
|
+
if force_llm is None:
|
|
185
|
+
force_llm = bool(cfg.force_llm)
|
|
186
|
+
cleaner_provider = create_provider(cleaner_model or cfg.cleaner_model)
|
|
187
|
+
schema_provider = create_provider(schema_model or cfg.schema_model)
|
|
188
|
+
extraction_provider = create_provider(extraction_model or cfg.extraction_model)
|
|
189
|
+
selector_provider = create_provider(selector_model or cfg.selector_model)
|
|
190
|
+
cleaned = await clean_html(html, provider=cleaner_provider, strategy="auto")
|
|
191
|
+
schema = await synthesize_schema(cleaned, prompt=prompt, provider=schema_provider)
|
|
192
|
+
from .cache.manager import CacheManager as _CM
|
|
193
|
+
|
|
194
|
+
selector_cache = _CM.from_config().selectors
|
|
195
|
+
healer = SelfHealer(max_attempts=max(1, int(cfg.max_retries)))
|
|
196
|
+
items = await _extract_url(
|
|
197
|
+
url=url,
|
|
198
|
+
html=html,
|
|
199
|
+
schema=schema,
|
|
200
|
+
prompt=prompt,
|
|
201
|
+
cleaner_provider=cleaner_provider,
|
|
202
|
+
extraction_provider=extraction_provider,
|
|
203
|
+
selector_provider=selector_provider,
|
|
204
|
+
selector_cache=selector_cache,
|
|
205
|
+
healer=healer,
|
|
206
|
+
force_llm=bool(force_llm),
|
|
207
|
+
)
|
|
208
|
+
out_fmt = str(output or "auto").lower()
|
|
209
|
+
if out_fmt in {"df", "dataframe"}:
|
|
210
|
+
from .output.dataframe import to_dataframe
|
|
211
|
+
|
|
212
|
+
df = to_dataframe(items, engine=cast(Any, dataframe_engine))
|
|
213
|
+
return df if df is not None else items
|
|
214
|
+
if out_fmt in {"python", "list", "dict"}:
|
|
215
|
+
return items
|
|
216
|
+
df = _maybe_to_dataframe(items)
|
|
217
|
+
return df if df is not None else items
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def ask_from_html(url: str, html: str, *, prompt: str, **kwargs: Any):
|
|
221
|
+
return _run_sync(_ask_from_html(url, html, prompt=prompt, **kwargs))
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
async def async_extract_from_html(
|
|
225
|
+
url: str,
|
|
226
|
+
html: str,
|
|
227
|
+
*,
|
|
228
|
+
schema,
|
|
229
|
+
prompt: str,
|
|
230
|
+
output: str = "python",
|
|
231
|
+
dataframe_engine: str = "auto",
|
|
232
|
+
**kwargs: Any,
|
|
233
|
+
):
|
|
234
|
+
"""
|
|
235
|
+
Same extraction contract as `async_extract`, but uses pre-fetched HTML (no network fetch).
|
|
236
|
+
|
|
237
|
+
Returns `list[BaseModel]` by default, or a DataFrame when ``output="df"`` / ``"dataframe"``,
|
|
238
|
+
or auto-converts like `async_extract` when ``output="auto"``.
|
|
239
|
+
"""
|
|
240
|
+
from pydantic import BaseModel
|
|
241
|
+
|
|
242
|
+
if not isinstance(schema, type) or not issubclass(schema, BaseModel):
|
|
243
|
+
raise TypeError("schema must be a Pydantic BaseModel type")
|
|
244
|
+
_normalize_extract_output(output)
|
|
245
|
+
|
|
246
|
+
cfg = get_config()
|
|
247
|
+
cleaner_model = cast(str, kwargs.pop("cleaner_model", cfg.cleaner_model))
|
|
248
|
+
extraction_model = cast(str, kwargs.pop("extraction_model", cfg.extraction_model))
|
|
249
|
+
selector_model = cast(str, kwargs.pop("selector_model", cfg.selector_model))
|
|
250
|
+
force_llm = bool(kwargs.pop("force_llm", cfg.force_llm))
|
|
251
|
+
|
|
252
|
+
selector_cache = CacheManager.from_config().selectors
|
|
253
|
+
healer = SelfHealer(max_attempts=max(1, int(cfg.max_retries)))
|
|
254
|
+
items = await _extract_url(
|
|
255
|
+
url=url,
|
|
256
|
+
html=html,
|
|
257
|
+
schema=schema,
|
|
258
|
+
prompt=prompt,
|
|
259
|
+
cleaner_provider=create_provider(cleaner_model),
|
|
260
|
+
extraction_provider=create_provider(extraction_model),
|
|
261
|
+
selector_provider=create_provider(selector_model),
|
|
262
|
+
selector_cache=selector_cache,
|
|
263
|
+
healer=healer,
|
|
264
|
+
force_llm=force_llm,
|
|
265
|
+
)
|
|
266
|
+
return _finalize_extract_output(
|
|
267
|
+
items,
|
|
268
|
+
schema,
|
|
269
|
+
output=output,
|
|
270
|
+
dataframe_engine=dataframe_engine,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def extract_from_html(url: str, html: str, *, schema: Any, prompt: str, **kwargs: Any):
|
|
275
|
+
"""Sync wrapper around `async_extract_from_html` (same return contract as `extract`)."""
|
|
276
|
+
return _run_sync(async_extract_from_html(url, html, schema=schema, prompt=prompt, **kwargs))
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
async def _query_from_html(
|
|
280
|
+
url: str,
|
|
281
|
+
html: str,
|
|
282
|
+
*,
|
|
283
|
+
silkql_string: str,
|
|
284
|
+
provider=None,
|
|
285
|
+
cache: SelectorCache | None = None,
|
|
286
|
+
force_llm: bool | None = None,
|
|
287
|
+
**kwargs: Any,
|
|
288
|
+
):
|
|
289
|
+
"""Run SilkQL on pre-fetched HTML (same pipeline as ``async_query`` for one page)."""
|
|
290
|
+
cfg = get_config()
|
|
291
|
+
prov = provider or create_provider(cfg.extraction_model)
|
|
292
|
+
cleaner_model = cast(str, kwargs.pop("cleaner_model", cfg.cleaner_model))
|
|
293
|
+
selector_model = cast(str, kwargs.pop("selector_model", cfg.selector_model))
|
|
294
|
+
selector_cache = cache or CacheManager.from_config().selectors
|
|
295
|
+
return await _execute_query_from_html(
|
|
296
|
+
url,
|
|
297
|
+
html,
|
|
298
|
+
silkql_string,
|
|
299
|
+
provider=prov,
|
|
300
|
+
cache=selector_cache,
|
|
301
|
+
cleaner_provider=create_provider(cleaner_model),
|
|
302
|
+
selector_provider=create_provider(selector_model),
|
|
303
|
+
force_llm=bool(cfg.force_llm if force_llm is None else force_llm),
|
|
304
|
+
healer=SelfHealer(max_attempts=max(1, int(cfg.max_retries))),
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def query_from_html(url: str, html: str, *, silkql_string: str, **kwargs: Any):
|
|
309
|
+
"""Sync SilkQL on existing HTML. Same pipeline as :func:`async_query` for a single page; see :func:`async_query` for options."""
|
|
310
|
+
return _run_sync(_query_from_html(url, html, silkql_string=silkql_string, **kwargs))
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def fetch(url: str, *args, **kwargs):
|
|
314
|
+
"""Fetch a URL and return a `SilkPage`."""
|
|
315
|
+
return _run_sync(_async_fetch(url, *args, **kwargs))
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
async def async_fetch(url: str, *args, **kwargs):
|
|
319
|
+
"""Async variant of `fetch`."""
|
|
320
|
+
return await _async_fetch(url, *args, **kwargs)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def discover_api(url: str, session: SilkSession | None = None, *, output_path: str | None = None):
|
|
324
|
+
"""Discover JSON API endpoints for a URL."""
|
|
325
|
+
return _run_sync(_async_discover_api(url, session, output_path))
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _maybe_to_dataframe(items: list[dict[str, Any]]):
|
|
329
|
+
# Only auto-convert if user already imported a DF library.
|
|
330
|
+
cfg = get_config()
|
|
331
|
+
if not cfg.auto_detect_dataframe:
|
|
332
|
+
return None
|
|
333
|
+
if "pandas" in sys.modules:
|
|
334
|
+
import pandas as pd # type: ignore
|
|
335
|
+
|
|
336
|
+
return pd.DataFrame(items)
|
|
337
|
+
if "polars" in sys.modules:
|
|
338
|
+
import polars as pl # type: ignore
|
|
339
|
+
|
|
340
|
+
return pl.DataFrame(items)
|
|
341
|
+
return None
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
_EXTRACT_OUTPUTS = frozenset({"python", "list", "dict", "auto", "df", "dataframe"})
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _normalize_extract_output(output: str) -> str:
|
|
348
|
+
out = str(output or "python").lower().strip()
|
|
349
|
+
if out not in _EXTRACT_OUTPUTS:
|
|
350
|
+
allowed = ", ".join(sorted(_EXTRACT_OUTPUTS))
|
|
351
|
+
raise ValueError(f"Invalid extract output={output!r}. Use one of: {allowed}")
|
|
352
|
+
return out
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def _finalize_extract_output(
|
|
356
|
+
items: list[dict[str, Any]],
|
|
357
|
+
schema: type[Any],
|
|
358
|
+
*,
|
|
359
|
+
output: str,
|
|
360
|
+
dataframe_engine: str,
|
|
361
|
+
) -> Any:
|
|
362
|
+
"""Validate dict rows to `schema`, attach meta, apply output / DataFrame rules."""
|
|
363
|
+
from pydantic import BaseModel
|
|
364
|
+
|
|
365
|
+
if not isinstance(schema, type) or not issubclass(schema, BaseModel):
|
|
366
|
+
raise TypeError("schema must be a Pydantic BaseModel type")
|
|
367
|
+
|
|
368
|
+
out_fmt = _normalize_extract_output(output)
|
|
369
|
+
out_models: list[BaseModel] = []
|
|
370
|
+
for it in items:
|
|
371
|
+
payload = {k: v for k, v in it.items() if k in schema.model_fields}
|
|
372
|
+
obj = schema.model_validate(payload)
|
|
373
|
+
meta = it.get("__silk_meta__")
|
|
374
|
+
if meta is not None:
|
|
375
|
+
with contextlib.suppress(Exception):
|
|
376
|
+
object.__setattr__(obj, "__silk_meta__", meta)
|
|
377
|
+
out_models.append(obj)
|
|
378
|
+
|
|
379
|
+
if out_fmt in {"df", "dataframe"}:
|
|
380
|
+
from .output.dataframe import to_dataframe
|
|
381
|
+
|
|
382
|
+
df = to_dataframe(out_models, engine=cast(Any, dataframe_engine))
|
|
383
|
+
return df if df is not None else out_models
|
|
384
|
+
|
|
385
|
+
if out_fmt in {"python", "list", "dict"}:
|
|
386
|
+
return out_models
|
|
387
|
+
|
|
388
|
+
if out_fmt == "auto":
|
|
389
|
+
cfg = get_config()
|
|
390
|
+
if cfg.auto_detect_dataframe and ("pandas" in sys.modules or "polars" in sys.modules):
|
|
391
|
+
payload_rows = [o.model_dump() for o in out_models]
|
|
392
|
+
df = _maybe_to_dataframe(payload_rows)
|
|
393
|
+
if df is not None:
|
|
394
|
+
return df
|
|
395
|
+
return out_models
|
|
396
|
+
|
|
397
|
+
return out_models
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _cleaned_from_hydration(hydration: Any, *, heading: str | None) -> CleanedContent:
|
|
401
|
+
payload = json.dumps(hydration, ensure_ascii=False)
|
|
402
|
+
flat = json.dumps({"heading": heading or "", "items": [payload]}, ensure_ascii=False)
|
|
403
|
+
token_estimate = max(1, int(len(payload) / 4))
|
|
404
|
+
return CleanedContent(flat_json=flat, markdown=payload, token_estimate=token_estimate)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _best_effort_hydration_subset(hydration: dict[str, Any]) -> Any:
|
|
408
|
+
"""
|
|
409
|
+
Try to pick the most stable, smallest subset of common SSR hydration payloads.
|
|
410
|
+
Falls back to the full dict if no known structure is found.
|
|
411
|
+
"""
|
|
412
|
+
# Next.js: __NEXT_DATA__ typically contains props.pageProps with the meaningful data.
|
|
413
|
+
props = hydration.get("props")
|
|
414
|
+
if isinstance(props, dict):
|
|
415
|
+
page_props = props.get("pageProps")
|
|
416
|
+
if page_props is not None:
|
|
417
|
+
return page_props
|
|
418
|
+
|
|
419
|
+
# Nuxt: extremely variable; keep it small when possible.
|
|
420
|
+
state = hydration.get("state")
|
|
421
|
+
if state is not None:
|
|
422
|
+
return state
|
|
423
|
+
|
|
424
|
+
data = hydration.get("data")
|
|
425
|
+
if data is not None:
|
|
426
|
+
return data
|
|
427
|
+
|
|
428
|
+
return hydration
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def ask(
|
|
432
|
+
url: str,
|
|
433
|
+
prompt: str,
|
|
434
|
+
**fetch_kwargs: Any,
|
|
435
|
+
):
|
|
436
|
+
"""Sync wrapper around `async_ask`."""
|
|
437
|
+
return _run_sync(async_ask(url, prompt, **fetch_kwargs))
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
async def async_ask(
|
|
441
|
+
url: str,
|
|
442
|
+
prompt: str,
|
|
443
|
+
*,
|
|
444
|
+
output: str = "auto",
|
|
445
|
+
dataframe_engine: str = "auto",
|
|
446
|
+
**fetch_kwargs: Any,
|
|
447
|
+
):
|
|
448
|
+
"""
|
|
449
|
+
Ask a natural-language question of a URL.
|
|
450
|
+
|
|
451
|
+
Pipeline:
|
|
452
|
+
- fetch (auto tier)
|
|
453
|
+
- hydration-first (optional: use hydration JSON as cleaned content)
|
|
454
|
+
- otherwise clean → synthesize schema → extract → compile selectors → cache
|
|
455
|
+
- output selection:
|
|
456
|
+
- output="python": list[dict]
|
|
457
|
+
- output="df": DataFrame (pandas/polars) if available
|
|
458
|
+
- output="auto": backward-compatible auto-conversion when caller already imported pandas/polars
|
|
459
|
+
"""
|
|
460
|
+
fetch_kwargs = dict(fetch_kwargs)
|
|
461
|
+
|
|
462
|
+
cfg = get_config()
|
|
463
|
+
|
|
464
|
+
cleaner_model = cast(str, fetch_kwargs.pop("cleaner_model", cfg.cleaner_model))
|
|
465
|
+
schema_model = cast(str, fetch_kwargs.pop("schema_model", cfg.schema_model))
|
|
466
|
+
extraction_model = cast(str, fetch_kwargs.pop("extraction_model", cfg.extraction_model))
|
|
467
|
+
selector_model = cast(str, fetch_kwargs.pop("selector_model", cfg.selector_model))
|
|
468
|
+
force_llm = bool(fetch_kwargs.pop("force_llm", cfg.force_llm))
|
|
469
|
+
hydration_first = bool(fetch_kwargs.pop("hydration_first", cfg.hydration_first))
|
|
470
|
+
hydration_subset = bool(fetch_kwargs.pop("hydration_subset", cfg.hydration_subset))
|
|
471
|
+
hydration_max_chars = int(fetch_kwargs.pop("hydration_max_chars", cfg.hydration_max_chars))
|
|
472
|
+
|
|
473
|
+
import time as _t
|
|
474
|
+
|
|
475
|
+
_t0 = _t.time()
|
|
476
|
+
page = await _async_fetch(url, tier="auto", **fetch_kwargs)
|
|
477
|
+
_t_fetch = _t.time() - _t0
|
|
478
|
+
log_event(
|
|
479
|
+
"ask_fetch_done",
|
|
480
|
+
url=url,
|
|
481
|
+
tier=getattr(page, "fetch_tier", None),
|
|
482
|
+
duration_ms=int(_t_fetch * 1000),
|
|
483
|
+
html_chars=len(page.html or ""),
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
selector_cache = CacheManager.from_config().selectors
|
|
487
|
+
|
|
488
|
+
cleaner_provider = create_provider(cleaner_model)
|
|
489
|
+
schema_provider = create_provider(schema_model)
|
|
490
|
+
extraction_provider = create_provider(extraction_model)
|
|
491
|
+
selector_provider = create_provider(selector_model)
|
|
492
|
+
healer = SelfHealer(max_attempts=max(1, int(cfg.max_retries)))
|
|
493
|
+
|
|
494
|
+
hydration = page.hydration_data() if hydration_first else None
|
|
495
|
+
if isinstance(hydration, dict) and hydration_subset:
|
|
496
|
+
hydration_any: Any = _best_effort_hydration_subset(hydration)
|
|
497
|
+
else:
|
|
498
|
+
hydration_any = hydration
|
|
499
|
+
|
|
500
|
+
hydration_payload = None
|
|
501
|
+
if hydration_any is not None:
|
|
502
|
+
with contextlib.suppress(Exception):
|
|
503
|
+
hydration_payload = json.dumps(hydration_any, ensure_ascii=False)
|
|
504
|
+
|
|
505
|
+
if (
|
|
506
|
+
hydration_any is not None
|
|
507
|
+
and hydration_payload is not None
|
|
508
|
+
and 0 < hydration_max_chars < len(hydration_payload)
|
|
509
|
+
):
|
|
510
|
+
log_event(
|
|
511
|
+
"ask_hydration_skipped",
|
|
512
|
+
url=url,
|
|
513
|
+
tier=getattr(page, "fetch_tier", None),
|
|
514
|
+
reason="too_large",
|
|
515
|
+
hydration_chars=len(hydration_payload),
|
|
516
|
+
max_chars=hydration_max_chars,
|
|
517
|
+
)
|
|
518
|
+
hydration_any = None
|
|
519
|
+
|
|
520
|
+
if hydration_any is not None:
|
|
521
|
+
cleaned = _cleaned_from_hydration(
|
|
522
|
+
hydration_any, heading=str(page.metadata.get("title") or "")
|
|
523
|
+
)
|
|
524
|
+
log_event(
|
|
525
|
+
"ask_clean_done",
|
|
526
|
+
url=url,
|
|
527
|
+
tier=getattr(page, "fetch_tier", None),
|
|
528
|
+
method="hydration",
|
|
529
|
+
hydration_chars=len(cleaned.markdown),
|
|
530
|
+
)
|
|
531
|
+
else:
|
|
532
|
+
_t1 = _t.time()
|
|
533
|
+
cleaned = await clean_html(page.html, provider=cleaner_provider, strategy="auto")
|
|
534
|
+
log_event(
|
|
535
|
+
"ask_clean_done",
|
|
536
|
+
url=url,
|
|
537
|
+
tier=getattr(page, "fetch_tier", None),
|
|
538
|
+
method="clean_html",
|
|
539
|
+
duration_ms=int((_t.time() - _t1) * 1000),
|
|
540
|
+
token_estimate=getattr(cleaned, "token_estimate", None),
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
_t2 = _t.time()
|
|
544
|
+
log_event("ask_schema_start", url=url, tier=getattr(page, "fetch_tier", None))
|
|
545
|
+
schema = await synthesize_schema(cleaned, prompt=prompt, provider=schema_provider)
|
|
546
|
+
log_event(
|
|
547
|
+
"ask_schema_done",
|
|
548
|
+
url=url,
|
|
549
|
+
tier=getattr(page, "fetch_tier", None),
|
|
550
|
+
duration_ms=int((_t.time() - _t2) * 1000),
|
|
551
|
+
fields=list(getattr(schema, "model_fields", {}).keys()),
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
_t3 = _t.time()
|
|
555
|
+
log_event("ask_extract_start", url=url, tier=getattr(page, "fetch_tier", None))
|
|
556
|
+
items = await _extract_url(
|
|
557
|
+
url=url,
|
|
558
|
+
html=page.html,
|
|
559
|
+
schema=schema,
|
|
560
|
+
prompt=prompt,
|
|
561
|
+
cleaner_provider=cleaner_provider,
|
|
562
|
+
extraction_provider=extraction_provider,
|
|
563
|
+
selector_provider=selector_provider,
|
|
564
|
+
selector_cache=selector_cache,
|
|
565
|
+
healer=healer,
|
|
566
|
+
force_llm=force_llm,
|
|
567
|
+
)
|
|
568
|
+
log_event(
|
|
569
|
+
"ask_extract_done",
|
|
570
|
+
url=url,
|
|
571
|
+
tier=getattr(page, "fetch_tier", None),
|
|
572
|
+
duration_ms=int((_t.time() - _t3) * 1000),
|
|
573
|
+
items=len(items),
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
# Scalar convenience: single item + single field
|
|
577
|
+
if len(items) == 1 and len(schema.model_fields) == 1:
|
|
578
|
+
only_key = next(iter(schema.model_fields.keys()))
|
|
579
|
+
return items[0].get(only_key)
|
|
580
|
+
|
|
581
|
+
out_fmt = str(output or "auto").lower()
|
|
582
|
+
if out_fmt in {"df", "dataframe"}:
|
|
583
|
+
from .output.dataframe import to_dataframe
|
|
584
|
+
|
|
585
|
+
df = to_dataframe(items, engine=cast(Any, dataframe_engine))
|
|
586
|
+
return df if df is not None else items
|
|
587
|
+
|
|
588
|
+
if out_fmt in {"python", "list", "dict"}:
|
|
589
|
+
return items
|
|
590
|
+
|
|
591
|
+
# Backward-compatible "auto": only auto-convert if caller already imported pandas/polars.
|
|
592
|
+
df = _maybe_to_dataframe(items)
|
|
593
|
+
return df if df is not None else items
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def extract(url: str, schema, prompt: str, **kwargs: Any):
|
|
597
|
+
"""Sync wrapper around `async_extract`."""
|
|
598
|
+
return _run_sync(async_extract(url, schema, prompt, **kwargs))
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
async def async_extract(
|
|
602
|
+
url: str,
|
|
603
|
+
schema,
|
|
604
|
+
prompt: str,
|
|
605
|
+
*,
|
|
606
|
+
output: str = "python",
|
|
607
|
+
dataframe_engine: str = "auto",
|
|
608
|
+
**kwargs: Any,
|
|
609
|
+
):
|
|
610
|
+
"""
|
|
611
|
+
Extract typed data from a URL using a provided Pydantic schema.
|
|
612
|
+
|
|
613
|
+
- selector cache fast-path
|
|
614
|
+
- self-heal on validation failure
|
|
615
|
+
- ``output`` controls the return shape:
|
|
616
|
+
- ``"python"`` / ``"list"`` / ``"dict"``: ``list[BaseModel]`` with ``__silk_meta__`` when present
|
|
617
|
+
- ``"df"`` / ``"dataframe"``: pandas or polars DataFrame (see ``dataframe_engine``), else falls back to list
|
|
618
|
+
- ``"auto"``: same as historical behavior — DataFrame only if ``auto_detect_dataframe`` and pandas/polars already imported
|
|
619
|
+
"""
|
|
620
|
+
from pydantic import BaseModel
|
|
621
|
+
|
|
622
|
+
if not isinstance(schema, type) or not issubclass(schema, BaseModel):
|
|
623
|
+
raise TypeError("schema must be a Pydantic BaseModel type")
|
|
624
|
+
_normalize_extract_output(output)
|
|
625
|
+
|
|
626
|
+
cfg = get_config()
|
|
627
|
+
cleaner_model = cast(str, kwargs.pop("cleaner_model", cfg.cleaner_model))
|
|
628
|
+
extraction_model = cast(str, kwargs.pop("extraction_model", cfg.extraction_model))
|
|
629
|
+
selector_model = cast(str, kwargs.pop("selector_model", cfg.selector_model))
|
|
630
|
+
force_llm = bool(kwargs.pop("force_llm", cfg.force_llm))
|
|
631
|
+
|
|
632
|
+
import time as _t
|
|
633
|
+
|
|
634
|
+
_t0 = _t.time()
|
|
635
|
+
page = await _async_fetch(url, tier="auto", **kwargs)
|
|
636
|
+
_t_fetch = _t.time() - _t0
|
|
637
|
+
log_event(
|
|
638
|
+
"extract_fetch_done",
|
|
639
|
+
url=url,
|
|
640
|
+
tier=getattr(page, "fetch_tier", None),
|
|
641
|
+
duration_ms=int(_t_fetch * 1000),
|
|
642
|
+
html_chars=len(page.html or ""),
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
selector_cache = CacheManager.from_config().selectors
|
|
646
|
+
|
|
647
|
+
healer = SelfHealer(max_attempts=max(1, int(cfg.max_retries)))
|
|
648
|
+
_t1 = _t.time()
|
|
649
|
+
log_event("extract_llm_start", url=url, tier=getattr(page, "fetch_tier", None))
|
|
650
|
+
items = await _extract_url(
|
|
651
|
+
url=url,
|
|
652
|
+
html=page.html,
|
|
653
|
+
schema=schema,
|
|
654
|
+
prompt=prompt,
|
|
655
|
+
cleaner_provider=create_provider(cleaner_model),
|
|
656
|
+
extraction_provider=create_provider(extraction_model),
|
|
657
|
+
selector_provider=create_provider(selector_model),
|
|
658
|
+
selector_cache=selector_cache,
|
|
659
|
+
healer=healer,
|
|
660
|
+
force_llm=force_llm,
|
|
661
|
+
)
|
|
662
|
+
log_event(
|
|
663
|
+
"extract_llm_done",
|
|
664
|
+
url=url,
|
|
665
|
+
tier=getattr(page, "fetch_tier", None),
|
|
666
|
+
duration_ms=int((_t.time() - _t1) * 1000),
|
|
667
|
+
items=len(items),
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
return _finalize_extract_output(
|
|
671
|
+
items,
|
|
672
|
+
schema,
|
|
673
|
+
output=output,
|
|
674
|
+
dataframe_engine=dataframe_engine,
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def query(*args, **kwargs):
|
|
679
|
+
"""Compile and run a SilkQL query (sync). Arguments and return type match :func:`async_query`."""
|
|
680
|
+
return _run_sync(async_query(*args, **kwargs))
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
async def async_query(
|
|
684
|
+
url: str,
|
|
685
|
+
silkql_string: str,
|
|
686
|
+
*,
|
|
687
|
+
provider=None,
|
|
688
|
+
cache: SelectorCache | None = None,
|
|
689
|
+
follow_pagination: bool = False,
|
|
690
|
+
max_pages: int = 20,
|
|
691
|
+
**fetch_kwargs: Any,
|
|
692
|
+
) -> QueryResult:
|
|
693
|
+
"""
|
|
694
|
+
Compile and run a SilkQL query against ``url``.
|
|
695
|
+
|
|
696
|
+
Fetches the page (tier ``"auto"`` by default; pass ``tier=`` like :func:`fetch`),
|
|
697
|
+
extracts with the compiled schema, caches CSS/XPath selectors per domain, and returns
|
|
698
|
+
a :class:`QueryResult` whose ``data`` is a one-element list containing the merged root
|
|
699
|
+
model (list collections are merged across pages when ``follow_pagination`` is true).
|
|
700
|
+
|
|
701
|
+
- ``provider``: extraction LLM; defaults to ``configure(extraction_model=...)``.
|
|
702
|
+
- ``cleaner_model`` / ``selector_model``: optional model strings (popped from ``**fetch_kwargs``),
|
|
703
|
+
defaulting to config — same split as :func:`extract`.
|
|
704
|
+
- ``cache``: selector cache instance; defaults to ``CacheManager.from_config().selectors``.
|
|
705
|
+
- ``follow_pagination``: when the SilkQL AST includes ``pagination { next_page_url }``, follow
|
|
706
|
+
relative/absolute next links up to ``max_pages``.
|
|
707
|
+
- ``force_llm``: skip selector cache (popped from ``fetch_kwargs``, default ``configure(force_llm=...)``).
|
|
708
|
+
- ``cached`` on the result is true if **any** scraped page used a selector-cache hit.
|
|
709
|
+
"""
|
|
710
|
+
cfg = get_config()
|
|
711
|
+
prov = provider or create_provider(cfg.extraction_model)
|
|
712
|
+
selector_cache = cache or CacheManager.from_config().selectors
|
|
713
|
+
force_llm = bool(fetch_kwargs.pop("force_llm", cfg.force_llm))
|
|
714
|
+
cleaner_model = cast(str, fetch_kwargs.pop("cleaner_model", cfg.cleaner_model))
|
|
715
|
+
selector_model = cast(str, fetch_kwargs.pop("selector_model", cfg.selector_model))
|
|
716
|
+
return await _execute_query(
|
|
717
|
+
url,
|
|
718
|
+
silkql_string,
|
|
719
|
+
provider=prov,
|
|
720
|
+
cache=selector_cache,
|
|
721
|
+
cleaner_provider=create_provider(cleaner_model),
|
|
722
|
+
selector_provider=create_provider(selector_model),
|
|
723
|
+
follow_pagination=follow_pagination,
|
|
724
|
+
max_pages=max_pages,
|
|
725
|
+
force_llm=force_llm,
|
|
726
|
+
**fetch_kwargs,
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def crawl(*args, **kwargs):
|
|
731
|
+
"""
|
|
732
|
+
Multi-page crawl (sync). See :func:`async_crawl` for parameters and return value.
|
|
733
|
+
|
|
734
|
+
Returns a list of extracted ``BaseModel`` instances when ``schema`` and ``prompt`` are
|
|
735
|
+
both set; otherwise pages are still fetched (hooks only) and the result is an empty list.
|
|
736
|
+
"""
|
|
737
|
+
return _run_sync(async_crawl(*args, **kwargs))
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
async def async_crawl(
|
|
741
|
+
start_url: str,
|
|
742
|
+
*,
|
|
743
|
+
allowed_domains: set[str] | None = None,
|
|
744
|
+
url_pattern: str | None = None,
|
|
745
|
+
max_pages: int = 100,
|
|
746
|
+
max_depth: int = 2,
|
|
747
|
+
concurrency: int = 10,
|
|
748
|
+
per_domain_concurrency: int = 2,
|
|
749
|
+
max_pending_urls: int = 5000,
|
|
750
|
+
schema=None,
|
|
751
|
+
prompt: str | None = None,
|
|
752
|
+
on_page=None,
|
|
753
|
+
on_item=None,
|
|
754
|
+
on_error=None,
|
|
755
|
+
**fetch_kwargs: Any,
|
|
756
|
+
):
|
|
757
|
+
"""
|
|
758
|
+
Breadth-first crawl from ``start_url`` with URL dedup, global and per-domain concurrency,
|
|
759
|
+
and optional structured extraction on each page.
|
|
760
|
+
|
|
761
|
+
- ``schema`` / ``prompt``: both required together for extraction; if both omitted, only
|
|
762
|
+
``on_page`` / link discovery run and the returned list is empty.
|
|
763
|
+
- ``max_pages``: hard cap on fetched pages.
|
|
764
|
+
- ``max_depth``: link-following depth from the start URL (0 = start page only).
|
|
765
|
+
- ``max_pending_urls``: best-effort cap on the crawl work-queue size to limit memory.
|
|
766
|
+
- ``on_page``, ``on_item``, ``on_error``: optional async callbacks (page after fetch, each
|
|
767
|
+
extracted model, errors per URL).
|
|
768
|
+
- Remaining keyword arguments are passed to the fetcher (same as :func:`fetch`).
|
|
769
|
+
"""
|
|
770
|
+
from pydantic import BaseModel
|
|
771
|
+
|
|
772
|
+
if (schema is not None) ^ (prompt is not None):
|
|
773
|
+
raise ValueError("async_crawl requires both schema and prompt together, or neither")
|
|
774
|
+
|
|
775
|
+
if schema is not None and (not isinstance(schema, type) or not issubclass(schema, BaseModel)):
|
|
776
|
+
raise TypeError("schema must be a Pydantic BaseModel type")
|
|
777
|
+
|
|
778
|
+
crawler = AsyncCrawler(
|
|
779
|
+
start_url=start_url,
|
|
780
|
+
allowed_domains=allowed_domains,
|
|
781
|
+
url_pattern=url_pattern,
|
|
782
|
+
max_pages=max_pages,
|
|
783
|
+
max_depth=max_depth,
|
|
784
|
+
concurrency=concurrency,
|
|
785
|
+
per_domain_concurrency=per_domain_concurrency,
|
|
786
|
+
max_pending_urls=max_pending_urls,
|
|
787
|
+
schema=schema,
|
|
788
|
+
prompt=prompt,
|
|
789
|
+
on_page=on_page,
|
|
790
|
+
on_item=on_item,
|
|
791
|
+
on_error=on_error,
|
|
792
|
+
)
|
|
793
|
+
out: list[BaseModel] = []
|
|
794
|
+
async for item in crawler.run(**fetch_kwargs):
|
|
795
|
+
out.append(item)
|
|
796
|
+
return out
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def crawl_sitemap(*args, **kwargs):
|
|
800
|
+
"""Sync wrapper around :func:`async_crawl_sitemap`."""
|
|
801
|
+
return _run_sync(async_crawl_sitemap(*args, **kwargs))
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
async def async_crawl_sitemap(
|
|
805
|
+
sitemap_url: str,
|
|
806
|
+
*,
|
|
807
|
+
schema=None,
|
|
808
|
+
prompt: str | None = None,
|
|
809
|
+
max_pages: int = 100,
|
|
810
|
+
max_sitemap_files: int = 20,
|
|
811
|
+
concurrency: int = 10,
|
|
812
|
+
per_domain_concurrency: int = 2,
|
|
813
|
+
**fetch_kwargs: Any,
|
|
814
|
+
):
|
|
815
|
+
"""
|
|
816
|
+
Fetch a sitemap (``urlset`` or ``sitemapindex``), collect page ``<loc>`` URLs via XML
|
|
817
|
+
parsing, then run :func:`async_crawl` on each (``max_depth=0``, ``max_pages=1`` per URL).
|
|
818
|
+
|
|
819
|
+
``allowed_domains`` for each crawl defaults to the sitemap URL host. Pass ``max_sitemap_files``
|
|
820
|
+
to cap nested sitemap documents when the root is an index.
|
|
821
|
+
"""
|
|
822
|
+
from pydantic import BaseModel
|
|
823
|
+
|
|
824
|
+
from .crawl.sitemap import collect_page_urls_from_sitemap, host_allowed_domains
|
|
825
|
+
|
|
826
|
+
if (schema is not None) ^ (prompt is not None):
|
|
827
|
+
raise ValueError("async_crawl_sitemap requires both schema and prompt together, or neither")
|
|
828
|
+
|
|
829
|
+
if schema is not None and (not isinstance(schema, type) or not issubclass(schema, BaseModel)):
|
|
830
|
+
raise TypeError("schema must be a Pydantic BaseModel type")
|
|
831
|
+
|
|
832
|
+
allowed = host_allowed_domains(sitemap_url)
|
|
833
|
+
locs = await collect_page_urls_from_sitemap(
|
|
834
|
+
_async_fetch,
|
|
835
|
+
sitemap_url,
|
|
836
|
+
max_pages=max_pages,
|
|
837
|
+
max_sitemap_files=max_sitemap_files,
|
|
838
|
+
**fetch_kwargs,
|
|
839
|
+
)
|
|
840
|
+
results: list[Any] = []
|
|
841
|
+
for loc in locs:
|
|
842
|
+
results.extend(
|
|
843
|
+
await async_crawl(
|
|
844
|
+
loc,
|
|
845
|
+
schema=schema,
|
|
846
|
+
prompt=prompt,
|
|
847
|
+
allowed_domains=allowed,
|
|
848
|
+
max_pages=1,
|
|
849
|
+
max_depth=0,
|
|
850
|
+
concurrency=concurrency,
|
|
851
|
+
per_domain_concurrency=per_domain_concurrency,
|
|
852
|
+
**fetch_kwargs,
|
|
853
|
+
)
|
|
854
|
+
)
|
|
855
|
+
return results
|
|
856
|
+
|
|
857
|
+
|
|
858
|
+
def watch(*args, **kwargs) -> Watcher:
|
|
859
|
+
"""
|
|
860
|
+
Create a Watcher instance (use `await watcher.start()` to begin).
|
|
861
|
+
"""
|
|
862
|
+
return Watcher(*args, **kwargs)
|