silkweb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. silkweb/__init__.py +862 -0
  2. silkweb/cache/__init__.py +14 -0
  3. silkweb/cache/http.py +91 -0
  4. silkweb/cache/manager.py +71 -0
  5. silkweb/cache/page.py +284 -0
  6. silkweb/cache/selectors.py +139 -0
  7. silkweb/cli/__init__.py +1 -0
  8. silkweb/cli/main.py +443 -0
  9. silkweb/config.py +143 -0
  10. silkweb/crawl/__init__.py +6 -0
  11. silkweb/crawl/crawler.py +216 -0
  12. silkweb/crawl/dedup.py +105 -0
  13. silkweb/crawl/sitemap.py +154 -0
  14. silkweb/discover.py +275 -0
  15. silkweb/exceptions.py +112 -0
  16. silkweb/fetch/__init__.py +1 -0
  17. silkweb/fetch/orchestrator.py +566 -0
  18. silkweb/fetch/tiers/__init__.py +1 -0
  19. silkweb/fetch/tiers/curl_cffi_fetcher.py +175 -0
  20. silkweb/fetch/tiers/httpx_fetcher.py +199 -0
  21. silkweb/fetch/tiers/network_capture.py +58 -0
  22. silkweb/fetch/tiers/playwright_fetcher.py +307 -0
  23. silkweb/fetch/tiers/stealth_fetcher.py +397 -0
  24. silkweb/llm/__init__.py +1 -0
  25. silkweb/llm/chunking/__init__.py +6 -0
  26. silkweb/llm/chunking/bm25.py +53 -0
  27. silkweb/llm/chunking/budget.py +44 -0
  28. silkweb/llm/chunking/dispatcher.py +45 -0
  29. silkweb/llm/chunking/dom.py +106 -0
  30. silkweb/llm/chunking/semantic.py +80 -0
  31. silkweb/llm/chunking/token.py +67 -0
  32. silkweb/llm/constrained.py +196 -0
  33. silkweb/llm/pipelines/__init__.py +1 -0
  34. silkweb/llm/pipelines/clean.py +208 -0
  35. silkweb/llm/pipelines/extract.py +462 -0
  36. silkweb/llm/pipelines/heal.py +178 -0
  37. silkweb/llm/pipelines/orchestrator.py +286 -0
  38. silkweb/llm/pipelines/schema.py +162 -0
  39. silkweb/llm/pipelines/selectors.py +106 -0
  40. silkweb/llm/providers/__init__.py +6 -0
  41. silkweb/llm/providers/anthropic.py +104 -0
  42. silkweb/llm/providers/base.py +195 -0
  43. silkweb/llm/providers/llamacpp.py +112 -0
  44. silkweb/llm/providers/ollama.py +103 -0
  45. silkweb/llm/providers/openai.py +145 -0
  46. silkweb/llm/providers/registry.py +227 -0
  47. silkweb/observability/__init__.py +16 -0
  48. silkweb/observability/logging.py +69 -0
  49. silkweb/observability/metrics.py +121 -0
  50. silkweb/observability/replay.py +163 -0
  51. silkweb/output/__init__.py +25 -0
  52. silkweb/output/dataframe.py +59 -0
  53. silkweb/output/dataset.py +28 -0
  54. silkweb/output/files.py +144 -0
  55. silkweb/parse/__init__.py +1 -0
  56. silkweb/parse/page.py +383 -0
  57. silkweb/recipes/__init__.py +5 -0
  58. silkweb/recipes/amazon-product.yaml +17 -0
  59. silkweb/recipes/github-repo.yaml +18 -0
  60. silkweb/recipes/google-serp.yaml +17 -0
  61. silkweb/recipes/hacker-news.yaml +18 -0
  62. silkweb/recipes/news-article.yaml +15 -0
  63. silkweb/recipes/product-listing.yaml +17 -0
  64. silkweb/recipes/reddit-posts.yaml +19 -0
  65. silkweb/recipes/registry.py +244 -0
  66. silkweb/session/__init__.py +6 -0
  67. silkweb/session/recorder.py +115 -0
  68. silkweb/session/session.py +307 -0
  69. silkweb/silkql/__init__.py +13 -0
  70. silkweb/silkql/compiler.py +217 -0
  71. silkweb/silkql/executor.py +421 -0
  72. silkweb/silkql/parser.py +163 -0
  73. silkweb/stealth/__init__.py +1 -0
  74. silkweb/stealth/behavior.py +123 -0
  75. silkweb/stealth/proxy.py +143 -0
  76. silkweb/stealth/rate_limit.py +205 -0
  77. silkweb/watch.py +261 -0
  78. silkweb-0.1.0.dist-info/METADATA +1869 -0
  79. silkweb-0.1.0.dist-info/RECORD +82 -0
  80. silkweb-0.1.0.dist-info/WHEEL +4 -0
  81. silkweb-0.1.0.dist-info/entry_points.txt +2 -0
  82. silkweb-0.1.0.dist-info/licenses/LICENSE +22 -0
silkweb/__init__.py ADDED
@@ -0,0 +1,862 @@
1
+ """
2
+ Silkweb public API surface.
3
+
4
+ Project overview and design live in the repository README.md.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ try:
10
+ from dotenv import load_dotenv
11
+
12
+ load_dotenv()
13
+ except ImportError:
14
+ pass
15
+
16
+ import asyncio
17
+ import contextlib
18
+ import json
19
+ import sys
20
+ import threading
21
+ from importlib.metadata import PackageNotFoundError, version
22
+ from typing import Any, cast
23
+
24
+ _SYNC_LOOP: asyncio.AbstractEventLoop | None = None
25
+ _SYNC_LOCK = threading.Lock()
26
+
27
+
28
+ def _get_sync_loop() -> asyncio.AbstractEventLoop:
29
+ """Return a persistent event loop for sync wrappers.
30
+
31
+ Using a single loop avoids the 'Event loop is closed' crash on Windows
32
+ that occurs when ``anyio.run()`` repeatedly creates and destroys loops
33
+ while httpx ``AsyncClient`` instances are cached from a prior loop.
34
+ """
35
+ global _SYNC_LOOP
36
+ if _SYNC_LOOP is not None and not _SYNC_LOOP.is_closed():
37
+ return _SYNC_LOOP
38
+ with _SYNC_LOCK:
39
+ if _SYNC_LOOP is not None and not _SYNC_LOOP.is_closed():
40
+ return _SYNC_LOOP
41
+ loop = asyncio.new_event_loop()
42
+ _SYNC_LOOP = loop
43
+ return loop
44
+
45
+
46
+ def _run_sync(coro):
47
+ """Run a coroutine on the persistent sync event loop."""
48
+ loop = _get_sync_loop()
49
+ return loop.run_until_complete(coro)
50
+
51
+
52
+ from .cache.manager import CacheManager
53
+ from .cache.selectors import SelectorCache
54
+ from .config import SilkwebConfig, configure, get_config
55
+ from .crawl.crawler import AsyncCrawler
56
+ from .discover import discover_api as _async_discover_api
57
+ from .exceptions import (
58
+ SilkwebBlockedError,
59
+ SilkwebCacheError,
60
+ SilkwebConfigError,
61
+ SilkwebError,
62
+ SilkwebExtractionError,
63
+ SilkwebFetchError,
64
+ SilkwebHTTPError,
65
+ SilkwebLLMError,
66
+ SilkwebRenderError,
67
+ SilkwebSchemaError,
68
+ SilkwebSelectorError,
69
+ SilkwebSessionError,
70
+ SilkwebSessionExpiredError,
71
+ SilkwebTimeoutError,
72
+ )
73
+ from .fetch.orchestrator import fetch as _async_fetch
74
+ from .llm.pipelines.clean import CleanedContent, clean_html
75
+ from .llm.pipelines.heal import SelfHealer
76
+ from .llm.pipelines.orchestrator import extract_url as _extract_url
77
+ from .llm.pipelines.schema import synthesize_schema
78
+ from .llm.providers.registry import create_provider
79
+ from .observability.logging import log_event
80
+ from .observability.replay import ReplaySession as _ReplaySession
81
+ from .observability.replay import replay as _replay
82
+ from .recipes.registry import recipes
83
+ from .session.recorder import record as record_session
84
+ from .session.recorder import replay as replay_session
85
+ from .session.session import SilkSession
86
+ from .silkql.executor import QueryResult
87
+ from .silkql.executor import execute_query as _execute_query
88
+ from .silkql.executor import execute_query_from_html as _execute_query_from_html
89
+ from .watch import Watcher
90
+
91
+
92
+ class _CacheFacade:
93
+ """
94
+ Small convenience wrapper so docs can use `silkweb.cache.*`.
95
+
96
+ This delegates to `CacheManager.from_config()` on each call so it always reflects
97
+ current configuration.
98
+ """
99
+
100
+ def clear(self, *, layer: str | None = None, domain: str | None = None) -> None:
101
+ CacheManager.from_config().clear(layer=layer, domain=domain) # type: ignore[arg-type]
102
+
103
+ def stats(self) -> dict[str, Any]:
104
+ return CacheManager.from_config().stats()
105
+
106
+
107
+ cache = _CacheFacade()
108
+
109
+ try:
110
+ __version__ = version("silkweb")
111
+ except PackageNotFoundError: # pragma: no cover
112
+ __version__ = "0.0.0"
113
+
114
+ __all__ = [
115
+ "QueryResult",
116
+ "SilkSession",
117
+ "SilkwebBlockedError",
118
+ "SilkwebCacheError",
119
+ "SilkwebConfig",
120
+ "SilkwebConfigError",
121
+ "SilkwebError",
122
+ "SilkwebExtractionError",
123
+ "SilkwebFetchError",
124
+ "SilkwebHTTPError",
125
+ "SilkwebLLMError",
126
+ "SilkwebRenderError",
127
+ "SilkwebSchemaError",
128
+ "SilkwebSelectorError",
129
+ "SilkwebSessionError",
130
+ "SilkwebSessionExpiredError",
131
+ "SilkwebTimeoutError",
132
+ "ask",
133
+ "async_ask",
134
+ "async_crawl",
135
+ "async_crawl_sitemap",
136
+ "async_extract",
137
+ "async_extract_from_html",
138
+ "async_fetch",
139
+ "async_query",
140
+ "cache",
141
+ "configure",
142
+ "crawl",
143
+ "crawl_sitemap",
144
+ "discover_api",
145
+ "extract",
146
+ "fetch",
147
+ "get_config",
148
+ "query",
149
+ "query_from_html",
150
+ "recipes",
151
+ "record_session",
152
+ "replay",
153
+ "replay_session",
154
+ "watch",
155
+ ]
156
+
157
+
158
+ def replay(session_file: str) -> _ReplaySession:
159
+ """
160
+ Load an **HTTP fetch replay** bundle (JSON ``*.silkweb`` + HTML sibling) written when
161
+ ``configure(replay_dir=...)`` is set. Returns :class:`observability.replay.ReplaySession`
162
+ with ``.html`` / ``.ask()`` / ``.extract()`` / ``.query()`` helpers.
163
+
164
+ This is **not** the same as :func:`replay_session`, which replays a **Playwright**
165
+ recording from ``record_session`` (cookies and actions under ``~/.silkweb/sessions``).
166
+ """
167
+ return _replay(session_file)
168
+
169
+
170
+ async def _ask_from_html(
171
+ url: str,
172
+ html: str,
173
+ *,
174
+ prompt: str,
175
+ cleaner_model: str | None = None,
176
+ schema_model: str | None = None,
177
+ extraction_model: str | None = None,
178
+ selector_model: str | None = None,
179
+ force_llm: bool | None = None,
180
+ output: str = "auto",
181
+ dataframe_engine: str = "auto",
182
+ ):
183
+ cfg = get_config()
184
+ if force_llm is None:
185
+ force_llm = bool(cfg.force_llm)
186
+ cleaner_provider = create_provider(cleaner_model or cfg.cleaner_model)
187
+ schema_provider = create_provider(schema_model or cfg.schema_model)
188
+ extraction_provider = create_provider(extraction_model or cfg.extraction_model)
189
+ selector_provider = create_provider(selector_model or cfg.selector_model)
190
+ cleaned = await clean_html(html, provider=cleaner_provider, strategy="auto")
191
+ schema = await synthesize_schema(cleaned, prompt=prompt, provider=schema_provider)
192
+ from .cache.manager import CacheManager as _CM
193
+
194
+ selector_cache = _CM.from_config().selectors
195
+ healer = SelfHealer(max_attempts=max(1, int(cfg.max_retries)))
196
+ items = await _extract_url(
197
+ url=url,
198
+ html=html,
199
+ schema=schema,
200
+ prompt=prompt,
201
+ cleaner_provider=cleaner_provider,
202
+ extraction_provider=extraction_provider,
203
+ selector_provider=selector_provider,
204
+ selector_cache=selector_cache,
205
+ healer=healer,
206
+ force_llm=bool(force_llm),
207
+ )
208
+ out_fmt = str(output or "auto").lower()
209
+ if out_fmt in {"df", "dataframe"}:
210
+ from .output.dataframe import to_dataframe
211
+
212
+ df = to_dataframe(items, engine=cast(Any, dataframe_engine))
213
+ return df if df is not None else items
214
+ if out_fmt in {"python", "list", "dict"}:
215
+ return items
216
+ df = _maybe_to_dataframe(items)
217
+ return df if df is not None else items
218
+
219
+
220
+ def ask_from_html(url: str, html: str, *, prompt: str, **kwargs: Any):
221
+ return _run_sync(_ask_from_html(url, html, prompt=prompt, **kwargs))
222
+
223
+
224
+ async def async_extract_from_html(
225
+ url: str,
226
+ html: str,
227
+ *,
228
+ schema,
229
+ prompt: str,
230
+ output: str = "python",
231
+ dataframe_engine: str = "auto",
232
+ **kwargs: Any,
233
+ ):
234
+ """
235
+ Same extraction contract as `async_extract`, but uses pre-fetched HTML (no network fetch).
236
+
237
+ Returns `list[BaseModel]` by default, or a DataFrame when ``output="df"`` / ``"dataframe"``,
238
+ or auto-converts like `async_extract` when ``output="auto"``.
239
+ """
240
+ from pydantic import BaseModel
241
+
242
+ if not isinstance(schema, type) or not issubclass(schema, BaseModel):
243
+ raise TypeError("schema must be a Pydantic BaseModel type")
244
+ _normalize_extract_output(output)
245
+
246
+ cfg = get_config()
247
+ cleaner_model = cast(str, kwargs.pop("cleaner_model", cfg.cleaner_model))
248
+ extraction_model = cast(str, kwargs.pop("extraction_model", cfg.extraction_model))
249
+ selector_model = cast(str, kwargs.pop("selector_model", cfg.selector_model))
250
+ force_llm = bool(kwargs.pop("force_llm", cfg.force_llm))
251
+
252
+ selector_cache = CacheManager.from_config().selectors
253
+ healer = SelfHealer(max_attempts=max(1, int(cfg.max_retries)))
254
+ items = await _extract_url(
255
+ url=url,
256
+ html=html,
257
+ schema=schema,
258
+ prompt=prompt,
259
+ cleaner_provider=create_provider(cleaner_model),
260
+ extraction_provider=create_provider(extraction_model),
261
+ selector_provider=create_provider(selector_model),
262
+ selector_cache=selector_cache,
263
+ healer=healer,
264
+ force_llm=force_llm,
265
+ )
266
+ return _finalize_extract_output(
267
+ items,
268
+ schema,
269
+ output=output,
270
+ dataframe_engine=dataframe_engine,
271
+ )
272
+
273
+
274
+ def extract_from_html(url: str, html: str, *, schema: Any, prompt: str, **kwargs: Any):
275
+ """Sync wrapper around `async_extract_from_html` (same return contract as `extract`)."""
276
+ return _run_sync(async_extract_from_html(url, html, schema=schema, prompt=prompt, **kwargs))
277
+
278
+
279
+ async def _query_from_html(
280
+ url: str,
281
+ html: str,
282
+ *,
283
+ silkql_string: str,
284
+ provider=None,
285
+ cache: SelectorCache | None = None,
286
+ force_llm: bool | None = None,
287
+ **kwargs: Any,
288
+ ):
289
+ """Run SilkQL on pre-fetched HTML (same pipeline as ``async_query`` for one page)."""
290
+ cfg = get_config()
291
+ prov = provider or create_provider(cfg.extraction_model)
292
+ cleaner_model = cast(str, kwargs.pop("cleaner_model", cfg.cleaner_model))
293
+ selector_model = cast(str, kwargs.pop("selector_model", cfg.selector_model))
294
+ selector_cache = cache or CacheManager.from_config().selectors
295
+ return await _execute_query_from_html(
296
+ url,
297
+ html,
298
+ silkql_string,
299
+ provider=prov,
300
+ cache=selector_cache,
301
+ cleaner_provider=create_provider(cleaner_model),
302
+ selector_provider=create_provider(selector_model),
303
+ force_llm=bool(cfg.force_llm if force_llm is None else force_llm),
304
+ healer=SelfHealer(max_attempts=max(1, int(cfg.max_retries))),
305
+ )
306
+
307
+
308
+ def query_from_html(url: str, html: str, *, silkql_string: str, **kwargs: Any):
309
+ """Sync SilkQL on existing HTML. Same pipeline as :func:`async_query` for a single page; see :func:`async_query` for options."""
310
+ return _run_sync(_query_from_html(url, html, silkql_string=silkql_string, **kwargs))
311
+
312
+
313
+ def fetch(url: str, *args, **kwargs):
314
+ """Fetch a URL and return a `SilkPage`."""
315
+ return _run_sync(_async_fetch(url, *args, **kwargs))
316
+
317
+
318
+ async def async_fetch(url: str, *args, **kwargs):
319
+ """Async variant of `fetch`."""
320
+ return await _async_fetch(url, *args, **kwargs)
321
+
322
+
323
+ def discover_api(url: str, session: SilkSession | None = None, *, output_path: str | None = None):
324
+ """Discover JSON API endpoints for a URL."""
325
+ return _run_sync(_async_discover_api(url, session, output_path))
326
+
327
+
328
+ def _maybe_to_dataframe(items: list[dict[str, Any]]):
329
+ # Only auto-convert if user already imported a DF library.
330
+ cfg = get_config()
331
+ if not cfg.auto_detect_dataframe:
332
+ return None
333
+ if "pandas" in sys.modules:
334
+ import pandas as pd # type: ignore
335
+
336
+ return pd.DataFrame(items)
337
+ if "polars" in sys.modules:
338
+ import polars as pl # type: ignore
339
+
340
+ return pl.DataFrame(items)
341
+ return None
342
+
343
+
344
+ _EXTRACT_OUTPUTS = frozenset({"python", "list", "dict", "auto", "df", "dataframe"})
345
+
346
+
347
+ def _normalize_extract_output(output: str) -> str:
348
+ out = str(output or "python").lower().strip()
349
+ if out not in _EXTRACT_OUTPUTS:
350
+ allowed = ", ".join(sorted(_EXTRACT_OUTPUTS))
351
+ raise ValueError(f"Invalid extract output={output!r}. Use one of: {allowed}")
352
+ return out
353
+
354
+
355
+ def _finalize_extract_output(
356
+ items: list[dict[str, Any]],
357
+ schema: type[Any],
358
+ *,
359
+ output: str,
360
+ dataframe_engine: str,
361
+ ) -> Any:
362
+ """Validate dict rows to `schema`, attach meta, apply output / DataFrame rules."""
363
+ from pydantic import BaseModel
364
+
365
+ if not isinstance(schema, type) or not issubclass(schema, BaseModel):
366
+ raise TypeError("schema must be a Pydantic BaseModel type")
367
+
368
+ out_fmt = _normalize_extract_output(output)
369
+ out_models: list[BaseModel] = []
370
+ for it in items:
371
+ payload = {k: v for k, v in it.items() if k in schema.model_fields}
372
+ obj = schema.model_validate(payload)
373
+ meta = it.get("__silk_meta__")
374
+ if meta is not None:
375
+ with contextlib.suppress(Exception):
376
+ object.__setattr__(obj, "__silk_meta__", meta)
377
+ out_models.append(obj)
378
+
379
+ if out_fmt in {"df", "dataframe"}:
380
+ from .output.dataframe import to_dataframe
381
+
382
+ df = to_dataframe(out_models, engine=cast(Any, dataframe_engine))
383
+ return df if df is not None else out_models
384
+
385
+ if out_fmt in {"python", "list", "dict"}:
386
+ return out_models
387
+
388
+ if out_fmt == "auto":
389
+ cfg = get_config()
390
+ if cfg.auto_detect_dataframe and ("pandas" in sys.modules or "polars" in sys.modules):
391
+ payload_rows = [o.model_dump() for o in out_models]
392
+ df = _maybe_to_dataframe(payload_rows)
393
+ if df is not None:
394
+ return df
395
+ return out_models
396
+
397
+ return out_models
398
+
399
+
400
+ def _cleaned_from_hydration(hydration: Any, *, heading: str | None) -> CleanedContent:
401
+ payload = json.dumps(hydration, ensure_ascii=False)
402
+ flat = json.dumps({"heading": heading or "", "items": [payload]}, ensure_ascii=False)
403
+ token_estimate = max(1, int(len(payload) / 4))
404
+ return CleanedContent(flat_json=flat, markdown=payload, token_estimate=token_estimate)
405
+
406
+
407
+ def _best_effort_hydration_subset(hydration: dict[str, Any]) -> Any:
408
+ """
409
+ Try to pick the most stable, smallest subset of common SSR hydration payloads.
410
+ Falls back to the full dict if no known structure is found.
411
+ """
412
+ # Next.js: __NEXT_DATA__ typically contains props.pageProps with the meaningful data.
413
+ props = hydration.get("props")
414
+ if isinstance(props, dict):
415
+ page_props = props.get("pageProps")
416
+ if page_props is not None:
417
+ return page_props
418
+
419
+ # Nuxt: extremely variable; keep it small when possible.
420
+ state = hydration.get("state")
421
+ if state is not None:
422
+ return state
423
+
424
+ data = hydration.get("data")
425
+ if data is not None:
426
+ return data
427
+
428
+ return hydration
429
+
430
+
431
+ def ask(
432
+ url: str,
433
+ prompt: str,
434
+ **fetch_kwargs: Any,
435
+ ):
436
+ """Sync wrapper around `async_ask`."""
437
+ return _run_sync(async_ask(url, prompt, **fetch_kwargs))
438
+
439
+
440
+ async def async_ask(
441
+ url: str,
442
+ prompt: str,
443
+ *,
444
+ output: str = "auto",
445
+ dataframe_engine: str = "auto",
446
+ **fetch_kwargs: Any,
447
+ ):
448
+ """
449
+ Ask a natural-language question of a URL.
450
+
451
+ Pipeline:
452
+ - fetch (auto tier)
453
+ - hydration-first (optional: use hydration JSON as cleaned content)
454
+ - otherwise clean → synthesize schema → extract → compile selectors → cache
455
+ - output selection:
456
+ - output="python": list[dict]
457
+ - output="df": DataFrame (pandas/polars) if available
458
+ - output="auto": backward-compatible auto-conversion when caller already imported pandas/polars
459
+ """
460
+ fetch_kwargs = dict(fetch_kwargs)
461
+
462
+ cfg = get_config()
463
+
464
+ cleaner_model = cast(str, fetch_kwargs.pop("cleaner_model", cfg.cleaner_model))
465
+ schema_model = cast(str, fetch_kwargs.pop("schema_model", cfg.schema_model))
466
+ extraction_model = cast(str, fetch_kwargs.pop("extraction_model", cfg.extraction_model))
467
+ selector_model = cast(str, fetch_kwargs.pop("selector_model", cfg.selector_model))
468
+ force_llm = bool(fetch_kwargs.pop("force_llm", cfg.force_llm))
469
+ hydration_first = bool(fetch_kwargs.pop("hydration_first", cfg.hydration_first))
470
+ hydration_subset = bool(fetch_kwargs.pop("hydration_subset", cfg.hydration_subset))
471
+ hydration_max_chars = int(fetch_kwargs.pop("hydration_max_chars", cfg.hydration_max_chars))
472
+
473
+ import time as _t
474
+
475
+ _t0 = _t.time()
476
+ page = await _async_fetch(url, tier="auto", **fetch_kwargs)
477
+ _t_fetch = _t.time() - _t0
478
+ log_event(
479
+ "ask_fetch_done",
480
+ url=url,
481
+ tier=getattr(page, "fetch_tier", None),
482
+ duration_ms=int(_t_fetch * 1000),
483
+ html_chars=len(page.html or ""),
484
+ )
485
+
486
+ selector_cache = CacheManager.from_config().selectors
487
+
488
+ cleaner_provider = create_provider(cleaner_model)
489
+ schema_provider = create_provider(schema_model)
490
+ extraction_provider = create_provider(extraction_model)
491
+ selector_provider = create_provider(selector_model)
492
+ healer = SelfHealer(max_attempts=max(1, int(cfg.max_retries)))
493
+
494
+ hydration = page.hydration_data() if hydration_first else None
495
+ if isinstance(hydration, dict) and hydration_subset:
496
+ hydration_any: Any = _best_effort_hydration_subset(hydration)
497
+ else:
498
+ hydration_any = hydration
499
+
500
+ hydration_payload = None
501
+ if hydration_any is not None:
502
+ with contextlib.suppress(Exception):
503
+ hydration_payload = json.dumps(hydration_any, ensure_ascii=False)
504
+
505
+ if (
506
+ hydration_any is not None
507
+ and hydration_payload is not None
508
+ and 0 < hydration_max_chars < len(hydration_payload)
509
+ ):
510
+ log_event(
511
+ "ask_hydration_skipped",
512
+ url=url,
513
+ tier=getattr(page, "fetch_tier", None),
514
+ reason="too_large",
515
+ hydration_chars=len(hydration_payload),
516
+ max_chars=hydration_max_chars,
517
+ )
518
+ hydration_any = None
519
+
520
+ if hydration_any is not None:
521
+ cleaned = _cleaned_from_hydration(
522
+ hydration_any, heading=str(page.metadata.get("title") or "")
523
+ )
524
+ log_event(
525
+ "ask_clean_done",
526
+ url=url,
527
+ tier=getattr(page, "fetch_tier", None),
528
+ method="hydration",
529
+ hydration_chars=len(cleaned.markdown),
530
+ )
531
+ else:
532
+ _t1 = _t.time()
533
+ cleaned = await clean_html(page.html, provider=cleaner_provider, strategy="auto")
534
+ log_event(
535
+ "ask_clean_done",
536
+ url=url,
537
+ tier=getattr(page, "fetch_tier", None),
538
+ method="clean_html",
539
+ duration_ms=int((_t.time() - _t1) * 1000),
540
+ token_estimate=getattr(cleaned, "token_estimate", None),
541
+ )
542
+
543
+ _t2 = _t.time()
544
+ log_event("ask_schema_start", url=url, tier=getattr(page, "fetch_tier", None))
545
+ schema = await synthesize_schema(cleaned, prompt=prompt, provider=schema_provider)
546
+ log_event(
547
+ "ask_schema_done",
548
+ url=url,
549
+ tier=getattr(page, "fetch_tier", None),
550
+ duration_ms=int((_t.time() - _t2) * 1000),
551
+ fields=list(getattr(schema, "model_fields", {}).keys()),
552
+ )
553
+
554
+ _t3 = _t.time()
555
+ log_event("ask_extract_start", url=url, tier=getattr(page, "fetch_tier", None))
556
+ items = await _extract_url(
557
+ url=url,
558
+ html=page.html,
559
+ schema=schema,
560
+ prompt=prompt,
561
+ cleaner_provider=cleaner_provider,
562
+ extraction_provider=extraction_provider,
563
+ selector_provider=selector_provider,
564
+ selector_cache=selector_cache,
565
+ healer=healer,
566
+ force_llm=force_llm,
567
+ )
568
+ log_event(
569
+ "ask_extract_done",
570
+ url=url,
571
+ tier=getattr(page, "fetch_tier", None),
572
+ duration_ms=int((_t.time() - _t3) * 1000),
573
+ items=len(items),
574
+ )
575
+
576
+ # Scalar convenience: single item + single field
577
+ if len(items) == 1 and len(schema.model_fields) == 1:
578
+ only_key = next(iter(schema.model_fields.keys()))
579
+ return items[0].get(only_key)
580
+
581
+ out_fmt = str(output or "auto").lower()
582
+ if out_fmt in {"df", "dataframe"}:
583
+ from .output.dataframe import to_dataframe
584
+
585
+ df = to_dataframe(items, engine=cast(Any, dataframe_engine))
586
+ return df if df is not None else items
587
+
588
+ if out_fmt in {"python", "list", "dict"}:
589
+ return items
590
+
591
+ # Backward-compatible "auto": only auto-convert if caller already imported pandas/polars.
592
+ df = _maybe_to_dataframe(items)
593
+ return df if df is not None else items
594
+
595
+
596
+ def extract(url: str, schema, prompt: str, **kwargs: Any):
597
+ """Sync wrapper around `async_extract`."""
598
+ return _run_sync(async_extract(url, schema, prompt, **kwargs))
599
+
600
+
601
+ async def async_extract(
602
+ url: str,
603
+ schema,
604
+ prompt: str,
605
+ *,
606
+ output: str = "python",
607
+ dataframe_engine: str = "auto",
608
+ **kwargs: Any,
609
+ ):
610
+ """
611
+ Extract typed data from a URL using a provided Pydantic schema.
612
+
613
+ - selector cache fast-path
614
+ - self-heal on validation failure
615
+ - ``output`` controls the return shape:
616
+ - ``"python"`` / ``"list"`` / ``"dict"``: ``list[BaseModel]`` with ``__silk_meta__`` when present
617
+ - ``"df"`` / ``"dataframe"``: pandas or polars DataFrame (see ``dataframe_engine``), else falls back to list
618
+ - ``"auto"``: same as historical behavior — DataFrame only if ``auto_detect_dataframe`` and pandas/polars already imported
619
+ """
620
+ from pydantic import BaseModel
621
+
622
+ if not isinstance(schema, type) or not issubclass(schema, BaseModel):
623
+ raise TypeError("schema must be a Pydantic BaseModel type")
624
+ _normalize_extract_output(output)
625
+
626
+ cfg = get_config()
627
+ cleaner_model = cast(str, kwargs.pop("cleaner_model", cfg.cleaner_model))
628
+ extraction_model = cast(str, kwargs.pop("extraction_model", cfg.extraction_model))
629
+ selector_model = cast(str, kwargs.pop("selector_model", cfg.selector_model))
630
+ force_llm = bool(kwargs.pop("force_llm", cfg.force_llm))
631
+
632
+ import time as _t
633
+
634
+ _t0 = _t.time()
635
+ page = await _async_fetch(url, tier="auto", **kwargs)
636
+ _t_fetch = _t.time() - _t0
637
+ log_event(
638
+ "extract_fetch_done",
639
+ url=url,
640
+ tier=getattr(page, "fetch_tier", None),
641
+ duration_ms=int(_t_fetch * 1000),
642
+ html_chars=len(page.html or ""),
643
+ )
644
+
645
+ selector_cache = CacheManager.from_config().selectors
646
+
647
+ healer = SelfHealer(max_attempts=max(1, int(cfg.max_retries)))
648
+ _t1 = _t.time()
649
+ log_event("extract_llm_start", url=url, tier=getattr(page, "fetch_tier", None))
650
+ items = await _extract_url(
651
+ url=url,
652
+ html=page.html,
653
+ schema=schema,
654
+ prompt=prompt,
655
+ cleaner_provider=create_provider(cleaner_model),
656
+ extraction_provider=create_provider(extraction_model),
657
+ selector_provider=create_provider(selector_model),
658
+ selector_cache=selector_cache,
659
+ healer=healer,
660
+ force_llm=force_llm,
661
+ )
662
+ log_event(
663
+ "extract_llm_done",
664
+ url=url,
665
+ tier=getattr(page, "fetch_tier", None),
666
+ duration_ms=int((_t.time() - _t1) * 1000),
667
+ items=len(items),
668
+ )
669
+
670
+ return _finalize_extract_output(
671
+ items,
672
+ schema,
673
+ output=output,
674
+ dataframe_engine=dataframe_engine,
675
+ )
676
+
677
+
678
+ def query(*args, **kwargs):
679
+ """Compile and run a SilkQL query (sync). Arguments and return type match :func:`async_query`."""
680
+ return _run_sync(async_query(*args, **kwargs))
681
+
682
+
683
+ async def async_query(
684
+ url: str,
685
+ silkql_string: str,
686
+ *,
687
+ provider=None,
688
+ cache: SelectorCache | None = None,
689
+ follow_pagination: bool = False,
690
+ max_pages: int = 20,
691
+ **fetch_kwargs: Any,
692
+ ) -> QueryResult:
693
+ """
694
+ Compile and run a SilkQL query against ``url``.
695
+
696
+ Fetches the page (tier ``"auto"`` by default; pass ``tier=`` like :func:`fetch`),
697
+ extracts with the compiled schema, caches CSS/XPath selectors per domain, and returns
698
+ a :class:`QueryResult` whose ``data`` is a one-element list containing the merged root
699
+ model (list collections are merged across pages when ``follow_pagination`` is true).
700
+
701
+ - ``provider``: extraction LLM; defaults to ``configure(extraction_model=...)``.
702
+ - ``cleaner_model`` / ``selector_model``: optional model strings (popped from ``**fetch_kwargs``),
703
+ defaulting to config — same split as :func:`extract`.
704
+ - ``cache``: selector cache instance; defaults to ``CacheManager.from_config().selectors``.
705
+ - ``follow_pagination``: when the SilkQL AST includes ``pagination { next_page_url }``, follow
706
+ relative/absolute next links up to ``max_pages``.
707
+ - ``force_llm``: skip selector cache (popped from ``fetch_kwargs``, default ``configure(force_llm=...)``).
708
+ - ``cached`` on the result is true if **any** scraped page used a selector-cache hit.
709
+ """
710
+ cfg = get_config()
711
+ prov = provider or create_provider(cfg.extraction_model)
712
+ selector_cache = cache or CacheManager.from_config().selectors
713
+ force_llm = bool(fetch_kwargs.pop("force_llm", cfg.force_llm))
714
+ cleaner_model = cast(str, fetch_kwargs.pop("cleaner_model", cfg.cleaner_model))
715
+ selector_model = cast(str, fetch_kwargs.pop("selector_model", cfg.selector_model))
716
+ return await _execute_query(
717
+ url,
718
+ silkql_string,
719
+ provider=prov,
720
+ cache=selector_cache,
721
+ cleaner_provider=create_provider(cleaner_model),
722
+ selector_provider=create_provider(selector_model),
723
+ follow_pagination=follow_pagination,
724
+ max_pages=max_pages,
725
+ force_llm=force_llm,
726
+ **fetch_kwargs,
727
+ )
728
+
729
+
730
+ def crawl(*args, **kwargs):
731
+ """
732
+ Multi-page crawl (sync). See :func:`async_crawl` for parameters and return value.
733
+
734
+ Returns a list of extracted ``BaseModel`` instances when ``schema`` and ``prompt`` are
735
+ both set; otherwise pages are still fetched (hooks only) and the result is an empty list.
736
+ """
737
+ return _run_sync(async_crawl(*args, **kwargs))
738
+
739
+
740
+ async def async_crawl(
741
+ start_url: str,
742
+ *,
743
+ allowed_domains: set[str] | None = None,
744
+ url_pattern: str | None = None,
745
+ max_pages: int = 100,
746
+ max_depth: int = 2,
747
+ concurrency: int = 10,
748
+ per_domain_concurrency: int = 2,
749
+ max_pending_urls: int = 5000,
750
+ schema=None,
751
+ prompt: str | None = None,
752
+ on_page=None,
753
+ on_item=None,
754
+ on_error=None,
755
+ **fetch_kwargs: Any,
756
+ ):
757
+ """
758
+ Breadth-first crawl from ``start_url`` with URL dedup, global and per-domain concurrency,
759
+ and optional structured extraction on each page.
760
+
761
+ - ``schema`` / ``prompt``: both required together for extraction; if both omitted, only
762
+ ``on_page`` / link discovery run and the returned list is empty.
763
+ - ``max_pages``: hard cap on fetched pages.
764
+ - ``max_depth``: link-following depth from the start URL (0 = start page only).
765
+ - ``max_pending_urls``: best-effort cap on the crawl work-queue size to limit memory.
766
+ - ``on_page``, ``on_item``, ``on_error``: optional async callbacks (page after fetch, each
767
+ extracted model, errors per URL).
768
+ - Remaining keyword arguments are passed to the fetcher (same as :func:`fetch`).
769
+ """
770
+ from pydantic import BaseModel
771
+
772
+ if (schema is not None) ^ (prompt is not None):
773
+ raise ValueError("async_crawl requires both schema and prompt together, or neither")
774
+
775
+ if schema is not None and (not isinstance(schema, type) or not issubclass(schema, BaseModel)):
776
+ raise TypeError("schema must be a Pydantic BaseModel type")
777
+
778
+ crawler = AsyncCrawler(
779
+ start_url=start_url,
780
+ allowed_domains=allowed_domains,
781
+ url_pattern=url_pattern,
782
+ max_pages=max_pages,
783
+ max_depth=max_depth,
784
+ concurrency=concurrency,
785
+ per_domain_concurrency=per_domain_concurrency,
786
+ max_pending_urls=max_pending_urls,
787
+ schema=schema,
788
+ prompt=prompt,
789
+ on_page=on_page,
790
+ on_item=on_item,
791
+ on_error=on_error,
792
+ )
793
+ out: list[BaseModel] = []
794
+ async for item in crawler.run(**fetch_kwargs):
795
+ out.append(item)
796
+ return out
797
+
798
+
799
+ def crawl_sitemap(*args, **kwargs):
800
+ """Sync wrapper around :func:`async_crawl_sitemap`."""
801
+ return _run_sync(async_crawl_sitemap(*args, **kwargs))
802
+
803
+
804
+ async def async_crawl_sitemap(
805
+ sitemap_url: str,
806
+ *,
807
+ schema=None,
808
+ prompt: str | None = None,
809
+ max_pages: int = 100,
810
+ max_sitemap_files: int = 20,
811
+ concurrency: int = 10,
812
+ per_domain_concurrency: int = 2,
813
+ **fetch_kwargs: Any,
814
+ ):
815
+ """
816
+ Fetch a sitemap (``urlset`` or ``sitemapindex``), collect page ``<loc>`` URLs via XML
817
+ parsing, then run :func:`async_crawl` on each (``max_depth=0``, ``max_pages=1`` per URL).
818
+
819
+ ``allowed_domains`` for each crawl defaults to the sitemap URL host. Pass ``max_sitemap_files``
820
+ to cap nested sitemap documents when the root is an index.
821
+ """
822
+ from pydantic import BaseModel
823
+
824
+ from .crawl.sitemap import collect_page_urls_from_sitemap, host_allowed_domains
825
+
826
+ if (schema is not None) ^ (prompt is not None):
827
+ raise ValueError("async_crawl_sitemap requires both schema and prompt together, or neither")
828
+
829
+ if schema is not None and (not isinstance(schema, type) or not issubclass(schema, BaseModel)):
830
+ raise TypeError("schema must be a Pydantic BaseModel type")
831
+
832
+ allowed = host_allowed_domains(sitemap_url)
833
+ locs = await collect_page_urls_from_sitemap(
834
+ _async_fetch,
835
+ sitemap_url,
836
+ max_pages=max_pages,
837
+ max_sitemap_files=max_sitemap_files,
838
+ **fetch_kwargs,
839
+ )
840
+ results: list[Any] = []
841
+ for loc in locs:
842
+ results.extend(
843
+ await async_crawl(
844
+ loc,
845
+ schema=schema,
846
+ prompt=prompt,
847
+ allowed_domains=allowed,
848
+ max_pages=1,
849
+ max_depth=0,
850
+ concurrency=concurrency,
851
+ per_domain_concurrency=per_domain_concurrency,
852
+ **fetch_kwargs,
853
+ )
854
+ )
855
+ return results
856
+
857
+
858
+ def watch(*args, **kwargs) -> Watcher:
859
+ """
860
+ Create a Watcher instance (use `await watcher.start()` to begin).
861
+ """
862
+ return Watcher(*args, **kwargs)