gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. gnosisllm_knowledge/api/knowledge.py +225 -35
  2. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  3. gnosisllm_knowledge/backends/memory/searcher.py +111 -10
  4. gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
  5. gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
  6. gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
  7. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  8. gnosisllm_knowledge/backends/opensearch/searcher.py +9 -6
  9. gnosisllm_knowledge/cli/app.py +58 -19
  10. gnosisllm_knowledge/cli/commands/agentic.py +15 -9
  11. gnosisllm_knowledge/cli/commands/load.py +169 -19
  12. gnosisllm_knowledge/cli/commands/memory.py +10 -0
  13. gnosisllm_knowledge/cli/commands/search.py +9 -10
  14. gnosisllm_knowledge/cli/commands/setup.py +25 -1
  15. gnosisllm_knowledge/cli/utils/config.py +4 -4
  16. gnosisllm_knowledge/core/domain/__init__.py +13 -0
  17. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  18. gnosisllm_knowledge/core/domain/document.py +14 -19
  19. gnosisllm_knowledge/core/domain/search.py +10 -25
  20. gnosisllm_knowledge/core/domain/source.py +11 -12
  21. gnosisllm_knowledge/core/events/__init__.py +8 -0
  22. gnosisllm_knowledge/core/events/types.py +122 -5
  23. gnosisllm_knowledge/core/exceptions.py +93 -0
  24. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  25. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  26. gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
  27. gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
  28. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  29. gnosisllm_knowledge/fetchers/config.py +27 -0
  30. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  31. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  32. gnosisllm_knowledge/loaders/__init__.py +5 -1
  33. gnosisllm_knowledge/loaders/discovery.py +338 -0
  34. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  35. gnosisllm_knowledge/loaders/factory.py +46 -0
  36. gnosisllm_knowledge/services/indexing.py +35 -20
  37. gnosisllm_knowledge/services/search.py +37 -20
  38. gnosisllm_knowledge/services/streaming_pipeline.py +39 -7
  39. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +30 -10
  40. gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
  41. gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
  42. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
  43. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,338 @@
1
+ """Discovery loader using Neo Reader Discovery API for website crawling."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from gnosisllm_knowledge.core.domain.discovery import DiscoveryConfig, DiscoveryProgress
10
+ from gnosisllm_knowledge.core.events.types import (
11
+ DiscoveryCompletedEvent,
12
+ DiscoveryFailedEvent,
13
+ DiscoveryProgressEvent,
14
+ DiscoveryStartedEvent,
15
+ )
16
+ from gnosisllm_knowledge.core.exceptions import DiscoveryError
17
+ from gnosisllm_knowledge.loaders.base import BaseLoader
18
+
19
+ if TYPE_CHECKING:
20
+ from gnosisllm_knowledge.core.events.emitter import EventEmitter
21
+ from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
22
+ from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
23
+ from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
24
+
25
+
26
+ class DiscoveryLoader(BaseLoader):
27
+ """Loader that discovers all URLs from a website using Neo Reader Discovery API.
28
+
29
+ Similar to SitemapLoader but uses Neo Reader's crawler instead of sitemap XML.
30
+ Supports full website crawling with configurable depth, filters, and limits.
31
+
32
+ The loader follows the Template Method Pattern from BaseLoader:
33
+ 1. _get_urls() performs URL discovery via Neo Reader Discovery API
34
+ 2. Returns all discovered URLs
35
+ 3. BaseLoader handles parallel fetching of each URL's content
36
+
37
+ Features:
38
+ - Async website crawling via Neo Reader Discovery API
39
+ - Configurable crawl depth, page limits, and domain restrictions
40
+ - URL filtering with include/exclude patterns
41
+ - Progress events during discovery
42
+ - Job cancellation on interruption
43
+ - Partial failure handling (logs warnings, continues with available URLs)
44
+
45
+ Example:
46
+ ```python
47
+ from gnosisllm_knowledge.fetchers import NeoreaderContentFetcher
48
+ from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
49
+ from gnosisllm_knowledge.chunking import SentenceChunker
50
+
51
+ fetcher = NeoreaderContentFetcher.from_env()
52
+ discovery_client = NeoreaderDiscoveryClient.from_env()
53
+ chunker = SentenceChunker()
54
+
55
+ loader = DiscoveryLoader(
56
+ fetcher=fetcher,
57
+ chunker=chunker,
58
+ discovery_client=discovery_client,
59
+ )
60
+
61
+ result = await loader.load(
62
+ "https://docs.example.com",
63
+ max_depth=3,
64
+ max_pages=100,
65
+ )
66
+ ```
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ fetcher: IContentFetcher,
72
+ chunker: ITextChunker,
73
+ discovery_client: NeoreaderDiscoveryClient,
74
+ config: dict[str, Any] | None = None,
75
+ event_emitter: EventEmitter | None = None,
76
+ ) -> None:
77
+ """Initialize the discovery loader.
78
+
79
+ Args:
80
+ fetcher: Content fetcher for retrieving URL content.
81
+ chunker: Text chunker for splitting content into documents.
82
+ discovery_client: Neo Reader Discovery API client for URL discovery.
83
+ config: Optional configuration dictionary.
84
+ event_emitter: Optional event emitter for progress events.
85
+ """
86
+ super().__init__(fetcher, chunker, config, event_emitter)
87
+ self._discovery_client = discovery_client
88
+ self._discovery_logger = logging.getLogger(f"{__name__}.DiscoveryLoader")
89
+
90
+ @property
91
+ def name(self) -> str:
92
+ """Return the loader name for registry identification.
93
+
94
+ Returns:
95
+ The string "discovery" for factory registration.
96
+ """
97
+ return "discovery"
98
+
99
+ def supports(self, source: str) -> bool:
100
+ """Check if this loader supports the given source.
101
+
102
+ Supports any HTTP/HTTPS URL that is not a sitemap. Sitemap URLs
103
+ should be handled by SitemapLoader instead.
104
+
105
+ Args:
106
+ source: The source URL.
107
+
108
+ Returns:
109
+ True if source is HTTP/HTTPS and not a sitemap URL.
110
+ """
111
+ source_lower = source.lower()
112
+ is_http = source_lower.startswith(("http://", "https://"))
113
+ is_sitemap = "sitemap" in source_lower or source_lower.endswith(".xml")
114
+ return is_http and not is_sitemap
115
+
116
+ async def _get_urls(self, source: str, **options: Any) -> list[str]:
117
+ """Discover URLs using Neo Reader Discovery API.
118
+
119
+ Creates a discovery job, polls for completion with progress updates,
120
+ and returns the discovered URLs. Handles job cancellation if
121
+ interrupted.
122
+
123
+ Args:
124
+ source: The starting URL for discovery.
125
+ **options: Discovery options:
126
+ - max_depth: Maximum crawl depth (default: 3)
127
+ - max_pages: Maximum pages to crawl (default: 100)
128
+ - same_domain: Only crawl same domain (default: True)
129
+ - include_subdomains: Include subdomains (default: True)
130
+ - respect_robots: Respect robots.txt (default: True)
131
+ - parse_sitemap: Also parse sitemap if found (default: False)
132
+ - with_metadata: Include page metadata (default: True)
133
+ - crawl_timeout: Crawl timeout in seconds (default: 300)
134
+ - concurrent_requests: Concurrent crawl requests (default: 5)
135
+ - request_delay: Delay between requests in ms (default: 100)
136
+ - include_pattern: Regex pattern for URLs to include
137
+ - exclude_pattern: Regex pattern for URLs to exclude
138
+ - path_prefix: Only crawl URLs with this path prefix
139
+
140
+ Returns:
141
+ List of discovered URL strings.
142
+
143
+ Raises:
144
+ DiscoveryError: If discovery fails.
145
+ """
146
+ discovery_config = self._build_discovery_config(options)
147
+ job_id = await self._discovery_client.create_job(source, discovery_config)
148
+
149
+ self._emit_discovery_started(source, job_id, discovery_config)
150
+
151
+ try:
152
+ result = await self._wait_for_job_completion(job_id)
153
+ except (asyncio.CancelledError, Exception) as e:
154
+ await self._handle_job_cancellation(job_id, e)
155
+ raise
156
+
157
+ return self._process_discovery_result(job_id, result, source)
158
+
159
+ def _build_discovery_config(self, options: dict[str, Any]) -> DiscoveryConfig:
160
+ """Build DiscoveryConfig from options dictionary.
161
+
162
+ Args:
163
+ options: Discovery options passed to _get_urls.
164
+
165
+ Returns:
166
+ Configured DiscoveryConfig instance.
167
+ """
168
+ return DiscoveryConfig(
169
+ max_depth=options.get("max_depth", 3),
170
+ max_pages=options.get("max_pages", 100),
171
+ same_domain=options.get("same_domain", True),
172
+ include_subdomains=options.get("include_subdomains", True),
173
+ respect_robots=options.get("respect_robots", True),
174
+ parse_sitemap=options.get("parse_sitemap", False),
175
+ with_metadata=options.get("with_metadata", True),
176
+ crawl_timeout=options.get("crawl_timeout", 300),
177
+ concurrent_requests=options.get("concurrent_requests", 5),
178
+ request_delay=options.get("request_delay", 100),
179
+ include_pattern=options.get("include_pattern"),
180
+ exclude_pattern=options.get("exclude_pattern"),
181
+ path_prefix=options.get("path_prefix"),
182
+ )
183
+
184
+ def _emit_discovery_started(
185
+ self,
186
+ source: str,
187
+ job_id: str,
188
+ discovery_config: DiscoveryConfig,
189
+ ) -> None:
190
+ """Emit discovery started event.
191
+
192
+ Args:
193
+ source: The starting URL.
194
+ job_id: The discovery job ID.
195
+ discovery_config: The discovery configuration.
196
+ """
197
+ self._events.emit(
198
+ DiscoveryStartedEvent(
199
+ url=source,
200
+ job_id=job_id,
201
+ config={
202
+ "max_depth": discovery_config.max_depth,
203
+ "max_pages": discovery_config.max_pages,
204
+ "same_domain": discovery_config.same_domain,
205
+ "include_subdomains": discovery_config.include_subdomains,
206
+ "respect_robots": discovery_config.respect_robots,
207
+ "parse_sitemap": discovery_config.parse_sitemap,
208
+ },
209
+ )
210
+ )
211
+ self._discovery_logger.info(
212
+ "Started discovery job %s for %s (max_depth=%d, max_pages=%d)",
213
+ job_id,
214
+ source,
215
+ discovery_config.max_depth,
216
+ discovery_config.max_pages,
217
+ )
218
+
219
+ async def _wait_for_job_completion(self, job_id: str) -> Any:
220
+ """Wait for discovery job to complete with progress updates.
221
+
222
+ Args:
223
+ job_id: The discovery job ID.
224
+
225
+ Returns:
226
+ The final DiscoveryJobStatus.
227
+ """
228
+
229
+ async def on_progress(progress: DiscoveryProgress) -> None:
230
+ """Emit progress event for each update."""
231
+ self._events.emit(
232
+ DiscoveryProgressEvent(
233
+ job_id=job_id,
234
+ percent=progress.percent,
235
+ pages_crawled=progress.pages_crawled,
236
+ urls_discovered=progress.urls_discovered,
237
+ current_depth=progress.current_depth,
238
+ message=progress.message,
239
+ )
240
+ )
241
+ self._discovery_logger.debug(
242
+ "Discovery progress: %d%% (%d pages, %d URLs)",
243
+ progress.percent,
244
+ progress.pages_crawled,
245
+ progress.urls_discovered,
246
+ )
247
+
248
+ return await self._discovery_client.wait_for_completion(
249
+ job_id,
250
+ on_progress=on_progress,
251
+ )
252
+
253
+ async def _handle_job_cancellation(self, job_id: str, error: Exception) -> None:
254
+ """Handle job cancellation on error or interruption.
255
+
256
+ Args:
257
+ job_id: The discovery job ID to cancel.
258
+ error: The exception that caused cancellation.
259
+ """
260
+ self._discovery_logger.warning(
261
+ "Cancelling discovery job %s due to: %s",
262
+ job_id,
263
+ error,
264
+ )
265
+ try:
266
+ await self._discovery_client.cancel_job(job_id)
267
+ self._discovery_logger.info("Successfully cancelled job %s", job_id)
268
+ except Exception as cancel_err:
269
+ self._discovery_logger.error(
270
+ "Failed to cancel job %s: %s",
271
+ job_id,
272
+ cancel_err,
273
+ )
274
+
275
+ def _process_discovery_result(
276
+ self,
277
+ job_id: str,
278
+ result: Any,
279
+ source: str,
280
+ ) -> list[str]:
281
+ """Process discovery result and extract URLs.
282
+
283
+ Handles completed, failed, and partial failure cases.
284
+
285
+ Args:
286
+ job_id: The discovery job ID.
287
+ result: The DiscoveryJobStatus from wait_for_completion.
288
+ source: The original source URL.
289
+
290
+ Returns:
291
+ List of discovered URL strings.
292
+
293
+ Raises:
294
+ DiscoveryError: If the job failed or was cancelled.
295
+ """
296
+ if result.status != "completed":
297
+ error_msg = result.error or "Unknown error"
298
+ self._events.emit(
299
+ DiscoveryFailedEvent(
300
+ job_id=job_id,
301
+ error=error_msg,
302
+ )
303
+ )
304
+ raise DiscoveryError(
305
+ f"Discovery failed: {error_msg}",
306
+ job_id=job_id,
307
+ source=source,
308
+ )
309
+
310
+ # Handle partial failures (log warning if errors occurred)
311
+ if result.stats and result.stats.errors > 0:
312
+ self._discovery_logger.warning(
313
+ "Discovery completed with %d errors (%d URLs returned)",
314
+ result.stats.errors,
315
+ result.stats.urls_returned,
316
+ )
317
+
318
+ # Extract URLs
319
+ urls = [u.url for u in (result.urls or [])]
320
+
321
+ # Emit completion event
322
+ self._events.emit(
323
+ DiscoveryCompletedEvent(
324
+ job_id=job_id,
325
+ urls_count=len(urls),
326
+ pages_crawled=result.stats.pages_crawled if result.stats else 0,
327
+ duration_seconds=result.stats.duration_seconds if result.stats else 0.0,
328
+ errors=result.stats.errors if result.stats else 0,
329
+ )
330
+ )
331
+
332
+ self._discovery_logger.info(
333
+ "Discovery completed: %d URLs discovered in %.1fs",
334
+ len(urls),
335
+ result.stats.duration_seconds if result.stats else 0.0,
336
+ )
337
+
338
+ return urls
@@ -0,0 +1,343 @@
1
+ """Streaming discovery with bounded memory for large websites.
2
+
3
+ This module provides streaming URL discovery from websites using the Neo Reader
4
+ Discovery API, yielding batches of URLs as they're discovered rather than
5
+ waiting for the full discovery to complete. This enables immediate processing
6
+ and keeps memory bounded for sites with many URLs (>500).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ import contextlib
13
+ import fnmatch
14
+ import logging
15
+ import re
16
+ from collections.abc import AsyncIterator, Awaitable, Callable
17
+ from typing import TYPE_CHECKING, Any
18
+
19
+ from gnosisllm_knowledge.core.domain.discovery import DiscoveryConfig, DiscoveryProgress
20
+ from gnosisllm_knowledge.core.streaming.pipeline import BoundedQueue, PipelineConfig
21
+
22
+ if TYPE_CHECKING:
23
+ from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
24
+
25
+
26
+ class StreamingDiscoveryDiscoverer:
27
+ """Discovers website URLs in a streaming fashion using Neo Reader Discovery API.
28
+
29
+ Instead of collecting all URLs before processing, this yields batches of URLs
30
+ as they're discovered from the polling response. This enables immediate
31
+ processing while discovery continues in the background.
32
+
33
+ Key differences from DiscoveryLoader._get_urls():
34
+ - Yields batches instead of returning complete list
35
+ - Uses bounded queue for backpressure
36
+ - Memory usage is O(batch_size) not O(total_urls)
37
+ - Starts yielding URLs while discovery is still running
38
+
39
+ Recommended for sites with >500 expected URLs to start content loading
40
+ before full discovery completes.
41
+
42
+ Example:
43
+ ```python
44
+ from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
45
+ from gnosisllm_knowledge.loaders.discovery_streaming import StreamingDiscoveryDiscoverer
46
+
47
+ client = NeoreaderDiscoveryClient.from_env()
48
+ discoverer = StreamingDiscoveryDiscoverer(discovery_client=client)
49
+
50
+ async for url_batch in discoverer.discover_urls_streaming(
51
+ source="https://docs.example.com",
52
+ batch_size=50,
53
+ max_pages=1000,
54
+ ):
55
+ # Process batch immediately - content loading can start
56
+ # while discovery continues
57
+ for url in url_batch:
58
+ await fetch_and_process(url)
59
+ ```
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ discovery_client: NeoreaderDiscoveryClient,
65
+ config: PipelineConfig | None = None,
66
+ ) -> None:
67
+ """Initialize the streaming discovery discoverer.
68
+
69
+ Args:
70
+ discovery_client: Neo Reader Discovery API client.
71
+ config: Pipeline configuration with batch sizes and concurrency.
72
+ """
73
+ self._discovery_client = discovery_client
74
+ self._config = config or PipelineConfig()
75
+ self._logger = logging.getLogger(__name__)
76
+
77
+ async def discover_urls_streaming(
78
+ self,
79
+ source: str,
80
+ batch_size: int | None = None,
81
+ max_pages: int = 1000,
82
+ max_depth: int = 3,
83
+ allowed_patterns: list[str] | None = None,
84
+ blocked_patterns: list[str] | None = None,
85
+ poll_interval: float = 2.0,
86
+ discovery_timeout: float = 600.0,
87
+ on_progress: Callable[[DiscoveryProgress], Awaitable[None] | None] | None = None,
88
+ **options: Any,
89
+ ) -> AsyncIterator[list[str]]:
90
+ """Yield batches of URLs as they're discovered.
91
+
92
+ Starts a discovery job and polls for new URLs, yielding batches
93
+ as they become available. This allows content fetching to begin
94
+ while discovery is still running.
95
+
96
+ Args:
97
+ source: Starting URL for discovery.
98
+ batch_size: URLs per batch (default from config).
99
+ max_pages: Maximum pages to discover.
100
+ max_depth: Maximum crawl depth.
101
+ allowed_patterns: URL patterns to include (glob or regex).
102
+ blocked_patterns: URL patterns to exclude (glob or regex).
103
+ poll_interval: Seconds between status polls.
104
+ discovery_timeout: Maximum time for discovery in seconds.
105
+ on_progress: Optional callback for progress updates (sync or async).
106
+ **options: Additional discovery options passed to DiscoveryConfig.
107
+
108
+ Yields:
109
+ Lists of discovered URLs, batch_size at a time.
110
+ """
111
+ batch_size = batch_size or self._config.url_batch_size
112
+ allowed_patterns = allowed_patterns or []
113
+ blocked_patterns = blocked_patterns or []
114
+
115
+ # Use bounded queue for discovered URLs
116
+ # Queue size is 2x batch_size to allow producer to stay ahead
117
+ url_queue: BoundedQueue[str] = BoundedQueue(maxsize=batch_size * 2)
118
+
119
+ # Tracking state
120
+ seen_urls: set[str] = set()
121
+ last_yielded_count = 0
122
+
123
+ # Build discovery config from options
124
+ discovery_config = self._build_discovery_config(
125
+ max_pages=max_pages,
126
+ max_depth=max_depth,
127
+ **options,
128
+ )
129
+
130
+ async def poll_and_queue_urls() -> None:
131
+ """Poll discovery job and push new URLs to queue."""
132
+ nonlocal last_yielded_count
133
+
134
+ # Create discovery job
135
+ job_id = await self._discovery_client.create_job(source, discovery_config)
136
+ self._logger.info(
137
+ "Started streaming discovery job %s for %s",
138
+ job_id,
139
+ source,
140
+ )
141
+
142
+ try:
143
+ loop = asyncio.get_event_loop()
144
+ start_time = loop.time()
145
+
146
+ while True:
147
+ # Get job status with URLs
148
+ status = await self._discovery_client.get_job_status(
149
+ job_id,
150
+ include_urls=True,
151
+ )
152
+
153
+ # Process any new URLs from this poll
154
+ if status.urls:
155
+ new_urls_added = 0
156
+ for discovered_url in status.urls:
157
+ url = discovered_url.url
158
+
159
+ if url in seen_urls:
160
+ continue
161
+
162
+ if not self._should_include_url(
163
+ url, allowed_patterns, blocked_patterns
164
+ ):
165
+ continue
166
+
167
+ seen_urls.add(url)
168
+ await url_queue.put(url) # Backpressure if queue full
169
+ new_urls_added += 1
170
+
171
+ if new_urls_added > 0:
172
+ self._logger.debug(
173
+ "Queued %d new URLs (total seen: %d)",
174
+ new_urls_added,
175
+ len(seen_urls),
176
+ )
177
+
178
+ # Check if job is complete
179
+ if status.is_terminal():
180
+ if status.status == "completed":
181
+ self._logger.info(
182
+ "Discovery completed: %d URLs discovered in %.1fs",
183
+ len(seen_urls),
184
+ status.stats.duration_seconds if status.stats else 0.0,
185
+ )
186
+ elif status.status == "failed":
187
+ self._logger.error(
188
+ "Discovery failed: %s",
189
+ status.error or "Unknown error",
190
+ )
191
+ break
192
+
193
+ # Check timeout
194
+ elapsed = loop.time() - start_time
195
+ if elapsed >= discovery_timeout:
196
+ self._logger.warning(
197
+ "Discovery timeout after %.1fs, cancelling job %s",
198
+ elapsed,
199
+ job_id,
200
+ )
201
+ await self._discovery_client.cancel_job(job_id)
202
+ break
203
+
204
+ # Call progress callback if provided and we have progress
205
+ if status.progress and on_progress:
206
+ result = on_progress(status.progress)
207
+ # Handle async callbacks
208
+ if asyncio.iscoroutine(result):
209
+ await result
210
+
211
+ # Log progress
212
+ if status.progress:
213
+ self._logger.debug(
214
+ "Discovery progress: %d%% (%d pages, %d URLs)",
215
+ status.progress.percent,
216
+ status.progress.pages_crawled,
217
+ status.progress.urls_discovered,
218
+ )
219
+
220
+ # Wait before next poll
221
+ await asyncio.sleep(poll_interval)
222
+
223
+ except asyncio.CancelledError:
224
+ self._logger.warning("Discovery cancelled, cleaning up job %s", job_id)
225
+ try:
226
+ await self._discovery_client.cancel_job(job_id)
227
+ except Exception as e:
228
+ self._logger.error("Failed to cancel job %s: %s", job_id, e)
229
+ raise
230
+ except Exception as e:
231
+ self._logger.error("Discovery error: %s", e)
232
+ try:
233
+ await self._discovery_client.cancel_job(job_id)
234
+ except Exception as cancel_e:
235
+ self._logger.error("Failed to cancel job %s: %s", job_id, cancel_e)
236
+ raise
237
+ finally:
238
+ url_queue.close()
239
+
240
+ # Start discovery in background task
241
+ discovery_task = asyncio.create_task(poll_and_queue_urls())
242
+
243
+ # Yield batches from queue
244
+ batch: list[str] = []
245
+ try:
246
+ async for url in url_queue:
247
+ batch.append(url)
248
+ if len(batch) >= batch_size:
249
+ yield batch
250
+ batch = []
251
+
252
+ # Yield remaining URLs
253
+ if batch:
254
+ yield batch
255
+
256
+ finally:
257
+ # Ensure discovery task is complete or cancelled
258
+ if not discovery_task.done():
259
+ discovery_task.cancel()
260
+ with contextlib.suppress(asyncio.CancelledError):
261
+ await discovery_task
262
+
263
+ def _build_discovery_config(
264
+ self,
265
+ max_pages: int,
266
+ max_depth: int,
267
+ **options: Any,
268
+ ) -> DiscoveryConfig:
269
+ """Build DiscoveryConfig from parameters and options.
270
+
271
+ Args:
272
+ max_pages: Maximum pages to crawl.
273
+ max_depth: Maximum crawl depth.
274
+ **options: Additional discovery options.
275
+
276
+ Returns:
277
+ Configured DiscoveryConfig instance.
278
+ """
279
+ return DiscoveryConfig(
280
+ max_depth=max_depth,
281
+ max_pages=max_pages,
282
+ same_domain=options.get("same_domain", True),
283
+ include_subdomains=options.get("include_subdomains", True),
284
+ respect_robots=options.get("respect_robots", True),
285
+ parse_sitemap=options.get("parse_sitemap", False),
286
+ with_metadata=options.get("with_metadata", True),
287
+ crawl_timeout=options.get("crawl_timeout", 300),
288
+ concurrent_requests=options.get("concurrent_requests", 5),
289
+ request_delay=options.get("request_delay", 100),
290
+ include_pattern=options.get("include_pattern"),
291
+ exclude_pattern=options.get("exclude_pattern"),
292
+ path_prefix=options.get("path_prefix"),
293
+ )
294
+
295
+ def _should_include_url(
296
+ self,
297
+ url: str,
298
+ allowed_patterns: list[str],
299
+ blocked_patterns: list[str],
300
+ ) -> bool:
301
+ """Check if a URL should be included based on patterns.
302
+
303
+ Args:
304
+ url: The URL to check.
305
+ allowed_patterns: Patterns that must match (if any).
306
+ blocked_patterns: Patterns that must not match.
307
+
308
+ Returns:
309
+ True if URL should be included.
310
+ """
311
+ # Check blocked patterns first
312
+ for pattern in blocked_patterns:
313
+ if self._matches_pattern(url, pattern):
314
+ return False
315
+
316
+ # If allowed patterns specified, at least one must match
317
+ if allowed_patterns:
318
+ return any(self._matches_pattern(url, p) for p in allowed_patterns)
319
+
320
+ return True
321
+
322
+ def _matches_pattern(self, url: str, pattern: str) -> bool:
323
+ """Check if URL matches a pattern.
324
+
325
+ Supports both glob patterns (with *) and regex patterns.
326
+
327
+ Args:
328
+ url: The URL to check.
329
+ pattern: The pattern to match against.
330
+
331
+ Returns:
332
+ True if URL matches the pattern.
333
+ """
334
+ # Try fnmatch for glob patterns
335
+ if "*" in pattern or "?" in pattern:
336
+ return fnmatch.fnmatch(url, pattern)
337
+
338
+ # Try regex
339
+ try:
340
+ return bool(re.search(pattern, url))
341
+ except re.error:
342
+ # Invalid regex, try substring match
343
+ return pattern in url