gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/api/knowledge.py +225 -35
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +9 -6
- gnosisllm_knowledge/cli/app.py +58 -19
- gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +10 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +25 -1
- gnosisllm_knowledge/cli/utils/config.py +4 -4
- gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +14 -19
- gnosisllm_knowledge/core/domain/search.py +10 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +122 -5
- gnosisllm_knowledge/core/exceptions.py +93 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/services/indexing.py +35 -20
- gnosisllm_knowledge/services/search.py +37 -20
- gnosisllm_knowledge/services/streaming_pipeline.py +39 -7
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +30 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
"""Discovery loader using Neo Reader Discovery API for website crawling."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
from gnosisllm_knowledge.core.domain.discovery import DiscoveryConfig, DiscoveryProgress
|
|
10
|
+
from gnosisllm_knowledge.core.events.types import (
|
|
11
|
+
DiscoveryCompletedEvent,
|
|
12
|
+
DiscoveryFailedEvent,
|
|
13
|
+
DiscoveryProgressEvent,
|
|
14
|
+
DiscoveryStartedEvent,
|
|
15
|
+
)
|
|
16
|
+
from gnosisllm_knowledge.core.exceptions import DiscoveryError
|
|
17
|
+
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
21
|
+
from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
|
|
22
|
+
from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
|
|
23
|
+
from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DiscoveryLoader(BaseLoader):
|
|
27
|
+
"""Loader that discovers all URLs from a website using Neo Reader Discovery API.
|
|
28
|
+
|
|
29
|
+
Similar to SitemapLoader but uses Neo Reader's crawler instead of sitemap XML.
|
|
30
|
+
Supports full website crawling with configurable depth, filters, and limits.
|
|
31
|
+
|
|
32
|
+
The loader follows the Template Method Pattern from BaseLoader:
|
|
33
|
+
1. _get_urls() performs URL discovery via Neo Reader Discovery API
|
|
34
|
+
2. Returns all discovered URLs
|
|
35
|
+
3. BaseLoader handles parallel fetching of each URL's content
|
|
36
|
+
|
|
37
|
+
Features:
|
|
38
|
+
- Async website crawling via Neo Reader Discovery API
|
|
39
|
+
- Configurable crawl depth, page limits, and domain restrictions
|
|
40
|
+
- URL filtering with include/exclude patterns
|
|
41
|
+
- Progress events during discovery
|
|
42
|
+
- Job cancellation on interruption
|
|
43
|
+
- Partial failure handling (logs warnings, continues with available URLs)
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
```python
|
|
47
|
+
from gnosisllm_knowledge.fetchers import NeoreaderContentFetcher
|
|
48
|
+
from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
|
|
49
|
+
from gnosisllm_knowledge.chunking import SentenceChunker
|
|
50
|
+
|
|
51
|
+
fetcher = NeoreaderContentFetcher.from_env()
|
|
52
|
+
discovery_client = NeoreaderDiscoveryClient.from_env()
|
|
53
|
+
chunker = SentenceChunker()
|
|
54
|
+
|
|
55
|
+
loader = DiscoveryLoader(
|
|
56
|
+
fetcher=fetcher,
|
|
57
|
+
chunker=chunker,
|
|
58
|
+
discovery_client=discovery_client,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
result = await loader.load(
|
|
62
|
+
"https://docs.example.com",
|
|
63
|
+
max_depth=3,
|
|
64
|
+
max_pages=100,
|
|
65
|
+
)
|
|
66
|
+
```
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
fetcher: IContentFetcher,
|
|
72
|
+
chunker: ITextChunker,
|
|
73
|
+
discovery_client: NeoreaderDiscoveryClient,
|
|
74
|
+
config: dict[str, Any] | None = None,
|
|
75
|
+
event_emitter: EventEmitter | None = None,
|
|
76
|
+
) -> None:
|
|
77
|
+
"""Initialize the discovery loader.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
fetcher: Content fetcher for retrieving URL content.
|
|
81
|
+
chunker: Text chunker for splitting content into documents.
|
|
82
|
+
discovery_client: Neo Reader Discovery API client for URL discovery.
|
|
83
|
+
config: Optional configuration dictionary.
|
|
84
|
+
event_emitter: Optional event emitter for progress events.
|
|
85
|
+
"""
|
|
86
|
+
super().__init__(fetcher, chunker, config, event_emitter)
|
|
87
|
+
self._discovery_client = discovery_client
|
|
88
|
+
self._discovery_logger = logging.getLogger(f"{__name__}.DiscoveryLoader")
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def name(self) -> str:
|
|
92
|
+
"""Return the loader name for registry identification.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
The string "discovery" for factory registration.
|
|
96
|
+
"""
|
|
97
|
+
return "discovery"
|
|
98
|
+
|
|
99
|
+
def supports(self, source: str) -> bool:
|
|
100
|
+
"""Check if this loader supports the given source.
|
|
101
|
+
|
|
102
|
+
Supports any HTTP/HTTPS URL that is not a sitemap. Sitemap URLs
|
|
103
|
+
should be handled by SitemapLoader instead.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
source: The source URL.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
True if source is HTTP/HTTPS and not a sitemap URL.
|
|
110
|
+
"""
|
|
111
|
+
source_lower = source.lower()
|
|
112
|
+
is_http = source_lower.startswith(("http://", "https://"))
|
|
113
|
+
is_sitemap = "sitemap" in source_lower or source_lower.endswith(".xml")
|
|
114
|
+
return is_http and not is_sitemap
|
|
115
|
+
|
|
116
|
+
async def _get_urls(self, source: str, **options: Any) -> list[str]:
|
|
117
|
+
"""Discover URLs using Neo Reader Discovery API.
|
|
118
|
+
|
|
119
|
+
Creates a discovery job, polls for completion with progress updates,
|
|
120
|
+
and returns the discovered URLs. Handles job cancellation if
|
|
121
|
+
interrupted.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
source: The starting URL for discovery.
|
|
125
|
+
**options: Discovery options:
|
|
126
|
+
- max_depth: Maximum crawl depth (default: 3)
|
|
127
|
+
- max_pages: Maximum pages to crawl (default: 100)
|
|
128
|
+
- same_domain: Only crawl same domain (default: True)
|
|
129
|
+
- include_subdomains: Include subdomains (default: True)
|
|
130
|
+
- respect_robots: Respect robots.txt (default: True)
|
|
131
|
+
- parse_sitemap: Also parse sitemap if found (default: False)
|
|
132
|
+
- with_metadata: Include page metadata (default: True)
|
|
133
|
+
- crawl_timeout: Crawl timeout in seconds (default: 300)
|
|
134
|
+
- concurrent_requests: Concurrent crawl requests (default: 5)
|
|
135
|
+
- request_delay: Delay between requests in ms (default: 100)
|
|
136
|
+
- include_pattern: Regex pattern for URLs to include
|
|
137
|
+
- exclude_pattern: Regex pattern for URLs to exclude
|
|
138
|
+
- path_prefix: Only crawl URLs with this path prefix
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
List of discovered URL strings.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
DiscoveryError: If discovery fails.
|
|
145
|
+
"""
|
|
146
|
+
discovery_config = self._build_discovery_config(options)
|
|
147
|
+
job_id = await self._discovery_client.create_job(source, discovery_config)
|
|
148
|
+
|
|
149
|
+
self._emit_discovery_started(source, job_id, discovery_config)
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
result = await self._wait_for_job_completion(job_id)
|
|
153
|
+
except (asyncio.CancelledError, Exception) as e:
|
|
154
|
+
await self._handle_job_cancellation(job_id, e)
|
|
155
|
+
raise
|
|
156
|
+
|
|
157
|
+
return self._process_discovery_result(job_id, result, source)
|
|
158
|
+
|
|
159
|
+
def _build_discovery_config(self, options: dict[str, Any]) -> DiscoveryConfig:
|
|
160
|
+
"""Build DiscoveryConfig from options dictionary.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
options: Discovery options passed to _get_urls.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Configured DiscoveryConfig instance.
|
|
167
|
+
"""
|
|
168
|
+
return DiscoveryConfig(
|
|
169
|
+
max_depth=options.get("max_depth", 3),
|
|
170
|
+
max_pages=options.get("max_pages", 100),
|
|
171
|
+
same_domain=options.get("same_domain", True),
|
|
172
|
+
include_subdomains=options.get("include_subdomains", True),
|
|
173
|
+
respect_robots=options.get("respect_robots", True),
|
|
174
|
+
parse_sitemap=options.get("parse_sitemap", False),
|
|
175
|
+
with_metadata=options.get("with_metadata", True),
|
|
176
|
+
crawl_timeout=options.get("crawl_timeout", 300),
|
|
177
|
+
concurrent_requests=options.get("concurrent_requests", 5),
|
|
178
|
+
request_delay=options.get("request_delay", 100),
|
|
179
|
+
include_pattern=options.get("include_pattern"),
|
|
180
|
+
exclude_pattern=options.get("exclude_pattern"),
|
|
181
|
+
path_prefix=options.get("path_prefix"),
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
def _emit_discovery_started(
|
|
185
|
+
self,
|
|
186
|
+
source: str,
|
|
187
|
+
job_id: str,
|
|
188
|
+
discovery_config: DiscoveryConfig,
|
|
189
|
+
) -> None:
|
|
190
|
+
"""Emit discovery started event.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
source: The starting URL.
|
|
194
|
+
job_id: The discovery job ID.
|
|
195
|
+
discovery_config: The discovery configuration.
|
|
196
|
+
"""
|
|
197
|
+
self._events.emit(
|
|
198
|
+
DiscoveryStartedEvent(
|
|
199
|
+
url=source,
|
|
200
|
+
job_id=job_id,
|
|
201
|
+
config={
|
|
202
|
+
"max_depth": discovery_config.max_depth,
|
|
203
|
+
"max_pages": discovery_config.max_pages,
|
|
204
|
+
"same_domain": discovery_config.same_domain,
|
|
205
|
+
"include_subdomains": discovery_config.include_subdomains,
|
|
206
|
+
"respect_robots": discovery_config.respect_robots,
|
|
207
|
+
"parse_sitemap": discovery_config.parse_sitemap,
|
|
208
|
+
},
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
self._discovery_logger.info(
|
|
212
|
+
"Started discovery job %s for %s (max_depth=%d, max_pages=%d)",
|
|
213
|
+
job_id,
|
|
214
|
+
source,
|
|
215
|
+
discovery_config.max_depth,
|
|
216
|
+
discovery_config.max_pages,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
async def _wait_for_job_completion(self, job_id: str) -> Any:
|
|
220
|
+
"""Wait for discovery job to complete with progress updates.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
job_id: The discovery job ID.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
The final DiscoveryJobStatus.
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
async def on_progress(progress: DiscoveryProgress) -> None:
|
|
230
|
+
"""Emit progress event for each update."""
|
|
231
|
+
self._events.emit(
|
|
232
|
+
DiscoveryProgressEvent(
|
|
233
|
+
job_id=job_id,
|
|
234
|
+
percent=progress.percent,
|
|
235
|
+
pages_crawled=progress.pages_crawled,
|
|
236
|
+
urls_discovered=progress.urls_discovered,
|
|
237
|
+
current_depth=progress.current_depth,
|
|
238
|
+
message=progress.message,
|
|
239
|
+
)
|
|
240
|
+
)
|
|
241
|
+
self._discovery_logger.debug(
|
|
242
|
+
"Discovery progress: %d%% (%d pages, %d URLs)",
|
|
243
|
+
progress.percent,
|
|
244
|
+
progress.pages_crawled,
|
|
245
|
+
progress.urls_discovered,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
return await self._discovery_client.wait_for_completion(
|
|
249
|
+
job_id,
|
|
250
|
+
on_progress=on_progress,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
async def _handle_job_cancellation(self, job_id: str, error: Exception) -> None:
|
|
254
|
+
"""Handle job cancellation on error or interruption.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
job_id: The discovery job ID to cancel.
|
|
258
|
+
error: The exception that caused cancellation.
|
|
259
|
+
"""
|
|
260
|
+
self._discovery_logger.warning(
|
|
261
|
+
"Cancelling discovery job %s due to: %s",
|
|
262
|
+
job_id,
|
|
263
|
+
error,
|
|
264
|
+
)
|
|
265
|
+
try:
|
|
266
|
+
await self._discovery_client.cancel_job(job_id)
|
|
267
|
+
self._discovery_logger.info("Successfully cancelled job %s", job_id)
|
|
268
|
+
except Exception as cancel_err:
|
|
269
|
+
self._discovery_logger.error(
|
|
270
|
+
"Failed to cancel job %s: %s",
|
|
271
|
+
job_id,
|
|
272
|
+
cancel_err,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
def _process_discovery_result(
|
|
276
|
+
self,
|
|
277
|
+
job_id: str,
|
|
278
|
+
result: Any,
|
|
279
|
+
source: str,
|
|
280
|
+
) -> list[str]:
|
|
281
|
+
"""Process discovery result and extract URLs.
|
|
282
|
+
|
|
283
|
+
Handles completed, failed, and partial failure cases.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
job_id: The discovery job ID.
|
|
287
|
+
result: The DiscoveryJobStatus from wait_for_completion.
|
|
288
|
+
source: The original source URL.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
List of discovered URL strings.
|
|
292
|
+
|
|
293
|
+
Raises:
|
|
294
|
+
DiscoveryError: If the job failed or was cancelled.
|
|
295
|
+
"""
|
|
296
|
+
if result.status != "completed":
|
|
297
|
+
error_msg = result.error or "Unknown error"
|
|
298
|
+
self._events.emit(
|
|
299
|
+
DiscoveryFailedEvent(
|
|
300
|
+
job_id=job_id,
|
|
301
|
+
error=error_msg,
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
raise DiscoveryError(
|
|
305
|
+
f"Discovery failed: {error_msg}",
|
|
306
|
+
job_id=job_id,
|
|
307
|
+
source=source,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
# Handle partial failures (log warning if errors occurred)
|
|
311
|
+
if result.stats and result.stats.errors > 0:
|
|
312
|
+
self._discovery_logger.warning(
|
|
313
|
+
"Discovery completed with %d errors (%d URLs returned)",
|
|
314
|
+
result.stats.errors,
|
|
315
|
+
result.stats.urls_returned,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Extract URLs
|
|
319
|
+
urls = [u.url for u in (result.urls or [])]
|
|
320
|
+
|
|
321
|
+
# Emit completion event
|
|
322
|
+
self._events.emit(
|
|
323
|
+
DiscoveryCompletedEvent(
|
|
324
|
+
job_id=job_id,
|
|
325
|
+
urls_count=len(urls),
|
|
326
|
+
pages_crawled=result.stats.pages_crawled if result.stats else 0,
|
|
327
|
+
duration_seconds=result.stats.duration_seconds if result.stats else 0.0,
|
|
328
|
+
errors=result.stats.errors if result.stats else 0,
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
self._discovery_logger.info(
|
|
333
|
+
"Discovery completed: %d URLs discovered in %.1fs",
|
|
334
|
+
len(urls),
|
|
335
|
+
result.stats.duration_seconds if result.stats else 0.0,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
return urls
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""Streaming discovery with bounded memory for large websites.
|
|
2
|
+
|
|
3
|
+
This module provides streaming URL discovery from websites using the Neo Reader
|
|
4
|
+
Discovery API, yielding batches of URLs as they're discovered rather than
|
|
5
|
+
waiting for the full discovery to complete. This enables immediate processing
|
|
6
|
+
and keeps memory bounded for sites with many URLs (>500).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import contextlib
|
|
13
|
+
import fnmatch
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
from collections.abc import AsyncIterator, Awaitable, Callable
|
|
17
|
+
from typing import TYPE_CHECKING, Any
|
|
18
|
+
|
|
19
|
+
from gnosisllm_knowledge.core.domain.discovery import DiscoveryConfig, DiscoveryProgress
|
|
20
|
+
from gnosisllm_knowledge.core.streaming.pipeline import BoundedQueue, PipelineConfig
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class StreamingDiscoveryDiscoverer:
|
|
27
|
+
"""Discovers website URLs in a streaming fashion using Neo Reader Discovery API.
|
|
28
|
+
|
|
29
|
+
Instead of collecting all URLs before processing, this yields batches of URLs
|
|
30
|
+
as they're discovered from the polling response. This enables immediate
|
|
31
|
+
processing while discovery continues in the background.
|
|
32
|
+
|
|
33
|
+
Key differences from DiscoveryLoader._get_urls():
|
|
34
|
+
- Yields batches instead of returning complete list
|
|
35
|
+
- Uses bounded queue for backpressure
|
|
36
|
+
- Memory usage is O(batch_size) not O(total_urls)
|
|
37
|
+
- Starts yielding URLs while discovery is still running
|
|
38
|
+
|
|
39
|
+
Recommended for sites with >500 expected URLs to start content loading
|
|
40
|
+
before full discovery completes.
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
```python
|
|
44
|
+
from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
|
|
45
|
+
from gnosisllm_knowledge.loaders.discovery_streaming import StreamingDiscoveryDiscoverer
|
|
46
|
+
|
|
47
|
+
client = NeoreaderDiscoveryClient.from_env()
|
|
48
|
+
discoverer = StreamingDiscoveryDiscoverer(discovery_client=client)
|
|
49
|
+
|
|
50
|
+
async for url_batch in discoverer.discover_urls_streaming(
|
|
51
|
+
source="https://docs.example.com",
|
|
52
|
+
batch_size=50,
|
|
53
|
+
max_pages=1000,
|
|
54
|
+
):
|
|
55
|
+
# Process batch immediately - content loading can start
|
|
56
|
+
# while discovery continues
|
|
57
|
+
for url in url_batch:
|
|
58
|
+
await fetch_and_process(url)
|
|
59
|
+
```
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
discovery_client: NeoreaderDiscoveryClient,
|
|
65
|
+
config: PipelineConfig | None = None,
|
|
66
|
+
) -> None:
|
|
67
|
+
"""Initialize the streaming discovery discoverer.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
discovery_client: Neo Reader Discovery API client.
|
|
71
|
+
config: Pipeline configuration with batch sizes and concurrency.
|
|
72
|
+
"""
|
|
73
|
+
self._discovery_client = discovery_client
|
|
74
|
+
self._config = config or PipelineConfig()
|
|
75
|
+
self._logger = logging.getLogger(__name__)
|
|
76
|
+
|
|
77
|
+
async def discover_urls_streaming(
|
|
78
|
+
self,
|
|
79
|
+
source: str,
|
|
80
|
+
batch_size: int | None = None,
|
|
81
|
+
max_pages: int = 1000,
|
|
82
|
+
max_depth: int = 3,
|
|
83
|
+
allowed_patterns: list[str] | None = None,
|
|
84
|
+
blocked_patterns: list[str] | None = None,
|
|
85
|
+
poll_interval: float = 2.0,
|
|
86
|
+
discovery_timeout: float = 600.0,
|
|
87
|
+
on_progress: Callable[[DiscoveryProgress], Awaitable[None] | None] | None = None,
|
|
88
|
+
**options: Any,
|
|
89
|
+
) -> AsyncIterator[list[str]]:
|
|
90
|
+
"""Yield batches of URLs as they're discovered.
|
|
91
|
+
|
|
92
|
+
Starts a discovery job and polls for new URLs, yielding batches
|
|
93
|
+
as they become available. This allows content fetching to begin
|
|
94
|
+
while discovery is still running.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
source: Starting URL for discovery.
|
|
98
|
+
batch_size: URLs per batch (default from config).
|
|
99
|
+
max_pages: Maximum pages to discover.
|
|
100
|
+
max_depth: Maximum crawl depth.
|
|
101
|
+
allowed_patterns: URL patterns to include (glob or regex).
|
|
102
|
+
blocked_patterns: URL patterns to exclude (glob or regex).
|
|
103
|
+
poll_interval: Seconds between status polls.
|
|
104
|
+
discovery_timeout: Maximum time for discovery in seconds.
|
|
105
|
+
on_progress: Optional callback for progress updates (sync or async).
|
|
106
|
+
**options: Additional discovery options passed to DiscoveryConfig.
|
|
107
|
+
|
|
108
|
+
Yields:
|
|
109
|
+
Lists of discovered URLs, batch_size at a time.
|
|
110
|
+
"""
|
|
111
|
+
batch_size = batch_size or self._config.url_batch_size
|
|
112
|
+
allowed_patterns = allowed_patterns or []
|
|
113
|
+
blocked_patterns = blocked_patterns or []
|
|
114
|
+
|
|
115
|
+
# Use bounded queue for discovered URLs
|
|
116
|
+
# Queue size is 2x batch_size to allow producer to stay ahead
|
|
117
|
+
url_queue: BoundedQueue[str] = BoundedQueue(maxsize=batch_size * 2)
|
|
118
|
+
|
|
119
|
+
# Tracking state
|
|
120
|
+
seen_urls: set[str] = set()
|
|
121
|
+
last_yielded_count = 0
|
|
122
|
+
|
|
123
|
+
# Build discovery config from options
|
|
124
|
+
discovery_config = self._build_discovery_config(
|
|
125
|
+
max_pages=max_pages,
|
|
126
|
+
max_depth=max_depth,
|
|
127
|
+
**options,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
async def poll_and_queue_urls() -> None:
|
|
131
|
+
"""Poll discovery job and push new URLs to queue."""
|
|
132
|
+
nonlocal last_yielded_count
|
|
133
|
+
|
|
134
|
+
# Create discovery job
|
|
135
|
+
job_id = await self._discovery_client.create_job(source, discovery_config)
|
|
136
|
+
self._logger.info(
|
|
137
|
+
"Started streaming discovery job %s for %s",
|
|
138
|
+
job_id,
|
|
139
|
+
source,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
loop = asyncio.get_event_loop()
|
|
144
|
+
start_time = loop.time()
|
|
145
|
+
|
|
146
|
+
while True:
|
|
147
|
+
# Get job status with URLs
|
|
148
|
+
status = await self._discovery_client.get_job_status(
|
|
149
|
+
job_id,
|
|
150
|
+
include_urls=True,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Process any new URLs from this poll
|
|
154
|
+
if status.urls:
|
|
155
|
+
new_urls_added = 0
|
|
156
|
+
for discovered_url in status.urls:
|
|
157
|
+
url = discovered_url.url
|
|
158
|
+
|
|
159
|
+
if url in seen_urls:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
if not self._should_include_url(
|
|
163
|
+
url, allowed_patterns, blocked_patterns
|
|
164
|
+
):
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
seen_urls.add(url)
|
|
168
|
+
await url_queue.put(url) # Backpressure if queue full
|
|
169
|
+
new_urls_added += 1
|
|
170
|
+
|
|
171
|
+
if new_urls_added > 0:
|
|
172
|
+
self._logger.debug(
|
|
173
|
+
"Queued %d new URLs (total seen: %d)",
|
|
174
|
+
new_urls_added,
|
|
175
|
+
len(seen_urls),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Check if job is complete
|
|
179
|
+
if status.is_terminal():
|
|
180
|
+
if status.status == "completed":
|
|
181
|
+
self._logger.info(
|
|
182
|
+
"Discovery completed: %d URLs discovered in %.1fs",
|
|
183
|
+
len(seen_urls),
|
|
184
|
+
status.stats.duration_seconds if status.stats else 0.0,
|
|
185
|
+
)
|
|
186
|
+
elif status.status == "failed":
|
|
187
|
+
self._logger.error(
|
|
188
|
+
"Discovery failed: %s",
|
|
189
|
+
status.error or "Unknown error",
|
|
190
|
+
)
|
|
191
|
+
break
|
|
192
|
+
|
|
193
|
+
# Check timeout
|
|
194
|
+
elapsed = loop.time() - start_time
|
|
195
|
+
if elapsed >= discovery_timeout:
|
|
196
|
+
self._logger.warning(
|
|
197
|
+
"Discovery timeout after %.1fs, cancelling job %s",
|
|
198
|
+
elapsed,
|
|
199
|
+
job_id,
|
|
200
|
+
)
|
|
201
|
+
await self._discovery_client.cancel_job(job_id)
|
|
202
|
+
break
|
|
203
|
+
|
|
204
|
+
# Call progress callback if provided and we have progress
|
|
205
|
+
if status.progress and on_progress:
|
|
206
|
+
result = on_progress(status.progress)
|
|
207
|
+
# Handle async callbacks
|
|
208
|
+
if asyncio.iscoroutine(result):
|
|
209
|
+
await result
|
|
210
|
+
|
|
211
|
+
# Log progress
|
|
212
|
+
if status.progress:
|
|
213
|
+
self._logger.debug(
|
|
214
|
+
"Discovery progress: %d%% (%d pages, %d URLs)",
|
|
215
|
+
status.progress.percent,
|
|
216
|
+
status.progress.pages_crawled,
|
|
217
|
+
status.progress.urls_discovered,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Wait before next poll
|
|
221
|
+
await asyncio.sleep(poll_interval)
|
|
222
|
+
|
|
223
|
+
except asyncio.CancelledError:
|
|
224
|
+
self._logger.warning("Discovery cancelled, cleaning up job %s", job_id)
|
|
225
|
+
try:
|
|
226
|
+
await self._discovery_client.cancel_job(job_id)
|
|
227
|
+
except Exception as e:
|
|
228
|
+
self._logger.error("Failed to cancel job %s: %s", job_id, e)
|
|
229
|
+
raise
|
|
230
|
+
except Exception as e:
|
|
231
|
+
self._logger.error("Discovery error: %s", e)
|
|
232
|
+
try:
|
|
233
|
+
await self._discovery_client.cancel_job(job_id)
|
|
234
|
+
except Exception as cancel_e:
|
|
235
|
+
self._logger.error("Failed to cancel job %s: %s", job_id, cancel_e)
|
|
236
|
+
raise
|
|
237
|
+
finally:
|
|
238
|
+
url_queue.close()
|
|
239
|
+
|
|
240
|
+
# Start discovery in background task
|
|
241
|
+
discovery_task = asyncio.create_task(poll_and_queue_urls())
|
|
242
|
+
|
|
243
|
+
# Yield batches from queue
|
|
244
|
+
batch: list[str] = []
|
|
245
|
+
try:
|
|
246
|
+
async for url in url_queue:
|
|
247
|
+
batch.append(url)
|
|
248
|
+
if len(batch) >= batch_size:
|
|
249
|
+
yield batch
|
|
250
|
+
batch = []
|
|
251
|
+
|
|
252
|
+
# Yield remaining URLs
|
|
253
|
+
if batch:
|
|
254
|
+
yield batch
|
|
255
|
+
|
|
256
|
+
finally:
|
|
257
|
+
# Ensure discovery task is complete or cancelled
|
|
258
|
+
if not discovery_task.done():
|
|
259
|
+
discovery_task.cancel()
|
|
260
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
261
|
+
await discovery_task
|
|
262
|
+
|
|
263
|
+
def _build_discovery_config(
|
|
264
|
+
self,
|
|
265
|
+
max_pages: int,
|
|
266
|
+
max_depth: int,
|
|
267
|
+
**options: Any,
|
|
268
|
+
) -> DiscoveryConfig:
|
|
269
|
+
"""Build DiscoveryConfig from parameters and options.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
max_pages: Maximum pages to crawl.
|
|
273
|
+
max_depth: Maximum crawl depth.
|
|
274
|
+
**options: Additional discovery options.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
Configured DiscoveryConfig instance.
|
|
278
|
+
"""
|
|
279
|
+
return DiscoveryConfig(
|
|
280
|
+
max_depth=max_depth,
|
|
281
|
+
max_pages=max_pages,
|
|
282
|
+
same_domain=options.get("same_domain", True),
|
|
283
|
+
include_subdomains=options.get("include_subdomains", True),
|
|
284
|
+
respect_robots=options.get("respect_robots", True),
|
|
285
|
+
parse_sitemap=options.get("parse_sitemap", False),
|
|
286
|
+
with_metadata=options.get("with_metadata", True),
|
|
287
|
+
crawl_timeout=options.get("crawl_timeout", 300),
|
|
288
|
+
concurrent_requests=options.get("concurrent_requests", 5),
|
|
289
|
+
request_delay=options.get("request_delay", 100),
|
|
290
|
+
include_pattern=options.get("include_pattern"),
|
|
291
|
+
exclude_pattern=options.get("exclude_pattern"),
|
|
292
|
+
path_prefix=options.get("path_prefix"),
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def _should_include_url(
|
|
296
|
+
self,
|
|
297
|
+
url: str,
|
|
298
|
+
allowed_patterns: list[str],
|
|
299
|
+
blocked_patterns: list[str],
|
|
300
|
+
) -> bool:
|
|
301
|
+
"""Check if a URL should be included based on patterns.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
url: The URL to check.
|
|
305
|
+
allowed_patterns: Patterns that must match (if any).
|
|
306
|
+
blocked_patterns: Patterns that must not match.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
True if URL should be included.
|
|
310
|
+
"""
|
|
311
|
+
# Check blocked patterns first
|
|
312
|
+
for pattern in blocked_patterns:
|
|
313
|
+
if self._matches_pattern(url, pattern):
|
|
314
|
+
return False
|
|
315
|
+
|
|
316
|
+
# If allowed patterns specified, at least one must match
|
|
317
|
+
if allowed_patterns:
|
|
318
|
+
return any(self._matches_pattern(url, p) for p in allowed_patterns)
|
|
319
|
+
|
|
320
|
+
return True
|
|
321
|
+
|
|
322
|
+
def _matches_pattern(self, url: str, pattern: str) -> bool:
|
|
323
|
+
"""Check if URL matches a pattern.
|
|
324
|
+
|
|
325
|
+
Supports both glob patterns (with *) and regex patterns.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
url: The URL to check.
|
|
329
|
+
pattern: The pattern to match against.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
True if URL matches the pattern.
|
|
333
|
+
"""
|
|
334
|
+
# Try fnmatch for glob patterns
|
|
335
|
+
if "*" in pattern or "?" in pattern:
|
|
336
|
+
return fnmatch.fnmatch(url, pattern)
|
|
337
|
+
|
|
338
|
+
# Try regex
|
|
339
|
+
try:
|
|
340
|
+
return bool(re.search(pattern, url))
|
|
341
|
+
except re.error:
|
|
342
|
+
# Invalid regex, try substring match
|
|
343
|
+
return pattern in url
|