gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +502 -32
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +436 -31
- gnosisllm_knowledge/cli/commands/agentic.py +26 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +733 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +49 -23
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +62 -4
- gnosisllm_knowledge/core/domain/__init__.py +54 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +19 -19
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +12 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +198 -5
- gnosisllm_knowledge/core/exceptions.py +227 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +100 -93
- gnosisllm_knowledge/services/search.py +84 -31
- gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -9,7 +9,11 @@ from typing import Any
|
|
|
9
9
|
from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
10
10
|
from gnosisllm_knowledge.core.interfaces.chunker import ITextChunker
|
|
11
11
|
from gnosisllm_knowledge.core.interfaces.fetcher import IContentFetcher
|
|
12
|
+
from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
|
|
13
|
+
from gnosisllm_knowledge.fetchers.neoreader import NeoreaderContentFetcher
|
|
14
|
+
from gnosisllm_knowledge.fetchers.neoreader_discovery import NeoreaderDiscoveryClient
|
|
12
15
|
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
16
|
+
from gnosisllm_knowledge.loaders.discovery import DiscoveryLoader
|
|
13
17
|
from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
|
|
14
18
|
from gnosisllm_knowledge.loaders.website import WebsiteLoader
|
|
15
19
|
|
|
@@ -20,6 +24,43 @@ LoaderCreator = Callable[
|
|
|
20
24
|
]
|
|
21
25
|
|
|
22
26
|
|
|
27
|
+
def _create_discovery_loader(
|
|
28
|
+
fetcher: IContentFetcher,
|
|
29
|
+
chunker: ITextChunker,
|
|
30
|
+
config: dict[str, Any] | None,
|
|
31
|
+
event_emitter: EventEmitter | None,
|
|
32
|
+
) -> DiscoveryLoader:
|
|
33
|
+
"""Factory function for creating DiscoveryLoader instances.
|
|
34
|
+
|
|
35
|
+
Creates a DiscoveryLoader with a NeoreaderDiscoveryClient. If the fetcher
|
|
36
|
+
is a NeoreaderContentFetcher, reuses its config to ensure consistency.
|
|
37
|
+
Otherwise, creates config from environment variables.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
fetcher: Content fetcher for retrieving URL content.
|
|
41
|
+
chunker: Text chunker for splitting content.
|
|
42
|
+
config: Optional configuration dictionary.
|
|
43
|
+
event_emitter: Optional event emitter for progress events.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Configured DiscoveryLoader instance.
|
|
47
|
+
"""
|
|
48
|
+
# Get config from fetcher if it's NeoreaderContentFetcher, otherwise use env
|
|
49
|
+
if isinstance(fetcher, NeoreaderContentFetcher):
|
|
50
|
+
neoreader_config = fetcher.config
|
|
51
|
+
else:
|
|
52
|
+
neoreader_config = NeoreaderConfig.from_env()
|
|
53
|
+
|
|
54
|
+
discovery_client = NeoreaderDiscoveryClient(neoreader_config)
|
|
55
|
+
return DiscoveryLoader(
|
|
56
|
+
fetcher=fetcher,
|
|
57
|
+
chunker=chunker,
|
|
58
|
+
discovery_client=discovery_client,
|
|
59
|
+
config=config,
|
|
60
|
+
event_emitter=event_emitter,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
23
64
|
class LoaderFactory:
|
|
24
65
|
"""Factory for creating content loaders (Registry Pattern).
|
|
25
66
|
|
|
@@ -29,6 +70,7 @@ class LoaderFactory:
|
|
|
29
70
|
Built-in loaders:
|
|
30
71
|
- website: Single URL loading
|
|
31
72
|
- sitemap: Sitemap XML with recursive discovery
|
|
73
|
+
- discovery: Website crawling via Neo Reader Discovery API
|
|
32
74
|
|
|
33
75
|
Example:
|
|
34
76
|
```python
|
|
@@ -40,6 +82,9 @@ class LoaderFactory:
|
|
|
40
82
|
# Explicit type
|
|
41
83
|
loader = factory.create("sitemap", config={"max_urls": 500})
|
|
42
84
|
|
|
85
|
+
# Discovery loader for full website crawling
|
|
86
|
+
loader = factory.create("discovery", config={"max_depth": 3, "max_pages": 100})
|
|
87
|
+
|
|
43
88
|
# Register custom loader
|
|
44
89
|
factory.register("custom", MyCustomLoader)
|
|
45
90
|
```
|
|
@@ -76,6 +121,7 @@ class LoaderFactory:
|
|
|
76
121
|
"""Register built-in loader types."""
|
|
77
122
|
self.register("website", lambda f, c, cfg, e: WebsiteLoader(f, c, cfg, e))
|
|
78
123
|
self.register("sitemap", lambda f, c, cfg, e: SitemapLoader(f, c, cfg, e))
|
|
124
|
+
self.register("discovery", _create_discovery_loader)
|
|
79
125
|
|
|
80
126
|
def register(self, name: str, creator: LoaderCreator) -> None:
|
|
81
127
|
"""Register a loader type.
|
|
@@ -6,13 +6,18 @@ import asyncio
|
|
|
6
6
|
import fnmatch
|
|
7
7
|
import logging
|
|
8
8
|
import re
|
|
9
|
+
from collections.abc import Awaitable, Callable
|
|
9
10
|
from typing import Any
|
|
10
11
|
from xml.etree import ElementTree
|
|
11
12
|
|
|
12
13
|
import httpx
|
|
13
14
|
|
|
14
|
-
from gnosisllm_knowledge.core.
|
|
15
|
+
from gnosisllm_knowledge.core.domain.document import Document
|
|
16
|
+
from gnosisllm_knowledge.core.domain.result import IndexResult
|
|
17
|
+
from gnosisllm_knowledge.core.events.types import SitemapDiscoveryEvent, UrlBatchProcessedEvent
|
|
18
|
+
from gnosisllm_knowledge.core.streaming.pipeline import PipelineConfig
|
|
15
19
|
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
20
|
+
from gnosisllm_knowledge.loaders.sitemap_streaming import StreamingSitemapDiscoverer
|
|
16
21
|
|
|
17
22
|
# XML namespace for sitemaps
|
|
18
23
|
SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
@@ -283,3 +288,126 @@ class SitemapLoader(BaseLoader):
|
|
|
283
288
|
except re.error:
|
|
284
289
|
# Invalid regex, try substring match
|
|
285
290
|
return pattern in url
|
|
291
|
+
|
|
292
|
+
async def load_streaming_with_indexing(
|
|
293
|
+
self,
|
|
294
|
+
source: str,
|
|
295
|
+
index_callback: Callable[[list[Document]], Awaitable[IndexResult]],
|
|
296
|
+
url_batch_size: int = 50,
|
|
297
|
+
doc_batch_size: int = 100,
|
|
298
|
+
config: PipelineConfig | None = None,
|
|
299
|
+
**options: Any,
|
|
300
|
+
) -> IndexResult:
|
|
301
|
+
"""Load sitemap with streaming URL discovery and progressive indexing.
|
|
302
|
+
|
|
303
|
+
This method:
|
|
304
|
+
1. Discovers URLs in batches (not all at once)
|
|
305
|
+
2. Fetches content for each URL batch
|
|
306
|
+
3. Indexes documents immediately after fetching
|
|
307
|
+
4. Moves to next batch only after indexing completes
|
|
308
|
+
|
|
309
|
+
Memory usage is bounded by:
|
|
310
|
+
- url_batch_size * avg_url_length (URL strings)
|
|
311
|
+
- doc_batch_size * avg_doc_size (document content)
|
|
312
|
+
- fetch_concurrency * avg_page_size (in-flight fetches)
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
source: Sitemap URL.
|
|
316
|
+
index_callback: Called with each batch of documents to index.
|
|
317
|
+
url_batch_size: URLs to process per iteration.
|
|
318
|
+
doc_batch_size: Documents per index batch.
|
|
319
|
+
config: Pipeline configuration.
|
|
320
|
+
**options: Additional loader options.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Aggregated IndexResult.
|
|
324
|
+
"""
|
|
325
|
+
config = config or PipelineConfig(url_batch_size=url_batch_size)
|
|
326
|
+
discoverer = StreamingSitemapDiscoverer(config=config)
|
|
327
|
+
|
|
328
|
+
total_indexed = 0
|
|
329
|
+
total_failed = 0
|
|
330
|
+
all_errors: list[dict[str, Any]] = []
|
|
331
|
+
batch_index = 0
|
|
332
|
+
total_urls_processed = 0
|
|
333
|
+
|
|
334
|
+
async for url_batch in discoverer.discover_urls_streaming(
|
|
335
|
+
sitemap_url=source,
|
|
336
|
+
batch_size=url_batch_size,
|
|
337
|
+
max_urls=options.get("max_urls", 10000),
|
|
338
|
+
max_depth=options.get("max_depth", 3),
|
|
339
|
+
allowed_patterns=options.get("allowed_patterns", []),
|
|
340
|
+
blocked_patterns=options.get("blocked_patterns", []),
|
|
341
|
+
):
|
|
342
|
+
# Fetch content for this batch of URLs
|
|
343
|
+
documents = await self._fetch_url_batch(url_batch, source, **options)
|
|
344
|
+
total_urls_processed += len(url_batch)
|
|
345
|
+
|
|
346
|
+
# Emit URL batch processed event
|
|
347
|
+
self._events.emit(
|
|
348
|
+
UrlBatchProcessedEvent(
|
|
349
|
+
batch_index=batch_index,
|
|
350
|
+
urls_in_batch=len(url_batch),
|
|
351
|
+
documents_created=len(documents),
|
|
352
|
+
total_urls_processed=total_urls_processed,
|
|
353
|
+
)
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Index in sub-batches
|
|
357
|
+
for i in range(0, len(documents), doc_batch_size):
|
|
358
|
+
doc_batch = documents[i : i + doc_batch_size]
|
|
359
|
+
result = await index_callback(doc_batch)
|
|
360
|
+
total_indexed += result.indexed_count
|
|
361
|
+
total_failed += result.failed_count
|
|
362
|
+
if result.errors:
|
|
363
|
+
all_errors.extend(result.errors)
|
|
364
|
+
|
|
365
|
+
# Memory freed: url_batch and documents go out of scope
|
|
366
|
+
self._sitemap_logger.info(
|
|
367
|
+
f"Processed URL batch {batch_index}: {len(url_batch)} URLs, "
|
|
368
|
+
f"{len(documents)} docs, {total_indexed} total indexed"
|
|
369
|
+
)
|
|
370
|
+
batch_index += 1
|
|
371
|
+
|
|
372
|
+
return IndexResult(
|
|
373
|
+
success=total_failed == 0,
|
|
374
|
+
indexed_count=total_indexed,
|
|
375
|
+
failed_count=total_failed,
|
|
376
|
+
errors=all_errors,
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
async def _fetch_url_batch(
|
|
380
|
+
self,
|
|
381
|
+
urls: list[str],
|
|
382
|
+
source: str,
|
|
383
|
+
**options: Any,
|
|
384
|
+
) -> list[Document]:
|
|
385
|
+
"""Fetch and chunk content for a batch of URLs.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
urls: List of URLs to fetch.
|
|
389
|
+
source: Original source identifier.
|
|
390
|
+
**options: Loader options.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
List of Document objects from all URLs.
|
|
394
|
+
"""
|
|
395
|
+
semaphore = asyncio.Semaphore(self._max_concurrent)
|
|
396
|
+
|
|
397
|
+
async def fetch_one(url: str) -> list[Document]:
|
|
398
|
+
async with semaphore:
|
|
399
|
+
return await self._load_url(url, source, **options)
|
|
400
|
+
|
|
401
|
+
results = await asyncio.gather(
|
|
402
|
+
*[fetch_one(url) for url in urls],
|
|
403
|
+
return_exceptions=True,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
documents: list[Document] = []
|
|
407
|
+
for i, result in enumerate(results):
|
|
408
|
+
if isinstance(result, Exception):
|
|
409
|
+
self._sitemap_logger.error(f"Failed to fetch {urls[i]}: {result}")
|
|
410
|
+
elif isinstance(result, list):
|
|
411
|
+
documents.extend(result)
|
|
412
|
+
|
|
413
|
+
return documents
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Streaming sitemap discovery with bounded memory.
|
|
2
|
+
|
|
3
|
+
This module provides streaming URL discovery from sitemaps, yielding
|
|
4
|
+
batches of URLs as they're discovered rather than collecting all URLs
|
|
5
|
+
first. This enables immediate processing and keeps memory bounded.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import fnmatch
|
|
12
|
+
import logging
|
|
13
|
+
import re
|
|
14
|
+
from collections.abc import AsyncIterator
|
|
15
|
+
from typing import Any
|
|
16
|
+
from xml.etree import ElementTree
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
|
|
20
|
+
from gnosisllm_knowledge.core.streaming.pipeline import BoundedQueue, PipelineConfig
|
|
21
|
+
|
|
22
|
+
# XML namespace for sitemaps
|
|
23
|
+
SITEMAP_NS = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class StreamingSitemapDiscoverer:
|
|
27
|
+
"""Discovers sitemap URLs in a streaming fashion.
|
|
28
|
+
|
|
29
|
+
Instead of collecting all URLs before processing, this yields
|
|
30
|
+
batches of URLs as they're discovered, enabling immediate processing.
|
|
31
|
+
|
|
32
|
+
Key differences from SitemapLoader._get_urls():
|
|
33
|
+
- Yields batches instead of returning complete list
|
|
34
|
+
- Uses bounded queue for backpressure
|
|
35
|
+
- Memory usage is O(batch_size) not O(total_urls)
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
```python
|
|
39
|
+
discoverer = StreamingSitemapDiscoverer()
|
|
40
|
+
|
|
41
|
+
async for url_batch in discoverer.discover_urls_streaming(
|
|
42
|
+
sitemap_url="https://example.com/sitemap.xml",
|
|
43
|
+
batch_size=50,
|
|
44
|
+
max_urls=1000,
|
|
45
|
+
):
|
|
46
|
+
# Process batch immediately
|
|
47
|
+
for url in url_batch:
|
|
48
|
+
await fetch_and_process(url)
|
|
49
|
+
```
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
config: PipelineConfig | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Initialize the streaming sitemap discoverer.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
config: Pipeline configuration with batch sizes and concurrency.
|
|
60
|
+
"""
|
|
61
|
+
self._config = config or PipelineConfig()
|
|
62
|
+
self._logger = logging.getLogger(__name__)
|
|
63
|
+
|
|
64
|
+
async def discover_urls_streaming(
|
|
65
|
+
self,
|
|
66
|
+
sitemap_url: str,
|
|
67
|
+
batch_size: int | None = None,
|
|
68
|
+
max_urls: int = 10000,
|
|
69
|
+
max_depth: int = 3,
|
|
70
|
+
allowed_patterns: list[str] | None = None,
|
|
71
|
+
blocked_patterns: list[str] | None = None,
|
|
72
|
+
**options: Any,
|
|
73
|
+
) -> AsyncIterator[list[str]]:
|
|
74
|
+
"""Yield batches of URLs as they're discovered.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
sitemap_url: Root sitemap URL.
|
|
78
|
+
batch_size: URLs per batch (default from config).
|
|
79
|
+
max_urls: Maximum total URLs to discover.
|
|
80
|
+
max_depth: Maximum sitemap recursion depth.
|
|
81
|
+
allowed_patterns: URL patterns to include.
|
|
82
|
+
blocked_patterns: URL patterns to exclude.
|
|
83
|
+
**options: Additional options (unused, for compatibility).
|
|
84
|
+
|
|
85
|
+
Yields:
|
|
86
|
+
Lists of discovered URLs, batch_size at a time.
|
|
87
|
+
"""
|
|
88
|
+
batch_size = batch_size or self._config.url_batch_size
|
|
89
|
+
allowed_patterns = allowed_patterns or []
|
|
90
|
+
blocked_patterns = blocked_patterns or []
|
|
91
|
+
|
|
92
|
+
# Use bounded queue for discovered URLs
|
|
93
|
+
# Queue size is 2x batch_size to allow producer to stay ahead
|
|
94
|
+
url_queue: BoundedQueue[str] = BoundedQueue(maxsize=batch_size * 2)
|
|
95
|
+
|
|
96
|
+
# Tracking state
|
|
97
|
+
discovered_count = 0
|
|
98
|
+
seen_urls: set[str] = set()
|
|
99
|
+
|
|
100
|
+
async def discover_recursive(url: str, depth: int) -> None:
|
|
101
|
+
"""Recursively discover URLs, pushing to queue."""
|
|
102
|
+
nonlocal discovered_count
|
|
103
|
+
|
|
104
|
+
if depth > max_depth or discovered_count >= max_urls:
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
content = await self._fetch_sitemap(url)
|
|
108
|
+
if not content:
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
root = ElementTree.fromstring(content)
|
|
113
|
+
except ElementTree.ParseError as e:
|
|
114
|
+
self._logger.error(f"Failed to parse sitemap {url}: {e}")
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
# Check for sitemap index
|
|
118
|
+
sitemap_refs = root.findall(".//sm:sitemap/sm:loc", SITEMAP_NS)
|
|
119
|
+
if sitemap_refs:
|
|
120
|
+
self._logger.info(
|
|
121
|
+
f"Found sitemap index with {len(sitemap_refs)} sitemaps at depth {depth}"
|
|
122
|
+
)
|
|
123
|
+
# Process nested sitemaps (limited parallelism to avoid overwhelming)
|
|
124
|
+
tasks = []
|
|
125
|
+
for ref in sitemap_refs[:10]: # Limit parallel sitemap fetches
|
|
126
|
+
if ref.text and discovered_count < max_urls:
|
|
127
|
+
tasks.append(discover_recursive(ref.text.strip(), depth + 1))
|
|
128
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
# Process URL entries
|
|
132
|
+
url_elements = root.findall(".//sm:url/sm:loc", SITEMAP_NS)
|
|
133
|
+
for url_elem in url_elements:
|
|
134
|
+
if url_elem.text and discovered_count < max_urls:
|
|
135
|
+
page_url = url_elem.text.strip()
|
|
136
|
+
|
|
137
|
+
if page_url in seen_urls:
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
if not self._should_include_url(
|
|
141
|
+
page_url, allowed_patterns, blocked_patterns
|
|
142
|
+
):
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
seen_urls.add(page_url)
|
|
146
|
+
await url_queue.put(page_url) # Backpressure if queue full
|
|
147
|
+
discovered_count += 1
|
|
148
|
+
|
|
149
|
+
if discovered_count % 100 == 0:
|
|
150
|
+
self._logger.debug(f"Discovered {discovered_count} URLs so far")
|
|
151
|
+
|
|
152
|
+
async def discover_and_close() -> None:
|
|
153
|
+
"""Run discovery and close queue when done."""
|
|
154
|
+
try:
|
|
155
|
+
await discover_recursive(sitemap_url, depth=0)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
self._logger.error(f"Discovery error: {e}")
|
|
158
|
+
finally:
|
|
159
|
+
url_queue.close()
|
|
160
|
+
|
|
161
|
+
# Start discovery in background task
|
|
162
|
+
discovery_task = asyncio.create_task(discover_and_close())
|
|
163
|
+
|
|
164
|
+
# Yield batches from queue
|
|
165
|
+
batch: list[str] = []
|
|
166
|
+
try:
|
|
167
|
+
async for url in url_queue:
|
|
168
|
+
batch.append(url)
|
|
169
|
+
if len(batch) >= batch_size:
|
|
170
|
+
yield batch
|
|
171
|
+
batch = []
|
|
172
|
+
|
|
173
|
+
# Yield remaining URLs
|
|
174
|
+
if batch:
|
|
175
|
+
yield batch
|
|
176
|
+
|
|
177
|
+
finally:
|
|
178
|
+
# Ensure discovery task is complete or cancelled
|
|
179
|
+
if not discovery_task.done():
|
|
180
|
+
discovery_task.cancel()
|
|
181
|
+
try:
|
|
182
|
+
await discovery_task
|
|
183
|
+
except asyncio.CancelledError:
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
async def _fetch_sitemap(self, url: str) -> str | None:
|
|
187
|
+
"""Fetch sitemap XML content.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
url: The sitemap URL to fetch.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Sitemap XML content or None if fetch failed.
|
|
194
|
+
"""
|
|
195
|
+
try:
|
|
196
|
+
async with httpx.AsyncClient(
|
|
197
|
+
timeout=self._config.fetch_timeout_seconds
|
|
198
|
+
) as client:
|
|
199
|
+
response = await client.get(
|
|
200
|
+
url,
|
|
201
|
+
headers={"Accept": "application/xml, text/xml, */*"},
|
|
202
|
+
follow_redirects=True,
|
|
203
|
+
)
|
|
204
|
+
response.raise_for_status()
|
|
205
|
+
return response.text
|
|
206
|
+
except Exception as e:
|
|
207
|
+
self._logger.error(f"Failed to fetch sitemap {url}: {e}")
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
def _should_include_url(
|
|
211
|
+
self,
|
|
212
|
+
url: str,
|
|
213
|
+
allowed_patterns: list[str],
|
|
214
|
+
blocked_patterns: list[str],
|
|
215
|
+
) -> bool:
|
|
216
|
+
"""Check if a URL should be included based on patterns.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
url: The URL to check.
|
|
220
|
+
allowed_patterns: Patterns that must match (if any).
|
|
221
|
+
blocked_patterns: Patterns that must not match.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
True if URL should be included.
|
|
225
|
+
"""
|
|
226
|
+
# Check blocked patterns first
|
|
227
|
+
for pattern in blocked_patterns:
|
|
228
|
+
if self._matches_pattern(url, pattern):
|
|
229
|
+
return False
|
|
230
|
+
|
|
231
|
+
# If allowed patterns specified, at least one must match
|
|
232
|
+
if allowed_patterns:
|
|
233
|
+
return any(self._matches_pattern(url, p) for p in allowed_patterns)
|
|
234
|
+
|
|
235
|
+
return True
|
|
236
|
+
|
|
237
|
+
def _matches_pattern(self, url: str, pattern: str) -> bool:
|
|
238
|
+
"""Check if URL matches a pattern.
|
|
239
|
+
|
|
240
|
+
Supports both glob patterns (with *) and regex patterns.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
url: The URL to check.
|
|
244
|
+
pattern: The pattern to match against.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
True if URL matches the pattern.
|
|
248
|
+
"""
|
|
249
|
+
# Try fnmatch for glob patterns
|
|
250
|
+
if "*" in pattern or "?" in pattern:
|
|
251
|
+
return fnmatch.fnmatch(url, pattern)
|
|
252
|
+
|
|
253
|
+
# Try regex
|
|
254
|
+
try:
|
|
255
|
+
return bool(re.search(pattern, url))
|
|
256
|
+
except re.error:
|
|
257
|
+
# Invalid regex, try substring match
|
|
258
|
+
return pattern in url
|