gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. gnosisllm_knowledge/api/knowledge.py +233 -35
  2. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  3. gnosisllm_knowledge/backends/memory/searcher.py +132 -10
  4. gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
  5. gnosisllm_knowledge/backends/opensearch/config.py +7 -0
  6. gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
  7. gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
  8. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  9. gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
  10. gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
  11. gnosisllm_knowledge/cli/app.py +58 -19
  12. gnosisllm_knowledge/cli/commands/agentic.py +15 -9
  13. gnosisllm_knowledge/cli/commands/load.py +169 -19
  14. gnosisllm_knowledge/cli/commands/memory.py +10 -0
  15. gnosisllm_knowledge/cli/commands/search.py +9 -10
  16. gnosisllm_knowledge/cli/commands/setup.py +25 -1
  17. gnosisllm_knowledge/cli/utils/config.py +4 -4
  18. gnosisllm_knowledge/core/domain/__init__.py +13 -0
  19. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  20. gnosisllm_knowledge/core/domain/document.py +14 -19
  21. gnosisllm_knowledge/core/domain/search.py +10 -25
  22. gnosisllm_knowledge/core/domain/source.py +11 -12
  23. gnosisllm_knowledge/core/events/__init__.py +8 -0
  24. gnosisllm_knowledge/core/events/types.py +122 -5
  25. gnosisllm_knowledge/core/exceptions.py +93 -0
  26. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  27. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  28. gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
  29. gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
  30. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  31. gnosisllm_knowledge/fetchers/config.py +27 -0
  32. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  33. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  34. gnosisllm_knowledge/loaders/__init__.py +5 -1
  35. gnosisllm_knowledge/loaders/discovery.py +338 -0
  36. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  37. gnosisllm_knowledge/loaders/factory.py +46 -0
  38. gnosisllm_knowledge/services/indexing.py +51 -21
  39. gnosisllm_knowledge/services/search.py +42 -28
  40. gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
  41. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
  42. gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
  43. gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
  44. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
  45. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,505 @@
1
+ """Neo Reader Discovery API client for website crawling and URL discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from collections.abc import Awaitable, Callable
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from gnosisllm_knowledge.core.domain.discovery import (
13
+ DiscoveredURL,
14
+ DiscoveryConfig,
15
+ DiscoveryJobStatus,
16
+ DiscoveryProgress,
17
+ DiscoveryStats,
18
+ )
19
+ from gnosisllm_knowledge.core.exceptions import (
20
+ ConnectionError,
21
+ DiscoveryJobFailedError,
22
+ DiscoveryTimeoutError,
23
+ FetchError,
24
+ )
25
+ from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
26
+
27
+
28
+ class NeoreaderDiscoveryClient:
29
+ """Client for Neo Reader Discovery API.
30
+
31
+ Handles the lifecycle of discovery jobs: creating jobs, polling for status
32
+ with exponential backoff, and cancellation. Uses httpx.AsyncClient internally
33
+ for efficient async HTTP operations.
34
+
35
+ Example:
36
+ ```python
37
+ config = NeoreaderConfig.from_env()
38
+ client = NeoreaderDiscoveryClient(config)
39
+
40
+ # Create a discovery job
41
+ job_id = await client.create_job(
42
+ "https://docs.example.com",
43
+ DiscoveryConfig(max_depth=3, max_pages=100)
44
+ )
45
+
46
+ # Wait for completion with progress callback
47
+ result = await client.wait_for_completion(
48
+ job_id,
49
+ on_progress=lambda p: print(f"Progress: {p.percent}%")
50
+ )
51
+
52
+ # Get discovered URLs
53
+ for url in result.urls:
54
+ print(url.url)
55
+
56
+ await client.close()
57
+ ```
58
+ """
59
+
60
+ def __init__(self, config: NeoreaderConfig) -> None:
61
+ """Initialize the discovery client.
62
+
63
+ Args:
64
+ config: Neo Reader configuration with host, API key, etc.
65
+ """
66
+ self._config = config
67
+ self._logger = logging.getLogger(__name__)
68
+ self._client: httpx.AsyncClient | None = None
69
+
70
+ @classmethod
71
+ def from_env(cls) -> NeoreaderDiscoveryClient:
72
+ """Create client from environment variables.
73
+
74
+ Uses NeoreaderConfig.from_env() to load configuration from:
75
+ - NEOREADER_HOST
76
+ - NEOREADER_API_KEY
77
+ - NEOREADER_TIMEOUT
78
+
79
+ Returns:
80
+ NeoreaderDiscoveryClient configured from environment.
81
+ """
82
+ return cls(NeoreaderConfig.from_env())
83
+
84
+ @property
85
+ def config(self) -> NeoreaderConfig:
86
+ """Get the Neo Reader configuration.
87
+
88
+ Returns:
89
+ The configuration used by this client.
90
+ """
91
+ return self._config
92
+
93
+ async def _get_client(self) -> httpx.AsyncClient:
94
+ """Get or create HTTP client.
95
+
96
+ Creates a reusable httpx.AsyncClient with base URL, timeout,
97
+ and authentication headers configured.
98
+
99
+ Returns:
100
+ Configured httpx.AsyncClient instance.
101
+ """
102
+ if self._client is None:
103
+ headers: dict[str, str] = {}
104
+ if self._config.api_key:
105
+ headers["Authorization"] = f"Bearer {self._config.api_key}"
106
+
107
+ self._client = httpx.AsyncClient(
108
+ base_url=self._config.host,
109
+ timeout=self._config.timeout,
110
+ headers=headers,
111
+ follow_redirects=True,
112
+ )
113
+ return self._client
114
+
115
+ async def close(self) -> None:
116
+ """Close HTTP client and release resources.
117
+
118
+ Should be called when done with the client to properly
119
+ close connections. Safe to call multiple times.
120
+ """
121
+ if self._client is not None:
122
+ await self._client.aclose()
123
+ self._client = None
124
+
125
+ async def __aenter__(self) -> NeoreaderDiscoveryClient:
126
+ """Enter async context manager.
127
+
128
+ Returns:
129
+ Self for use in async with statement.
130
+ """
131
+ return self
132
+
133
+ async def __aexit__(
134
+ self,
135
+ exc_type: type[BaseException] | None,
136
+ exc_val: BaseException | None,
137
+ exc_tb: Any,
138
+ ) -> None:
139
+ """Exit async context manager and close client."""
140
+ await self.close()
141
+
142
+ async def create_job(
143
+ self,
144
+ url: str,
145
+ discovery_config: DiscoveryConfig | None = None,
146
+ ) -> str:
147
+ """Create a discovery job for the given URL.
148
+
149
+ Initiates an async discovery crawl starting from the specified URL.
150
+ The job runs in the background on the Neo Reader server.
151
+
152
+ Args:
153
+ url: The starting URL for discovery.
154
+ discovery_config: Configuration for the crawl. Uses defaults if None.
155
+
156
+ Returns:
157
+ The job ID for tracking the discovery job.
158
+
159
+ Raises:
160
+ ConnectionError: If unable to connect to Neo Reader.
161
+ FetchError: If the API returns an error response.
162
+ """
163
+ config = discovery_config or DiscoveryConfig()
164
+ client = await self._get_client()
165
+
166
+ # Use the DiscoveryConfig.to_headers() method for clean conversion
167
+ headers = config.to_headers()
168
+
169
+ self._logger.debug(
170
+ "Creating discovery job for %s with config: max_depth=%d, max_pages=%d",
171
+ url,
172
+ config.max_depth,
173
+ config.max_pages,
174
+ )
175
+
176
+ try:
177
+ response = await client.post(
178
+ f"/discover/{url}",
179
+ headers=headers,
180
+ )
181
+ response.raise_for_status()
182
+
183
+ data = response.json()
184
+ job_id = data["job_id"]
185
+
186
+ self._logger.info("Created discovery job %s for %s", job_id, url)
187
+ return job_id
188
+
189
+ except httpx.ConnectError as e:
190
+ raise ConnectionError(
191
+ f"Cannot connect to Neo Reader at {self._config.host}",
192
+ host=self._config.host,
193
+ cause=e,
194
+ ) from e
195
+ except httpx.HTTPStatusError as e:
196
+ raise FetchError(
197
+ f"Failed to create discovery job: HTTP {e.response.status_code}",
198
+ source=url,
199
+ status_code=e.response.status_code,
200
+ cause=e,
201
+ ) from e
202
+
203
+ async def get_job_status(
204
+ self,
205
+ job_id: str,
206
+ include_urls: bool = True,
207
+ ) -> DiscoveryJobStatus:
208
+ """Get the current status of a discovery job.
209
+
210
+ Fetches the job status, progress, stats, and optionally the
211
+ discovered URLs from the Neo Reader API.
212
+
213
+ Args:
214
+ job_id: The discovery job ID.
215
+ include_urls: Whether to include discovered URLs in the response.
216
+
217
+ Returns:
218
+ DiscoveryJobStatus with current job state.
219
+
220
+ Raises:
221
+ ConnectionError: If unable to connect to Neo Reader.
222
+ FetchError: If the API returns an error response.
223
+ """
224
+ client = await self._get_client()
225
+
226
+ params = {"include_urls": str(include_urls).lower()}
227
+
228
+ try:
229
+ response = await client.get(
230
+ f"/discover/jobs/{job_id}",
231
+ params=params,
232
+ )
233
+ response.raise_for_status()
234
+
235
+ data = response.json()
236
+ return self._parse_job_status(data)
237
+
238
+ except httpx.ConnectError as e:
239
+ raise ConnectionError(
240
+ f"Cannot connect to Neo Reader at {self._config.host}",
241
+ host=self._config.host,
242
+ cause=e,
243
+ ) from e
244
+ except httpx.HTTPStatusError as e:
245
+ raise FetchError(
246
+ f"Failed to get job status: HTTP {e.response.status_code}",
247
+ source=job_id,
248
+ status_code=e.response.status_code,
249
+ cause=e,
250
+ ) from e
251
+
252
+ async def wait_for_completion(
253
+ self,
254
+ job_id: str,
255
+ *,
256
+ initial_interval: float = 1.0,
257
+ max_interval: float = 10.0,
258
+ backoff_factor: float = 1.5,
259
+ timeout: float = 600.0,
260
+ on_progress: Callable[[DiscoveryProgress], Awaitable[None] | None]
261
+ | None = None,
262
+ ) -> DiscoveryJobStatus:
263
+ """Poll until job completes or fails with exponential backoff.
264
+
265
+ Continuously polls the job status with exponential backoff between
266
+ requests. Calls the optional progress callback on each update.
267
+ Raises an exception if the job times out or fails.
268
+
269
+ Args:
270
+ job_id: The discovery job ID to wait for.
271
+ initial_interval: Initial polling interval in seconds.
272
+ max_interval: Maximum polling interval in seconds.
273
+ backoff_factor: Multiplier for interval increase (e.g., 1.5 = 50% increase).
274
+ timeout: Maximum time to wait for completion in seconds.
275
+ on_progress: Optional async or sync callback for progress updates.
276
+
277
+ Returns:
278
+ DiscoveryJobStatus with completed/failed/cancelled state.
279
+
280
+ Raises:
281
+ DiscoveryTimeoutError: If the job doesn't complete within timeout.
282
+ ConnectionError: If unable to connect to Neo Reader.
283
+ FetchError: If the API returns an error response.
284
+ """
285
+ loop = asyncio.get_event_loop()
286
+ start_time = loop.time()
287
+ interval = initial_interval
288
+
289
+ self._logger.info(
290
+ "Waiting for job %s to complete (timeout: %.0fs)",
291
+ job_id,
292
+ timeout,
293
+ )
294
+
295
+ while True:
296
+ status = await self.get_job_status(job_id)
297
+
298
+ # Call progress callback if provided and we have progress
299
+ if status.progress and on_progress:
300
+ result = on_progress(status.progress)
301
+ # Handle async callbacks
302
+ if asyncio.iscoroutine(result):
303
+ await result
304
+
305
+ # Check if job is in terminal state
306
+ if status.is_terminal():
307
+ self._logger.info(
308
+ "Job %s completed with status: %s",
309
+ job_id,
310
+ status.status,
311
+ )
312
+ return status
313
+
314
+ # Check timeout
315
+ elapsed = loop.time() - start_time
316
+ if elapsed >= timeout:
317
+ self._logger.warning(
318
+ "Job %s timed out after %.1fs",
319
+ job_id,
320
+ elapsed,
321
+ )
322
+ raise DiscoveryTimeoutError(
323
+ f"Discovery job {job_id} timed out after {elapsed:.1f}s",
324
+ job_id=job_id,
325
+ elapsed=elapsed,
326
+ timeout=timeout,
327
+ )
328
+
329
+ # Wait with exponential backoff
330
+ self._logger.debug(
331
+ "Job %s still running, waiting %.1fs before next poll",
332
+ job_id,
333
+ interval,
334
+ )
335
+ await asyncio.sleep(interval)
336
+ interval = min(interval * backoff_factor, max_interval)
337
+
338
+ async def cancel_job(self, job_id: str) -> bool:
339
+ """Cancel a running discovery job.
340
+
341
+ Sends a cancellation request to stop the job. The job will
342
+ transition to 'cancelled' status.
343
+
344
+ Args:
345
+ job_id: The discovery job ID to cancel.
346
+
347
+ Returns:
348
+ True if cancellation was successful, False if job was
349
+ already in a terminal state.
350
+
351
+ Raises:
352
+ ConnectionError: If unable to connect to Neo Reader.
353
+ FetchError: If the API returns an error response.
354
+ """
355
+ client = await self._get_client()
356
+
357
+ self._logger.info("Cancelling discovery job %s", job_id)
358
+
359
+ try:
360
+ response = await client.delete(f"/discover/jobs/{job_id}")
361
+ response.raise_for_status()
362
+
363
+ self._logger.info("Successfully cancelled job %s", job_id)
364
+ return True
365
+
366
+ except httpx.HTTPStatusError as e:
367
+ # 404 or similar might mean job is already completed/cancelled
368
+ if e.response.status_code == 404:
369
+ self._logger.warning(
370
+ "Job %s not found (may already be completed)",
371
+ job_id,
372
+ )
373
+ return False
374
+ raise FetchError(
375
+ f"Failed to cancel job: HTTP {e.response.status_code}",
376
+ source=job_id,
377
+ status_code=e.response.status_code,
378
+ cause=e,
379
+ ) from e
380
+ except httpx.ConnectError as e:
381
+ raise ConnectionError(
382
+ f"Cannot connect to Neo Reader at {self._config.host}",
383
+ host=self._config.host,
384
+ cause=e,
385
+ ) from e
386
+
387
+ async def discover(
388
+ self,
389
+ url: str,
390
+ discovery_config: DiscoveryConfig | None = None,
391
+ *,
392
+ timeout: float = 600.0,
393
+ on_progress: Callable[[DiscoveryProgress], Awaitable[None] | None]
394
+ | None = None,
395
+ ) -> list[str]:
396
+ """Convenience method to discover all URLs from a website.
397
+
398
+ Creates a job, waits for completion, and returns the discovered URLs.
399
+ Handles job cancellation on errors or interruption.
400
+
401
+ Args:
402
+ url: The starting URL for discovery.
403
+ discovery_config: Configuration for the crawl. Uses defaults if None.
404
+ timeout: Maximum time to wait for completion in seconds.
405
+ on_progress: Optional callback for progress updates.
406
+
407
+ Returns:
408
+ List of discovered URL strings.
409
+
410
+ Raises:
411
+ DiscoveryTimeoutError: If the job doesn't complete within timeout.
412
+ DiscoveryJobFailedError: If the job fails or is cancelled.
413
+ ConnectionError: If unable to connect to Neo Reader.
414
+ FetchError: If the API returns an error response.
415
+ """
416
+ job_id = await self.create_job(url, discovery_config)
417
+
418
+ try:
419
+ status = await self.wait_for_completion(
420
+ job_id,
421
+ timeout=timeout,
422
+ on_progress=on_progress,
423
+ )
424
+
425
+ if status.status != "completed":
426
+ raise DiscoveryJobFailedError(
427
+ f"Discovery job {job_id} failed with status: {status.status}",
428
+ job_id=job_id,
429
+ status=status.status,
430
+ source=url,
431
+ )
432
+
433
+ return [u.url for u in status.urls]
434
+
435
+ except (asyncio.CancelledError, Exception) as e:
436
+ # Attempt to cancel the job on any error
437
+ self._logger.warning(
438
+ "Cancelling job %s due to error: %s",
439
+ job_id,
440
+ e,
441
+ )
442
+ try:
443
+ await self.cancel_job(job_id)
444
+ except Exception as cancel_err:
445
+ self._logger.error(
446
+ "Failed to cancel job %s: %s",
447
+ job_id,
448
+ cancel_err,
449
+ )
450
+ raise
451
+
452
+ def _parse_job_status(self, data: dict[str, Any]) -> DiscoveryJobStatus:
453
+ """Parse API response into DiscoveryJobStatus.
454
+
455
+ Args:
456
+ data: Raw JSON response from the API.
457
+
458
+ Returns:
459
+ Parsed DiscoveryJobStatus instance.
460
+ """
461
+ # Parse progress if present
462
+ progress = None
463
+ if data.get("progress"):
464
+ progress = DiscoveryProgress(
465
+ percent=data["progress"].get("percent", 0),
466
+ pages_crawled=data["progress"].get("pages_crawled", 0),
467
+ urls_discovered=data["progress"].get("urls_discovered", 0),
468
+ current_depth=data["progress"].get("current_depth", 0),
469
+ message=data["progress"].get("message", ""),
470
+ )
471
+
472
+ # Parse stats if present
473
+ stats = None
474
+ if data.get("stats"):
475
+ stats = DiscoveryStats(
476
+ pages_crawled=data["stats"].get("pages_crawled", 0),
477
+ urls_found=data["stats"].get("urls_found", 0),
478
+ urls_returned=data["stats"].get("urls_returned", 0),
479
+ urls_filtered=data["stats"].get("urls_filtered", 0),
480
+ errors=data["stats"].get("errors", 0),
481
+ duration_seconds=data["stats"].get("duration_seconds", 0.0),
482
+ )
483
+
484
+ # Parse URLs if present
485
+ urls: list[DiscoveredURL] = []
486
+ if data.get("urls"):
487
+ for u in data["urls"]:
488
+ urls.append(
489
+ DiscoveredURL(
490
+ url=u["url"],
491
+ depth=u.get("depth", 0),
492
+ title=u.get("title"),
493
+ is_internal=u.get("is_internal", True),
494
+ )
495
+ )
496
+
497
+ return DiscoveryJobStatus(
498
+ job_id=data["job_id"],
499
+ status=data["status"],
500
+ start_url=data["start_url"],
501
+ progress=progress,
502
+ stats=stats,
503
+ urls=urls,
504
+ error=data.get("error"),
505
+ )
@@ -1,13 +1,17 @@
1
1
  """Content loaders for various source types."""
2
2
 
3
3
  from gnosisllm_knowledge.loaders.base import BaseLoader
4
+ from gnosisllm_knowledge.loaders.discovery import DiscoveryLoader
5
+ from gnosisllm_knowledge.loaders.discovery_streaming import StreamingDiscoveryDiscoverer
4
6
  from gnosisllm_knowledge.loaders.factory import LoaderFactory
5
7
  from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
6
8
  from gnosisllm_knowledge.loaders.website import WebsiteLoader
7
9
 
8
10
  __all__ = [
9
11
  "BaseLoader",
12
+ "DiscoveryLoader",
10
13
  "LoaderFactory",
11
- "WebsiteLoader",
12
14
  "SitemapLoader",
15
+ "StreamingDiscoveryDiscoverer",
16
+ "WebsiteLoader",
13
17
  ]