gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/api/knowledge.py +233 -35
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +132 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- gnosisllm_knowledge/backends/opensearch/config.py +7 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
- gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
- gnosisllm_knowledge/cli/app.py +58 -19
- gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +10 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +25 -1
- gnosisllm_knowledge/cli/utils/config.py +4 -4
- gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +14 -19
- gnosisllm_knowledge/core/domain/search.py +10 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +122 -5
- gnosisllm_knowledge/core/exceptions.py +93 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/services/indexing.py +51 -21
- gnosisllm_knowledge/services/search.py +42 -28
- gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
- gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
"""Neo Reader Discovery API client for website crawling and URL discovery."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from collections.abc import Awaitable, Callable
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from gnosisllm_knowledge.core.domain.discovery import (
|
|
13
|
+
DiscoveredURL,
|
|
14
|
+
DiscoveryConfig,
|
|
15
|
+
DiscoveryJobStatus,
|
|
16
|
+
DiscoveryProgress,
|
|
17
|
+
DiscoveryStats,
|
|
18
|
+
)
|
|
19
|
+
from gnosisllm_knowledge.core.exceptions import (
|
|
20
|
+
ConnectionError,
|
|
21
|
+
DiscoveryJobFailedError,
|
|
22
|
+
DiscoveryTimeoutError,
|
|
23
|
+
FetchError,
|
|
24
|
+
)
|
|
25
|
+
from gnosisllm_knowledge.fetchers.config import NeoreaderConfig
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class NeoreaderDiscoveryClient:
|
|
29
|
+
"""Client for Neo Reader Discovery API.
|
|
30
|
+
|
|
31
|
+
Handles the lifecycle of discovery jobs: creating jobs, polling for status
|
|
32
|
+
with exponential backoff, and cancellation. Uses httpx.AsyncClient internally
|
|
33
|
+
for efficient async HTTP operations.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
```python
|
|
37
|
+
config = NeoreaderConfig.from_env()
|
|
38
|
+
client = NeoreaderDiscoveryClient(config)
|
|
39
|
+
|
|
40
|
+
# Create a discovery job
|
|
41
|
+
job_id = await client.create_job(
|
|
42
|
+
"https://docs.example.com",
|
|
43
|
+
DiscoveryConfig(max_depth=3, max_pages=100)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Wait for completion with progress callback
|
|
47
|
+
result = await client.wait_for_completion(
|
|
48
|
+
job_id,
|
|
49
|
+
on_progress=lambda p: print(f"Progress: {p.percent}%")
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Get discovered URLs
|
|
53
|
+
for url in result.urls:
|
|
54
|
+
print(url.url)
|
|
55
|
+
|
|
56
|
+
await client.close()
|
|
57
|
+
```
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(self, config: NeoreaderConfig) -> None:
|
|
61
|
+
"""Initialize the discovery client.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
config: Neo Reader configuration with host, API key, etc.
|
|
65
|
+
"""
|
|
66
|
+
self._config = config
|
|
67
|
+
self._logger = logging.getLogger(__name__)
|
|
68
|
+
self._client: httpx.AsyncClient | None = None
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def from_env(cls) -> NeoreaderDiscoveryClient:
|
|
72
|
+
"""Create client from environment variables.
|
|
73
|
+
|
|
74
|
+
Uses NeoreaderConfig.from_env() to load configuration from:
|
|
75
|
+
- NEOREADER_HOST
|
|
76
|
+
- NEOREADER_API_KEY
|
|
77
|
+
- NEOREADER_TIMEOUT
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
NeoreaderDiscoveryClient configured from environment.
|
|
81
|
+
"""
|
|
82
|
+
return cls(NeoreaderConfig.from_env())
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def config(self) -> NeoreaderConfig:
|
|
86
|
+
"""Get the Neo Reader configuration.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The configuration used by this client.
|
|
90
|
+
"""
|
|
91
|
+
return self._config
|
|
92
|
+
|
|
93
|
+
async def _get_client(self) -> httpx.AsyncClient:
|
|
94
|
+
"""Get or create HTTP client.
|
|
95
|
+
|
|
96
|
+
Creates a reusable httpx.AsyncClient with base URL, timeout,
|
|
97
|
+
and authentication headers configured.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Configured httpx.AsyncClient instance.
|
|
101
|
+
"""
|
|
102
|
+
if self._client is None:
|
|
103
|
+
headers: dict[str, str] = {}
|
|
104
|
+
if self._config.api_key:
|
|
105
|
+
headers["Authorization"] = f"Bearer {self._config.api_key}"
|
|
106
|
+
|
|
107
|
+
self._client = httpx.AsyncClient(
|
|
108
|
+
base_url=self._config.host,
|
|
109
|
+
timeout=self._config.timeout,
|
|
110
|
+
headers=headers,
|
|
111
|
+
follow_redirects=True,
|
|
112
|
+
)
|
|
113
|
+
return self._client
|
|
114
|
+
|
|
115
|
+
async def close(self) -> None:
|
|
116
|
+
"""Close HTTP client and release resources.
|
|
117
|
+
|
|
118
|
+
Should be called when done with the client to properly
|
|
119
|
+
close connections. Safe to call multiple times.
|
|
120
|
+
"""
|
|
121
|
+
if self._client is not None:
|
|
122
|
+
await self._client.aclose()
|
|
123
|
+
self._client = None
|
|
124
|
+
|
|
125
|
+
async def __aenter__(self) -> NeoreaderDiscoveryClient:
|
|
126
|
+
"""Enter async context manager.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Self for use in async with statement.
|
|
130
|
+
"""
|
|
131
|
+
return self
|
|
132
|
+
|
|
133
|
+
async def __aexit__(
|
|
134
|
+
self,
|
|
135
|
+
exc_type: type[BaseException] | None,
|
|
136
|
+
exc_val: BaseException | None,
|
|
137
|
+
exc_tb: Any,
|
|
138
|
+
) -> None:
|
|
139
|
+
"""Exit async context manager and close client."""
|
|
140
|
+
await self.close()
|
|
141
|
+
|
|
142
|
+
async def create_job(
|
|
143
|
+
self,
|
|
144
|
+
url: str,
|
|
145
|
+
discovery_config: DiscoveryConfig | None = None,
|
|
146
|
+
) -> str:
|
|
147
|
+
"""Create a discovery job for the given URL.
|
|
148
|
+
|
|
149
|
+
Initiates an async discovery crawl starting from the specified URL.
|
|
150
|
+
The job runs in the background on the Neo Reader server.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
url: The starting URL for discovery.
|
|
154
|
+
discovery_config: Configuration for the crawl. Uses defaults if None.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
The job ID for tracking the discovery job.
|
|
158
|
+
|
|
159
|
+
Raises:
|
|
160
|
+
ConnectionError: If unable to connect to Neo Reader.
|
|
161
|
+
FetchError: If the API returns an error response.
|
|
162
|
+
"""
|
|
163
|
+
config = discovery_config or DiscoveryConfig()
|
|
164
|
+
client = await self._get_client()
|
|
165
|
+
|
|
166
|
+
# Use the DiscoveryConfig.to_headers() method for clean conversion
|
|
167
|
+
headers = config.to_headers()
|
|
168
|
+
|
|
169
|
+
self._logger.debug(
|
|
170
|
+
"Creating discovery job for %s with config: max_depth=%d, max_pages=%d",
|
|
171
|
+
url,
|
|
172
|
+
config.max_depth,
|
|
173
|
+
config.max_pages,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
response = await client.post(
|
|
178
|
+
f"/discover/{url}",
|
|
179
|
+
headers=headers,
|
|
180
|
+
)
|
|
181
|
+
response.raise_for_status()
|
|
182
|
+
|
|
183
|
+
data = response.json()
|
|
184
|
+
job_id = data["job_id"]
|
|
185
|
+
|
|
186
|
+
self._logger.info("Created discovery job %s for %s", job_id, url)
|
|
187
|
+
return job_id
|
|
188
|
+
|
|
189
|
+
except httpx.ConnectError as e:
|
|
190
|
+
raise ConnectionError(
|
|
191
|
+
f"Cannot connect to Neo Reader at {self._config.host}",
|
|
192
|
+
host=self._config.host,
|
|
193
|
+
cause=e,
|
|
194
|
+
) from e
|
|
195
|
+
except httpx.HTTPStatusError as e:
|
|
196
|
+
raise FetchError(
|
|
197
|
+
f"Failed to create discovery job: HTTP {e.response.status_code}",
|
|
198
|
+
source=url,
|
|
199
|
+
status_code=e.response.status_code,
|
|
200
|
+
cause=e,
|
|
201
|
+
) from e
|
|
202
|
+
|
|
203
|
+
async def get_job_status(
|
|
204
|
+
self,
|
|
205
|
+
job_id: str,
|
|
206
|
+
include_urls: bool = True,
|
|
207
|
+
) -> DiscoveryJobStatus:
|
|
208
|
+
"""Get the current status of a discovery job.
|
|
209
|
+
|
|
210
|
+
Fetches the job status, progress, stats, and optionally the
|
|
211
|
+
discovered URLs from the Neo Reader API.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
job_id: The discovery job ID.
|
|
215
|
+
include_urls: Whether to include discovered URLs in the response.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
DiscoveryJobStatus with current job state.
|
|
219
|
+
|
|
220
|
+
Raises:
|
|
221
|
+
ConnectionError: If unable to connect to Neo Reader.
|
|
222
|
+
FetchError: If the API returns an error response.
|
|
223
|
+
"""
|
|
224
|
+
client = await self._get_client()
|
|
225
|
+
|
|
226
|
+
params = {"include_urls": str(include_urls).lower()}
|
|
227
|
+
|
|
228
|
+
try:
|
|
229
|
+
response = await client.get(
|
|
230
|
+
f"/discover/jobs/{job_id}",
|
|
231
|
+
params=params,
|
|
232
|
+
)
|
|
233
|
+
response.raise_for_status()
|
|
234
|
+
|
|
235
|
+
data = response.json()
|
|
236
|
+
return self._parse_job_status(data)
|
|
237
|
+
|
|
238
|
+
except httpx.ConnectError as e:
|
|
239
|
+
raise ConnectionError(
|
|
240
|
+
f"Cannot connect to Neo Reader at {self._config.host}",
|
|
241
|
+
host=self._config.host,
|
|
242
|
+
cause=e,
|
|
243
|
+
) from e
|
|
244
|
+
except httpx.HTTPStatusError as e:
|
|
245
|
+
raise FetchError(
|
|
246
|
+
f"Failed to get job status: HTTP {e.response.status_code}",
|
|
247
|
+
source=job_id,
|
|
248
|
+
status_code=e.response.status_code,
|
|
249
|
+
cause=e,
|
|
250
|
+
) from e
|
|
251
|
+
|
|
252
|
+
async def wait_for_completion(
|
|
253
|
+
self,
|
|
254
|
+
job_id: str,
|
|
255
|
+
*,
|
|
256
|
+
initial_interval: float = 1.0,
|
|
257
|
+
max_interval: float = 10.0,
|
|
258
|
+
backoff_factor: float = 1.5,
|
|
259
|
+
timeout: float = 600.0,
|
|
260
|
+
on_progress: Callable[[DiscoveryProgress], Awaitable[None] | None]
|
|
261
|
+
| None = None,
|
|
262
|
+
) -> DiscoveryJobStatus:
|
|
263
|
+
"""Poll until job completes or fails with exponential backoff.
|
|
264
|
+
|
|
265
|
+
Continuously polls the job status with exponential backoff between
|
|
266
|
+
requests. Calls the optional progress callback on each update.
|
|
267
|
+
Raises an exception if the job times out or fails.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
job_id: The discovery job ID to wait for.
|
|
271
|
+
initial_interval: Initial polling interval in seconds.
|
|
272
|
+
max_interval: Maximum polling interval in seconds.
|
|
273
|
+
backoff_factor: Multiplier for interval increase (e.g., 1.5 = 50% increase).
|
|
274
|
+
timeout: Maximum time to wait for completion in seconds.
|
|
275
|
+
on_progress: Optional async or sync callback for progress updates.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
DiscoveryJobStatus with completed/failed/cancelled state.
|
|
279
|
+
|
|
280
|
+
Raises:
|
|
281
|
+
DiscoveryTimeoutError: If the job doesn't complete within timeout.
|
|
282
|
+
ConnectionError: If unable to connect to Neo Reader.
|
|
283
|
+
FetchError: If the API returns an error response.
|
|
284
|
+
"""
|
|
285
|
+
loop = asyncio.get_event_loop()
|
|
286
|
+
start_time = loop.time()
|
|
287
|
+
interval = initial_interval
|
|
288
|
+
|
|
289
|
+
self._logger.info(
|
|
290
|
+
"Waiting for job %s to complete (timeout: %.0fs)",
|
|
291
|
+
job_id,
|
|
292
|
+
timeout,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
while True:
|
|
296
|
+
status = await self.get_job_status(job_id)
|
|
297
|
+
|
|
298
|
+
# Call progress callback if provided and we have progress
|
|
299
|
+
if status.progress and on_progress:
|
|
300
|
+
result = on_progress(status.progress)
|
|
301
|
+
# Handle async callbacks
|
|
302
|
+
if asyncio.iscoroutine(result):
|
|
303
|
+
await result
|
|
304
|
+
|
|
305
|
+
# Check if job is in terminal state
|
|
306
|
+
if status.is_terminal():
|
|
307
|
+
self._logger.info(
|
|
308
|
+
"Job %s completed with status: %s",
|
|
309
|
+
job_id,
|
|
310
|
+
status.status,
|
|
311
|
+
)
|
|
312
|
+
return status
|
|
313
|
+
|
|
314
|
+
# Check timeout
|
|
315
|
+
elapsed = loop.time() - start_time
|
|
316
|
+
if elapsed >= timeout:
|
|
317
|
+
self._logger.warning(
|
|
318
|
+
"Job %s timed out after %.1fs",
|
|
319
|
+
job_id,
|
|
320
|
+
elapsed,
|
|
321
|
+
)
|
|
322
|
+
raise DiscoveryTimeoutError(
|
|
323
|
+
f"Discovery job {job_id} timed out after {elapsed:.1f}s",
|
|
324
|
+
job_id=job_id,
|
|
325
|
+
elapsed=elapsed,
|
|
326
|
+
timeout=timeout,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Wait with exponential backoff
|
|
330
|
+
self._logger.debug(
|
|
331
|
+
"Job %s still running, waiting %.1fs before next poll",
|
|
332
|
+
job_id,
|
|
333
|
+
interval,
|
|
334
|
+
)
|
|
335
|
+
await asyncio.sleep(interval)
|
|
336
|
+
interval = min(interval * backoff_factor, max_interval)
|
|
337
|
+
|
|
338
|
+
async def cancel_job(self, job_id: str) -> bool:
|
|
339
|
+
"""Cancel a running discovery job.
|
|
340
|
+
|
|
341
|
+
Sends a cancellation request to stop the job. The job will
|
|
342
|
+
transition to 'cancelled' status.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
job_id: The discovery job ID to cancel.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
True if cancellation was successful, False if job was
|
|
349
|
+
already in a terminal state.
|
|
350
|
+
|
|
351
|
+
Raises:
|
|
352
|
+
ConnectionError: If unable to connect to Neo Reader.
|
|
353
|
+
FetchError: If the API returns an error response.
|
|
354
|
+
"""
|
|
355
|
+
client = await self._get_client()
|
|
356
|
+
|
|
357
|
+
self._logger.info("Cancelling discovery job %s", job_id)
|
|
358
|
+
|
|
359
|
+
try:
|
|
360
|
+
response = await client.delete(f"/discover/jobs/{job_id}")
|
|
361
|
+
response.raise_for_status()
|
|
362
|
+
|
|
363
|
+
self._logger.info("Successfully cancelled job %s", job_id)
|
|
364
|
+
return True
|
|
365
|
+
|
|
366
|
+
except httpx.HTTPStatusError as e:
|
|
367
|
+
# 404 or similar might mean job is already completed/cancelled
|
|
368
|
+
if e.response.status_code == 404:
|
|
369
|
+
self._logger.warning(
|
|
370
|
+
"Job %s not found (may already be completed)",
|
|
371
|
+
job_id,
|
|
372
|
+
)
|
|
373
|
+
return False
|
|
374
|
+
raise FetchError(
|
|
375
|
+
f"Failed to cancel job: HTTP {e.response.status_code}",
|
|
376
|
+
source=job_id,
|
|
377
|
+
status_code=e.response.status_code,
|
|
378
|
+
cause=e,
|
|
379
|
+
) from e
|
|
380
|
+
except httpx.ConnectError as e:
|
|
381
|
+
raise ConnectionError(
|
|
382
|
+
f"Cannot connect to Neo Reader at {self._config.host}",
|
|
383
|
+
host=self._config.host,
|
|
384
|
+
cause=e,
|
|
385
|
+
) from e
|
|
386
|
+
|
|
387
|
+
async def discover(
|
|
388
|
+
self,
|
|
389
|
+
url: str,
|
|
390
|
+
discovery_config: DiscoveryConfig | None = None,
|
|
391
|
+
*,
|
|
392
|
+
timeout: float = 600.0,
|
|
393
|
+
on_progress: Callable[[DiscoveryProgress], Awaitable[None] | None]
|
|
394
|
+
| None = None,
|
|
395
|
+
) -> list[str]:
|
|
396
|
+
"""Convenience method to discover all URLs from a website.
|
|
397
|
+
|
|
398
|
+
Creates a job, waits for completion, and returns the discovered URLs.
|
|
399
|
+
Handles job cancellation on errors or interruption.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
url: The starting URL for discovery.
|
|
403
|
+
discovery_config: Configuration for the crawl. Uses defaults if None.
|
|
404
|
+
timeout: Maximum time to wait for completion in seconds.
|
|
405
|
+
on_progress: Optional callback for progress updates.
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
List of discovered URL strings.
|
|
409
|
+
|
|
410
|
+
Raises:
|
|
411
|
+
DiscoveryTimeoutError: If the job doesn't complete within timeout.
|
|
412
|
+
DiscoveryJobFailedError: If the job fails or is cancelled.
|
|
413
|
+
ConnectionError: If unable to connect to Neo Reader.
|
|
414
|
+
FetchError: If the API returns an error response.
|
|
415
|
+
"""
|
|
416
|
+
job_id = await self.create_job(url, discovery_config)
|
|
417
|
+
|
|
418
|
+
try:
|
|
419
|
+
status = await self.wait_for_completion(
|
|
420
|
+
job_id,
|
|
421
|
+
timeout=timeout,
|
|
422
|
+
on_progress=on_progress,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
if status.status != "completed":
|
|
426
|
+
raise DiscoveryJobFailedError(
|
|
427
|
+
f"Discovery job {job_id} failed with status: {status.status}",
|
|
428
|
+
job_id=job_id,
|
|
429
|
+
status=status.status,
|
|
430
|
+
source=url,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
return [u.url for u in status.urls]
|
|
434
|
+
|
|
435
|
+
except (asyncio.CancelledError, Exception) as e:
|
|
436
|
+
# Attempt to cancel the job on any error
|
|
437
|
+
self._logger.warning(
|
|
438
|
+
"Cancelling job %s due to error: %s",
|
|
439
|
+
job_id,
|
|
440
|
+
e,
|
|
441
|
+
)
|
|
442
|
+
try:
|
|
443
|
+
await self.cancel_job(job_id)
|
|
444
|
+
except Exception as cancel_err:
|
|
445
|
+
self._logger.error(
|
|
446
|
+
"Failed to cancel job %s: %s",
|
|
447
|
+
job_id,
|
|
448
|
+
cancel_err,
|
|
449
|
+
)
|
|
450
|
+
raise
|
|
451
|
+
|
|
452
|
+
def _parse_job_status(self, data: dict[str, Any]) -> DiscoveryJobStatus:
|
|
453
|
+
"""Parse API response into DiscoveryJobStatus.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
data: Raw JSON response from the API.
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
Parsed DiscoveryJobStatus instance.
|
|
460
|
+
"""
|
|
461
|
+
# Parse progress if present
|
|
462
|
+
progress = None
|
|
463
|
+
if data.get("progress"):
|
|
464
|
+
progress = DiscoveryProgress(
|
|
465
|
+
percent=data["progress"].get("percent", 0),
|
|
466
|
+
pages_crawled=data["progress"].get("pages_crawled", 0),
|
|
467
|
+
urls_discovered=data["progress"].get("urls_discovered", 0),
|
|
468
|
+
current_depth=data["progress"].get("current_depth", 0),
|
|
469
|
+
message=data["progress"].get("message", ""),
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Parse stats if present
|
|
473
|
+
stats = None
|
|
474
|
+
if data.get("stats"):
|
|
475
|
+
stats = DiscoveryStats(
|
|
476
|
+
pages_crawled=data["stats"].get("pages_crawled", 0),
|
|
477
|
+
urls_found=data["stats"].get("urls_found", 0),
|
|
478
|
+
urls_returned=data["stats"].get("urls_returned", 0),
|
|
479
|
+
urls_filtered=data["stats"].get("urls_filtered", 0),
|
|
480
|
+
errors=data["stats"].get("errors", 0),
|
|
481
|
+
duration_seconds=data["stats"].get("duration_seconds", 0.0),
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Parse URLs if present
|
|
485
|
+
urls: list[DiscoveredURL] = []
|
|
486
|
+
if data.get("urls"):
|
|
487
|
+
for u in data["urls"]:
|
|
488
|
+
urls.append(
|
|
489
|
+
DiscoveredURL(
|
|
490
|
+
url=u["url"],
|
|
491
|
+
depth=u.get("depth", 0),
|
|
492
|
+
title=u.get("title"),
|
|
493
|
+
is_internal=u.get("is_internal", True),
|
|
494
|
+
)
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
return DiscoveryJobStatus(
|
|
498
|
+
job_id=data["job_id"],
|
|
499
|
+
status=data["status"],
|
|
500
|
+
start_url=data["start_url"],
|
|
501
|
+
progress=progress,
|
|
502
|
+
stats=stats,
|
|
503
|
+
urls=urls,
|
|
504
|
+
error=data.get("error"),
|
|
505
|
+
)
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
"""Content loaders for various source types."""
|
|
2
2
|
|
|
3
3
|
from gnosisllm_knowledge.loaders.base import BaseLoader
|
|
4
|
+
from gnosisllm_knowledge.loaders.discovery import DiscoveryLoader
|
|
5
|
+
from gnosisllm_knowledge.loaders.discovery_streaming import StreamingDiscoveryDiscoverer
|
|
4
6
|
from gnosisllm_knowledge.loaders.factory import LoaderFactory
|
|
5
7
|
from gnosisllm_knowledge.loaders.sitemap import SitemapLoader
|
|
6
8
|
from gnosisllm_knowledge.loaders.website import WebsiteLoader
|
|
7
9
|
|
|
8
10
|
__all__ = [
|
|
9
11
|
"BaseLoader",
|
|
12
|
+
"DiscoveryLoader",
|
|
10
13
|
"LoaderFactory",
|
|
11
|
-
"WebsiteLoader",
|
|
12
14
|
"SitemapLoader",
|
|
15
|
+
"StreamingDiscoveryDiscoverer",
|
|
16
|
+
"WebsiteLoader",
|
|
13
17
|
]
|