gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. gnosisllm_knowledge/api/knowledge.py +233 -35
  2. gnosisllm_knowledge/backends/memory/indexer.py +27 -2
  3. gnosisllm_knowledge/backends/memory/searcher.py +132 -10
  4. gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
  5. gnosisllm_knowledge/backends/opensearch/config.py +7 -0
  6. gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
  7. gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
  8. gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
  9. gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
  10. gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
  11. gnosisllm_knowledge/cli/app.py +58 -19
  12. gnosisllm_knowledge/cli/commands/agentic.py +15 -9
  13. gnosisllm_knowledge/cli/commands/load.py +169 -19
  14. gnosisllm_knowledge/cli/commands/memory.py +10 -0
  15. gnosisllm_knowledge/cli/commands/search.py +9 -10
  16. gnosisllm_knowledge/cli/commands/setup.py +25 -1
  17. gnosisllm_knowledge/cli/utils/config.py +4 -4
  18. gnosisllm_knowledge/core/domain/__init__.py +13 -0
  19. gnosisllm_knowledge/core/domain/discovery.py +166 -0
  20. gnosisllm_knowledge/core/domain/document.py +14 -19
  21. gnosisllm_knowledge/core/domain/search.py +10 -25
  22. gnosisllm_knowledge/core/domain/source.py +11 -12
  23. gnosisllm_knowledge/core/events/__init__.py +8 -0
  24. gnosisllm_knowledge/core/events/types.py +122 -5
  25. gnosisllm_knowledge/core/exceptions.py +93 -0
  26. gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
  27. gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
  28. gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
  29. gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
  30. gnosisllm_knowledge/fetchers/__init__.py +8 -0
  31. gnosisllm_knowledge/fetchers/config.py +27 -0
  32. gnosisllm_knowledge/fetchers/neoreader.py +31 -3
  33. gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
  34. gnosisllm_knowledge/loaders/__init__.py +5 -1
  35. gnosisllm_knowledge/loaders/discovery.py +338 -0
  36. gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
  37. gnosisllm_knowledge/loaders/factory.py +46 -0
  38. gnosisllm_knowledge/services/indexing.py +51 -21
  39. gnosisllm_knowledge/services/search.py +42 -28
  40. gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
  41. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
  42. gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
  43. gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
  44. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
  45. {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,166 @@
1
+ """Domain models for website discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class DiscoveryConfig:
10
+ """Configuration for website discovery crawl.
11
+
12
+ Controls how the Neo Reader Discovery API crawls and discovers URLs.
13
+
14
+ Attributes:
15
+ max_depth: Maximum crawl depth from start URL.
16
+ max_pages: Maximum number of pages to crawl.
17
+ same_domain: Only crawl URLs on the same domain.
18
+ include_subdomains: Include subdomains when same_domain is True.
19
+ respect_robots: Respect robots.txt rules.
20
+ parse_sitemap: Also parse sitemap if available.
21
+ with_metadata: Include page metadata (title, etc.) in results.
22
+ crawl_timeout: Overall timeout for the crawl in seconds.
23
+ concurrent_requests: Number of concurrent crawl requests.
24
+ request_delay: Delay between requests in milliseconds.
25
+ include_pattern: Regex pattern for URLs to include.
26
+ exclude_pattern: Regex pattern for URLs to exclude.
27
+ path_prefix: Only crawl URLs with this path prefix.
28
+ """
29
+
30
+ max_depth: int = 3
31
+ max_pages: int = 100
32
+ same_domain: bool = True
33
+ include_subdomains: bool = True
34
+ respect_robots: bool = True
35
+ parse_sitemap: bool = False
36
+ with_metadata: bool = True
37
+ crawl_timeout: int = 300
38
+ concurrent_requests: int = 5
39
+ request_delay: int = 100
40
+ include_pattern: str | None = None
41
+ exclude_pattern: str | None = None
42
+ path_prefix: str | None = None
43
+
44
+ def to_headers(self) -> dict[str, str]:
45
+ """Convert config to HTTP headers for Neo Reader API.
46
+
47
+ Returns:
48
+ Dictionary of header name to value.
49
+ """
50
+ headers = {
51
+ "X-Max-Depth": str(self.max_depth),
52
+ "X-Max-Pages": str(self.max_pages),
53
+ "X-Same-Domain": str(self.same_domain).lower(),
54
+ "X-Include-Subdomains": str(self.include_subdomains).lower(),
55
+ "X-Respect-Robots": str(self.respect_robots).lower(),
56
+ "X-Parse-Sitemap": str(self.parse_sitemap).lower(),
57
+ "X-With-Metadata": str(self.with_metadata).lower(),
58
+ "X-Crawl-Timeout": str(self.crawl_timeout),
59
+ "X-Concurrent-Requests": str(self.concurrent_requests),
60
+ "X-Request-Delay": str(self.request_delay),
61
+ }
62
+ if self.include_pattern:
63
+ headers["X-Include-Pattern"] = self.include_pattern
64
+ if self.exclude_pattern:
65
+ headers["X-Exclude-Pattern"] = self.exclude_pattern
66
+ if self.path_prefix:
67
+ headers["X-Path-Prefix"] = self.path_prefix
68
+ return headers
69
+
70
+
71
+ @dataclass
72
+ class DiscoveryProgress:
73
+ """Progress information for a running discovery job.
74
+
75
+ Attributes:
76
+ percent: Completion percentage (0-100).
77
+ pages_crawled: Number of pages crawled so far.
78
+ urls_discovered: Number of URLs discovered so far.
79
+ current_depth: Current crawl depth.
80
+ message: Human-readable progress message.
81
+ """
82
+
83
+ percent: int = 0
84
+ pages_crawled: int = 0
85
+ urls_discovered: int = 0
86
+ current_depth: int = 0
87
+ message: str = ""
88
+
89
+
90
+ @dataclass
91
+ class DiscoveryStats:
92
+ """Statistics for a completed discovery job.
93
+
94
+ Attributes:
95
+ pages_crawled: Total pages crawled.
96
+ urls_found: Total URLs found during crawl.
97
+ urls_returned: URLs returned in results (after filtering).
98
+ urls_filtered: URLs excluded by filters.
99
+ errors: Number of errors during crawl.
100
+ duration_seconds: Total crawl duration.
101
+ """
102
+
103
+ pages_crawled: int = 0
104
+ urls_found: int = 0
105
+ urls_returned: int = 0
106
+ urls_filtered: int = 0
107
+ errors: int = 0
108
+ duration_seconds: float = 0.0
109
+
110
+
111
+ @dataclass
112
+ class DiscoveredURL:
113
+ """A URL discovered during crawl.
114
+
115
+ Attributes:
116
+ url: The discovered URL.
117
+ depth: Crawl depth at which URL was found.
118
+ title: Page title if available.
119
+ is_internal: Whether URL is internal to the domain.
120
+ """
121
+
122
+ url: str
123
+ depth: int = 0
124
+ title: str | None = None
125
+ is_internal: bool = True
126
+
127
+
128
+ @dataclass
129
+ class DiscoveryJobStatus:
130
+ """Status of a discovery job.
131
+
132
+ Represents the current state of an async discovery job.
133
+
134
+ Attributes:
135
+ job_id: Unique job identifier.
136
+ status: Job status (pending, queued, running, completed, failed, cancelled).
137
+ start_url: The URL that started the discovery.
138
+ progress: Progress information if job is running.
139
+ stats: Statistics if job is completed.
140
+ urls: Discovered URLs if job is completed.
141
+ error: Error message if job failed.
142
+ """
143
+
144
+ job_id: str
145
+ status: str
146
+ start_url: str
147
+ progress: DiscoveryProgress | None = None
148
+ stats: DiscoveryStats | None = None
149
+ urls: list[DiscoveredURL] = field(default_factory=list)
150
+ error: str | None = None
151
+
152
+ def is_terminal(self) -> bool:
153
+ """Check if job is in a terminal state.
154
+
155
+ Returns:
156
+ True if job is completed, failed, or cancelled.
157
+ """
158
+ return self.status in ("completed", "failed", "cancelled")
159
+
160
+ def is_running(self) -> bool:
161
+ """Check if job is currently running.
162
+
163
+ Returns:
164
+ True if job is pending, queued, or running.
165
+ """
166
+ return self.status in ("pending", "queued", "running")
@@ -26,17 +26,22 @@ class Document:
26
26
  This is the core domain object that flows through the knowledge pipeline.
27
27
  Documents are created by loaders, processed by chunkers, and stored by indexers.
28
28
 
29
+ Note:
30
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
31
+ isolation (e.g., `knowledge-{account_id}`). Tenant information like account_id
32
+ should be passed in the metadata dictionary if needed for audit purposes.
33
+
29
34
  Attributes:
30
35
  content: The main text content of the document.
31
36
  source: Source identifier (URL, file path, etc.).
32
37
  doc_id: Unique identifier. Auto-generated from content hash if not provided.
33
38
  title: Optional document title.
34
39
  url: URL where the document was fetched from.
35
- metadata: Arbitrary metadata dictionary.
40
+ metadata: Arbitrary metadata dictionary (can include tenant info for audit).
36
41
 
37
- Multi-tenancy fields:
38
- account_id: Account/tenant identifier.
42
+ Collection fields:
39
43
  collection_id: Collection the document belongs to.
44
+ collection_name: Collection name for display in aggregations.
40
45
  source_id: Source identifier within the collection.
41
46
 
42
47
  Chunking info:
@@ -70,8 +75,7 @@ class Document:
70
75
  url: str | None = None
71
76
  metadata: dict[str, Any] = field(default_factory=dict)
72
77
 
73
- # Multi-tenancy fields
74
- account_id: str | None = None
78
+ # Collection fields
75
79
  collection_id: str | None = None
76
80
  collection_name: str | None = None # For display in aggregations
77
81
  source_id: str | None = None
@@ -139,7 +143,6 @@ class Document:
139
143
  title=self.title,
140
144
  url=self.url,
141
145
  metadata=self.metadata.copy(),
142
- account_id=self.account_id,
143
146
  collection_id=self.collection_id,
144
147
  collection_name=self.collection_name,
145
148
  source_id=self.source_id,
@@ -154,23 +157,21 @@ class Document:
154
157
  created_at=self.created_at,
155
158
  )
156
159
 
157
- def with_tenant(
160
+ def with_collection(
158
161
  self,
159
- account_id: str,
160
- collection_id: str | None = None,
162
+ collection_id: str,
161
163
  collection_name: str | None = None,
162
164
  source_id: str | None = None,
163
165
  ) -> Document:
164
- """Create a new document with tenant information.
166
+ """Create a new document with collection information.
165
167
 
166
168
  Args:
167
- account_id: Account/tenant identifier.
168
169
  collection_id: Collection identifier.
169
170
  collection_name: Collection name for display.
170
171
  source_id: Source identifier.
171
172
 
172
173
  Returns:
173
- New Document instance with tenant information set.
174
+ New Document instance with collection information set.
174
175
  """
175
176
  return Document(
176
177
  content=self.content,
@@ -179,8 +180,7 @@ class Document:
179
180
  title=self.title,
180
181
  url=self.url,
181
182
  metadata=self.metadata.copy(),
182
- account_id=account_id,
183
- collection_id=collection_id or self.collection_id,
183
+ collection_id=collection_id,
184
184
  collection_name=collection_name or self.collection_name,
185
185
  source_id=source_id or self.source_id,
186
186
  chunk_index=self.chunk_index,
@@ -203,11 +203,6 @@ class Document:
203
203
  """Check if this document is a chunk of a larger document."""
204
204
  return self.chunk_index is not None and self.total_chunks is not None
205
205
 
206
- @property
207
- def is_multi_tenant(self) -> bool:
208
- """Check if this document has tenant information."""
209
- return self.account_id is not None
210
-
211
206
 
212
207
  @dataclass
213
208
  class TextChunk:
@@ -39,6 +39,11 @@ class AgentType(str, Enum):
39
39
  class SearchQuery:
40
40
  """Search query with filters and options.
41
41
 
42
+ Note:
43
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
44
+ isolation (e.g., `knowledge-{account_id}`). Callers should ensure they're
45
+ searching the correct tenant-specific index.
46
+
42
47
  Attributes:
43
48
  text: The search query text.
44
49
  mode: Search mode to use.
@@ -49,7 +54,6 @@ class SearchQuery:
49
54
  Filters:
50
55
  collection_ids: Filter by collection IDs.
51
56
  source_ids: Filter by source IDs.
52
- account_id: Multi-tenant account filter.
53
57
  metadata_filters: Custom metadata filters.
54
58
 
55
59
  Advanced options:
@@ -69,7 +73,6 @@ class SearchQuery:
69
73
  # Filters
70
74
  collection_ids: list[str] | None = None
71
75
  source_ids: list[str] | None = None
72
- account_id: str | None = None
73
76
  metadata_filters: dict[str, Any] = field(default_factory=dict)
74
77
 
75
78
  # Advanced options
@@ -89,26 +92,6 @@ class SearchQuery:
89
92
  min_score=self.min_score,
90
93
  collection_ids=self.collection_ids,
91
94
  source_ids=self.source_ids,
92
- account_id=self.account_id,
93
- metadata_filters=self.metadata_filters.copy(),
94
- field_boosts=self.field_boosts.copy() if self.field_boosts else None,
95
- include_highlights=self.include_highlights,
96
- include_fields=self.include_fields,
97
- exclude_fields=self.exclude_fields,
98
- explain=self.explain,
99
- )
100
-
101
- def with_tenant(self, account_id: str) -> SearchQuery:
102
- """Create a copy with tenant information."""
103
- return SearchQuery(
104
- text=self.text,
105
- mode=self.mode,
106
- limit=self.limit,
107
- offset=self.offset,
108
- min_score=self.min_score,
109
- collection_ids=self.collection_ids,
110
- source_ids=self.source_ids,
111
- account_id=account_id,
112
95
  metadata_filters=self.metadata_filters.copy(),
113
96
  field_boosts=self.field_boosts.copy() if self.field_boosts else None,
114
97
  include_highlights=self.include_highlights,
@@ -216,13 +199,17 @@ class ReasoningStep:
216
199
  class AgenticSearchQuery:
217
200
  """Query for agentic search with conversation support.
218
201
 
202
+ Note:
203
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
204
+ isolation (e.g., `knowledge-{account_id}`). Callers should ensure they're
205
+ searching the correct tenant-specific index.
206
+
219
207
  Attributes:
220
208
  text: The search query text.
221
209
  agent_type: Type of agent to use.
222
210
  conversation_id: ID for continuing a conversation.
223
211
  collection_ids: Filter by collection IDs.
224
212
  source_ids: Filter by source IDs.
225
- account_id: Multi-tenant account filter.
226
213
  limit: Maximum number of source documents to retrieve.
227
214
  include_reasoning: Whether to include reasoning steps.
228
215
  metadata_filters: Custom metadata filters.
@@ -235,7 +222,6 @@ class AgenticSearchQuery:
235
222
  conversation_id: str | None = None
236
223
  collection_ids: list[str] | None = None
237
224
  source_ids: list[str] | None = None
238
- account_id: str | None = None
239
225
  limit: int = 10
240
226
  include_reasoning: bool = True
241
227
  metadata_filters: dict[str, Any] = field(default_factory=dict)
@@ -250,7 +236,6 @@ class AgenticSearchQuery:
250
236
  limit=self.limit,
251
237
  collection_ids=self.collection_ids,
252
238
  source_ids=self.source_ids,
253
- account_id=self.account_id,
254
239
  metadata_filters=self.metadata_filters.copy(),
255
240
  )
256
241
 
@@ -10,6 +10,11 @@ from typing import Any
10
10
  class SourceConfig:
11
11
  """Configuration for a content source.
12
12
 
13
+ Note:
14
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
15
+ isolation (e.g., `knowledge-{account_id}`). Tenant information should be
16
+ managed by the caller, not embedded in source configuration.
17
+
13
18
  Attributes:
14
19
  url: The source URL or path.
15
20
  source_type: Type of source (website, sitemap, file, etc.).
@@ -26,8 +31,7 @@ class SourceConfig:
26
31
  remove_selector: CSS selector for elements to remove.
27
32
  timeout: Request timeout in seconds.
28
33
 
29
- Multi-tenancy:
30
- account_id: Account/tenant identifier.
34
+ Collection:
31
35
  collection_id: Collection identifier.
32
36
  source_id: Source identifier within collection.
33
37
  """
@@ -47,8 +51,7 @@ class SourceConfig:
47
51
  remove_selector: str | None = None
48
52
  timeout: int | None = None
49
53
 
50
- # Multi-tenancy
51
- account_id: str | None = None
54
+ # Collection
52
55
  collection_id: str | None = None
53
56
  source_id: str | None = None
54
57
 
@@ -73,26 +76,23 @@ class SourceConfig:
73
76
  target_selector=self.target_selector,
74
77
  remove_selector=self.remove_selector,
75
78
  timeout=self.timeout,
76
- account_id=self.account_id,
77
79
  collection_id=self.collection_id,
78
80
  source_id=self.source_id,
79
81
  )
80
82
 
81
- def with_tenant(
83
+ def with_collection(
82
84
  self,
83
- account_id: str,
84
- collection_id: str | None = None,
85
+ collection_id: str,
85
86
  source_id: str | None = None,
86
87
  ) -> SourceConfig:
87
- """Create a copy with tenant information.
88
+ """Create a copy with collection information.
88
89
 
89
90
  Args:
90
- account_id: Account/tenant identifier.
91
91
  collection_id: Collection identifier.
92
92
  source_id: Source identifier.
93
93
 
94
94
  Returns:
95
- New SourceConfig with tenant information.
95
+ New SourceConfig with collection information.
96
96
  """
97
97
  return SourceConfig(
98
98
  url=self.url,
@@ -105,7 +105,6 @@ class SourceConfig:
105
105
  target_selector=self.target_selector,
106
106
  remove_selector=self.remove_selector,
107
107
  timeout=self.timeout,
108
- account_id=account_id,
109
108
  collection_id=collection_id,
110
109
  source_id=source_id,
111
110
  )
@@ -4,6 +4,10 @@ from gnosisllm_knowledge.core.events.emitter import EventEmitter
4
4
  from gnosisllm_knowledge.core.events.types import (
5
5
  BatchCompletedEvent,
6
6
  BatchStartedEvent,
7
+ DiscoveryCompletedEvent,
8
+ DiscoveryFailedEvent,
9
+ DiscoveryProgressEvent,
10
+ DiscoveryStartedEvent,
7
11
  DocumentIndexedEvent,
8
12
  DocumentLoadedEvent,
9
13
  Event,
@@ -20,4 +24,8 @@ __all__ = [
20
24
  "SitemapDiscoveryEvent",
21
25
  "BatchStartedEvent",
22
26
  "BatchCompletedEvent",
27
+ "DiscoveryStartedEvent",
28
+ "DiscoveryProgressEvent",
29
+ "DiscoveryCompletedEvent",
30
+ "DiscoveryFailedEvent",
23
31
  ]
@@ -34,6 +34,12 @@ class EventType(str, Enum):
34
34
  LOAD_FAILED = "load_failed"
35
35
  SITEMAP_DISCOVERED = "sitemap_discovered"
36
36
 
37
+ # Discovery events
38
+ DISCOVERY_STARTED = "discovery_started"
39
+ DISCOVERY_PROGRESS = "discovery_progress"
40
+ DISCOVERY_COMPLETED = "discovery_completed"
41
+ DISCOVERY_FAILED = "discovery_failed"
42
+
37
43
  # Streaming events
38
44
  STREAMING_PROGRESS = "streaming_progress"
39
45
  URL_BATCH_PROCESSED = "url_batch_processed"
@@ -91,11 +97,14 @@ class EventType(str, Enum):
91
97
  class Event:
92
98
  """Base event class.
93
99
 
100
+ Note:
101
+ This library is tenant-agnostic. Multi-tenancy is achieved through index
102
+ isolation. Any tenant-specific context should be passed in the data dict.
103
+
94
104
  Attributes:
95
105
  event_type: The type of event.
96
106
  timestamp: When the event occurred.
97
- data: Additional event data.
98
- account_id: Account ID for multi-tenant context.
107
+ data: Additional event data (can include tenant context for audit).
99
108
  user_id: User ID if applicable.
100
109
  request_id: Request ID for tracing.
101
110
  trace_id: Distributed trace ID.
@@ -107,7 +116,6 @@ class Event:
107
116
  data: dict[str, Any] = field(default_factory=dict)
108
117
 
109
118
  # Context
110
- account_id: str | None = None
111
119
  user_id: str | None = None
112
120
  request_id: str | None = None
113
121
 
@@ -117,7 +125,6 @@ class Event:
117
125
 
118
126
  def with_context(
119
127
  self,
120
- account_id: str | None = None,
121
128
  user_id: str | None = None,
122
129
  request_id: str | None = None,
123
130
  ) -> Event:
@@ -126,7 +133,6 @@ class Event:
126
133
  event_type=self.event_type,
127
134
  timestamp=self.timestamp,
128
135
  data=self.data.copy(),
129
- account_id=account_id or self.account_id,
130
136
  user_id=user_id or self.user_id,
131
137
  request_id=request_id or self.request_id,
132
138
  trace_id=self.trace_id,
@@ -300,3 +306,114 @@ class StreamingCompletedEvent(Event):
300
306
  "failed_count": self.failed_count,
301
307
  "duration_ms": self.duration_ms,
302
308
  }
309
+
310
+
311
+ # === Discovery Events ===
312
+
313
+
314
+ @dataclass
315
+ class DiscoveryStartedEvent(Event):
316
+ """Event emitted when a discovery job starts.
317
+
318
+ Attributes:
319
+ url: The starting URL for discovery.
320
+ job_id: The discovery job ID.
321
+ config: Discovery configuration as dictionary.
322
+ """
323
+
324
+ url: str = ""
325
+ job_id: str = ""
326
+ config: dict[str, Any] = field(default_factory=dict)
327
+
328
+ def __post_init__(self) -> None:
329
+ """Set event type."""
330
+ self.event_type = EventType.DISCOVERY_STARTED
331
+ self.data = {
332
+ "url": self.url,
333
+ "job_id": self.job_id,
334
+ "config": self.config,
335
+ }
336
+
337
+
338
+ @dataclass
339
+ class DiscoveryProgressEvent(Event):
340
+ """Event emitted during discovery progress updates.
341
+
342
+ Attributes:
343
+ job_id: The discovery job ID.
344
+ percent: Progress percentage (0-100).
345
+ pages_crawled: Number of pages crawled so far.
346
+ urls_discovered: Number of URLs discovered so far.
347
+ current_depth: Current crawl depth.
348
+ message: Human-readable progress message.
349
+ """
350
+
351
+ job_id: str = ""
352
+ percent: int = 0
353
+ pages_crawled: int = 0
354
+ urls_discovered: int = 0
355
+ current_depth: int = 0
356
+ message: str = ""
357
+
358
+ def __post_init__(self) -> None:
359
+ """Set event type."""
360
+ self.event_type = EventType.DISCOVERY_PROGRESS
361
+ self.data = {
362
+ "job_id": self.job_id,
363
+ "percent": self.percent,
364
+ "pages_crawled": self.pages_crawled,
365
+ "urls_discovered": self.urls_discovered,
366
+ "current_depth": self.current_depth,
367
+ "message": self.message,
368
+ }
369
+
370
+
371
+ @dataclass
372
+ class DiscoveryCompletedEvent(Event):
373
+ """Event emitted when discovery completes successfully.
374
+
375
+ Attributes:
376
+ job_id: The discovery job ID.
377
+ urls_count: Total number of URLs discovered.
378
+ pages_crawled: Total number of pages crawled.
379
+ duration_seconds: Total discovery duration.
380
+ errors: Number of errors encountered during discovery.
381
+ """
382
+
383
+ job_id: str = ""
384
+ urls_count: int = 0
385
+ pages_crawled: int = 0
386
+ duration_seconds: float = 0.0
387
+ errors: int = 0
388
+
389
+ def __post_init__(self) -> None:
390
+ """Set event type."""
391
+ self.event_type = EventType.DISCOVERY_COMPLETED
392
+ self.data = {
393
+ "job_id": self.job_id,
394
+ "urls_count": self.urls_count,
395
+ "pages_crawled": self.pages_crawled,
396
+ "duration_seconds": self.duration_seconds,
397
+ "errors": self.errors,
398
+ }
399
+
400
+
401
+ @dataclass
402
+ class DiscoveryFailedEvent(Event):
403
+ """Event emitted when discovery fails.
404
+
405
+ Attributes:
406
+ job_id: The discovery job ID.
407
+ error: Error message describing the failure.
408
+ """
409
+
410
+ job_id: str = ""
411
+ error: str = ""
412
+
413
+ def __post_init__(self) -> None:
414
+ """Set event type."""
415
+ self.event_type = EventType.DISCOVERY_FAILED
416
+ self.data = {
417
+ "job_id": self.job_id,
418
+ "error": self.error,
419
+ }
@@ -539,3 +539,96 @@ class MemoryConfigurationError(MemoryError):
539
539
  self.missing_config = missing_config
540
540
  if missing_config:
541
541
  self.details["missing_config"] = missing_config
542
+
543
+
544
+ # === Discovery Exceptions ===
545
+
546
+
547
+ class DiscoveryError(KnowledgeError):
548
+ """Base exception for discovery operations.
549
+
550
+ Raised when website discovery fails.
551
+ All discovery-related exceptions inherit from this class.
552
+ """
553
+
554
+ def __init__(
555
+ self,
556
+ message: str = "Discovery error",
557
+ *,
558
+ job_id: str | None = None,
559
+ source: str | None = None,
560
+ **kwargs: Any,
561
+ ) -> None:
562
+ """Initialize the exception.
563
+
564
+ Args:
565
+ message: Human-readable error message.
566
+ job_id: The discovery job ID if available.
567
+ source: The source URL being discovered.
568
+ **kwargs: Additional arguments for parent class.
569
+ """
570
+ super().__init__(message, **kwargs)
571
+ self.job_id = job_id
572
+ self.source = source
573
+ if job_id:
574
+ self.details["job_id"] = job_id
575
+ if source:
576
+ self.details["source"] = source
577
+
578
+
579
+ class DiscoveryTimeoutError(DiscoveryError):
580
+ """Discovery job timed out.
581
+
582
+ Raised when a discovery job exceeds its configured timeout
583
+ while waiting for completion.
584
+ """
585
+
586
+ def __init__(
587
+ self,
588
+ message: str = "Discovery job timed out",
589
+ *,
590
+ elapsed: float | None = None,
591
+ timeout: float | None = None,
592
+ **kwargs: Any,
593
+ ) -> None:
594
+ """Initialize the exception.
595
+
596
+ Args:
597
+ message: Human-readable error message.
598
+ elapsed: Time elapsed before timeout.
599
+ timeout: The timeout value that was exceeded.
600
+ **kwargs: Additional arguments for parent class.
601
+ """
602
+ super().__init__(message, **kwargs)
603
+ self.elapsed = elapsed
604
+ self.timeout = timeout
605
+ if elapsed is not None:
606
+ self.details["elapsed"] = elapsed
607
+ if timeout is not None:
608
+ self.details["timeout"] = timeout
609
+
610
+
611
+ class DiscoveryJobFailedError(DiscoveryError):
612
+ """Discovery job failed on the server.
613
+
614
+ Raised when a discovery job completes with a failed or cancelled status.
615
+ """
616
+
617
+ def __init__(
618
+ self,
619
+ message: str = "Discovery job failed",
620
+ *,
621
+ status: str | None = None,
622
+ **kwargs: Any,
623
+ ) -> None:
624
+ """Initialize the exception.
625
+
626
+ Args:
627
+ message: Human-readable error message.
628
+ status: The final job status.
629
+ **kwargs: Additional arguments for parent class.
630
+ """
631
+ super().__init__(message, **kwargs)
632
+ self.status = status
633
+ if status:
634
+ self.details["status"] = status