gnosisllm-knowledge 0.3.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/api/knowledge.py +233 -35
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +132 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- gnosisllm_knowledge/backends/opensearch/config.py +7 -0
- gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +64 -6
- gnosisllm_knowledge/backends/opensearch/setup.py +29 -33
- gnosisllm_knowledge/cli/app.py +58 -19
- gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +10 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +25 -1
- gnosisllm_knowledge/cli/utils/config.py +4 -4
- gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +14 -19
- gnosisllm_knowledge/core/domain/search.py +10 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +122 -5
- gnosisllm_knowledge/core/exceptions.py +93 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/searcher.py +30 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/services/indexing.py +51 -21
- gnosisllm_knowledge/services/search.py +42 -28
- gnosisllm_knowledge/services/streaming_pipeline.py +45 -7
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/METADATA +30 -10
- gnosisllm_knowledge-0.4.3.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.3.0.dist-info/RECORD +0 -77
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.3.0.dist-info → gnosisllm_knowledge-0.4.3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Domain models for website discovery."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class DiscoveryConfig:
|
|
10
|
+
"""Configuration for website discovery crawl.
|
|
11
|
+
|
|
12
|
+
Controls how the Neo Reader Discovery API crawls and discovers URLs.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
max_depth: Maximum crawl depth from start URL.
|
|
16
|
+
max_pages: Maximum number of pages to crawl.
|
|
17
|
+
same_domain: Only crawl URLs on the same domain.
|
|
18
|
+
include_subdomains: Include subdomains when same_domain is True.
|
|
19
|
+
respect_robots: Respect robots.txt rules.
|
|
20
|
+
parse_sitemap: Also parse sitemap if available.
|
|
21
|
+
with_metadata: Include page metadata (title, etc.) in results.
|
|
22
|
+
crawl_timeout: Overall timeout for the crawl in seconds.
|
|
23
|
+
concurrent_requests: Number of concurrent crawl requests.
|
|
24
|
+
request_delay: Delay between requests in milliseconds.
|
|
25
|
+
include_pattern: Regex pattern for URLs to include.
|
|
26
|
+
exclude_pattern: Regex pattern for URLs to exclude.
|
|
27
|
+
path_prefix: Only crawl URLs with this path prefix.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
max_depth: int = 3
|
|
31
|
+
max_pages: int = 100
|
|
32
|
+
same_domain: bool = True
|
|
33
|
+
include_subdomains: bool = True
|
|
34
|
+
respect_robots: bool = True
|
|
35
|
+
parse_sitemap: bool = False
|
|
36
|
+
with_metadata: bool = True
|
|
37
|
+
crawl_timeout: int = 300
|
|
38
|
+
concurrent_requests: int = 5
|
|
39
|
+
request_delay: int = 100
|
|
40
|
+
include_pattern: str | None = None
|
|
41
|
+
exclude_pattern: str | None = None
|
|
42
|
+
path_prefix: str | None = None
|
|
43
|
+
|
|
44
|
+
def to_headers(self) -> dict[str, str]:
|
|
45
|
+
"""Convert config to HTTP headers for Neo Reader API.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Dictionary of header name to value.
|
|
49
|
+
"""
|
|
50
|
+
headers = {
|
|
51
|
+
"X-Max-Depth": str(self.max_depth),
|
|
52
|
+
"X-Max-Pages": str(self.max_pages),
|
|
53
|
+
"X-Same-Domain": str(self.same_domain).lower(),
|
|
54
|
+
"X-Include-Subdomains": str(self.include_subdomains).lower(),
|
|
55
|
+
"X-Respect-Robots": str(self.respect_robots).lower(),
|
|
56
|
+
"X-Parse-Sitemap": str(self.parse_sitemap).lower(),
|
|
57
|
+
"X-With-Metadata": str(self.with_metadata).lower(),
|
|
58
|
+
"X-Crawl-Timeout": str(self.crawl_timeout),
|
|
59
|
+
"X-Concurrent-Requests": str(self.concurrent_requests),
|
|
60
|
+
"X-Request-Delay": str(self.request_delay),
|
|
61
|
+
}
|
|
62
|
+
if self.include_pattern:
|
|
63
|
+
headers["X-Include-Pattern"] = self.include_pattern
|
|
64
|
+
if self.exclude_pattern:
|
|
65
|
+
headers["X-Exclude-Pattern"] = self.exclude_pattern
|
|
66
|
+
if self.path_prefix:
|
|
67
|
+
headers["X-Path-Prefix"] = self.path_prefix
|
|
68
|
+
return headers
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class DiscoveryProgress:
|
|
73
|
+
"""Progress information for a running discovery job.
|
|
74
|
+
|
|
75
|
+
Attributes:
|
|
76
|
+
percent: Completion percentage (0-100).
|
|
77
|
+
pages_crawled: Number of pages crawled so far.
|
|
78
|
+
urls_discovered: Number of URLs discovered so far.
|
|
79
|
+
current_depth: Current crawl depth.
|
|
80
|
+
message: Human-readable progress message.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
percent: int = 0
|
|
84
|
+
pages_crawled: int = 0
|
|
85
|
+
urls_discovered: int = 0
|
|
86
|
+
current_depth: int = 0
|
|
87
|
+
message: str = ""
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class DiscoveryStats:
|
|
92
|
+
"""Statistics for a completed discovery job.
|
|
93
|
+
|
|
94
|
+
Attributes:
|
|
95
|
+
pages_crawled: Total pages crawled.
|
|
96
|
+
urls_found: Total URLs found during crawl.
|
|
97
|
+
urls_returned: URLs returned in results (after filtering).
|
|
98
|
+
urls_filtered: URLs excluded by filters.
|
|
99
|
+
errors: Number of errors during crawl.
|
|
100
|
+
duration_seconds: Total crawl duration.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
pages_crawled: int = 0
|
|
104
|
+
urls_found: int = 0
|
|
105
|
+
urls_returned: int = 0
|
|
106
|
+
urls_filtered: int = 0
|
|
107
|
+
errors: int = 0
|
|
108
|
+
duration_seconds: float = 0.0
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class DiscoveredURL:
|
|
113
|
+
"""A URL discovered during crawl.
|
|
114
|
+
|
|
115
|
+
Attributes:
|
|
116
|
+
url: The discovered URL.
|
|
117
|
+
depth: Crawl depth at which URL was found.
|
|
118
|
+
title: Page title if available.
|
|
119
|
+
is_internal: Whether URL is internal to the domain.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
url: str
|
|
123
|
+
depth: int = 0
|
|
124
|
+
title: str | None = None
|
|
125
|
+
is_internal: bool = True
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class DiscoveryJobStatus:
|
|
130
|
+
"""Status of a discovery job.
|
|
131
|
+
|
|
132
|
+
Represents the current state of an async discovery job.
|
|
133
|
+
|
|
134
|
+
Attributes:
|
|
135
|
+
job_id: Unique job identifier.
|
|
136
|
+
status: Job status (pending, queued, running, completed, failed, cancelled).
|
|
137
|
+
start_url: The URL that started the discovery.
|
|
138
|
+
progress: Progress information if job is running.
|
|
139
|
+
stats: Statistics if job is completed.
|
|
140
|
+
urls: Discovered URLs if job is completed.
|
|
141
|
+
error: Error message if job failed.
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
job_id: str
|
|
145
|
+
status: str
|
|
146
|
+
start_url: str
|
|
147
|
+
progress: DiscoveryProgress | None = None
|
|
148
|
+
stats: DiscoveryStats | None = None
|
|
149
|
+
urls: list[DiscoveredURL] = field(default_factory=list)
|
|
150
|
+
error: str | None = None
|
|
151
|
+
|
|
152
|
+
def is_terminal(self) -> bool:
|
|
153
|
+
"""Check if job is in a terminal state.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
True if job is completed, failed, or cancelled.
|
|
157
|
+
"""
|
|
158
|
+
return self.status in ("completed", "failed", "cancelled")
|
|
159
|
+
|
|
160
|
+
def is_running(self) -> bool:
|
|
161
|
+
"""Check if job is currently running.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
True if job is pending, queued, or running.
|
|
165
|
+
"""
|
|
166
|
+
return self.status in ("pending", "queued", "running")
|
|
@@ -26,17 +26,22 @@ class Document:
|
|
|
26
26
|
This is the core domain object that flows through the knowledge pipeline.
|
|
27
27
|
Documents are created by loaders, processed by chunkers, and stored by indexers.
|
|
28
28
|
|
|
29
|
+
Note:
|
|
30
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
31
|
+
isolation (e.g., `knowledge-{account_id}`). Tenant information like account_id
|
|
32
|
+
should be passed in the metadata dictionary if needed for audit purposes.
|
|
33
|
+
|
|
29
34
|
Attributes:
|
|
30
35
|
content: The main text content of the document.
|
|
31
36
|
source: Source identifier (URL, file path, etc.).
|
|
32
37
|
doc_id: Unique identifier. Auto-generated from content hash if not provided.
|
|
33
38
|
title: Optional document title.
|
|
34
39
|
url: URL where the document was fetched from.
|
|
35
|
-
metadata: Arbitrary metadata dictionary.
|
|
40
|
+
metadata: Arbitrary metadata dictionary (can include tenant info for audit).
|
|
36
41
|
|
|
37
|
-
|
|
38
|
-
account_id: Account/tenant identifier.
|
|
42
|
+
Collection fields:
|
|
39
43
|
collection_id: Collection the document belongs to.
|
|
44
|
+
collection_name: Collection name for display in aggregations.
|
|
40
45
|
source_id: Source identifier within the collection.
|
|
41
46
|
|
|
42
47
|
Chunking info:
|
|
@@ -70,8 +75,7 @@ class Document:
|
|
|
70
75
|
url: str | None = None
|
|
71
76
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
72
77
|
|
|
73
|
-
#
|
|
74
|
-
account_id: str | None = None
|
|
78
|
+
# Collection fields
|
|
75
79
|
collection_id: str | None = None
|
|
76
80
|
collection_name: str | None = None # For display in aggregations
|
|
77
81
|
source_id: str | None = None
|
|
@@ -139,7 +143,6 @@ class Document:
|
|
|
139
143
|
title=self.title,
|
|
140
144
|
url=self.url,
|
|
141
145
|
metadata=self.metadata.copy(),
|
|
142
|
-
account_id=self.account_id,
|
|
143
146
|
collection_id=self.collection_id,
|
|
144
147
|
collection_name=self.collection_name,
|
|
145
148
|
source_id=self.source_id,
|
|
@@ -154,23 +157,21 @@ class Document:
|
|
|
154
157
|
created_at=self.created_at,
|
|
155
158
|
)
|
|
156
159
|
|
|
157
|
-
def
|
|
160
|
+
def with_collection(
|
|
158
161
|
self,
|
|
159
|
-
|
|
160
|
-
collection_id: str | None = None,
|
|
162
|
+
collection_id: str,
|
|
161
163
|
collection_name: str | None = None,
|
|
162
164
|
source_id: str | None = None,
|
|
163
165
|
) -> Document:
|
|
164
|
-
"""Create a new document with
|
|
166
|
+
"""Create a new document with collection information.
|
|
165
167
|
|
|
166
168
|
Args:
|
|
167
|
-
account_id: Account/tenant identifier.
|
|
168
169
|
collection_id: Collection identifier.
|
|
169
170
|
collection_name: Collection name for display.
|
|
170
171
|
source_id: Source identifier.
|
|
171
172
|
|
|
172
173
|
Returns:
|
|
173
|
-
New Document instance with
|
|
174
|
+
New Document instance with collection information set.
|
|
174
175
|
"""
|
|
175
176
|
return Document(
|
|
176
177
|
content=self.content,
|
|
@@ -179,8 +180,7 @@ class Document:
|
|
|
179
180
|
title=self.title,
|
|
180
181
|
url=self.url,
|
|
181
182
|
metadata=self.metadata.copy(),
|
|
182
|
-
|
|
183
|
-
collection_id=collection_id or self.collection_id,
|
|
183
|
+
collection_id=collection_id,
|
|
184
184
|
collection_name=collection_name or self.collection_name,
|
|
185
185
|
source_id=source_id or self.source_id,
|
|
186
186
|
chunk_index=self.chunk_index,
|
|
@@ -203,11 +203,6 @@ class Document:
|
|
|
203
203
|
"""Check if this document is a chunk of a larger document."""
|
|
204
204
|
return self.chunk_index is not None and self.total_chunks is not None
|
|
205
205
|
|
|
206
|
-
@property
|
|
207
|
-
def is_multi_tenant(self) -> bool:
|
|
208
|
-
"""Check if this document has tenant information."""
|
|
209
|
-
return self.account_id is not None
|
|
210
|
-
|
|
211
206
|
|
|
212
207
|
@dataclass
|
|
213
208
|
class TextChunk:
|
|
@@ -39,6 +39,11 @@ class AgentType(str, Enum):
|
|
|
39
39
|
class SearchQuery:
|
|
40
40
|
"""Search query with filters and options.
|
|
41
41
|
|
|
42
|
+
Note:
|
|
43
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
44
|
+
isolation (e.g., `knowledge-{account_id}`). Callers should ensure they're
|
|
45
|
+
searching the correct tenant-specific index.
|
|
46
|
+
|
|
42
47
|
Attributes:
|
|
43
48
|
text: The search query text.
|
|
44
49
|
mode: Search mode to use.
|
|
@@ -49,7 +54,6 @@ class SearchQuery:
|
|
|
49
54
|
Filters:
|
|
50
55
|
collection_ids: Filter by collection IDs.
|
|
51
56
|
source_ids: Filter by source IDs.
|
|
52
|
-
account_id: Multi-tenant account filter.
|
|
53
57
|
metadata_filters: Custom metadata filters.
|
|
54
58
|
|
|
55
59
|
Advanced options:
|
|
@@ -69,7 +73,6 @@ class SearchQuery:
|
|
|
69
73
|
# Filters
|
|
70
74
|
collection_ids: list[str] | None = None
|
|
71
75
|
source_ids: list[str] | None = None
|
|
72
|
-
account_id: str | None = None
|
|
73
76
|
metadata_filters: dict[str, Any] = field(default_factory=dict)
|
|
74
77
|
|
|
75
78
|
# Advanced options
|
|
@@ -89,26 +92,6 @@ class SearchQuery:
|
|
|
89
92
|
min_score=self.min_score,
|
|
90
93
|
collection_ids=self.collection_ids,
|
|
91
94
|
source_ids=self.source_ids,
|
|
92
|
-
account_id=self.account_id,
|
|
93
|
-
metadata_filters=self.metadata_filters.copy(),
|
|
94
|
-
field_boosts=self.field_boosts.copy() if self.field_boosts else None,
|
|
95
|
-
include_highlights=self.include_highlights,
|
|
96
|
-
include_fields=self.include_fields,
|
|
97
|
-
exclude_fields=self.exclude_fields,
|
|
98
|
-
explain=self.explain,
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
def with_tenant(self, account_id: str) -> SearchQuery:
|
|
102
|
-
"""Create a copy with tenant information."""
|
|
103
|
-
return SearchQuery(
|
|
104
|
-
text=self.text,
|
|
105
|
-
mode=self.mode,
|
|
106
|
-
limit=self.limit,
|
|
107
|
-
offset=self.offset,
|
|
108
|
-
min_score=self.min_score,
|
|
109
|
-
collection_ids=self.collection_ids,
|
|
110
|
-
source_ids=self.source_ids,
|
|
111
|
-
account_id=account_id,
|
|
112
95
|
metadata_filters=self.metadata_filters.copy(),
|
|
113
96
|
field_boosts=self.field_boosts.copy() if self.field_boosts else None,
|
|
114
97
|
include_highlights=self.include_highlights,
|
|
@@ -216,13 +199,17 @@ class ReasoningStep:
|
|
|
216
199
|
class AgenticSearchQuery:
|
|
217
200
|
"""Query for agentic search with conversation support.
|
|
218
201
|
|
|
202
|
+
Note:
|
|
203
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
204
|
+
isolation (e.g., `knowledge-{account_id}`). Callers should ensure they're
|
|
205
|
+
searching the correct tenant-specific index.
|
|
206
|
+
|
|
219
207
|
Attributes:
|
|
220
208
|
text: The search query text.
|
|
221
209
|
agent_type: Type of agent to use.
|
|
222
210
|
conversation_id: ID for continuing a conversation.
|
|
223
211
|
collection_ids: Filter by collection IDs.
|
|
224
212
|
source_ids: Filter by source IDs.
|
|
225
|
-
account_id: Multi-tenant account filter.
|
|
226
213
|
limit: Maximum number of source documents to retrieve.
|
|
227
214
|
include_reasoning: Whether to include reasoning steps.
|
|
228
215
|
metadata_filters: Custom metadata filters.
|
|
@@ -235,7 +222,6 @@ class AgenticSearchQuery:
|
|
|
235
222
|
conversation_id: str | None = None
|
|
236
223
|
collection_ids: list[str] | None = None
|
|
237
224
|
source_ids: list[str] | None = None
|
|
238
|
-
account_id: str | None = None
|
|
239
225
|
limit: int = 10
|
|
240
226
|
include_reasoning: bool = True
|
|
241
227
|
metadata_filters: dict[str, Any] = field(default_factory=dict)
|
|
@@ -250,7 +236,6 @@ class AgenticSearchQuery:
|
|
|
250
236
|
limit=self.limit,
|
|
251
237
|
collection_ids=self.collection_ids,
|
|
252
238
|
source_ids=self.source_ids,
|
|
253
|
-
account_id=self.account_id,
|
|
254
239
|
metadata_filters=self.metadata_filters.copy(),
|
|
255
240
|
)
|
|
256
241
|
|
|
@@ -10,6 +10,11 @@ from typing import Any
|
|
|
10
10
|
class SourceConfig:
|
|
11
11
|
"""Configuration for a content source.
|
|
12
12
|
|
|
13
|
+
Note:
|
|
14
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
15
|
+
isolation (e.g., `knowledge-{account_id}`). Tenant information should be
|
|
16
|
+
managed by the caller, not embedded in source configuration.
|
|
17
|
+
|
|
13
18
|
Attributes:
|
|
14
19
|
url: The source URL or path.
|
|
15
20
|
source_type: Type of source (website, sitemap, file, etc.).
|
|
@@ -26,8 +31,7 @@ class SourceConfig:
|
|
|
26
31
|
remove_selector: CSS selector for elements to remove.
|
|
27
32
|
timeout: Request timeout in seconds.
|
|
28
33
|
|
|
29
|
-
|
|
30
|
-
account_id: Account/tenant identifier.
|
|
34
|
+
Collection:
|
|
31
35
|
collection_id: Collection identifier.
|
|
32
36
|
source_id: Source identifier within collection.
|
|
33
37
|
"""
|
|
@@ -47,8 +51,7 @@ class SourceConfig:
|
|
|
47
51
|
remove_selector: str | None = None
|
|
48
52
|
timeout: int | None = None
|
|
49
53
|
|
|
50
|
-
#
|
|
51
|
-
account_id: str | None = None
|
|
54
|
+
# Collection
|
|
52
55
|
collection_id: str | None = None
|
|
53
56
|
source_id: str | None = None
|
|
54
57
|
|
|
@@ -73,26 +76,23 @@ class SourceConfig:
|
|
|
73
76
|
target_selector=self.target_selector,
|
|
74
77
|
remove_selector=self.remove_selector,
|
|
75
78
|
timeout=self.timeout,
|
|
76
|
-
account_id=self.account_id,
|
|
77
79
|
collection_id=self.collection_id,
|
|
78
80
|
source_id=self.source_id,
|
|
79
81
|
)
|
|
80
82
|
|
|
81
|
-
def
|
|
83
|
+
def with_collection(
|
|
82
84
|
self,
|
|
83
|
-
|
|
84
|
-
collection_id: str | None = None,
|
|
85
|
+
collection_id: str,
|
|
85
86
|
source_id: str | None = None,
|
|
86
87
|
) -> SourceConfig:
|
|
87
|
-
"""Create a copy with
|
|
88
|
+
"""Create a copy with collection information.
|
|
88
89
|
|
|
89
90
|
Args:
|
|
90
|
-
account_id: Account/tenant identifier.
|
|
91
91
|
collection_id: Collection identifier.
|
|
92
92
|
source_id: Source identifier.
|
|
93
93
|
|
|
94
94
|
Returns:
|
|
95
|
-
New SourceConfig with
|
|
95
|
+
New SourceConfig with collection information.
|
|
96
96
|
"""
|
|
97
97
|
return SourceConfig(
|
|
98
98
|
url=self.url,
|
|
@@ -105,7 +105,6 @@ class SourceConfig:
|
|
|
105
105
|
target_selector=self.target_selector,
|
|
106
106
|
remove_selector=self.remove_selector,
|
|
107
107
|
timeout=self.timeout,
|
|
108
|
-
account_id=account_id,
|
|
109
108
|
collection_id=collection_id,
|
|
110
109
|
source_id=source_id,
|
|
111
110
|
)
|
|
@@ -4,6 +4,10 @@ from gnosisllm_knowledge.core.events.emitter import EventEmitter
|
|
|
4
4
|
from gnosisllm_knowledge.core.events.types import (
|
|
5
5
|
BatchCompletedEvent,
|
|
6
6
|
BatchStartedEvent,
|
|
7
|
+
DiscoveryCompletedEvent,
|
|
8
|
+
DiscoveryFailedEvent,
|
|
9
|
+
DiscoveryProgressEvent,
|
|
10
|
+
DiscoveryStartedEvent,
|
|
7
11
|
DocumentIndexedEvent,
|
|
8
12
|
DocumentLoadedEvent,
|
|
9
13
|
Event,
|
|
@@ -20,4 +24,8 @@ __all__ = [
|
|
|
20
24
|
"SitemapDiscoveryEvent",
|
|
21
25
|
"BatchStartedEvent",
|
|
22
26
|
"BatchCompletedEvent",
|
|
27
|
+
"DiscoveryStartedEvent",
|
|
28
|
+
"DiscoveryProgressEvent",
|
|
29
|
+
"DiscoveryCompletedEvent",
|
|
30
|
+
"DiscoveryFailedEvent",
|
|
23
31
|
]
|
|
@@ -34,6 +34,12 @@ class EventType(str, Enum):
|
|
|
34
34
|
LOAD_FAILED = "load_failed"
|
|
35
35
|
SITEMAP_DISCOVERED = "sitemap_discovered"
|
|
36
36
|
|
|
37
|
+
# Discovery events
|
|
38
|
+
DISCOVERY_STARTED = "discovery_started"
|
|
39
|
+
DISCOVERY_PROGRESS = "discovery_progress"
|
|
40
|
+
DISCOVERY_COMPLETED = "discovery_completed"
|
|
41
|
+
DISCOVERY_FAILED = "discovery_failed"
|
|
42
|
+
|
|
37
43
|
# Streaming events
|
|
38
44
|
STREAMING_PROGRESS = "streaming_progress"
|
|
39
45
|
URL_BATCH_PROCESSED = "url_batch_processed"
|
|
@@ -91,11 +97,14 @@ class EventType(str, Enum):
|
|
|
91
97
|
class Event:
|
|
92
98
|
"""Base event class.
|
|
93
99
|
|
|
100
|
+
Note:
|
|
101
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
102
|
+
isolation. Any tenant-specific context should be passed in the data dict.
|
|
103
|
+
|
|
94
104
|
Attributes:
|
|
95
105
|
event_type: The type of event.
|
|
96
106
|
timestamp: When the event occurred.
|
|
97
|
-
data: Additional event data.
|
|
98
|
-
account_id: Account ID for multi-tenant context.
|
|
107
|
+
data: Additional event data (can include tenant context for audit).
|
|
99
108
|
user_id: User ID if applicable.
|
|
100
109
|
request_id: Request ID for tracing.
|
|
101
110
|
trace_id: Distributed trace ID.
|
|
@@ -107,7 +116,6 @@ class Event:
|
|
|
107
116
|
data: dict[str, Any] = field(default_factory=dict)
|
|
108
117
|
|
|
109
118
|
# Context
|
|
110
|
-
account_id: str | None = None
|
|
111
119
|
user_id: str | None = None
|
|
112
120
|
request_id: str | None = None
|
|
113
121
|
|
|
@@ -117,7 +125,6 @@ class Event:
|
|
|
117
125
|
|
|
118
126
|
def with_context(
|
|
119
127
|
self,
|
|
120
|
-
account_id: str | None = None,
|
|
121
128
|
user_id: str | None = None,
|
|
122
129
|
request_id: str | None = None,
|
|
123
130
|
) -> Event:
|
|
@@ -126,7 +133,6 @@ class Event:
|
|
|
126
133
|
event_type=self.event_type,
|
|
127
134
|
timestamp=self.timestamp,
|
|
128
135
|
data=self.data.copy(),
|
|
129
|
-
account_id=account_id or self.account_id,
|
|
130
136
|
user_id=user_id or self.user_id,
|
|
131
137
|
request_id=request_id or self.request_id,
|
|
132
138
|
trace_id=self.trace_id,
|
|
@@ -300,3 +306,114 @@ class StreamingCompletedEvent(Event):
|
|
|
300
306
|
"failed_count": self.failed_count,
|
|
301
307
|
"duration_ms": self.duration_ms,
|
|
302
308
|
}
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
# === Discovery Events ===
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
@dataclass
|
|
315
|
+
class DiscoveryStartedEvent(Event):
|
|
316
|
+
"""Event emitted when a discovery job starts.
|
|
317
|
+
|
|
318
|
+
Attributes:
|
|
319
|
+
url: The starting URL for discovery.
|
|
320
|
+
job_id: The discovery job ID.
|
|
321
|
+
config: Discovery configuration as dictionary.
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
url: str = ""
|
|
325
|
+
job_id: str = ""
|
|
326
|
+
config: dict[str, Any] = field(default_factory=dict)
|
|
327
|
+
|
|
328
|
+
def __post_init__(self) -> None:
|
|
329
|
+
"""Set event type."""
|
|
330
|
+
self.event_type = EventType.DISCOVERY_STARTED
|
|
331
|
+
self.data = {
|
|
332
|
+
"url": self.url,
|
|
333
|
+
"job_id": self.job_id,
|
|
334
|
+
"config": self.config,
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
@dataclass
|
|
339
|
+
class DiscoveryProgressEvent(Event):
|
|
340
|
+
"""Event emitted during discovery progress updates.
|
|
341
|
+
|
|
342
|
+
Attributes:
|
|
343
|
+
job_id: The discovery job ID.
|
|
344
|
+
percent: Progress percentage (0-100).
|
|
345
|
+
pages_crawled: Number of pages crawled so far.
|
|
346
|
+
urls_discovered: Number of URLs discovered so far.
|
|
347
|
+
current_depth: Current crawl depth.
|
|
348
|
+
message: Human-readable progress message.
|
|
349
|
+
"""
|
|
350
|
+
|
|
351
|
+
job_id: str = ""
|
|
352
|
+
percent: int = 0
|
|
353
|
+
pages_crawled: int = 0
|
|
354
|
+
urls_discovered: int = 0
|
|
355
|
+
current_depth: int = 0
|
|
356
|
+
message: str = ""
|
|
357
|
+
|
|
358
|
+
def __post_init__(self) -> None:
|
|
359
|
+
"""Set event type."""
|
|
360
|
+
self.event_type = EventType.DISCOVERY_PROGRESS
|
|
361
|
+
self.data = {
|
|
362
|
+
"job_id": self.job_id,
|
|
363
|
+
"percent": self.percent,
|
|
364
|
+
"pages_crawled": self.pages_crawled,
|
|
365
|
+
"urls_discovered": self.urls_discovered,
|
|
366
|
+
"current_depth": self.current_depth,
|
|
367
|
+
"message": self.message,
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
@dataclass
|
|
372
|
+
class DiscoveryCompletedEvent(Event):
|
|
373
|
+
"""Event emitted when discovery completes successfully.
|
|
374
|
+
|
|
375
|
+
Attributes:
|
|
376
|
+
job_id: The discovery job ID.
|
|
377
|
+
urls_count: Total number of URLs discovered.
|
|
378
|
+
pages_crawled: Total number of pages crawled.
|
|
379
|
+
duration_seconds: Total discovery duration.
|
|
380
|
+
errors: Number of errors encountered during discovery.
|
|
381
|
+
"""
|
|
382
|
+
|
|
383
|
+
job_id: str = ""
|
|
384
|
+
urls_count: int = 0
|
|
385
|
+
pages_crawled: int = 0
|
|
386
|
+
duration_seconds: float = 0.0
|
|
387
|
+
errors: int = 0
|
|
388
|
+
|
|
389
|
+
def __post_init__(self) -> None:
|
|
390
|
+
"""Set event type."""
|
|
391
|
+
self.event_type = EventType.DISCOVERY_COMPLETED
|
|
392
|
+
self.data = {
|
|
393
|
+
"job_id": self.job_id,
|
|
394
|
+
"urls_count": self.urls_count,
|
|
395
|
+
"pages_crawled": self.pages_crawled,
|
|
396
|
+
"duration_seconds": self.duration_seconds,
|
|
397
|
+
"errors": self.errors,
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
@dataclass
|
|
402
|
+
class DiscoveryFailedEvent(Event):
|
|
403
|
+
"""Event emitted when discovery fails.
|
|
404
|
+
|
|
405
|
+
Attributes:
|
|
406
|
+
job_id: The discovery job ID.
|
|
407
|
+
error: Error message describing the failure.
|
|
408
|
+
"""
|
|
409
|
+
|
|
410
|
+
job_id: str = ""
|
|
411
|
+
error: str = ""
|
|
412
|
+
|
|
413
|
+
def __post_init__(self) -> None:
|
|
414
|
+
"""Set event type."""
|
|
415
|
+
self.event_type = EventType.DISCOVERY_FAILED
|
|
416
|
+
self.data = {
|
|
417
|
+
"job_id": self.job_id,
|
|
418
|
+
"error": self.error,
|
|
419
|
+
}
|
|
@@ -539,3 +539,96 @@ class MemoryConfigurationError(MemoryError):
|
|
|
539
539
|
self.missing_config = missing_config
|
|
540
540
|
if missing_config:
|
|
541
541
|
self.details["missing_config"] = missing_config
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
# === Discovery Exceptions ===
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
class DiscoveryError(KnowledgeError):
|
|
548
|
+
"""Base exception for discovery operations.
|
|
549
|
+
|
|
550
|
+
Raised when website discovery fails.
|
|
551
|
+
All discovery-related exceptions inherit from this class.
|
|
552
|
+
"""
|
|
553
|
+
|
|
554
|
+
def __init__(
|
|
555
|
+
self,
|
|
556
|
+
message: str = "Discovery error",
|
|
557
|
+
*,
|
|
558
|
+
job_id: str | None = None,
|
|
559
|
+
source: str | None = None,
|
|
560
|
+
**kwargs: Any,
|
|
561
|
+
) -> None:
|
|
562
|
+
"""Initialize the exception.
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
message: Human-readable error message.
|
|
566
|
+
job_id: The discovery job ID if available.
|
|
567
|
+
source: The source URL being discovered.
|
|
568
|
+
**kwargs: Additional arguments for parent class.
|
|
569
|
+
"""
|
|
570
|
+
super().__init__(message, **kwargs)
|
|
571
|
+
self.job_id = job_id
|
|
572
|
+
self.source = source
|
|
573
|
+
if job_id:
|
|
574
|
+
self.details["job_id"] = job_id
|
|
575
|
+
if source:
|
|
576
|
+
self.details["source"] = source
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
class DiscoveryTimeoutError(DiscoveryError):
|
|
580
|
+
"""Discovery job timed out.
|
|
581
|
+
|
|
582
|
+
Raised when a discovery job exceeds its configured timeout
|
|
583
|
+
while waiting for completion.
|
|
584
|
+
"""
|
|
585
|
+
|
|
586
|
+
def __init__(
|
|
587
|
+
self,
|
|
588
|
+
message: str = "Discovery job timed out",
|
|
589
|
+
*,
|
|
590
|
+
elapsed: float | None = None,
|
|
591
|
+
timeout: float | None = None,
|
|
592
|
+
**kwargs: Any,
|
|
593
|
+
) -> None:
|
|
594
|
+
"""Initialize the exception.
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
message: Human-readable error message.
|
|
598
|
+
elapsed: Time elapsed before timeout.
|
|
599
|
+
timeout: The timeout value that was exceeded.
|
|
600
|
+
**kwargs: Additional arguments for parent class.
|
|
601
|
+
"""
|
|
602
|
+
super().__init__(message, **kwargs)
|
|
603
|
+
self.elapsed = elapsed
|
|
604
|
+
self.timeout = timeout
|
|
605
|
+
if elapsed is not None:
|
|
606
|
+
self.details["elapsed"] = elapsed
|
|
607
|
+
if timeout is not None:
|
|
608
|
+
self.details["timeout"] = timeout
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
class DiscoveryJobFailedError(DiscoveryError):
|
|
612
|
+
"""Discovery job failed on the server.
|
|
613
|
+
|
|
614
|
+
Raised when a discovery job completes with a failed or cancelled status.
|
|
615
|
+
"""
|
|
616
|
+
|
|
617
|
+
def __init__(
|
|
618
|
+
self,
|
|
619
|
+
message: str = "Discovery job failed",
|
|
620
|
+
*,
|
|
621
|
+
status: str | None = None,
|
|
622
|
+
**kwargs: Any,
|
|
623
|
+
) -> None:
|
|
624
|
+
"""Initialize the exception.
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
message: Human-readable error message.
|
|
628
|
+
status: The final job status.
|
|
629
|
+
**kwargs: Additional arguments for parent class.
|
|
630
|
+
"""
|
|
631
|
+
super().__init__(message, **kwargs)
|
|
632
|
+
self.status = status
|
|
633
|
+
if status:
|
|
634
|
+
self.details["status"] = status
|