gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,176 @@
1
+ """Result domain models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+
9
+ @dataclass
10
+ class LoadResult:
11
+ """Result of a load operation.
12
+
13
+ Attributes:
14
+ source: The source that was loaded (URL, file path, etc.).
15
+ source_type: Type of source (website, sitemap, file, etc.).
16
+ document_count: Number of documents loaded.
17
+ success: Whether the operation succeeded.
18
+ error_message: Error message if operation failed.
19
+ duration_ms: Duration of the operation in milliseconds.
20
+ metadata: Additional metadata about the load operation.
21
+ urls_processed: Number of URLs processed (for multi-URL sources).
22
+ urls_failed: Number of URLs that failed to load.
23
+ bytes_loaded: Total bytes of content loaded.
24
+ """
25
+
26
+ source: str
27
+ source_type: str
28
+ document_count: int
29
+ success: bool
30
+ error_message: str | None = None
31
+ duration_ms: float = 0.0
32
+ metadata: dict[str, Any] = field(default_factory=dict)
33
+ urls_processed: int = 0
34
+ urls_failed: int = 0
35
+ bytes_loaded: int = 0
36
+
37
+ @property
38
+ def success_rate(self) -> float:
39
+ """Calculate the success rate for multi-URL loads."""
40
+ total = self.urls_processed + self.urls_failed
41
+ if total == 0:
42
+ return 1.0 if self.success else 0.0
43
+ return self.urls_processed / total
44
+
45
+
46
+ @dataclass
47
+ class IndexResult:
48
+ """Result of an indexing operation.
49
+
50
+ Attributes:
51
+ success: Whether the operation succeeded.
52
+ document_id: ID of the indexed document (single doc operation).
53
+ index_name: Name of the index where documents were stored.
54
+ indexed_count: Number of documents successfully indexed.
55
+ failed_count: Number of documents that failed to index.
56
+ error_message: Error message if operation failed completely.
57
+ duration_ms: Duration of the operation in milliseconds.
58
+ failed_doc_ids: List of document IDs that failed to index.
59
+ errors: List of error details for failed documents.
60
+ """
61
+
62
+ success: bool
63
+ document_id: str | None = None
64
+ index_name: str | None = None
65
+ indexed_count: int = 0
66
+ failed_count: int = 0
67
+ error_message: str | None = None
68
+ duration_ms: float = 0.0
69
+ failed_doc_ids: list[str] = field(default_factory=list)
70
+ errors: list[dict[str, Any]] = field(default_factory=list)
71
+
72
+ @property
73
+ def total_attempted(self) -> int:
74
+ """Return total documents attempted to index."""
75
+ return self.indexed_count + self.failed_count
76
+
77
+ @property
78
+ def success_rate(self) -> float:
79
+ """Calculate the success rate."""
80
+ total = self.total_attempted
81
+ if total == 0:
82
+ return 1.0 if self.success else 0.0
83
+ return self.indexed_count / total
84
+
85
+ def merge(self, other: IndexResult) -> IndexResult:
86
+ """Merge two IndexResults into one.
87
+
88
+ Useful for combining batch results.
89
+
90
+ Args:
91
+ other: Another IndexResult to merge with.
92
+
93
+ Returns:
94
+ New IndexResult combining both results.
95
+ """
96
+ return IndexResult(
97
+ index_name=self.index_name,
98
+ indexed_count=self.indexed_count + other.indexed_count,
99
+ failed_count=self.failed_count + other.failed_count,
100
+ success=self.success and other.success,
101
+ error_message=(
102
+ f"{self.error_message}; {other.error_message}"
103
+ if self.error_message and other.error_message
104
+ else self.error_message or other.error_message
105
+ ),
106
+ duration_ms=self.duration_ms + other.duration_ms,
107
+ failed_doc_ids=self.failed_doc_ids + other.failed_doc_ids,
108
+ errors=self.errors + other.errors,
109
+ )
110
+
111
+
112
+ @dataclass
113
+ class BatchResult:
114
+ """Result of a batch operation.
115
+
116
+ Attributes:
117
+ total: Total items processed.
118
+ succeeded: Number of successful operations.
119
+ failed: Number of failed operations.
120
+ duration_ms: Duration of the batch operation in milliseconds.
121
+ errors: List of errors that occurred.
122
+ """
123
+
124
+ total: int
125
+ succeeded: int
126
+ failed: int
127
+ duration_ms: float = 0.0
128
+ errors: list[str] = field(default_factory=list)
129
+
130
+ @property
131
+ def success_rate(self) -> float:
132
+ """Calculate the success rate for this batch."""
133
+ total = self.total
134
+ if total == 0:
135
+ return 1.0
136
+ return self.succeeded / total
137
+
138
+
139
+ @dataclass
140
+ class ValidationResult:
141
+ """Result of a validation operation.
142
+
143
+ Attributes:
144
+ valid: Whether the content/source is valid.
145
+ message: Descriptive message about the validation.
146
+ errors: List of validation errors if any.
147
+ warnings: List of validation warnings if any.
148
+ metadata: Additional validation metadata.
149
+ """
150
+
151
+ valid: bool
152
+ message: str = ""
153
+ errors: list[str] = field(default_factory=list)
154
+ warnings: list[str] = field(default_factory=list)
155
+ metadata: dict[str, Any] = field(default_factory=dict)
156
+
157
+ @classmethod
158
+ def success(cls, message: str = "Validation passed") -> ValidationResult:
159
+ """Create a successful validation result."""
160
+ return cls(valid=True, message=message)
161
+
162
+ @classmethod
163
+ def failure(cls, message: str, errors: list[str] | None = None) -> ValidationResult:
164
+ """Create a failed validation result."""
165
+ return cls(valid=False, message=message, errors=errors or [])
166
+
167
+ def add_error(self, error: str) -> ValidationResult:
168
+ """Add an error and return self for chaining."""
169
+ self.errors.append(error)
170
+ self.valid = False
171
+ return self
172
+
173
+ def add_warning(self, warning: str) -> ValidationResult:
174
+ """Add a warning and return self for chaining."""
175
+ self.warnings.append(warning)
176
+ return self
@@ -0,0 +1,327 @@
1
+ """Search domain models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from enum import Enum
7
+ from typing import Any
8
+
9
+
10
+ class SearchMode(str, Enum):
11
+ """Search mode options.
12
+
13
+ Attributes:
14
+ SEMANTIC: Vector similarity search only (KNN).
15
+ KEYWORD: BM25 text search only.
16
+ HYBRID: Combined semantic + keyword search (default).
17
+ AGENTIC: AI-powered search with reasoning and answer generation.
18
+ """
19
+
20
+ SEMANTIC = "semantic"
21
+ KEYWORD = "keyword"
22
+ HYBRID = "hybrid"
23
+ AGENTIC = "agentic"
24
+
25
+
26
+ class AgentType(str, Enum):
27
+ """Agent types for agentic search.
28
+
29
+ Attributes:
30
+ FLOW: Fast RAG - single query/response with no conversation memory.
31
+ CONVERSATIONAL: Multi-turn conversations with memory support.
32
+ """
33
+
34
+ FLOW = "flow"
35
+ CONVERSATIONAL = "conversational"
36
+
37
+
38
+ @dataclass
39
+ class SearchQuery:
40
+ """Search query with filters and options.
41
+
42
+ Attributes:
43
+ text: The search query text.
44
+ mode: Search mode to use.
45
+ limit: Maximum number of results to return.
46
+ offset: Number of results to skip (for pagination).
47
+ min_score: Minimum score threshold for results.
48
+
49
+ Filters:
50
+ collection_ids: Filter by collection IDs.
51
+ source_ids: Filter by source IDs.
52
+ account_id: Multi-tenant account filter.
53
+ metadata_filters: Custom metadata filters.
54
+
55
+ Advanced options:
56
+ field_boosts: Field boosting weights.
57
+ include_highlights: Whether to include highlights.
58
+ include_fields: Fields to include in results.
59
+ exclude_fields: Fields to exclude from results.
60
+ explain: Whether to include score explanation.
61
+ """
62
+
63
+ text: str
64
+ mode: SearchMode = SearchMode.HYBRID
65
+ limit: int = 10
66
+ offset: int = 0
67
+ min_score: float | None = None
68
+
69
+ # Filters
70
+ collection_ids: list[str] | None = None
71
+ source_ids: list[str] | None = None
72
+ account_id: str | None = None
73
+ metadata_filters: dict[str, Any] = field(default_factory=dict)
74
+
75
+ # Advanced options
76
+ field_boosts: dict[str, float] | None = None
77
+ include_highlights: bool = True
78
+ include_fields: list[str] | None = None
79
+ exclude_fields: list[str] | None = None
80
+ explain: bool = False
81
+
82
+ def with_mode(self, mode: SearchMode) -> SearchQuery:
83
+ """Create a copy with a different search mode."""
84
+ return SearchQuery(
85
+ text=self.text,
86
+ mode=mode,
87
+ limit=self.limit,
88
+ offset=self.offset,
89
+ min_score=self.min_score,
90
+ collection_ids=self.collection_ids,
91
+ source_ids=self.source_ids,
92
+ account_id=self.account_id,
93
+ metadata_filters=self.metadata_filters.copy(),
94
+ field_boosts=self.field_boosts.copy() if self.field_boosts else None,
95
+ include_highlights=self.include_highlights,
96
+ include_fields=self.include_fields,
97
+ exclude_fields=self.exclude_fields,
98
+ explain=self.explain,
99
+ )
100
+
101
+ def with_tenant(self, account_id: str) -> SearchQuery:
102
+ """Create a copy with tenant information."""
103
+ return SearchQuery(
104
+ text=self.text,
105
+ mode=self.mode,
106
+ limit=self.limit,
107
+ offset=self.offset,
108
+ min_score=self.min_score,
109
+ collection_ids=self.collection_ids,
110
+ source_ids=self.source_ids,
111
+ account_id=account_id,
112
+ metadata_filters=self.metadata_filters.copy(),
113
+ field_boosts=self.field_boosts.copy() if self.field_boosts else None,
114
+ include_highlights=self.include_highlights,
115
+ include_fields=self.include_fields,
116
+ exclude_fields=self.exclude_fields,
117
+ explain=self.explain,
118
+ )
119
+
120
+
121
+ @dataclass
122
+ class SearchResultItem:
123
+ """A single search result.
124
+
125
+ Attributes:
126
+ doc_id: Document identifier.
127
+ content: Document content.
128
+ score: Relevance score.
129
+ title: Document title.
130
+ url: Document URL.
131
+ source: Source identifier.
132
+ collection_id: Collection identifier.
133
+ source_id: Source identifier within collection.
134
+ chunk_index: Chunk index if document is chunked.
135
+ total_chunks: Total chunks in parent document.
136
+ metadata: Document metadata.
137
+ highlights: Highlighted snippets from matching content.
138
+ explanation: Score explanation (when explain=True).
139
+ """
140
+
141
+ doc_id: str
142
+ content: str
143
+ score: float
144
+ title: str | None = None
145
+ url: str | None = None
146
+ source: str | None = None
147
+ collection_id: str | None = None
148
+ source_id: str | None = None
149
+ chunk_index: int | None = None
150
+ total_chunks: int | None = None
151
+ metadata: dict[str, Any] | None = None
152
+ highlights: list[str] | None = None
153
+ highlighted_title: str | None = None
154
+ explanation: dict[str, Any] | None = None
155
+
156
+
157
+ @dataclass
158
+ class SearchResult:
159
+ """Complete search result with metadata.
160
+
161
+ Attributes:
162
+ query: The original search query text.
163
+ mode: Search mode that was used.
164
+ items: List of search result items.
165
+ total_hits: Total number of matching documents.
166
+ duration_ms: Search duration in milliseconds.
167
+ max_score: Maximum score among results.
168
+ from_cache: Whether results came from cache.
169
+ cache_key: Cache key if results are cacheable.
170
+ """
171
+
172
+ query: str
173
+ mode: SearchMode
174
+ items: list[SearchResultItem]
175
+ total_hits: int
176
+ duration_ms: float
177
+ max_score: float | None = None
178
+ from_cache: bool = False
179
+ cache_key: str | None = None
180
+ search_after_token: Any | None = None # For cursor-based pagination
181
+ has_more: bool = False
182
+
183
+ @property
184
+ def has_results(self) -> bool:
185
+ """Check if there are any results."""
186
+ return len(self.items) > 0
187
+
188
+ @property
189
+ def count(self) -> int:
190
+ """Return the number of results in this page."""
191
+ return len(self.items)
192
+
193
+
194
+ @dataclass
195
+ class ReasoningStep:
196
+ """A single step in the agent's reasoning process.
197
+
198
+ Attributes:
199
+ tool: The tool that was used (e.g., "VectorDBTool", "MLModelTool").
200
+ action: The action performed.
201
+ input: Input provided to the tool.
202
+ output: Output from the tool.
203
+ duration_ms: Duration of this step in milliseconds.
204
+ tokens_used: Number of tokens consumed by this step.
205
+ """
206
+
207
+ tool: str
208
+ action: str
209
+ input: str | None = None
210
+ output: str | None = None
211
+ duration_ms: float = 0.0
212
+ tokens_used: int = 0
213
+
214
+
215
+ @dataclass
216
+ class AgenticSearchQuery:
217
+ """Query for agentic search with conversation support.
218
+
219
+ Attributes:
220
+ text: The search query text.
221
+ agent_type: Type of agent to use.
222
+ conversation_id: ID for continuing a conversation.
223
+ collection_ids: Filter by collection IDs.
224
+ source_ids: Filter by source IDs.
225
+ account_id: Multi-tenant account filter.
226
+ limit: Maximum number of source documents to retrieve.
227
+ include_reasoning: Whether to include reasoning steps.
228
+ metadata_filters: Custom metadata filters.
229
+ temperature: LLM temperature (0.0 to 1.0).
230
+ max_iterations: Maximum agent iterations.
231
+ """
232
+
233
+ text: str
234
+ agent_type: AgentType = AgentType.FLOW
235
+ conversation_id: str | None = None
236
+ collection_ids: list[str] | None = None
237
+ source_ids: list[str] | None = None
238
+ account_id: str | None = None
239
+ limit: int = 10
240
+ include_reasoning: bool = True
241
+ metadata_filters: dict[str, Any] = field(default_factory=dict)
242
+ temperature: float = 0.0
243
+ max_iterations: int = 5
244
+
245
+ def to_search_query(self) -> SearchQuery:
246
+ """Convert to a standard SearchQuery for fallback."""
247
+ return SearchQuery(
248
+ text=self.text,
249
+ mode=SearchMode.HYBRID,
250
+ limit=self.limit,
251
+ collection_ids=self.collection_ids,
252
+ source_ids=self.source_ids,
253
+ account_id=self.account_id,
254
+ metadata_filters=self.metadata_filters.copy(),
255
+ )
256
+
257
+
258
+ @dataclass
259
+ class AgenticSearchResult:
260
+ """Search result with agentic enhancements.
261
+
262
+ Extends SearchResult with AI-generated answer and reasoning.
263
+
264
+ Attributes:
265
+ query: The original search query text.
266
+ mode: Search mode (always AGENTIC).
267
+ items: Retrieved source documents.
268
+ total_hits: Total number of matching documents.
269
+ duration_ms: Total search duration in milliseconds.
270
+ max_score: Maximum score among results.
271
+ answer: AI-generated answer to the query.
272
+ reasoning_steps: List of reasoning steps taken by the agent.
273
+ conversation_id: Conversation ID for multi-turn searches.
274
+ agent_type: Type of agent that was used.
275
+ citations: References to source documents used in answer.
276
+ total_tokens: Total tokens consumed.
277
+ prompt_tokens: Tokens used in prompts.
278
+ completion_tokens: Tokens used in completions.
279
+ """
280
+
281
+ query: str
282
+ mode: SearchMode
283
+ items: list[SearchResultItem]
284
+ total_hits: int
285
+ duration_ms: float
286
+ max_score: float | None = None
287
+ answer: str | None = None
288
+ reasoning_steps: list[ReasoningStep] = field(default_factory=list)
289
+ conversation_id: str | None = None
290
+ agent_type: AgentType = AgentType.FLOW
291
+ citations: list[str] = field(default_factory=list)
292
+ total_tokens: int = 0
293
+ prompt_tokens: int = 0
294
+ completion_tokens: int = 0
295
+
296
+ @property
297
+ def has_answer(self) -> bool:
298
+ """Check if an answer was generated."""
299
+ return self.answer is not None and len(self.answer) > 0
300
+
301
+ @property
302
+ def has_reasoning(self) -> bool:
303
+ """Check if reasoning steps are available."""
304
+ return len(self.reasoning_steps) > 0
305
+
306
+ @classmethod
307
+ def from_search_result(
308
+ cls,
309
+ result: SearchResult,
310
+ answer: str | None = None,
311
+ reasoning_steps: list[ReasoningStep] | None = None,
312
+ agent_type: AgentType = AgentType.FLOW,
313
+ conversation_id: str | None = None,
314
+ ) -> AgenticSearchResult:
315
+ """Create AgenticSearchResult from a SearchResult."""
316
+ return cls(
317
+ query=result.query,
318
+ mode=SearchMode.AGENTIC,
319
+ items=result.items,
320
+ total_hits=result.total_hits,
321
+ duration_ms=result.duration_ms,
322
+ max_score=result.max_score,
323
+ answer=answer,
324
+ reasoning_steps=reasoning_steps or [],
325
+ conversation_id=conversation_id,
326
+ agent_type=agent_type,
327
+ )
@@ -0,0 +1,139 @@
1
+ """Source configuration domain model."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any
7
+
8
+
9
+ @dataclass
10
+ class SourceConfig:
11
+ """Configuration for a content source.
12
+
13
+ Attributes:
14
+ url: The source URL or path.
15
+ source_type: Type of source (website, sitemap, file, etc.).
16
+ options: Additional loader-specific options.
17
+
18
+ Sitemap-specific options:
19
+ max_urls: Maximum number of URLs to process.
20
+ max_depth: Maximum sitemap recursion depth.
21
+ allowed_patterns: URL patterns to include.
22
+ blocked_patterns: URL patterns to exclude.
23
+
24
+ Fetcher options:
25
+ target_selector: CSS selector for content extraction.
26
+ remove_selector: CSS selector for elements to remove.
27
+ timeout: Request timeout in seconds.
28
+
29
+ Multi-tenancy:
30
+ account_id: Account/tenant identifier.
31
+ collection_id: Collection identifier.
32
+ source_id: Source identifier within collection.
33
+ """
34
+
35
+ url: str
36
+ source_type: str = "website"
37
+ options: dict[str, Any] = field(default_factory=dict)
38
+
39
+ # Sitemap-specific options
40
+ max_urls: int | None = None
41
+ max_depth: int | None = None
42
+ allowed_patterns: list[str] = field(default_factory=list)
43
+ blocked_patterns: list[str] = field(default_factory=list)
44
+
45
+ # Fetcher options
46
+ target_selector: str | None = None
47
+ remove_selector: str | None = None
48
+ timeout: int | None = None
49
+
50
+ # Multi-tenancy
51
+ account_id: str | None = None
52
+ collection_id: str | None = None
53
+ source_id: str | None = None
54
+
55
+ def with_options(self, **options: Any) -> SourceConfig:
56
+ """Create a copy with additional options merged.
57
+
58
+ Args:
59
+ **options: Options to merge into the config.
60
+
61
+ Returns:
62
+ New SourceConfig with merged options.
63
+ """
64
+ merged_options = {**self.options, **options}
65
+ return SourceConfig(
66
+ url=self.url,
67
+ source_type=self.source_type,
68
+ options=merged_options,
69
+ max_urls=self.max_urls,
70
+ max_depth=self.max_depth,
71
+ allowed_patterns=self.allowed_patterns.copy(),
72
+ blocked_patterns=self.blocked_patterns.copy(),
73
+ target_selector=self.target_selector,
74
+ remove_selector=self.remove_selector,
75
+ timeout=self.timeout,
76
+ account_id=self.account_id,
77
+ collection_id=self.collection_id,
78
+ source_id=self.source_id,
79
+ )
80
+
81
+ def with_tenant(
82
+ self,
83
+ account_id: str,
84
+ collection_id: str | None = None,
85
+ source_id: str | None = None,
86
+ ) -> SourceConfig:
87
+ """Create a copy with tenant information.
88
+
89
+ Args:
90
+ account_id: Account/tenant identifier.
91
+ collection_id: Collection identifier.
92
+ source_id: Source identifier.
93
+
94
+ Returns:
95
+ New SourceConfig with tenant information.
96
+ """
97
+ return SourceConfig(
98
+ url=self.url,
99
+ source_type=self.source_type,
100
+ options=self.options.copy(),
101
+ max_urls=self.max_urls,
102
+ max_depth=self.max_depth,
103
+ allowed_patterns=self.allowed_patterns.copy(),
104
+ blocked_patterns=self.blocked_patterns.copy(),
105
+ target_selector=self.target_selector,
106
+ remove_selector=self.remove_selector,
107
+ timeout=self.timeout,
108
+ account_id=account_id,
109
+ collection_id=collection_id,
110
+ source_id=source_id,
111
+ )
112
+
113
+ @property
114
+ def is_sitemap(self) -> bool:
115
+ """Check if this is a sitemap source."""
116
+ return self.source_type == "sitemap" or self.url.endswith("sitemap.xml")
117
+
118
+ @property
119
+ def is_website(self) -> bool:
120
+ """Check if this is a website source."""
121
+ return self.source_type == "website"
122
+
123
+ @classmethod
124
+ def from_url(cls, url: str, **kwargs: Any) -> SourceConfig:
125
+ """Create a SourceConfig from a URL, auto-detecting source type.
126
+
127
+ Args:
128
+ url: The source URL.
129
+ **kwargs: Additional configuration options.
130
+
131
+ Returns:
132
+ SourceConfig with auto-detected source type.
133
+ """
134
+ # Auto-detect source type from URL
135
+ source_type = "website"
136
+ if "sitemap" in url.lower() or url.endswith(".xml"):
137
+ source_type = "sitemap"
138
+
139
+ return cls(url=url, source_type=source_type, **kwargs)
@@ -0,0 +1,23 @@
1
+ """Event system for decoupled communication (Observer pattern)."""
2
+
3
+ from gnosisllm_knowledge.core.events.emitter import EventEmitter
4
+ from gnosisllm_knowledge.core.events.types import (
5
+ BatchCompletedEvent,
6
+ BatchStartedEvent,
7
+ DocumentIndexedEvent,
8
+ DocumentLoadedEvent,
9
+ Event,
10
+ EventType,
11
+ SitemapDiscoveryEvent,
12
+ )
13
+
14
+ __all__ = [
15
+ "Event",
16
+ "EventType",
17
+ "EventEmitter",
18
+ "DocumentLoadedEvent",
19
+ "DocumentIndexedEvent",
20
+ "SitemapDiscoveryEvent",
21
+ "BatchStartedEvent",
22
+ "BatchCompletedEvent",
23
+ ]