gnosisllm-knowledge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. gnosisllm_knowledge/__init__.py +152 -0
  2. gnosisllm_knowledge/api/__init__.py +5 -0
  3. gnosisllm_knowledge/api/knowledge.py +548 -0
  4. gnosisllm_knowledge/backends/__init__.py +26 -0
  5. gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  6. gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  7. gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  8. gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  9. gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  10. gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  11. gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  12. gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  13. gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  14. gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  15. gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  16. gnosisllm_knowledge/chunking/__init__.py +9 -0
  17. gnosisllm_knowledge/chunking/fixed.py +138 -0
  18. gnosisllm_knowledge/chunking/sentence.py +239 -0
  19. gnosisllm_knowledge/cli/__init__.py +18 -0
  20. gnosisllm_knowledge/cli/app.py +509 -0
  21. gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  22. gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  23. gnosisllm_knowledge/cli/commands/load.py +369 -0
  24. gnosisllm_knowledge/cli/commands/search.py +440 -0
  25. gnosisllm_knowledge/cli/commands/setup.py +228 -0
  26. gnosisllm_knowledge/cli/display/__init__.py +5 -0
  27. gnosisllm_knowledge/cli/display/service.py +555 -0
  28. gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  29. gnosisllm_knowledge/cli/utils/config.py +207 -0
  30. gnosisllm_knowledge/core/__init__.py +87 -0
  31. gnosisllm_knowledge/core/domain/__init__.py +43 -0
  32. gnosisllm_knowledge/core/domain/document.py +240 -0
  33. gnosisllm_knowledge/core/domain/result.py +176 -0
  34. gnosisllm_knowledge/core/domain/search.py +327 -0
  35. gnosisllm_knowledge/core/domain/source.py +139 -0
  36. gnosisllm_knowledge/core/events/__init__.py +23 -0
  37. gnosisllm_knowledge/core/events/emitter.py +216 -0
  38. gnosisllm_knowledge/core/events/types.py +226 -0
  39. gnosisllm_knowledge/core/exceptions.py +407 -0
  40. gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  41. gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  42. gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  43. gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  44. gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  45. gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  46. gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  47. gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  48. gnosisllm_knowledge/fetchers/__init__.py +12 -0
  49. gnosisllm_knowledge/fetchers/config.py +77 -0
  50. gnosisllm_knowledge/fetchers/http.py +167 -0
  51. gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  52. gnosisllm_knowledge/loaders/__init__.py +13 -0
  53. gnosisllm_knowledge/loaders/base.py +399 -0
  54. gnosisllm_knowledge/loaders/factory.py +202 -0
  55. gnosisllm_knowledge/loaders/sitemap.py +285 -0
  56. gnosisllm_knowledge/loaders/website.py +57 -0
  57. gnosisllm_knowledge/py.typed +0 -0
  58. gnosisllm_knowledge/services/__init__.py +9 -0
  59. gnosisllm_knowledge/services/indexing.py +387 -0
  60. gnosisllm_knowledge/services/search.py +349 -0
  61. gnosisllm_knowledge-0.2.0.dist-info/METADATA +382 -0
  62. gnosisllm_knowledge-0.2.0.dist-info/RECORD +64 -0
  63. gnosisllm_knowledge-0.2.0.dist-info/WHEEL +4 -0
  64. gnosisllm_knowledge-0.2.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,112 @@
1
+ """Content fetcher protocol - Single Responsibility Principle."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Protocol, runtime_checkable
7
+
8
+
9
+ @dataclass
10
+ class FetchResult:
11
+ """Result of a fetch operation.
12
+
13
+ Attributes:
14
+ content: The fetched content (usually text or markdown).
15
+ status_code: HTTP status code or equivalent.
16
+ content_type: MIME type of the content.
17
+ url: The final URL after redirects.
18
+ title: Extracted document title.
19
+ metadata: Additional metadata from the fetch.
20
+ encoding: Content encoding.
21
+ headers: Response headers.
22
+ """
23
+
24
+ content: str
25
+ status_code: int
26
+ content_type: str
27
+ url: str
28
+ title: str | None = None
29
+ metadata: dict[str, Any] = field(default_factory=dict)
30
+ encoding: str | None = None
31
+ headers: dict[str, str] = field(default_factory=dict)
32
+
33
+ @property
34
+ def is_success(self) -> bool:
35
+ """Check if the fetch was successful."""
36
+ return 200 <= self.status_code < 300
37
+
38
+ @property
39
+ def is_html(self) -> bool:
40
+ """Check if the content is HTML."""
41
+ return "html" in self.content_type.lower()
42
+
43
+ @property
44
+ def is_text(self) -> bool:
45
+ """Check if the content is text."""
46
+ return "text" in self.content_type.lower()
47
+
48
+ @property
49
+ def content_length(self) -> int:
50
+ """Return the length of the content."""
51
+ return len(self.content)
52
+
53
+
54
+ @runtime_checkable
55
+ class IContentFetcher(Protocol):
56
+ """Protocol for fetching raw content from URLs.
57
+
58
+ Content fetchers are responsible for:
59
+ - Making HTTP requests to URLs
60
+ - Converting content to a standard format (e.g., markdown)
61
+ - Handling authentication and headers
62
+ - Extracting metadata like titles
63
+
64
+ Implementations should follow the Single Responsibility Principle
65
+ and handle only content fetching, not parsing or chunking.
66
+ """
67
+
68
+ async def fetch(self, url: str, **options: Any) -> FetchResult:
69
+ """Fetch content from a URL.
70
+
71
+ Args:
72
+ url: The URL to fetch.
73
+ **options: Fetcher-specific options like:
74
+ - target_selector: CSS selector for content extraction
75
+ - remove_selector: CSS selector for elements to remove
76
+ - timeout: Request timeout in seconds
77
+ - headers: Additional HTTP headers
78
+
79
+ Returns:
80
+ FetchResult with content and metadata.
81
+
82
+ Raises:
83
+ ConnectionError: If the URL cannot be reached.
84
+ TimeoutError: If the request times out.
85
+ """
86
+ ...
87
+
88
+ async def health_check(self) -> bool:
89
+ """Check if the fetcher service is available.
90
+
91
+ Returns:
92
+ True if the service is healthy, False otherwise.
93
+ """
94
+ ...
95
+
96
+ async def fetch_batch(
97
+ self,
98
+ urls: list[str],
99
+ max_concurrent: int = 10,
100
+ **options: Any,
101
+ ) -> list[FetchResult | Exception]:
102
+ """Fetch multiple URLs concurrently.
103
+
104
+ Args:
105
+ urls: List of URLs to fetch.
106
+ max_concurrent: Maximum concurrent requests.
107
+ **options: Options passed to each fetch call.
108
+
109
+ Returns:
110
+ List of FetchResult objects or Exception for failed fetches.
111
+ """
112
+ ...
@@ -0,0 +1,244 @@
1
+ """Document indexer protocol - Interface Segregation Principle."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import AsyncIterator, Callable, Sequence
6
+ from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
7
+
8
+ if TYPE_CHECKING:
9
+ from gnosisllm_knowledge.core.domain.document import Document
10
+ from gnosisllm_knowledge.core.domain.result import BatchResult, IndexResult
11
+
12
+
13
+ @runtime_checkable
14
+ class IDocumentIndexer(Protocol):
15
+ """Protocol for indexing documents into a search backend.
16
+
17
+ Document indexers are responsible for:
18
+ - Generating embeddings for documents
19
+ - Storing documents in the search backend
20
+ - Managing index lifecycle (create, delete, refresh)
21
+ - Handling bulk operations efficiently
22
+
23
+ Implementations should follow the Interface Segregation Principle
24
+ and provide focused methods for each operation type.
25
+ """
26
+
27
+ async def index(
28
+ self,
29
+ document: Document,
30
+ index_name: str,
31
+ **options: Any,
32
+ ) -> IndexResult:
33
+ """Index a single document.
34
+
35
+ Args:
36
+ document: The document to index.
37
+ index_name: Target index name.
38
+ **options: Backend-specific options.
39
+
40
+ Returns:
41
+ IndexResult with success/failure information.
42
+ """
43
+ ...
44
+
45
+ async def bulk_index(
46
+ self,
47
+ documents: Sequence[Document],
48
+ index_name: str,
49
+ batch_size: int = 100,
50
+ **options: Any,
51
+ ) -> IndexResult:
52
+ """Bulk index multiple documents efficiently.
53
+
54
+ Args:
55
+ documents: Documents to index.
56
+ index_name: Target index name.
57
+ batch_size: Number of documents per batch.
58
+ **options: Backend-specific options.
59
+
60
+ Returns:
61
+ Aggregated IndexResult for all documents.
62
+ """
63
+ ...
64
+
65
+ async def bulk_index_streaming(
66
+ self,
67
+ documents: AsyncIterator[Document],
68
+ index_name: str,
69
+ batch_size: int = 100,
70
+ max_concurrent_batches: int = 3,
71
+ on_batch_complete: Callable[[BatchResult], None] | None = None,
72
+ **options: Any,
73
+ ) -> IndexResult:
74
+ """Stream-index documents with backpressure handling.
75
+
76
+ Memory-efficient indexing for large document streams.
77
+
78
+ Args:
79
+ documents: Async iterator of documents.
80
+ index_name: Target index name.
81
+ batch_size: Number of documents per batch.
82
+ max_concurrent_batches: Maximum concurrent batch operations.
83
+ on_batch_complete: Callback called after each batch completes.
84
+ **options: Backend-specific options.
85
+
86
+ Returns:
87
+ Aggregated IndexResult for all documents.
88
+ """
89
+ ...
90
+
91
+ async def upsert(
92
+ self,
93
+ document: Document,
94
+ index_name: str,
95
+ **options: Any,
96
+ ) -> IndexResult:
97
+ """Upsert (update or insert) a document.
98
+
99
+ Args:
100
+ document: Document to upsert.
101
+ index_name: Target index name.
102
+ **options: Backend-specific options.
103
+
104
+ Returns:
105
+ IndexResult with operation status.
106
+ """
107
+ ...
108
+
109
+ async def delete(
110
+ self,
111
+ doc_id: str,
112
+ index_name: str,
113
+ ) -> bool:
114
+ """Delete a document by ID.
115
+
116
+ Args:
117
+ doc_id: Document ID to delete.
118
+ index_name: Target index name.
119
+
120
+ Returns:
121
+ True if deleted, False if not found.
122
+ """
123
+ ...
124
+
125
+ async def bulk_delete(
126
+ self,
127
+ doc_ids: Sequence[str],
128
+ index_name: str,
129
+ ) -> int:
130
+ """Delete multiple documents by ID.
131
+
132
+ Args:
133
+ doc_ids: Document IDs to delete.
134
+ index_name: Target index name.
135
+
136
+ Returns:
137
+ Number of documents deleted.
138
+ """
139
+ ...
140
+
141
+ async def delete_by_source(
142
+ self,
143
+ source: str,
144
+ index_name: str,
145
+ ) -> int:
146
+ """Delete all documents from a specific source.
147
+
148
+ Args:
149
+ source: Source identifier.
150
+ index_name: Target index name.
151
+
152
+ Returns:
153
+ Number of documents deleted.
154
+ """
155
+ ...
156
+
157
+ async def delete_by_query(
158
+ self,
159
+ query: dict[str, Any],
160
+ index_name: str,
161
+ ) -> int:
162
+ """Delete documents matching a query.
163
+
164
+ Args:
165
+ query: Query dictionary in backend format.
166
+ index_name: Target index name.
167
+
168
+ Returns:
169
+ Number of documents deleted.
170
+ """
171
+ ...
172
+
173
+ async def ensure_index(
174
+ self,
175
+ index_name: str,
176
+ **options: Any,
177
+ ) -> bool:
178
+ """Ensure index exists with proper mapping.
179
+
180
+ Creates the index if it doesn't exist, or verifies
181
+ the existing mapping is compatible.
182
+
183
+ Args:
184
+ index_name: Index name to ensure.
185
+ **options: Index settings and mapping options.
186
+
187
+ Returns:
188
+ True if index exists or was created successfully.
189
+ """
190
+ ...
191
+
192
+ async def delete_index(self, index_name: str) -> bool:
193
+ """Delete an index.
194
+
195
+ Args:
196
+ index_name: Index name to delete.
197
+
198
+ Returns:
199
+ True if deleted, False if not found.
200
+ """
201
+ ...
202
+
203
+ async def refresh_index(self, index_name: str) -> bool:
204
+ """Refresh index to make documents searchable.
205
+
206
+ Args:
207
+ index_name: Index name to refresh.
208
+
209
+ Returns:
210
+ True if refresh succeeded.
211
+ """
212
+ ...
213
+
214
+ async def get_document(
215
+ self,
216
+ doc_id: str,
217
+ index_name: str,
218
+ ) -> Document | None:
219
+ """Get a document by ID.
220
+
221
+ Args:
222
+ doc_id: Document ID to retrieve.
223
+ index_name: Index name.
224
+
225
+ Returns:
226
+ Document if found, None otherwise.
227
+ """
228
+ ...
229
+
230
+ async def document_exists(
231
+ self,
232
+ doc_id: str,
233
+ index_name: str,
234
+ ) -> bool:
235
+ """Check if a document exists.
236
+
237
+ Args:
238
+ doc_id: Document ID to check.
239
+ index_name: Index name.
240
+
241
+ Returns:
242
+ True if document exists.
243
+ """
244
+ ...
@@ -0,0 +1,102 @@
1
+ """Content loader protocol - Interface Segregation Principle."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import AsyncIterator, Callable
6
+ from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
7
+
8
+ if TYPE_CHECKING:
9
+ from gnosisllm_knowledge.core.domain.document import Document
10
+ from gnosisllm_knowledge.core.domain.result import LoadResult, ValidationResult
11
+
12
+
13
+ @runtime_checkable
14
+ class IContentLoader(Protocol):
15
+ """Protocol for loading content from various sources.
16
+
17
+ Content loaders are responsible for:
18
+ - Fetching content from a source (URL, file, etc.)
19
+ - Chunking content into documents
20
+ - Supporting both batch and streaming loading
21
+
22
+ Implementations should follow the Single Responsibility Principle
23
+ and handle only content loading, not indexing.
24
+ """
25
+
26
+ @property
27
+ def name(self) -> str:
28
+ """Return the loader name for registry identification."""
29
+ ...
30
+
31
+ def supports(self, source: str) -> bool:
32
+ """Check if this loader supports the given source.
33
+
34
+ Args:
35
+ source: The source URL or path.
36
+
37
+ Returns:
38
+ True if this loader can handle the source.
39
+ """
40
+ ...
41
+
42
+ async def validate_source(self, source: str) -> ValidationResult:
43
+ """Validate that the source is accessible and valid.
44
+
45
+ Args:
46
+ source: The source URL or path.
47
+
48
+ Returns:
49
+ ValidationResult with validation status and any errors.
50
+ """
51
+ ...
52
+
53
+ async def load(self, source: str, **options: Any) -> LoadResult:
54
+ """Load all documents from source.
55
+
56
+ Args:
57
+ source: The source URL or path.
58
+ **options: Loader-specific options.
59
+
60
+ Returns:
61
+ LoadResult with loaded documents and metadata.
62
+ """
63
+ ...
64
+
65
+ async def load_streaming(
66
+ self,
67
+ source: str,
68
+ **options: Any,
69
+ ) -> AsyncIterator[Document]:
70
+ """Stream documents from source for memory-efficient processing.
71
+
72
+ This method yields documents one at a time, which is more
73
+ memory-efficient for large sources.
74
+
75
+ Args:
76
+ source: The source URL or path.
77
+ **options: Loader-specific options.
78
+
79
+ Yields:
80
+ Document objects as they are loaded.
81
+ """
82
+ ...
83
+
84
+ async def load_with_callback(
85
+ self,
86
+ source: str,
87
+ callback: Callable[[list[Document]], Any],
88
+ batch_size: int = 5,
89
+ **options: Any,
90
+ ) -> int:
91
+ """Load documents with a callback for batch processing.
92
+
93
+ Args:
94
+ source: The source URL or path.
95
+ callback: Callback function called with each batch of documents.
96
+ batch_size: Number of documents per batch.
97
+ **options: Loader-specific options.
98
+
99
+ Returns:
100
+ Total number of documents loaded.
101
+ """
102
+ ...
@@ -0,0 +1,178 @@
1
+ """Knowledge searcher protocol - Interface Segregation Principle."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
6
+
7
+ if TYPE_CHECKING:
8
+ from gnosisllm_knowledge.core.domain.search import SearchQuery, SearchResult
9
+
10
+
11
+ @runtime_checkable
12
+ class IKnowledgeSearcher(Protocol):
13
+ """Protocol for searching documents in a search backend.
14
+
15
+ Knowledge searchers are responsible for:
16
+ - Executing different search modes (semantic, keyword, hybrid)
17
+ - Generating embeddings for queries
18
+ - Filtering and ranking results
19
+ - Handling pagination
20
+
21
+ Implementations should follow the Interface Segregation Principle
22
+ and provide focused methods for each search type.
23
+ """
24
+
25
+ async def search(
26
+ self,
27
+ query: SearchQuery,
28
+ index_name: str,
29
+ **options: Any,
30
+ ) -> SearchResult:
31
+ """Execute search based on query mode.
32
+
33
+ Automatically selects the appropriate search method based
34
+ on the query's mode setting.
35
+
36
+ Args:
37
+ query: Search query with filters and options.
38
+ index_name: Target index name.
39
+ **options: Backend-specific options.
40
+
41
+ Returns:
42
+ SearchResult with hits and metadata.
43
+ """
44
+ ...
45
+
46
+ async def semantic_search(
47
+ self,
48
+ query: SearchQuery,
49
+ index_name: str,
50
+ **options: Any,
51
+ ) -> SearchResult:
52
+ """Execute semantic (vector) search only.
53
+
54
+ Uses embedding similarity to find relevant documents.
55
+
56
+ Args:
57
+ query: Search query.
58
+ index_name: Target index name.
59
+ **options: Backend-specific options.
60
+
61
+ Returns:
62
+ SearchResult with semantically similar documents.
63
+ """
64
+ ...
65
+
66
+ async def keyword_search(
67
+ self,
68
+ query: SearchQuery,
69
+ index_name: str,
70
+ **options: Any,
71
+ ) -> SearchResult:
72
+ """Execute keyword (BM25) search only.
73
+
74
+ Uses traditional text matching to find relevant documents.
75
+
76
+ Args:
77
+ query: Search query.
78
+ index_name: Target index name.
79
+ **options: Backend-specific options.
80
+
81
+ Returns:
82
+ SearchResult with keyword-matching documents.
83
+ """
84
+ ...
85
+
86
+ async def hybrid_search(
87
+ self,
88
+ query: SearchQuery,
89
+ index_name: str,
90
+ **options: Any,
91
+ ) -> SearchResult:
92
+ """Execute hybrid (semantic + keyword) search.
93
+
94
+ Combines vector similarity and text matching for best results.
95
+
96
+ Args:
97
+ query: Search query.
98
+ index_name: Target index name.
99
+ **options: Backend-specific options.
100
+
101
+ Returns:
102
+ SearchResult with combined ranking.
103
+ """
104
+ ...
105
+
106
+ async def get_embedding(
107
+ self,
108
+ text: str,
109
+ **options: Any,
110
+ ) -> list[float]:
111
+ """Get embedding vector for text.
112
+
113
+ Args:
114
+ text: Text to embed.
115
+ **options: Embedding model options.
116
+
117
+ Returns:
118
+ Embedding vector as list of floats.
119
+ """
120
+ ...
121
+
122
+ async def get_embeddings_batch(
123
+ self,
124
+ texts: list[str],
125
+ batch_size: int = 100,
126
+ **options: Any,
127
+ ) -> list[list[float]]:
128
+ """Get embeddings for multiple texts efficiently.
129
+
130
+ Args:
131
+ texts: List of texts to embed.
132
+ batch_size: Batch size for API calls.
133
+ **options: Embedding model options.
134
+
135
+ Returns:
136
+ List of embedding vectors.
137
+ """
138
+ ...
139
+
140
+ async def get_similar_documents(
141
+ self,
142
+ doc_id: str,
143
+ index_name: str,
144
+ limit: int = 10,
145
+ **options: Any,
146
+ ) -> SearchResult:
147
+ """Find documents similar to a given document.
148
+
149
+ Args:
150
+ doc_id: Document ID to find similar documents for.
151
+ index_name: Target index name.
152
+ limit: Maximum number of results.
153
+ **options: Backend-specific options.
154
+
155
+ Returns:
156
+ SearchResult with similar documents.
157
+ """
158
+ ...
159
+
160
+ async def multi_search(
161
+ self,
162
+ queries: list[SearchQuery],
163
+ index_name: str,
164
+ **options: Any,
165
+ ) -> list[SearchResult]:
166
+ """Execute multiple searches in a single request.
167
+
168
+ More efficient than individual search calls.
169
+
170
+ Args:
171
+ queries: List of search queries.
172
+ index_name: Target index name.
173
+ **options: Backend-specific options.
174
+
175
+ Returns:
176
+ List of SearchResults in same order as queries.
177
+ """
178
+ ...