gnosisllm-knowledge 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/PKG-INFO +30 -10
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/README.md +29 -9
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/pyproject.toml +1 -1
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/api/knowledge.py +225 -35
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/agentic.py +14 -9
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/indexer.py +48 -3
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/mappings.py +12 -4
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/searcher.py +9 -6
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/app.py +58 -19
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/commands/agentic.py +15 -9
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/commands/load.py +169 -19
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/commands/memory.py +10 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/commands/search.py +9 -10
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/commands/setup.py +25 -1
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/utils/config.py +4 -4
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/domain/__init__.py +13 -0
- gnosisllm_knowledge-0.4.0/src/gnosisllm_knowledge/core/domain/discovery.py +166 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/domain/document.py +14 -19
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/domain/search.py +10 -25
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/domain/source.py +11 -12
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/events/__init__.py +8 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/events/types.py +122 -5
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/exceptions.py +93 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/streaming.py +10 -4
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/fetchers/__init__.py +8 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/fetchers/config.py +27 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge-0.4.0/src/gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge-0.4.0/src/gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge-0.4.0/src/gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/loaders/factory.py +46 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/services/indexing.py +35 -20
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/services/search.py +37 -20
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/services/streaming_pipeline.py +39 -7
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/api/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/api/memory.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/memory/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/config.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/memory/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/memory/client.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/memory/config.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/memory/setup.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/backends/opensearch/setup.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/chunking/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/chunking/fixed.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/chunking/sentence.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/commands/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/display/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/display/service.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/cli/utils/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/domain/memory.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/domain/result.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/events/emitter.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/chunker.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/fetcher.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/loader.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/memory.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/interfaces/setup.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/streaming/__init__.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/core/streaming/pipeline.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/fetchers/http.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/loaders/base.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/loaders/sitemap.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/loaders/sitemap_streaming.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/loaders/website.py +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/py.typed +0 -0
- {gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/services/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gnosisllm-knowledge
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Enterprise-grade knowledge loading, indexing, and search for Python
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: knowledge-base,rag,semantic-search,vector-search,opensearch,llm,embeddings,enterprise
|
|
@@ -46,7 +46,7 @@ Enterprise-grade knowledge loading, indexing, and semantic search library for Py
|
|
|
46
46
|
- **Multiple Loaders**: Load content from websites, sitemaps, and files
|
|
47
47
|
- **Intelligent Chunking**: Sentence-aware text splitting with configurable overlap
|
|
48
48
|
- **OpenSearch Backend**: Production-ready with k-NN vector search
|
|
49
|
-
- **Multi-Tenancy**:
|
|
49
|
+
- **Multi-Tenancy**: Index isolation for complete tenant separation (tenant-agnostic library)
|
|
50
50
|
- **Event-Driven**: Observer pattern for progress tracking and monitoring
|
|
51
51
|
- **SOLID Architecture**: Clean, maintainable, and extensible codebase
|
|
52
52
|
|
|
@@ -144,14 +144,15 @@ gnosisllm-knowledge load <URL> [OPTIONS]
|
|
|
144
144
|
|
|
145
145
|
Options:
|
|
146
146
|
--type Source type: website, sitemap (auto-detects)
|
|
147
|
-
--index Target index name (
|
|
148
|
-
--account-id Multi-tenant account ID
|
|
147
|
+
--index Target index name (e.g., knowledge-tenant-123)
|
|
149
148
|
--collection-id Collection grouping ID
|
|
150
149
|
--batch-size Documents per batch (default: 100)
|
|
151
150
|
--max-urls Max URLs from sitemap (default: 1000)
|
|
152
151
|
--dry-run Preview without indexing
|
|
153
152
|
```
|
|
154
153
|
|
|
154
|
+
Multi-tenancy is achieved through index isolation. Use `--index` with tenant-specific names (e.g., `--index knowledge-tenant-123`).
|
|
155
|
+
|
|
155
156
|
### Search
|
|
156
157
|
|
|
157
158
|
Search indexed content with multiple modes:
|
|
@@ -161,14 +162,15 @@ gnosisllm-knowledge search <QUERY> [OPTIONS]
|
|
|
161
162
|
|
|
162
163
|
Options:
|
|
163
164
|
--mode Search mode: semantic, keyword, hybrid, agentic
|
|
164
|
-
--index Index to search (
|
|
165
|
+
--index Index to search (e.g., knowledge-tenant-123)
|
|
165
166
|
--limit Max results (default: 5)
|
|
166
|
-
--account-id Filter by account
|
|
167
167
|
--collection-ids Filter by collections (comma-separated)
|
|
168
168
|
--json Output as JSON for scripting
|
|
169
169
|
--interactive Interactive search session
|
|
170
170
|
```
|
|
171
171
|
|
|
172
|
+
Multi-tenancy is achieved through index isolation. Use `--index` with tenant-specific names.
|
|
173
|
+
|
|
172
174
|
## Architecture
|
|
173
175
|
|
|
174
176
|
```
|
|
@@ -319,22 +321,40 @@ agent_body = {
|
|
|
319
321
|
|
|
320
322
|
## Multi-Tenancy
|
|
321
323
|
|
|
324
|
+
This library is **tenant-agnostic**. Multi-tenancy is achieved through **index isolation** - each tenant gets their own OpenSearch index.
|
|
325
|
+
|
|
322
326
|
```python
|
|
323
|
-
#
|
|
327
|
+
# The calling application (e.g., API) constructs tenant-specific index names
|
|
328
|
+
index_name = f"knowledge-{account_id}"
|
|
329
|
+
|
|
330
|
+
# Create Knowledge instance for the tenant
|
|
331
|
+
knowledge = Knowledge.from_opensearch(
|
|
332
|
+
host="localhost",
|
|
333
|
+
port=9200,
|
|
334
|
+
index_prefix=index_name, # knowledge-tenant-123
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Load content to tenant's isolated index
|
|
324
338
|
await knowledge.load(
|
|
325
339
|
source="https://docs.example.com/sitemap.xml",
|
|
326
|
-
account_id="tenant-123",
|
|
327
340
|
collection_id="docs",
|
|
328
341
|
)
|
|
329
342
|
|
|
330
|
-
# Search within tenant
|
|
343
|
+
# Search within tenant's index (no account_id filter needed)
|
|
331
344
|
results = await knowledge.search(
|
|
332
345
|
"query",
|
|
333
|
-
account_id="tenant-123",
|
|
334
346
|
collection_ids=["docs"],
|
|
335
347
|
)
|
|
336
348
|
```
|
|
337
349
|
|
|
350
|
+
**Note**: For audit purposes, you can store `account_id` in document metadata:
|
|
351
|
+
```python
|
|
352
|
+
await knowledge.load(
|
|
353
|
+
source="https://docs.example.com/sitemap.xml",
|
|
354
|
+
document_defaults={"metadata": {"account_id": "tenant-123"}},
|
|
355
|
+
)
|
|
356
|
+
```
|
|
357
|
+
|
|
338
358
|
## Agentic Memory
|
|
339
359
|
|
|
340
360
|
Conversational memory with automatic fact extraction using OpenSearch's ML Memory plugin.
|
|
@@ -11,7 +11,7 @@ Enterprise-grade knowledge loading, indexing, and semantic search library for Py
|
|
|
11
11
|
- **Multiple Loaders**: Load content from websites, sitemaps, and files
|
|
12
12
|
- **Intelligent Chunking**: Sentence-aware text splitting with configurable overlap
|
|
13
13
|
- **OpenSearch Backend**: Production-ready with k-NN vector search
|
|
14
|
-
- **Multi-Tenancy**:
|
|
14
|
+
- **Multi-Tenancy**: Index isolation for complete tenant separation (tenant-agnostic library)
|
|
15
15
|
- **Event-Driven**: Observer pattern for progress tracking and monitoring
|
|
16
16
|
- **SOLID Architecture**: Clean, maintainable, and extensible codebase
|
|
17
17
|
|
|
@@ -109,14 +109,15 @@ gnosisllm-knowledge load <URL> [OPTIONS]
|
|
|
109
109
|
|
|
110
110
|
Options:
|
|
111
111
|
--type Source type: website, sitemap (auto-detects)
|
|
112
|
-
--index Target index name (
|
|
113
|
-
--account-id Multi-tenant account ID
|
|
112
|
+
--index Target index name (e.g., knowledge-tenant-123)
|
|
114
113
|
--collection-id Collection grouping ID
|
|
115
114
|
--batch-size Documents per batch (default: 100)
|
|
116
115
|
--max-urls Max URLs from sitemap (default: 1000)
|
|
117
116
|
--dry-run Preview without indexing
|
|
118
117
|
```
|
|
119
118
|
|
|
119
|
+
Multi-tenancy is achieved through index isolation. Use `--index` with tenant-specific names (e.g., `--index knowledge-tenant-123`).
|
|
120
|
+
|
|
120
121
|
### Search
|
|
121
122
|
|
|
122
123
|
Search indexed content with multiple modes:
|
|
@@ -126,14 +127,15 @@ gnosisllm-knowledge search <QUERY> [OPTIONS]
|
|
|
126
127
|
|
|
127
128
|
Options:
|
|
128
129
|
--mode Search mode: semantic, keyword, hybrid, agentic
|
|
129
|
-
--index Index to search (
|
|
130
|
+
--index Index to search (e.g., knowledge-tenant-123)
|
|
130
131
|
--limit Max results (default: 5)
|
|
131
|
-
--account-id Filter by account
|
|
132
132
|
--collection-ids Filter by collections (comma-separated)
|
|
133
133
|
--json Output as JSON for scripting
|
|
134
134
|
--interactive Interactive search session
|
|
135
135
|
```
|
|
136
136
|
|
|
137
|
+
Multi-tenancy is achieved through index isolation. Use `--index` with tenant-specific names.
|
|
138
|
+
|
|
137
139
|
## Architecture
|
|
138
140
|
|
|
139
141
|
```
|
|
@@ -284,22 +286,40 @@ agent_body = {
|
|
|
284
286
|
|
|
285
287
|
## Multi-Tenancy
|
|
286
288
|
|
|
289
|
+
This library is **tenant-agnostic**. Multi-tenancy is achieved through **index isolation** - each tenant gets their own OpenSearch index.
|
|
290
|
+
|
|
287
291
|
```python
|
|
288
|
-
#
|
|
292
|
+
# The calling application (e.g., API) constructs tenant-specific index names
|
|
293
|
+
index_name = f"knowledge-{account_id}"
|
|
294
|
+
|
|
295
|
+
# Create Knowledge instance for the tenant
|
|
296
|
+
knowledge = Knowledge.from_opensearch(
|
|
297
|
+
host="localhost",
|
|
298
|
+
port=9200,
|
|
299
|
+
index_prefix=index_name, # knowledge-tenant-123
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Load content to tenant's isolated index
|
|
289
303
|
await knowledge.load(
|
|
290
304
|
source="https://docs.example.com/sitemap.xml",
|
|
291
|
-
account_id="tenant-123",
|
|
292
305
|
collection_id="docs",
|
|
293
306
|
)
|
|
294
307
|
|
|
295
|
-
# Search within tenant
|
|
308
|
+
# Search within tenant's index (no account_id filter needed)
|
|
296
309
|
results = await knowledge.search(
|
|
297
310
|
"query",
|
|
298
|
-
account_id="tenant-123",
|
|
299
311
|
collection_ids=["docs"],
|
|
300
312
|
)
|
|
301
313
|
```
|
|
302
314
|
|
|
315
|
+
**Note**: For audit purposes, you can store `account_id` in document metadata:
|
|
316
|
+
```python
|
|
317
|
+
await knowledge.load(
|
|
318
|
+
source="https://docs.example.com/sitemap.xml",
|
|
319
|
+
document_defaults={"metadata": {"account_id": "tenant-123"}},
|
|
320
|
+
)
|
|
321
|
+
```
|
|
322
|
+
|
|
303
323
|
## Agentic Memory
|
|
304
324
|
|
|
305
325
|
Conversational memory with automatic fact extraction using OpenSearch's ML Memory plugin.
|
{gnosisllm_knowledge-0.3.0 → gnosisllm_knowledge-0.4.0}/src/gnosisllm_knowledge/api/knowledge.py
RENAMED
|
@@ -1,4 +1,39 @@
|
|
|
1
|
-
"""High-level Knowledge API facade.
|
|
1
|
+
"""High-level Knowledge API facade.
|
|
2
|
+
|
|
3
|
+
This module provides the main entry point for the gnosisllm-knowledge library.
|
|
4
|
+
The Knowledge class is a high-level facade that abstracts the complexity of
|
|
5
|
+
loading, indexing, and searching knowledge documents.
|
|
6
|
+
|
|
7
|
+
Note:
|
|
8
|
+
This library is tenant-agnostic. Multi-tenancy should be handled at the
|
|
9
|
+
API layer by using separate indices per account (e.g.,
|
|
10
|
+
`knowledge-{account_id}`) rather than filtering by account_id.
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
```python
|
|
14
|
+
# Create Knowledge instance for a specific tenant
|
|
15
|
+
knowledge = Knowledge.from_opensearch(
|
|
16
|
+
host="localhost",
|
|
17
|
+
port=9200,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Use a tenant-specific index
|
|
21
|
+
tenant_index = f"knowledge-{account_id}"
|
|
22
|
+
|
|
23
|
+
# Load content
|
|
24
|
+
await knowledge.load(
|
|
25
|
+
"https://docs.example.com/sitemap.xml",
|
|
26
|
+
index_name=tenant_index,
|
|
27
|
+
collection_id="docs",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Search (tenant isolation via index name)
|
|
31
|
+
results = await knowledge.search(
|
|
32
|
+
"how to configure",
|
|
33
|
+
index_name=tenant_index,
|
|
34
|
+
)
|
|
35
|
+
```
|
|
36
|
+
"""
|
|
2
37
|
|
|
3
38
|
from __future__ import annotations
|
|
4
39
|
|
|
@@ -130,6 +165,10 @@ class Knowledge:
|
|
|
130
165
|
) -> Knowledge:
|
|
131
166
|
"""Create Knowledge instance with OpenSearch backend.
|
|
132
167
|
|
|
168
|
+
This factory creates a Knowledge instance configured for OpenSearch.
|
|
169
|
+
The returned instance is tenant-agnostic - multi-tenancy should be
|
|
170
|
+
handled by using separate indices per account.
|
|
171
|
+
|
|
133
172
|
Args:
|
|
134
173
|
host: OpenSearch host.
|
|
135
174
|
port: OpenSearch port.
|
|
@@ -147,6 +186,19 @@ class Knowledge:
|
|
|
147
186
|
Note:
|
|
148
187
|
Embeddings are generated automatically by OpenSearch ingest pipeline.
|
|
149
188
|
Run 'gnosisllm-knowledge setup' to configure the ML model.
|
|
189
|
+
|
|
190
|
+
Example:
|
|
191
|
+
```python
|
|
192
|
+
# Create a Knowledge instance
|
|
193
|
+
knowledge = Knowledge.from_opensearch(
|
|
194
|
+
host="localhost",
|
|
195
|
+
port=9200,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Use tenant-specific index for isolation
|
|
199
|
+
tenant_index = f"gnosisllm-{account_id}-knowledge"
|
|
200
|
+
await knowledge.load(source, index_name=tenant_index)
|
|
201
|
+
```
|
|
150
202
|
"""
|
|
151
203
|
# Import OpenSearch client
|
|
152
204
|
try:
|
|
@@ -216,15 +268,29 @@ class Knowledge:
|
|
|
216
268
|
def from_env(cls) -> Knowledge:
|
|
217
269
|
"""Create Knowledge instance from environment variables.
|
|
218
270
|
|
|
271
|
+
This factory creates a Knowledge instance using configuration from
|
|
272
|
+
environment variables. The returned instance is tenant-agnostic -
|
|
273
|
+
multi-tenancy should be handled by using separate indices per account.
|
|
274
|
+
|
|
219
275
|
Returns:
|
|
220
276
|
Configured Knowledge instance.
|
|
277
|
+
|
|
278
|
+
Example:
|
|
279
|
+
```python
|
|
280
|
+
# Create from environment
|
|
281
|
+
knowledge = Knowledge.from_env()
|
|
282
|
+
|
|
283
|
+
# Use tenant-specific index for isolation
|
|
284
|
+
tenant_index = f"gnosisllm-{account_id}-knowledge"
|
|
285
|
+
await knowledge.search("query", index_name=tenant_index)
|
|
286
|
+
```
|
|
221
287
|
"""
|
|
222
288
|
config = OpenSearchConfig.from_env()
|
|
223
289
|
neoreader_config = NeoreaderConfig.from_env()
|
|
224
290
|
|
|
225
291
|
return cls.from_opensearch(
|
|
226
292
|
config=config,
|
|
227
|
-
neoreader_url=neoreader_config.
|
|
293
|
+
neoreader_url=neoreader_config.host if neoreader_config.host else None,
|
|
228
294
|
)
|
|
229
295
|
|
|
230
296
|
@property
|
|
@@ -318,7 +384,6 @@ class Knowledge:
|
|
|
318
384
|
source: str,
|
|
319
385
|
*,
|
|
320
386
|
index_name: str | None = None,
|
|
321
|
-
account_id: str | None = None,
|
|
322
387
|
collection_id: str | None = None,
|
|
323
388
|
source_id: str | None = None,
|
|
324
389
|
source_type: str | None = None,
|
|
@@ -329,10 +394,13 @@ class Knowledge:
|
|
|
329
394
|
|
|
330
395
|
Automatically detects source type (sitemap, website, etc.).
|
|
331
396
|
|
|
397
|
+
Note:
|
|
398
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
399
|
+
by using separate indices per account.
|
|
400
|
+
|
|
332
401
|
Args:
|
|
333
402
|
source: Source URL or path.
|
|
334
|
-
index_name: Target index (
|
|
335
|
-
account_id: Account ID for multi-tenancy.
|
|
403
|
+
index_name: Target index (use tenant-specific name for isolation).
|
|
336
404
|
collection_id: Collection ID.
|
|
337
405
|
source_id: Source ID (auto-generated if not provided).
|
|
338
406
|
source_type: Explicit source type (auto-detected if not provided).
|
|
@@ -366,7 +434,6 @@ class Knowledge:
|
|
|
366
434
|
return await service.load_and_index(
|
|
367
435
|
source=source,
|
|
368
436
|
index_name=index,
|
|
369
|
-
account_id=account_id,
|
|
370
437
|
collection_id=collection_id,
|
|
371
438
|
source_id=source_id,
|
|
372
439
|
**options,
|
|
@@ -377,7 +444,6 @@ class Knowledge:
|
|
|
377
444
|
source: str,
|
|
378
445
|
*,
|
|
379
446
|
index_name: str | None = None,
|
|
380
|
-
account_id: str | None = None,
|
|
381
447
|
collection_id: str | None = None,
|
|
382
448
|
collection_name: str | None = None,
|
|
383
449
|
source_id: str | None = None,
|
|
@@ -398,10 +464,13 @@ class Knowledge:
|
|
|
398
464
|
- Document storage: O(index_batch_size)
|
|
399
465
|
- In-flight fetches: O(fetch_concurrency * avg_page_size)
|
|
400
466
|
|
|
467
|
+
Note:
|
|
468
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
469
|
+
by using separate indices per account.
|
|
470
|
+
|
|
401
471
|
Args:
|
|
402
472
|
source: Sitemap URL.
|
|
403
|
-
index_name: Target index (
|
|
404
|
-
account_id: Account ID for multi-tenancy.
|
|
473
|
+
index_name: Target index (use tenant-specific name for isolation).
|
|
405
474
|
collection_id: Collection ID.
|
|
406
475
|
collection_name: Collection name for display.
|
|
407
476
|
source_id: Source ID (auto-generated if not provided).
|
|
@@ -419,6 +488,7 @@ class Knowledge:
|
|
|
419
488
|
# Efficiently load 100k+ URL sitemap
|
|
420
489
|
result = await knowledge.load_streaming(
|
|
421
490
|
"https://large-site.com/sitemap.xml",
|
|
491
|
+
index_name="knowledge-account123", # Tenant-specific
|
|
422
492
|
url_batch_size=100,
|
|
423
493
|
fetch_concurrency=20,
|
|
424
494
|
max_urls=50000,
|
|
@@ -454,7 +524,6 @@ class Knowledge:
|
|
|
454
524
|
return await pipeline.execute(
|
|
455
525
|
source=source,
|
|
456
526
|
index_name=index,
|
|
457
|
-
account_id=account_id,
|
|
458
527
|
collection_id=collection_id,
|
|
459
528
|
collection_name=collection_name,
|
|
460
529
|
source_id=source_id,
|
|
@@ -471,7 +540,6 @@ class Knowledge:
|
|
|
471
540
|
mode: SearchMode = SearchMode.HYBRID,
|
|
472
541
|
limit: int = 10,
|
|
473
542
|
offset: int = 0,
|
|
474
|
-
account_id: str | None = None,
|
|
475
543
|
collection_ids: list[str] | None = None,
|
|
476
544
|
source_ids: list[str] | None = None,
|
|
477
545
|
min_score: float | None = None,
|
|
@@ -479,13 +547,16 @@ class Knowledge:
|
|
|
479
547
|
) -> SearchResult:
|
|
480
548
|
"""Search for knowledge documents.
|
|
481
549
|
|
|
550
|
+
Note:
|
|
551
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
552
|
+
by using separate indices per account.
|
|
553
|
+
|
|
482
554
|
Args:
|
|
483
555
|
query: Search query text.
|
|
484
|
-
index_name: Index to search (
|
|
556
|
+
index_name: Index to search (use tenant-specific name for isolation).
|
|
485
557
|
mode: Search mode (semantic, keyword, hybrid).
|
|
486
558
|
limit: Maximum results.
|
|
487
559
|
offset: Result offset for pagination.
|
|
488
|
-
account_id: Account ID for multi-tenancy.
|
|
489
560
|
collection_ids: Filter by collection IDs.
|
|
490
561
|
source_ids: Filter by source IDs.
|
|
491
562
|
min_score: Minimum score threshold.
|
|
@@ -500,7 +571,6 @@ class Knowledge:
|
|
|
500
571
|
mode=mode,
|
|
501
572
|
limit=limit,
|
|
502
573
|
offset=offset,
|
|
503
|
-
account_id=account_id,
|
|
504
574
|
collection_ids=collection_ids,
|
|
505
575
|
source_ids=source_ids,
|
|
506
576
|
min_score=min_score,
|
|
@@ -578,19 +648,73 @@ class Knowledge:
|
|
|
578
648
|
|
|
579
649
|
# === Management Methods ===
|
|
580
650
|
|
|
651
|
+
async def get_document(
|
|
652
|
+
self,
|
|
653
|
+
document_id: str,
|
|
654
|
+
*,
|
|
655
|
+
index_name: str | None = None,
|
|
656
|
+
) -> dict[str, Any] | None:
|
|
657
|
+
"""Get a single document by ID.
|
|
658
|
+
|
|
659
|
+
Note:
|
|
660
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
661
|
+
by using separate indices per account.
|
|
662
|
+
|
|
663
|
+
Args:
|
|
664
|
+
document_id: Document ID to retrieve.
|
|
665
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
666
|
+
Uses default index if not provided.
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
Document dict with all fields (excluding embeddings) or None if not found.
|
|
670
|
+
"""
|
|
671
|
+
index = index_name or self._default_index
|
|
672
|
+
if not index:
|
|
673
|
+
raise ValueError("No index specified and no default index configured")
|
|
674
|
+
|
|
675
|
+
return await self._indexer.get(document_id, index)
|
|
676
|
+
|
|
677
|
+
async def delete_document(
|
|
678
|
+
self,
|
|
679
|
+
document_id: str,
|
|
680
|
+
*,
|
|
681
|
+
index_name: str | None = None,
|
|
682
|
+
) -> bool:
|
|
683
|
+
"""Delete a single document by ID.
|
|
684
|
+
|
|
685
|
+
Note:
|
|
686
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
687
|
+
by using separate indices per account.
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
document_id: Document ID to delete.
|
|
691
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
692
|
+
Uses default index if not provided.
|
|
693
|
+
|
|
694
|
+
Returns:
|
|
695
|
+
True if deleted, False if not found.
|
|
696
|
+
"""
|
|
697
|
+
index = index_name or self._default_index
|
|
698
|
+
if not index:
|
|
699
|
+
raise ValueError("No index specified and no default index configured")
|
|
700
|
+
|
|
701
|
+
return await self._indexer.delete(document_id, index)
|
|
702
|
+
|
|
581
703
|
async def delete_source(
|
|
582
704
|
self,
|
|
583
705
|
source_id: str,
|
|
584
706
|
*,
|
|
585
707
|
index_name: str | None = None,
|
|
586
|
-
account_id: str | None = None,
|
|
587
708
|
) -> int:
|
|
588
709
|
"""Delete all documents from a source.
|
|
589
710
|
|
|
711
|
+
Note:
|
|
712
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
713
|
+
by using separate indices per account.
|
|
714
|
+
|
|
590
715
|
Args:
|
|
591
716
|
source_id: Source ID to delete.
|
|
592
|
-
index_name: Index name.
|
|
593
|
-
account_id: Account ID for multi-tenancy.
|
|
717
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
594
718
|
|
|
595
719
|
Returns:
|
|
596
720
|
Count of deleted documents.
|
|
@@ -599,21 +723,23 @@ class Knowledge:
|
|
|
599
723
|
if not index:
|
|
600
724
|
raise ValueError("No index specified")
|
|
601
725
|
|
|
602
|
-
return await self.indexing.delete_source(source_id, index
|
|
726
|
+
return await self.indexing.delete_source(source_id, index)
|
|
603
727
|
|
|
604
728
|
async def delete_collection(
|
|
605
729
|
self,
|
|
606
730
|
collection_id: str,
|
|
607
731
|
*,
|
|
608
732
|
index_name: str | None = None,
|
|
609
|
-
account_id: str | None = None,
|
|
610
733
|
) -> int:
|
|
611
734
|
"""Delete all documents from a collection.
|
|
612
735
|
|
|
736
|
+
Note:
|
|
737
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
738
|
+
by using separate indices per account.
|
|
739
|
+
|
|
613
740
|
Args:
|
|
614
741
|
collection_id: Collection ID to delete.
|
|
615
|
-
index_name: Index name.
|
|
616
|
-
account_id: Account ID for multi-tenancy.
|
|
742
|
+
index_name: Index name (use tenant-specific name for isolation).
|
|
617
743
|
|
|
618
744
|
Returns:
|
|
619
745
|
Count of deleted documents.
|
|
@@ -622,54 +748,85 @@ class Knowledge:
|
|
|
622
748
|
if not index:
|
|
623
749
|
raise ValueError("No index specified")
|
|
624
750
|
|
|
625
|
-
return await self.indexing.delete_collection(collection_id, index
|
|
751
|
+
return await self.indexing.delete_collection(collection_id, index)
|
|
626
752
|
|
|
627
753
|
async def count(
|
|
628
754
|
self,
|
|
629
755
|
*,
|
|
630
756
|
index_name: str | None = None,
|
|
631
|
-
account_id: str | None = None,
|
|
632
757
|
collection_id: str | None = None,
|
|
758
|
+
source_id: str | None = None,
|
|
633
759
|
) -> int:
|
|
634
760
|
"""Count documents.
|
|
635
761
|
|
|
762
|
+
Note:
|
|
763
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
764
|
+
by using separate indices per account.
|
|
765
|
+
|
|
636
766
|
Args:
|
|
637
|
-
index_name: Index to count.
|
|
638
|
-
account_id: Filter by account.
|
|
767
|
+
index_name: Index to count (use tenant-specific name for isolation).
|
|
639
768
|
collection_id: Filter by collection.
|
|
769
|
+
source_id: Filter by source (for source deletion confirmation).
|
|
640
770
|
|
|
641
771
|
Returns:
|
|
642
772
|
Document count.
|
|
643
773
|
"""
|
|
644
774
|
return await self.search_service.count(
|
|
645
775
|
index_name=index_name,
|
|
646
|
-
account_id=account_id,
|
|
647
776
|
collection_id=collection_id,
|
|
777
|
+
source_id=source_id,
|
|
648
778
|
)
|
|
649
779
|
|
|
650
780
|
# === Collection and Stats Methods ===
|
|
651
781
|
|
|
652
|
-
async def get_collections(
|
|
782
|
+
async def get_collections(
|
|
783
|
+
self,
|
|
784
|
+
*,
|
|
785
|
+
index_name: str | None = None,
|
|
786
|
+
) -> list[dict[str, Any]]:
|
|
653
787
|
"""Get all collections with document counts.
|
|
654
788
|
|
|
655
789
|
Aggregates unique collection_ids from indexed documents.
|
|
656
790
|
|
|
791
|
+
Note:
|
|
792
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
793
|
+
by using separate indices per account.
|
|
794
|
+
|
|
795
|
+
Args:
|
|
796
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
797
|
+
Uses default index if not provided.
|
|
798
|
+
|
|
657
799
|
Returns:
|
|
658
800
|
List of collection dictionaries with id, name, and document_count.
|
|
659
801
|
"""
|
|
660
|
-
|
|
802
|
+
index = index_name or self._default_index
|
|
803
|
+
return await self.search_service.get_collections(index_name=index)
|
|
661
804
|
|
|
662
|
-
async def get_stats(
|
|
805
|
+
async def get_stats(
|
|
806
|
+
self,
|
|
807
|
+
*,
|
|
808
|
+
index_name: str | None = None,
|
|
809
|
+
) -> dict[str, Any]:
|
|
663
810
|
"""Get index statistics.
|
|
664
811
|
|
|
812
|
+
Note:
|
|
813
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
814
|
+
by using separate indices per account.
|
|
815
|
+
|
|
816
|
+
Args:
|
|
817
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
818
|
+
Uses default index if not provided.
|
|
819
|
+
|
|
665
820
|
Returns:
|
|
666
821
|
Dictionary with document_count, index_name, and other stats.
|
|
667
822
|
"""
|
|
668
|
-
|
|
823
|
+
index = index_name or self._default_index
|
|
824
|
+
return await self.search_service.get_stats(index_name=index)
|
|
669
825
|
|
|
670
826
|
async def list_documents(
|
|
671
827
|
self,
|
|
672
828
|
*,
|
|
829
|
+
index_name: str | None = None,
|
|
673
830
|
source_id: str | None = None,
|
|
674
831
|
collection_id: str | None = None,
|
|
675
832
|
limit: int = 50,
|
|
@@ -677,7 +834,13 @@ class Knowledge:
|
|
|
677
834
|
) -> dict[str, Any]:
|
|
678
835
|
"""List documents with optional filters.
|
|
679
836
|
|
|
837
|
+
Note:
|
|
838
|
+
This method is tenant-agnostic. Multi-tenancy should be handled
|
|
839
|
+
by using separate indices per account.
|
|
840
|
+
|
|
680
841
|
Args:
|
|
842
|
+
index_name: Index to query (use tenant-specific name for isolation).
|
|
843
|
+
Uses default index if not provided.
|
|
681
844
|
source_id: Optional source ID filter.
|
|
682
845
|
collection_id: Optional collection ID filter.
|
|
683
846
|
limit: Maximum documents to return (max 100).
|
|
@@ -686,9 +849,9 @@ class Knowledge:
|
|
|
686
849
|
Returns:
|
|
687
850
|
Dictionary with documents, total, limit, offset.
|
|
688
851
|
"""
|
|
689
|
-
index = self._default_index
|
|
852
|
+
index = index_name or self._default_index
|
|
690
853
|
if not index:
|
|
691
|
-
raise ValueError("No default index configured")
|
|
854
|
+
raise ValueError("No index specified and no default index configured")
|
|
692
855
|
|
|
693
856
|
# Clamp limit to reasonable bounds
|
|
694
857
|
limit = min(max(1, limit), 100)
|
|
@@ -823,6 +986,33 @@ class Knowledge:
|
|
|
823
986
|
return await agentic_searcher.agentic_search(agentic_query, index, **options)
|
|
824
987
|
|
|
825
988
|
async def close(self) -> None:
|
|
826
|
-
"""Close connections and clean up resources.
|
|
827
|
-
|
|
828
|
-
|
|
989
|
+
"""Close connections and clean up resources.
|
|
990
|
+
|
|
991
|
+
Closes the underlying AsyncOpenSearch client to prevent
|
|
992
|
+
unclosed aiohttp session warnings. Properly handles
|
|
993
|
+
CancelledError during event loop shutdown.
|
|
994
|
+
"""
|
|
995
|
+
import asyncio
|
|
996
|
+
|
|
997
|
+
# Close the OpenSearch client via the searcher
|
|
998
|
+
# Note: indexer, searcher, and setup share the same client instance,
|
|
999
|
+
# so closing via searcher is sufficient
|
|
1000
|
+
if hasattr(self._searcher, '_client') and self._searcher._client is not None:
|
|
1001
|
+
client = self._searcher._client
|
|
1002
|
+
try:
|
|
1003
|
+
await client.close()
|
|
1004
|
+
logger.debug("Closed OpenSearch client connection")
|
|
1005
|
+
except asyncio.CancelledError:
|
|
1006
|
+
# Event loop is shutting down - this is expected during cleanup
|
|
1007
|
+
logger.debug("OpenSearch client close cancelled (event loop shutting down)")
|
|
1008
|
+
except Exception as e:
|
|
1009
|
+
logger.warning(f"Error closing OpenSearch client: {e}")
|
|
1010
|
+
finally:
|
|
1011
|
+
# Clear client reference on all components that share it
|
|
1012
|
+
# This prevents any accidental reuse after close
|
|
1013
|
+
if hasattr(self._searcher, '_client'):
|
|
1014
|
+
self._searcher._client = None
|
|
1015
|
+
if hasattr(self._indexer, '_client'):
|
|
1016
|
+
self._indexer._client = None
|
|
1017
|
+
if self._setup and hasattr(self._setup, '_client'):
|
|
1018
|
+
self._setup._client = None
|