gnosisllm-knowledge 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. gnosisllm_knowledge-0.2.0/PKG-INFO +382 -0
  2. gnosisllm_knowledge-0.2.0/README.md +346 -0
  3. gnosisllm_knowledge-0.2.0/pyproject.toml +127 -0
  4. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/__init__.py +152 -0
  5. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/api/__init__.py +5 -0
  6. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/api/knowledge.py +548 -0
  7. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/__init__.py +26 -0
  8. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/memory/__init__.py +9 -0
  9. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/memory/indexer.py +384 -0
  10. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/memory/searcher.py +516 -0
  11. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/opensearch/__init__.py +19 -0
  12. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/opensearch/agentic.py +738 -0
  13. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/opensearch/config.py +195 -0
  14. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/opensearch/indexer.py +499 -0
  15. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/opensearch/mappings.py +255 -0
  16. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/opensearch/queries.py +445 -0
  17. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/opensearch/searcher.py +383 -0
  18. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/backends/opensearch/setup.py +1390 -0
  19. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/chunking/__init__.py +9 -0
  20. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/chunking/fixed.py +138 -0
  21. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/chunking/sentence.py +239 -0
  22. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/__init__.py +18 -0
  23. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/app.py +509 -0
  24. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/commands/__init__.py +7 -0
  25. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/commands/agentic.py +529 -0
  26. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/commands/load.py +369 -0
  27. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/commands/search.py +440 -0
  28. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/commands/setup.py +228 -0
  29. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/display/__init__.py +5 -0
  30. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/display/service.py +555 -0
  31. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/utils/__init__.py +5 -0
  32. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/cli/utils/config.py +207 -0
  33. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/__init__.py +87 -0
  34. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/domain/__init__.py +43 -0
  35. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/domain/document.py +240 -0
  36. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/domain/result.py +176 -0
  37. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/domain/search.py +327 -0
  38. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/domain/source.py +139 -0
  39. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/events/__init__.py +23 -0
  40. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/events/emitter.py +216 -0
  41. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/events/types.py +226 -0
  42. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/exceptions.py +407 -0
  43. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/interfaces/__init__.py +20 -0
  44. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/interfaces/agentic.py +136 -0
  45. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/interfaces/chunker.py +64 -0
  46. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/interfaces/fetcher.py +112 -0
  47. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/interfaces/indexer.py +244 -0
  48. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/interfaces/loader.py +102 -0
  49. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/interfaces/searcher.py +178 -0
  50. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/core/interfaces/setup.py +164 -0
  51. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/fetchers/__init__.py +12 -0
  52. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/fetchers/config.py +77 -0
  53. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/fetchers/http.py +167 -0
  54. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/fetchers/neoreader.py +204 -0
  55. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/loaders/__init__.py +13 -0
  56. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/loaders/base.py +399 -0
  57. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/loaders/factory.py +202 -0
  58. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/loaders/sitemap.py +285 -0
  59. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/loaders/website.py +57 -0
  60. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/py.typed +0 -0
  61. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/services/__init__.py +9 -0
  62. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/services/indexing.py +387 -0
  63. gnosisllm_knowledge-0.2.0/src/gnosisllm_knowledge/services/search.py +349 -0
@@ -0,0 +1,382 @@
1
+ Metadata-Version: 2.4
2
+ Name: gnosisllm-knowledge
3
+ Version: 0.2.0
4
+ Summary: Enterprise-grade knowledge loading, indexing, and search for Python
5
+ License: MIT
6
+ Keywords: knowledge-base,rag,semantic-search,vector-search,opensearch,llm,embeddings,enterprise
7
+ Author: David Marsa
8
+ Author-email: david.marsa@neomanex.com
9
+ Requires-Python: >=3.11,<4.0
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Text Processing :: Indexing
20
+ Classifier: Typing :: Typed
21
+ Classifier: Framework :: AsyncIO
22
+ Requires-Dist: aiohttp (>=3.13.2,<4.0.0)
23
+ Requires-Dist: httpx (>=0.28.1,<0.29.0)
24
+ Requires-Dist: opensearch-py (>=3.1.0,<4.0.0)
25
+ Requires-Dist: pydantic (>=2.12.5,<3.0.0)
26
+ Requires-Dist: pydantic-settings (>=2.12.0,<3.0.0)
27
+ Requires-Dist: python-dotenv (>=1.2.1,<2.0.0)
28
+ Requires-Dist: rich (>=13.7.0,<14.0.0)
29
+ Requires-Dist: typer (>=0.12.0,<1.0.0)
30
+ Project-URL: Changelog, https://github.com/gnosisllm/gnosisllm-knowledge/blob/main/CHANGELOG.md
31
+ Project-URL: Documentation, https://gnosisllm-knowledge.readthedocs.io
32
+ Project-URL: Homepage, https://github.com/gnosisllm/gnosisllm-knowledge
33
+ Project-URL: Repository, https://github.com/gnosisllm/gnosisllm-knowledge
34
+ Description-Content-Type: text/markdown
35
+
36
+ # GnosisLLM Knowledge
37
+
38
+ Enterprise-grade knowledge loading, indexing, and semantic search library for Python.
39
+
40
+ ## Features
41
+
42
+ - **Semantic Search**: Vector-based similarity search using OpenAI embeddings
43
+ - **Hybrid Search**: Combine semantic and keyword (BM25) search for best results
44
+ - **Multiple Loaders**: Load content from websites, sitemaps, and files
45
+ - **Intelligent Chunking**: Sentence-aware text splitting with configurable overlap
46
+ - **OpenSearch Backend**: Production-ready with k-NN vector search
47
+ - **Multi-Tenancy**: Built-in support for account and collection isolation
48
+ - **Event-Driven**: Observer pattern for progress tracking and monitoring
49
+ - **SOLID Architecture**: Clean, maintainable, and extensible codebase
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install gnosisllm-knowledge
55
+
56
+ # With OpenSearch backend
57
+ pip install gnosisllm-knowledge[opensearch]
58
+
59
+ # With all optional dependencies
60
+ pip install gnosisllm-knowledge[all]
61
+ ```
62
+
63
+ ## Quick Start (CLI)
64
+
65
+ ```bash
66
+ # Install
67
+ pip install gnosisllm-knowledge
68
+
69
+ # Set OpenAI API key for embeddings
70
+ export OPENAI_API_KEY=sk-...
71
+
72
+ # Setup OpenSearch with ML model
73
+ gnosisllm-knowledge setup --host localhost --port 9200
74
+ # ✓ Created connector, model, pipelines, index
75
+ # Model ID: abc123 → Add to .env: OPENSEARCH_MODEL_ID=abc123
76
+
77
+ export OPENSEARCH_MODEL_ID=abc123
78
+
79
+ # Load content from a sitemap
80
+ gnosisllm-knowledge load https://docs.example.com/sitemap.xml
81
+ # ✓ Loaded 247 documents (1,248 chunks) in 45.3s
82
+
83
+ # Search
84
+ gnosisllm-knowledge search "how to configure authentication"
85
+ # Found 42 results (23.4ms)
86
+ # 1. Authentication Guide (92.3%)
87
+ # To configure authentication, set AUTH_PROVIDER...
88
+
89
+ # Interactive search mode
90
+ gnosisllm-knowledge search --interactive
91
+ ```
92
+
93
+ ## Quick Start (Python API)
94
+
95
+ ```python
96
+ from gnosisllm_knowledge import Knowledge
97
+
98
+ # Create instance with OpenSearch backend
99
+ knowledge = Knowledge.from_opensearch(
100
+ host="localhost",
101
+ port=9200,
102
+ )
103
+
104
+ # Setup backend (creates indices)
105
+ await knowledge.setup()
106
+
107
+ # Load and index a sitemap
108
+ await knowledge.load(
109
+ "https://docs.example.com/sitemap.xml",
110
+ collection_id="docs",
111
+ )
112
+
113
+ # Search
114
+ results = await knowledge.search("how to configure authentication")
115
+ for item in results.items:
116
+ print(f"{item.title}: {item.score}")
117
+ ```
118
+
119
+ ## CLI Commands
120
+
121
+ ### Setup
122
+
123
+ Configure OpenSearch with neural search capabilities:
124
+
125
+ ```bash
126
+ gnosisllm-knowledge setup [OPTIONS]
127
+
128
+ Options:
129
+ --host OpenSearch host (default: localhost)
130
+ --port OpenSearch port (default: 9200)
131
+ --use-ssl Enable SSL connection
132
+ --force Clean up existing resources first
133
+ --no-hybrid Skip hybrid search pipeline
134
+ ```
135
+
136
+ ### Load
137
+
138
+ Load and index content from URLs or sitemaps:
139
+
140
+ ```bash
141
+ gnosisllm-knowledge load <URL> [OPTIONS]
142
+
143
+ Options:
144
+ --type Source type: website, sitemap (auto-detects)
145
+ --index Target index name (default: knowledge)
146
+ --account-id Multi-tenant account ID
147
+ --collection-id Collection grouping ID
148
+ --batch-size Documents per batch (default: 100)
149
+ --max-urls Max URLs from sitemap (default: 1000)
150
+ --dry-run Preview without indexing
151
+ ```
152
+
153
+ ### Search
154
+
155
+ Search indexed content with multiple modes:
156
+
157
+ ```bash
158
+ gnosisllm-knowledge search <QUERY> [OPTIONS]
159
+
160
+ Options:
161
+ --mode Search mode: semantic, keyword, hybrid, agentic
162
+ --index Index to search (default: knowledge)
163
+ --limit Max results (default: 5)
164
+ --account-id Filter by account
165
+ --collection-ids Filter by collections (comma-separated)
166
+ --json Output as JSON for scripting
167
+ --interactive Interactive search session
168
+ ```
169
+
170
+ ## Architecture
171
+
172
+ ```
173
+ gnosisllm-knowledge/
174
+ ├── api/ # High-level Knowledge facade
175
+ ├── core/
176
+ │ ├── domain/ # Document, SearchQuery, SearchResult models
177
+ │ ├── interfaces/ # Protocol definitions (IContentLoader, etc.)
178
+ │ ├── events/ # Event system for progress tracking
179
+ │ └── exceptions.py # Exception hierarchy
180
+ ├── loaders/ # Content loaders (website, sitemap)
181
+ ├── fetchers/ # Content fetchers (HTTP, Neoreader)
182
+ ├── chunking/ # Text chunking strategies
183
+ ├── backends/
184
+ │ ├── opensearch/ # OpenSearch implementation
185
+ │ └── memory/ # In-memory backend for testing
186
+ └── services/ # Indexing and search orchestration
187
+ ```
188
+
189
+ ## Search Modes
190
+
191
+ ```python
192
+ from gnosisllm_knowledge import SearchMode
193
+
194
+ # Semantic search (vector similarity)
195
+ results = await knowledge.search(query, mode=SearchMode.SEMANTIC)
196
+
197
+ # Keyword search (BM25)
198
+ results = await knowledge.search(query, mode=SearchMode.KEYWORD)
199
+
200
+ # Hybrid search (default - combines both)
201
+ results = await knowledge.search(query, mode=SearchMode.HYBRID)
202
+ ```
203
+
204
+ ## Agentic Search
205
+
206
+ AI-powered search with reasoning and natural language answers using OpenSearch ML agents.
207
+
208
+ **Requirements:** OpenSearch 3.4+ for conversational memory support.
209
+
210
+ ### Setup
211
+
212
+ ```bash
213
+ # 1. First run standard setup (creates embedding model)
214
+ gnosisllm-knowledge setup --port 9201
215
+
216
+ # 2. Setup agentic agents (creates LLM connector, VectorDBTool, MLModelTool, agents)
217
+ gnosisllm-knowledge agentic setup
218
+ # ✓ Flow Agent ID: abc123
219
+ # ✓ Conversational Agent ID: def456
220
+
221
+ # 3. Add agent IDs to environment
222
+ export OPENSEARCH_FLOW_AGENT_ID=abc123
223
+ export OPENSEARCH_CONVERSATIONAL_AGENT_ID=def456
224
+ ```
225
+
226
+ ### Usage
227
+
228
+ ```bash
229
+ # Single-turn agentic search (uses flow agent)
230
+ gnosisllm-knowledge search --mode agentic "What is Typer?"
231
+
232
+ # Interactive multi-turn chat (uses conversational agent with memory)
233
+ gnosisllm-knowledge agentic chat
234
+ # You: What is Typer?
235
+ # Assistant: Typer is a library for building CLI applications...
236
+ # You: What did you just say about it?
237
+ # Assistant: I told you that Typer is a library for building CLI...
238
+ ```
239
+
240
+ ### How It Works
241
+
242
+ ```
243
+ ┌─────────────────────────────────────────────────────────────────────┐
244
+ │ User Query │
245
+ │ "What is Typer?" │
246
+ └─────────────────────────────────────────────────────────────────────┘
247
+
248
+
249
+ ┌─────────────────────────────────────────────────────────────────────┐
250
+ │ OpenSearch ML Agent │
251
+ │ (Flow or Conversational) │
252
+ └─────────────────────────────────────────────────────────────────────┘
253
+
254
+ ┌─────────────────┴─────────────────┐
255
+ ▼ ▼
256
+ ┌───────────────────────┐ ┌───────────────────────┐
257
+ │ VectorDBTool │ │ Conversation Memory │
258
+ │ (Knowledge Search) │ │ (Conversational │
259
+ │ │ │ Agent Only) │
260
+ │ - Searches index │ │ │
261
+ │ - Returns context │ │ - Stores Q&A pairs │
262
+ │ │ │ - Injects chat_history│
263
+ └───────────────────────┘ └───────────────────────┘
264
+ │ │
265
+ └─────────────────┬─────────────────┘
266
+
267
+ ┌─────────────────────────────────────────────────────────────────────┐
268
+ │ MLModelTool (answer_generator) │
269
+ │ │
270
+ │ Prompt Template: │
271
+ │ ┌────────────────────────────────────────────────────────────────┐ │
272
+ │ │ Context from knowledge base: │ │
273
+ │ │ ${parameters.knowledge_search.output} │ │
274
+ │ │ │ │
275
+ │ │ Previous conversation: ← Only for conversational agent │ │
276
+ │ │ ${parameters.chat_history:-} │ │
277
+ │ │ │ │
278
+ │ │ Question: ${parameters.question} │ │
279
+ │ └────────────────────────────────────────────────────────────────┘ │
280
+ └─────────────────────────────────────────────────────────────────────┘
281
+
282
+
283
+ ┌─────────────────────────────────────────────────────────────────────┐
284
+ │ AI-Generated Answer │
285
+ │ "Typer is a library for building CLI applications in Python..." │
286
+ └─────────────────────────────────────────────────────────────────────┘
287
+ ```
288
+
289
+ ### Agent Types
290
+
291
+ | Agent | Type | Use Case | Memory |
292
+ |-------|------|----------|--------|
293
+ | Flow | `flow` | Fast single-turn RAG, API calls | No |
294
+ | Conversational | `conversational_flow` | Multi-turn dialogue, chat | Yes |
295
+
296
+ ### Key Configuration
297
+
298
+ The conversational agent requires these settings for memory to work:
299
+
300
+ ```python
301
+ # In agent registration (setup.py)
302
+ agent_body = {
303
+ "type": "conversational_flow",
304
+ "app_type": "rag", # Required for memory injection
305
+ "llm": {
306
+ "model_id": llm_model_id,
307
+ "parameters": {
308
+ "message_history_limit": 10, # Include last N messages
309
+ },
310
+ },
311
+ "memory": {"type": "conversation_index"},
312
+ }
313
+
314
+ # MLModelTool prompt must include:
315
+ # ${parameters.chat_history:-} ← Receives conversation history
316
+ ```
317
+
318
+ ## Multi-Tenancy
319
+
320
+ ```python
321
+ # Load with tenant isolation
322
+ await knowledge.load(
323
+ source="https://docs.example.com/sitemap.xml",
324
+ account_id="tenant-123",
325
+ collection_id="docs",
326
+ )
327
+
328
+ # Search within tenant
329
+ results = await knowledge.search(
330
+ "query",
331
+ account_id="tenant-123",
332
+ collection_ids=["docs"],
333
+ )
334
+ ```
335
+
336
+ ## Event Tracking
337
+
338
+ ```python
339
+ from gnosisllm_knowledge import EventType
340
+
341
+ # Subscribe to events
342
+ @knowledge.events.on(EventType.DOCUMENT_INDEXED)
343
+ def on_indexed(event):
344
+ print(f"Indexed: {event.document_id}")
345
+
346
+ @knowledge.events.on(EventType.BATCH_COMPLETED)
347
+ def on_batch(event):
348
+ print(f"Batch complete: {event.documents_indexed} docs")
349
+ ```
350
+
351
+ ## Configuration
352
+
353
+ ```python
354
+ from gnosisllm_knowledge import OpenSearchConfig
355
+
356
+ # From environment variables
357
+ config = OpenSearchConfig.from_env()
358
+
359
+ # Explicit configuration
360
+ config = OpenSearchConfig(
361
+ host="search.example.com",
362
+ port=443,
363
+ use_ssl=True,
364
+ username="admin",
365
+ password="secret",
366
+ embedding_model="text-embedding-3-small",
367
+ embedding_dimension=1536,
368
+ )
369
+
370
+ knowledge = Knowledge.from_opensearch(config=config)
371
+ ```
372
+
373
+ ## Requirements
374
+
375
+ - Python 3.11+
376
+ - OpenSearch 2.0+ (for production use)
377
+ - OpenSearch 3.4+ (for agentic search with conversation memory)
378
+
379
+ ## License
380
+
381
+ MIT
382
+