gnosisllm-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnosisllm_knowledge/__init__.py +91 -39
- gnosisllm_knowledge/api/__init__.py +3 -2
- gnosisllm_knowledge/api/knowledge.py +502 -32
- gnosisllm_knowledge/api/memory.py +966 -0
- gnosisllm_knowledge/backends/__init__.py +14 -5
- gnosisllm_knowledge/backends/memory/indexer.py +27 -2
- gnosisllm_knowledge/backends/memory/searcher.py +111 -10
- gnosisllm_knowledge/backends/opensearch/agentic.py +355 -48
- gnosisllm_knowledge/backends/opensearch/config.py +49 -28
- gnosisllm_knowledge/backends/opensearch/indexer.py +49 -3
- gnosisllm_knowledge/backends/opensearch/mappings.py +14 -5
- gnosisllm_knowledge/backends/opensearch/memory/__init__.py +12 -0
- gnosisllm_knowledge/backends/opensearch/memory/client.py +1380 -0
- gnosisllm_knowledge/backends/opensearch/memory/config.py +127 -0
- gnosisllm_knowledge/backends/opensearch/memory/setup.py +322 -0
- gnosisllm_knowledge/backends/opensearch/queries.py +33 -33
- gnosisllm_knowledge/backends/opensearch/searcher.py +238 -0
- gnosisllm_knowledge/backends/opensearch/setup.py +308 -148
- gnosisllm_knowledge/cli/app.py +436 -31
- gnosisllm_knowledge/cli/commands/agentic.py +26 -9
- gnosisllm_knowledge/cli/commands/load.py +169 -19
- gnosisllm_knowledge/cli/commands/memory.py +733 -0
- gnosisllm_knowledge/cli/commands/search.py +9 -10
- gnosisllm_knowledge/cli/commands/setup.py +49 -23
- gnosisllm_knowledge/cli/display/service.py +43 -0
- gnosisllm_knowledge/cli/utils/config.py +62 -4
- gnosisllm_knowledge/core/domain/__init__.py +54 -0
- gnosisllm_knowledge/core/domain/discovery.py +166 -0
- gnosisllm_knowledge/core/domain/document.py +19 -19
- gnosisllm_knowledge/core/domain/memory.py +440 -0
- gnosisllm_knowledge/core/domain/result.py +11 -3
- gnosisllm_knowledge/core/domain/search.py +12 -25
- gnosisllm_knowledge/core/domain/source.py +11 -12
- gnosisllm_knowledge/core/events/__init__.py +8 -0
- gnosisllm_knowledge/core/events/types.py +198 -5
- gnosisllm_knowledge/core/exceptions.py +227 -0
- gnosisllm_knowledge/core/interfaces/__init__.py +17 -0
- gnosisllm_knowledge/core/interfaces/agentic.py +11 -3
- gnosisllm_knowledge/core/interfaces/indexer.py +10 -1
- gnosisllm_knowledge/core/interfaces/memory.py +524 -0
- gnosisllm_knowledge/core/interfaces/searcher.py +10 -1
- gnosisllm_knowledge/core/interfaces/streaming.py +133 -0
- gnosisllm_knowledge/core/streaming/__init__.py +36 -0
- gnosisllm_knowledge/core/streaming/pipeline.py +228 -0
- gnosisllm_knowledge/fetchers/__init__.py +8 -0
- gnosisllm_knowledge/fetchers/config.py +27 -0
- gnosisllm_knowledge/fetchers/neoreader.py +31 -3
- gnosisllm_knowledge/fetchers/neoreader_discovery.py +505 -0
- gnosisllm_knowledge/loaders/__init__.py +5 -1
- gnosisllm_knowledge/loaders/base.py +3 -4
- gnosisllm_knowledge/loaders/discovery.py +338 -0
- gnosisllm_knowledge/loaders/discovery_streaming.py +343 -0
- gnosisllm_knowledge/loaders/factory.py +46 -0
- gnosisllm_knowledge/loaders/sitemap.py +129 -1
- gnosisllm_knowledge/loaders/sitemap_streaming.py +258 -0
- gnosisllm_knowledge/services/indexing.py +100 -93
- gnosisllm_knowledge/services/search.py +84 -31
- gnosisllm_knowledge/services/streaming_pipeline.py +334 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/METADATA +73 -10
- gnosisllm_knowledge-0.4.0.dist-info/RECORD +81 -0
- gnosisllm_knowledge-0.2.0.dist-info/RECORD +0 -64
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/WHEEL +0 -0
- {gnosisllm_knowledge-0.2.0.dist-info → gnosisllm_knowledge-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -5,6 +5,10 @@ Supports multiple search modes:
|
|
|
5
5
|
- keyword: Traditional BM25 text matching
|
|
6
6
|
- hybrid: Combined semantic + keyword (default, best results)
|
|
7
7
|
- agentic: AI-powered search with reasoning and answer generation
|
|
8
|
+
|
|
9
|
+
Note:
|
|
10
|
+
This library is tenant-agnostic. Multi-tenancy is achieved through index
|
|
11
|
+
isolation - each tenant should use a separate index (e.g., "knowledge-{account_id}").
|
|
8
12
|
"""
|
|
9
13
|
|
|
10
14
|
from __future__ import annotations
|
|
@@ -44,7 +48,6 @@ async def search_command(
|
|
|
44
48
|
index_name: str = "knowledge",
|
|
45
49
|
limit: int = 5,
|
|
46
50
|
offset: int = 0,
|
|
47
|
-
account_id: str | None = None,
|
|
48
51
|
collection_ids: str | None = None,
|
|
49
52
|
source_ids: str | None = None,
|
|
50
53
|
min_score: float = 0.0,
|
|
@@ -55,14 +58,17 @@ async def search_command(
|
|
|
55
58
|
) -> None:
|
|
56
59
|
"""Execute the search command.
|
|
57
60
|
|
|
61
|
+
Note:
|
|
62
|
+
Multi-tenancy is achieved through index isolation. Use tenant-specific
|
|
63
|
+
index names instead (e.g., --index knowledge-tenant-123).
|
|
64
|
+
|
|
58
65
|
Args:
|
|
59
66
|
display: Display service for output.
|
|
60
67
|
query: Search query text.
|
|
61
68
|
mode: Search mode (semantic, keyword, hybrid, agentic).
|
|
62
|
-
index_name: Index to search.
|
|
69
|
+
index_name: Index to search (use tenant-specific name for isolation).
|
|
63
70
|
limit: Maximum results to return.
|
|
64
71
|
offset: Pagination offset.
|
|
65
|
-
account_id: Filter by account ID.
|
|
66
72
|
collection_ids: Filter by collection IDs (comma-separated).
|
|
67
73
|
source_ids: Filter by source IDs (comma-separated).
|
|
68
74
|
min_score: Minimum score threshold.
|
|
@@ -86,7 +92,6 @@ async def search_command(
|
|
|
86
92
|
query=query or "",
|
|
87
93
|
index_name=index_name,
|
|
88
94
|
agent_type="flow", # Default to flow for single queries
|
|
89
|
-
account_id=account_id,
|
|
90
95
|
collection_ids=collection_ids,
|
|
91
96
|
source_ids=source_ids,
|
|
92
97
|
limit=limit,
|
|
@@ -117,7 +122,6 @@ async def search_command(
|
|
|
117
122
|
index_name=index_name,
|
|
118
123
|
mode=mode,
|
|
119
124
|
limit=limit,
|
|
120
|
-
account_id=account_id,
|
|
121
125
|
collection_ids=collection_ids,
|
|
122
126
|
source_ids=source_ids,
|
|
123
127
|
min_score=min_score,
|
|
@@ -146,7 +150,6 @@ async def search_command(
|
|
|
146
150
|
index_name=index_name,
|
|
147
151
|
limit=limit,
|
|
148
152
|
offset=offset,
|
|
149
|
-
account_id=account_id,
|
|
150
153
|
collection_ids=collection_ids,
|
|
151
154
|
source_ids=source_ids,
|
|
152
155
|
min_score=min_score,
|
|
@@ -167,7 +170,6 @@ async def _execute_search(
|
|
|
167
170
|
index_name: str,
|
|
168
171
|
limit: int,
|
|
169
172
|
offset: int,
|
|
170
|
-
account_id: str | None,
|
|
171
173
|
collection_ids: str | None,
|
|
172
174
|
source_ids: str | None,
|
|
173
175
|
min_score: float,
|
|
@@ -214,7 +216,6 @@ async def _execute_search(
|
|
|
214
216
|
mode=_get_search_mode(mode),
|
|
215
217
|
limit=limit,
|
|
216
218
|
offset=offset,
|
|
217
|
-
account_id=account_id,
|
|
218
219
|
collection_ids=collection_list,
|
|
219
220
|
source_ids=source_list,
|
|
220
221
|
min_score=min_score,
|
|
@@ -315,7 +316,6 @@ async def _interactive_search(
|
|
|
315
316
|
index_name: str,
|
|
316
317
|
mode: str,
|
|
317
318
|
limit: int,
|
|
318
|
-
account_id: str | None,
|
|
319
319
|
collection_ids: str | None,
|
|
320
320
|
source_ids: str | None,
|
|
321
321
|
min_score: float,
|
|
@@ -396,7 +396,6 @@ async def _interactive_search(
|
|
|
396
396
|
mode=_get_search_mode(mode),
|
|
397
397
|
limit=limit,
|
|
398
398
|
offset=0,
|
|
399
|
-
account_id=account_id,
|
|
400
399
|
collection_ids=collection_list,
|
|
401
400
|
source_ids=source_list,
|
|
402
401
|
min_score=min_score,
|
|
@@ -26,12 +26,12 @@ if TYPE_CHECKING:
|
|
|
26
26
|
|
|
27
27
|
async def setup_command(
|
|
28
28
|
display: RichDisplayService,
|
|
29
|
-
host: str =
|
|
30
|
-
port: int =
|
|
29
|
+
host: str | None = None,
|
|
30
|
+
port: int | None = None,
|
|
31
31
|
username: str | None = None,
|
|
32
32
|
password: str | None = None,
|
|
33
|
-
use_ssl: bool =
|
|
34
|
-
verify_certs: bool =
|
|
33
|
+
use_ssl: bool | None = None,
|
|
34
|
+
verify_certs: bool | None = None,
|
|
35
35
|
force: bool = False,
|
|
36
36
|
no_sample_data: bool = False,
|
|
37
37
|
no_hybrid: bool = False,
|
|
@@ -40,24 +40,26 @@ async def setup_command(
|
|
|
40
40
|
|
|
41
41
|
Args:
|
|
42
42
|
display: Display service for output.
|
|
43
|
-
host: OpenSearch host.
|
|
44
|
-
port: OpenSearch port.
|
|
45
|
-
username: OpenSearch username.
|
|
46
|
-
password: OpenSearch password.
|
|
47
|
-
use_ssl: Enable SSL.
|
|
48
|
-
verify_certs: Verify SSL certificates.
|
|
43
|
+
host: OpenSearch host (overrides env).
|
|
44
|
+
port: OpenSearch port (overrides env).
|
|
45
|
+
username: OpenSearch username (overrides env).
|
|
46
|
+
password: OpenSearch password (overrides env).
|
|
47
|
+
use_ssl: Enable SSL (overrides env).
|
|
48
|
+
verify_certs: Verify SSL certificates (overrides env).
|
|
49
49
|
force: Clean up existing resources first.
|
|
50
50
|
no_sample_data: Skip sample data ingestion.
|
|
51
51
|
no_hybrid: Skip hybrid search pipeline.
|
|
52
52
|
"""
|
|
53
|
-
# Load configuration
|
|
53
|
+
# Load configuration from environment
|
|
54
54
|
cli_config = CliConfig.from_env()
|
|
55
55
|
|
|
56
|
-
#
|
|
57
|
-
final_host = host
|
|
58
|
-
final_port = port
|
|
59
|
-
final_username = username
|
|
60
|
-
final_password = password
|
|
56
|
+
# CLI arguments override environment variables (only if explicitly provided)
|
|
57
|
+
final_host = host if host is not None else cli_config.opensearch_host
|
|
58
|
+
final_port = port if port is not None else cli_config.opensearch_port
|
|
59
|
+
final_username = username if username is not None else cli_config.opensearch_username
|
|
60
|
+
final_password = password if password is not None else cli_config.opensearch_password
|
|
61
|
+
final_use_ssl = use_ssl if use_ssl is not None else cli_config.opensearch_use_ssl
|
|
62
|
+
final_verify_certs = verify_certs if verify_certs is not None else cli_config.opensearch_verify_certs
|
|
61
63
|
|
|
62
64
|
# Validate required config
|
|
63
65
|
if not cli_config.openai_api_key:
|
|
@@ -79,7 +81,7 @@ async def setup_command(
|
|
|
79
81
|
"Configuration",
|
|
80
82
|
[
|
|
81
83
|
("Host", f"{final_host}:{final_port}"),
|
|
82
|
-
("SSL", "Enabled" if
|
|
84
|
+
("SSL", "Enabled" if final_use_ssl else "Disabled"),
|
|
83
85
|
("Auth", "Configured" if final_username else "None"),
|
|
84
86
|
("Hybrid Search", "Disabled" if no_hybrid else "Enabled"),
|
|
85
87
|
("Force Recreate", "Yes" if force else "No"),
|
|
@@ -88,17 +90,41 @@ async def setup_command(
|
|
|
88
90
|
|
|
89
91
|
display.newline()
|
|
90
92
|
|
|
91
|
-
# Create OpenSearch config
|
|
93
|
+
# Create OpenSearch config from environment, then override with CLI args
|
|
94
|
+
# This ensures all env vars (including pipeline names) are respected
|
|
95
|
+
base_config = OpenSearchConfig.from_env()
|
|
92
96
|
opensearch_config = OpenSearchConfig(
|
|
97
|
+
# CLI overrides (if provided)
|
|
93
98
|
host=final_host,
|
|
94
99
|
port=final_port,
|
|
95
100
|
username=final_username,
|
|
96
101
|
password=final_password,
|
|
97
|
-
use_ssl=
|
|
98
|
-
verify_certs=
|
|
102
|
+
use_ssl=final_use_ssl,
|
|
103
|
+
verify_certs=final_verify_certs,
|
|
99
104
|
openai_api_key=cli_config.openai_api_key,
|
|
100
105
|
embedding_model=cli_config.openai_embedding_model,
|
|
101
106
|
embedding_dimension=cli_config.openai_embedding_dimension,
|
|
107
|
+
# Preserve env-based config for pipelines and other settings
|
|
108
|
+
ingest_pipeline_name=base_config.ingest_pipeline_name,
|
|
109
|
+
search_pipeline_name=base_config.search_pipeline_name,
|
|
110
|
+
index_prefix=base_config.index_prefix,
|
|
111
|
+
model_id=base_config.model_id,
|
|
112
|
+
model_group_id=base_config.model_group_id,
|
|
113
|
+
embedding_field=base_config.embedding_field,
|
|
114
|
+
# k-NN settings
|
|
115
|
+
knn_engine=base_config.knn_engine,
|
|
116
|
+
knn_space_type=base_config.knn_space_type,
|
|
117
|
+
knn_algo_param_ef_search=base_config.knn_algo_param_ef_search,
|
|
118
|
+
knn_algo_param_ef_construction=base_config.knn_algo_param_ef_construction,
|
|
119
|
+
knn_algo_param_m=base_config.knn_algo_param_m,
|
|
120
|
+
# Index settings
|
|
121
|
+
number_of_shards=base_config.number_of_shards,
|
|
122
|
+
number_of_replicas=base_config.number_of_replicas,
|
|
123
|
+
refresh_interval=base_config.refresh_interval,
|
|
124
|
+
# Agentic settings
|
|
125
|
+
agentic_llm_model=base_config.agentic_llm_model,
|
|
126
|
+
agentic_max_iterations=base_config.agentic_max_iterations,
|
|
127
|
+
agentic_timeout_seconds=base_config.agentic_timeout_seconds,
|
|
102
128
|
)
|
|
103
129
|
|
|
104
130
|
# Create OpenSearch client
|
|
@@ -109,8 +135,8 @@ async def setup_command(
|
|
|
109
135
|
client = AsyncOpenSearch(
|
|
110
136
|
hosts=[{"host": final_host, "port": final_port}],
|
|
111
137
|
http_auth=http_auth,
|
|
112
|
-
use_ssl=
|
|
113
|
-
verify_certs=
|
|
138
|
+
use_ssl=final_use_ssl,
|
|
139
|
+
verify_certs=final_verify_certs,
|
|
114
140
|
ssl_show_warn=False,
|
|
115
141
|
)
|
|
116
142
|
|
|
@@ -124,7 +150,7 @@ async def setup_command(
|
|
|
124
150
|
display.format_error_with_suggestion(
|
|
125
151
|
error=f"Cannot connect to OpenSearch at {final_host}:{final_port}",
|
|
126
152
|
suggestion="Ensure OpenSearch is running and accessible.",
|
|
127
|
-
command=f"curl http{'s' if
|
|
153
|
+
command=f"curl http{'s' if final_use_ssl else ''}://{final_host}:{final_port}",
|
|
128
154
|
)
|
|
129
155
|
sys.exit(1)
|
|
130
156
|
|
|
@@ -553,3 +553,46 @@ class RichDisplayService:
|
|
|
553
553
|
suggestion="Run agentic setup to create agents.",
|
|
554
554
|
command="gnosisllm-knowledge agentic setup",
|
|
555
555
|
)
|
|
556
|
+
|
|
557
|
+
def memory_status(
|
|
558
|
+
self,
|
|
559
|
+
llm_model_id: str | None,
|
|
560
|
+
embedding_model_id: str | None,
|
|
561
|
+
llm_model: str = "gpt-4o",
|
|
562
|
+
embedding_model: str = "text-embedding-3-small",
|
|
563
|
+
) -> None:
|
|
564
|
+
"""Display agentic memory configuration status.
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
llm_model_id: LLM model ID if configured.
|
|
568
|
+
embedding_model_id: Embedding model ID if configured.
|
|
569
|
+
llm_model: LLM model name for fact extraction.
|
|
570
|
+
embedding_model: Embedding model name.
|
|
571
|
+
"""
|
|
572
|
+
status_rows = []
|
|
573
|
+
|
|
574
|
+
# LLM Model
|
|
575
|
+
if llm_model_id:
|
|
576
|
+
status_rows.append(("LLM Model", "[green]Configured[/green]"))
|
|
577
|
+
status_rows.append((" ID", f"[dim]{llm_model_id}[/dim]"))
|
|
578
|
+
status_rows.append((" Model", llm_model))
|
|
579
|
+
else:
|
|
580
|
+
status_rows.append(("LLM Model", "[red]Not configured[/red]"))
|
|
581
|
+
|
|
582
|
+
# Embedding Model
|
|
583
|
+
if embedding_model_id:
|
|
584
|
+
status_rows.append(("Embedding Model", "[green]Configured[/green]"))
|
|
585
|
+
status_rows.append((" ID", f"[dim]{embedding_model_id}[/dim]"))
|
|
586
|
+
status_rows.append((" Model", embedding_model))
|
|
587
|
+
else:
|
|
588
|
+
status_rows.append(("Embedding Model", "[red]Not configured[/red]"))
|
|
589
|
+
|
|
590
|
+
self.table("Agentic Memory Configuration", status_rows)
|
|
591
|
+
|
|
592
|
+
if not llm_model_id or not embedding_model_id:
|
|
593
|
+
self.newline()
|
|
594
|
+
self.format_error_with_suggestion(
|
|
595
|
+
error="Memory models not configured.",
|
|
596
|
+
suggestion="Run memory setup to create connectors and models.",
|
|
597
|
+
command="gnosisllm-knowledge memory setup --openai-key sk-...",
|
|
598
|
+
)
|
|
@@ -27,7 +27,7 @@ class CliConfig:
|
|
|
27
27
|
opensearch_verify_certs: bool = False
|
|
28
28
|
opensearch_model_id: str | None = None
|
|
29
29
|
opensearch_index_name: str = "knowledge"
|
|
30
|
-
|
|
30
|
+
opensearch_ingest_pipeline_name: str = "gnosisllm-ingest-pipeline"
|
|
31
31
|
opensearch_search_pipeline_name: str = "gnosisllm-search-pipeline"
|
|
32
32
|
|
|
33
33
|
# OpenAI
|
|
@@ -42,6 +42,13 @@ class CliConfig:
|
|
|
42
42
|
agentic_max_iterations: int = 5
|
|
43
43
|
agentic_timeout_seconds: int = 60
|
|
44
44
|
|
|
45
|
+
# Agentic Memory
|
|
46
|
+
memory_llm_model_id: str | None = None
|
|
47
|
+
memory_embedding_model_id: str | None = None
|
|
48
|
+
memory_llm_model: str = "gpt-4o"
|
|
49
|
+
memory_embedding_model: str = "text-embedding-3-small"
|
|
50
|
+
memory_embedding_dimension: int = 1536
|
|
51
|
+
|
|
45
52
|
# Neoreader
|
|
46
53
|
neoreader_host: str = "https://api.neoreader.dev"
|
|
47
54
|
|
|
@@ -71,11 +78,11 @@ class CliConfig:
|
|
|
71
78
|
== "true",
|
|
72
79
|
opensearch_model_id=os.getenv("OPENSEARCH_MODEL_ID"),
|
|
73
80
|
opensearch_index_name=os.getenv("OPENSEARCH_INDEX_NAME", "knowledge"),
|
|
74
|
-
|
|
75
|
-
"
|
|
81
|
+
opensearch_ingest_pipeline_name=os.getenv(
|
|
82
|
+
"OPENSEARCH_INGEST_PIPELINE", "gnosisllm-ingest-pipeline"
|
|
76
83
|
),
|
|
77
84
|
opensearch_search_pipeline_name=os.getenv(
|
|
78
|
-
"
|
|
85
|
+
"OPENSEARCH_SEARCH_PIPELINE", "gnosisllm-search-pipeline"
|
|
79
86
|
),
|
|
80
87
|
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
81
88
|
openai_embedding_model=os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"),
|
|
@@ -86,6 +93,12 @@ class CliConfig:
|
|
|
86
93
|
agentic_llm_model=os.getenv("AGENTIC_LLM_MODEL", "gpt-4o"),
|
|
87
94
|
agentic_max_iterations=int(os.getenv("AGENTIC_MAX_ITERATIONS", "5")),
|
|
88
95
|
agentic_timeout_seconds=int(os.getenv("AGENTIC_TIMEOUT_SECONDS", "60")),
|
|
96
|
+
# Agentic Memory configuration
|
|
97
|
+
memory_llm_model_id=os.getenv("OPENSEARCH_MEMORY_LLM_MODEL_ID"),
|
|
98
|
+
memory_embedding_model_id=os.getenv("OPENSEARCH_MEMORY_EMBEDDING_MODEL_ID"),
|
|
99
|
+
memory_llm_model=os.getenv("MEMORY_LLM_MODEL", "gpt-4o"),
|
|
100
|
+
memory_embedding_model=os.getenv("MEMORY_EMBEDDING_MODEL", "text-embedding-3-small"),
|
|
101
|
+
memory_embedding_dimension=int(os.getenv("MEMORY_EMBEDDING_DIMENSION", "1536")),
|
|
89
102
|
neoreader_host=os.getenv("NEOREADER_HOST", "https://api.neoreader.dev"),
|
|
90
103
|
)
|
|
91
104
|
|
|
@@ -205,3 +218,48 @@ class CliConfig:
|
|
|
205
218
|
def has_conversational_agent(self) -> bool:
|
|
206
219
|
"""Check if conversational agent is configured."""
|
|
207
220
|
return bool(self.opensearch_conversational_agent_id)
|
|
221
|
+
|
|
222
|
+
# === Memory Configuration ===
|
|
223
|
+
|
|
224
|
+
def validate_for_memory(self) -> list[str]:
|
|
225
|
+
"""Validate configuration for memory commands.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
List of validation errors (empty if valid).
|
|
229
|
+
"""
|
|
230
|
+
errors = []
|
|
231
|
+
if not self.memory_llm_model_id:
|
|
232
|
+
errors.append(
|
|
233
|
+
"OPENSEARCH_MEMORY_LLM_MODEL_ID is required for memory operations. "
|
|
234
|
+
"Run 'gnosisllm-knowledge memory setup' first."
|
|
235
|
+
)
|
|
236
|
+
if not self.memory_embedding_model_id:
|
|
237
|
+
errors.append(
|
|
238
|
+
"OPENSEARCH_MEMORY_EMBEDDING_MODEL_ID is required for memory operations. "
|
|
239
|
+
"Run 'gnosisllm-knowledge memory setup' first."
|
|
240
|
+
)
|
|
241
|
+
return errors
|
|
242
|
+
|
|
243
|
+
def validate_for_memory_setup(self) -> list[str]:
|
|
244
|
+
"""Validate configuration for memory setup command.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
List of validation errors (empty if valid).
|
|
248
|
+
"""
|
|
249
|
+
errors = []
|
|
250
|
+
if not self.openai_api_key:
|
|
251
|
+
errors.append(
|
|
252
|
+
"OPENAI_API_KEY is required for memory setup. "
|
|
253
|
+
"Use --openai-key or set the environment variable."
|
|
254
|
+
)
|
|
255
|
+
return errors
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def has_memory_models(self) -> bool:
|
|
259
|
+
"""Check if memory models are configured."""
|
|
260
|
+
return bool(self.memory_llm_model_id and self.memory_embedding_model_id)
|
|
261
|
+
|
|
262
|
+
@property
|
|
263
|
+
def memory_is_configured(self) -> bool:
|
|
264
|
+
"""Check if memory is fully configured for operations."""
|
|
265
|
+
return self.has_memory_models
|
|
@@ -1,6 +1,34 @@
|
|
|
1
1
|
"""Domain models - Value objects and entities."""
|
|
2
2
|
|
|
3
|
+
from gnosisllm_knowledge.core.domain.discovery import (
|
|
4
|
+
DiscoveredURL,
|
|
5
|
+
DiscoveryConfig,
|
|
6
|
+
DiscoveryJobStatus,
|
|
7
|
+
DiscoveryProgress,
|
|
8
|
+
DiscoveryStats,
|
|
9
|
+
)
|
|
3
10
|
from gnosisllm_knowledge.core.domain.document import Document, DocumentStatus, TextChunk
|
|
11
|
+
from gnosisllm_knowledge.core.domain.memory import (
|
|
12
|
+
ContainerConfig,
|
|
13
|
+
ContainerIndexSettings,
|
|
14
|
+
ContainerInfo,
|
|
15
|
+
EmbeddingModelType,
|
|
16
|
+
HistoryAction,
|
|
17
|
+
HistoryEntry,
|
|
18
|
+
IndexSettings,
|
|
19
|
+
MemoryEntry,
|
|
20
|
+
MemoryStats,
|
|
21
|
+
MemoryStrategy,
|
|
22
|
+
MemoryType,
|
|
23
|
+
Message,
|
|
24
|
+
Namespace,
|
|
25
|
+
PayloadType,
|
|
26
|
+
RecallResult,
|
|
27
|
+
SessionInfo,
|
|
28
|
+
StoreRequest,
|
|
29
|
+
StoreResult,
|
|
30
|
+
StrategyConfig,
|
|
31
|
+
)
|
|
4
32
|
from gnosisllm_knowledge.core.domain.result import (
|
|
5
33
|
BatchResult,
|
|
6
34
|
IndexResult,
|
|
@@ -20,10 +48,36 @@ from gnosisllm_knowledge.core.domain.search import (
|
|
|
20
48
|
from gnosisllm_knowledge.core.domain.source import SourceConfig
|
|
21
49
|
|
|
22
50
|
__all__ = [
|
|
51
|
+
# Discovery
|
|
52
|
+
"DiscoveredURL",
|
|
53
|
+
"DiscoveryConfig",
|
|
54
|
+
"DiscoveryJobStatus",
|
|
55
|
+
"DiscoveryProgress",
|
|
56
|
+
"DiscoveryStats",
|
|
23
57
|
# Document
|
|
24
58
|
"Document",
|
|
25
59
|
"DocumentStatus",
|
|
26
60
|
"TextChunk",
|
|
61
|
+
# Memory
|
|
62
|
+
"MemoryStrategy",
|
|
63
|
+
"MemoryType",
|
|
64
|
+
"PayloadType",
|
|
65
|
+
"EmbeddingModelType",
|
|
66
|
+
"HistoryAction",
|
|
67
|
+
"StrategyConfig",
|
|
68
|
+
"IndexSettings",
|
|
69
|
+
"ContainerIndexSettings",
|
|
70
|
+
"ContainerConfig",
|
|
71
|
+
"ContainerInfo",
|
|
72
|
+
"Message",
|
|
73
|
+
"Namespace",
|
|
74
|
+
"StoreRequest",
|
|
75
|
+
"StoreResult",
|
|
76
|
+
"MemoryEntry",
|
|
77
|
+
"RecallResult",
|
|
78
|
+
"SessionInfo",
|
|
79
|
+
"HistoryEntry",
|
|
80
|
+
"MemoryStats",
|
|
27
81
|
# Result
|
|
28
82
|
"LoadResult",
|
|
29
83
|
"IndexResult",
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Domain models for website discovery."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class DiscoveryConfig:
|
|
10
|
+
"""Configuration for website discovery crawl.
|
|
11
|
+
|
|
12
|
+
Controls how the Neo Reader Discovery API crawls and discovers URLs.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
max_depth: Maximum crawl depth from start URL.
|
|
16
|
+
max_pages: Maximum number of pages to crawl.
|
|
17
|
+
same_domain: Only crawl URLs on the same domain.
|
|
18
|
+
include_subdomains: Include subdomains when same_domain is True.
|
|
19
|
+
respect_robots: Respect robots.txt rules.
|
|
20
|
+
parse_sitemap: Also parse sitemap if available.
|
|
21
|
+
with_metadata: Include page metadata (title, etc.) in results.
|
|
22
|
+
crawl_timeout: Overall timeout for the crawl in seconds.
|
|
23
|
+
concurrent_requests: Number of concurrent crawl requests.
|
|
24
|
+
request_delay: Delay between requests in milliseconds.
|
|
25
|
+
include_pattern: Regex pattern for URLs to include.
|
|
26
|
+
exclude_pattern: Regex pattern for URLs to exclude.
|
|
27
|
+
path_prefix: Only crawl URLs with this path prefix.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
max_depth: int = 3
|
|
31
|
+
max_pages: int = 100
|
|
32
|
+
same_domain: bool = True
|
|
33
|
+
include_subdomains: bool = True
|
|
34
|
+
respect_robots: bool = True
|
|
35
|
+
parse_sitemap: bool = False
|
|
36
|
+
with_metadata: bool = True
|
|
37
|
+
crawl_timeout: int = 300
|
|
38
|
+
concurrent_requests: int = 5
|
|
39
|
+
request_delay: int = 100
|
|
40
|
+
include_pattern: str | None = None
|
|
41
|
+
exclude_pattern: str | None = None
|
|
42
|
+
path_prefix: str | None = None
|
|
43
|
+
|
|
44
|
+
def to_headers(self) -> dict[str, str]:
|
|
45
|
+
"""Convert config to HTTP headers for Neo Reader API.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Dictionary of header name to value.
|
|
49
|
+
"""
|
|
50
|
+
headers = {
|
|
51
|
+
"X-Max-Depth": str(self.max_depth),
|
|
52
|
+
"X-Max-Pages": str(self.max_pages),
|
|
53
|
+
"X-Same-Domain": str(self.same_domain).lower(),
|
|
54
|
+
"X-Include-Subdomains": str(self.include_subdomains).lower(),
|
|
55
|
+
"X-Respect-Robots": str(self.respect_robots).lower(),
|
|
56
|
+
"X-Parse-Sitemap": str(self.parse_sitemap).lower(),
|
|
57
|
+
"X-With-Metadata": str(self.with_metadata).lower(),
|
|
58
|
+
"X-Crawl-Timeout": str(self.crawl_timeout),
|
|
59
|
+
"X-Concurrent-Requests": str(self.concurrent_requests),
|
|
60
|
+
"X-Request-Delay": str(self.request_delay),
|
|
61
|
+
}
|
|
62
|
+
if self.include_pattern:
|
|
63
|
+
headers["X-Include-Pattern"] = self.include_pattern
|
|
64
|
+
if self.exclude_pattern:
|
|
65
|
+
headers["X-Exclude-Pattern"] = self.exclude_pattern
|
|
66
|
+
if self.path_prefix:
|
|
67
|
+
headers["X-Path-Prefix"] = self.path_prefix
|
|
68
|
+
return headers
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class DiscoveryProgress:
|
|
73
|
+
"""Progress information for a running discovery job.
|
|
74
|
+
|
|
75
|
+
Attributes:
|
|
76
|
+
percent: Completion percentage (0-100).
|
|
77
|
+
pages_crawled: Number of pages crawled so far.
|
|
78
|
+
urls_discovered: Number of URLs discovered so far.
|
|
79
|
+
current_depth: Current crawl depth.
|
|
80
|
+
message: Human-readable progress message.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
percent: int = 0
|
|
84
|
+
pages_crawled: int = 0
|
|
85
|
+
urls_discovered: int = 0
|
|
86
|
+
current_depth: int = 0
|
|
87
|
+
message: str = ""
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class DiscoveryStats:
|
|
92
|
+
"""Statistics for a completed discovery job.
|
|
93
|
+
|
|
94
|
+
Attributes:
|
|
95
|
+
pages_crawled: Total pages crawled.
|
|
96
|
+
urls_found: Total URLs found during crawl.
|
|
97
|
+
urls_returned: URLs returned in results (after filtering).
|
|
98
|
+
urls_filtered: URLs excluded by filters.
|
|
99
|
+
errors: Number of errors during crawl.
|
|
100
|
+
duration_seconds: Total crawl duration.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
pages_crawled: int = 0
|
|
104
|
+
urls_found: int = 0
|
|
105
|
+
urls_returned: int = 0
|
|
106
|
+
urls_filtered: int = 0
|
|
107
|
+
errors: int = 0
|
|
108
|
+
duration_seconds: float = 0.0
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class DiscoveredURL:
|
|
113
|
+
"""A URL discovered during crawl.
|
|
114
|
+
|
|
115
|
+
Attributes:
|
|
116
|
+
url: The discovered URL.
|
|
117
|
+
depth: Crawl depth at which URL was found.
|
|
118
|
+
title: Page title if available.
|
|
119
|
+
is_internal: Whether URL is internal to the domain.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
url: str
|
|
123
|
+
depth: int = 0
|
|
124
|
+
title: str | None = None
|
|
125
|
+
is_internal: bool = True
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass
|
|
129
|
+
class DiscoveryJobStatus:
|
|
130
|
+
"""Status of a discovery job.
|
|
131
|
+
|
|
132
|
+
Represents the current state of an async discovery job.
|
|
133
|
+
|
|
134
|
+
Attributes:
|
|
135
|
+
job_id: Unique job identifier.
|
|
136
|
+
status: Job status (pending, queued, running, completed, failed, cancelled).
|
|
137
|
+
start_url: The URL that started the discovery.
|
|
138
|
+
progress: Progress information if job is running.
|
|
139
|
+
stats: Statistics if job is completed.
|
|
140
|
+
urls: Discovered URLs if job is completed.
|
|
141
|
+
error: Error message if job failed.
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
job_id: str
|
|
145
|
+
status: str
|
|
146
|
+
start_url: str
|
|
147
|
+
progress: DiscoveryProgress | None = None
|
|
148
|
+
stats: DiscoveryStats | None = None
|
|
149
|
+
urls: list[DiscoveredURL] = field(default_factory=list)
|
|
150
|
+
error: str | None = None
|
|
151
|
+
|
|
152
|
+
def is_terminal(self) -> bool:
|
|
153
|
+
"""Check if job is in a terminal state.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
True if job is completed, failed, or cancelled.
|
|
157
|
+
"""
|
|
158
|
+
return self.status in ("completed", "failed", "cancelled")
|
|
159
|
+
|
|
160
|
+
def is_running(self) -> bool:
|
|
161
|
+
"""Check if job is currently running.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
True if job is pending, queued, or running.
|
|
165
|
+
"""
|
|
166
|
+
return self.status in ("pending", "queued", "running")
|