cognee 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +5 -1
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/ui/ui.py +47 -16
- cognee/api/v1/update/routers/get_update_router.py +1 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +5 -0
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +34 -2
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/METADATA +22 -7
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -128
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- distributed/poetry.lock +0 -12238
- distributed/pyproject.toml +0 -185
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
cognee/__init__.py
CHANGED
|
@@ -19,6 +19,7 @@ from .api.v1.add import add
|
|
|
19
19
|
from .api.v1.delete import delete
|
|
20
20
|
from .api.v1.cognify import cognify
|
|
21
21
|
from .modules.memify import memify
|
|
22
|
+
from .api.v1.update import update
|
|
22
23
|
from .api.v1.config.config import config
|
|
23
24
|
from .api.v1.datasets.datasets import datasets
|
|
24
25
|
from .api.v1.prune import prune
|
cognee/api/health.py
CHANGED
|
@@ -241,16 +241,6 @@ class HealthChecker:
|
|
|
241
241
|
"""Get comprehensive health status."""
|
|
242
242
|
components = {}
|
|
243
243
|
|
|
244
|
-
# Critical services
|
|
245
|
-
critical_components = [
|
|
246
|
-
"relational_db",
|
|
247
|
-
"vector_db",
|
|
248
|
-
"graph_db",
|
|
249
|
-
"file_storage",
|
|
250
|
-
"llm_provider",
|
|
251
|
-
"embedding_service",
|
|
252
|
-
]
|
|
253
|
-
|
|
254
244
|
critical_checks = [
|
|
255
245
|
("relational_db", self.check_relational_db()),
|
|
256
246
|
("vector_db", self.check_vector_db()),
|
|
@@ -296,11 +286,11 @@ class HealthChecker:
|
|
|
296
286
|
else:
|
|
297
287
|
components[name] = result
|
|
298
288
|
|
|
289
|
+
critical_comps = [check[0] for check in critical_checks]
|
|
299
290
|
# Determine overall status
|
|
300
291
|
critical_unhealthy = any(
|
|
301
|
-
comp.status == HealthStatus.UNHEALTHY
|
|
292
|
+
comp.status == HealthStatus.UNHEALTHY and name in critical_comps
|
|
302
293
|
for name, comp in components.items()
|
|
303
|
-
if name in critical_components
|
|
304
294
|
)
|
|
305
295
|
|
|
306
296
|
has_degraded = any(comp.status == HealthStatus.DEGRADED for comp in components.values())
|
cognee/api/v1/add/add.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
|
-
from typing import Union, BinaryIO, List, Optional
|
|
3
|
-
|
|
2
|
+
from typing import Union, BinaryIO, List, Optional, Any
|
|
4
3
|
from cognee.modules.users.models import User
|
|
5
4
|
from cognee.modules.pipelines import Task, run_pipeline
|
|
6
5
|
from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (
|
|
@@ -11,6 +10,9 @@ from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
|
|
|
11
10
|
)
|
|
12
11
|
from cognee.modules.engine.operations.setup import setup
|
|
13
12
|
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
|
|
13
|
+
from cognee.shared.logging_utils import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
async def add(
|
|
@@ -21,14 +23,15 @@ async def add(
|
|
|
21
23
|
vector_db_config: dict = None,
|
|
22
24
|
graph_db_config: dict = None,
|
|
23
25
|
dataset_id: Optional[UUID] = None,
|
|
24
|
-
preferred_loaders: List[str] = None,
|
|
26
|
+
preferred_loaders: Optional[List[Union[str, dict[str, dict[str, Any]]]]] = None,
|
|
25
27
|
incremental_loading: bool = True,
|
|
28
|
+
data_per_batch: Optional[int] = 20,
|
|
26
29
|
):
|
|
27
30
|
"""
|
|
28
31
|
Add data to Cognee for knowledge graph processing.
|
|
29
32
|
|
|
30
33
|
This is the first step in the Cognee workflow - it ingests raw data and prepares it
|
|
31
|
-
for processing. The function accepts various data formats including text, files, and
|
|
34
|
+
for processing. The function accepts various data formats including text, files, urls and
|
|
32
35
|
binary streams, then stores them in a specified dataset for further processing.
|
|
33
36
|
|
|
34
37
|
Prerequisites:
|
|
@@ -68,6 +71,7 @@ async def add(
|
|
|
68
71
|
- S3 path: "s3://my-bucket/documents/file.pdf"
|
|
69
72
|
- List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
|
|
70
73
|
- Binary file object: open("file.txt", "rb")
|
|
74
|
+
- url: A web link url (https or http)
|
|
71
75
|
dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
|
|
72
76
|
Create separate datasets to organize different knowledge domains.
|
|
73
77
|
user: User object for authentication and permissions. Uses default user if None.
|
|
@@ -78,6 +82,9 @@ async def add(
|
|
|
78
82
|
vector_db_config: Optional configuration for vector database (for custom setups).
|
|
79
83
|
graph_db_config: Optional configuration for graph database (for custom setups).
|
|
80
84
|
dataset_id: Optional specific dataset UUID to use instead of dataset_name.
|
|
85
|
+
extraction_rules: Optional dictionary of rules (e.g., CSS selectors, XPath) for extracting specific content from web pages using BeautifulSoup
|
|
86
|
+
tavily_config: Optional configuration for Tavily API, including API key and extraction settings
|
|
87
|
+
soup_crawler_config: Optional configuration for BeautifulSoup crawler, specifying concurrency, crawl delay, and extraction rules.
|
|
81
88
|
|
|
82
89
|
Returns:
|
|
83
90
|
PipelineRunInfo: Information about the ingestion pipeline execution including:
|
|
@@ -126,6 +133,21 @@ async def add(
|
|
|
126
133
|
|
|
127
134
|
# Add a single file
|
|
128
135
|
await cognee.add("/home/user/documents/analysis.pdf")
|
|
136
|
+
|
|
137
|
+
# Add a single url and bs4 extract ingestion method
|
|
138
|
+
extraction_rules = {
|
|
139
|
+
"title": "h1",
|
|
140
|
+
"description": "p",
|
|
141
|
+
"more_info": "a[href*='more-info']"
|
|
142
|
+
}
|
|
143
|
+
await cognee.add("https://example.com",extraction_rules=extraction_rules)
|
|
144
|
+
|
|
145
|
+
# Add a single url and tavily extract ingestion method
|
|
146
|
+
Make sure to set TAVILY_API_KEY = YOUR_TAVILY_API_KEY as a environment variable
|
|
147
|
+
await cognee.add("https://example.com")
|
|
148
|
+
|
|
149
|
+
# Add multiple urls
|
|
150
|
+
await cognee.add(["https://example.com","https://books.toscrape.com"])
|
|
129
151
|
```
|
|
130
152
|
|
|
131
153
|
Environment Variables:
|
|
@@ -133,17 +155,34 @@ async def add(
|
|
|
133
155
|
- LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.)
|
|
134
156
|
|
|
135
157
|
Optional:
|
|
136
|
-
- LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama"
|
|
158
|
+
- LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama", "mistral"
|
|
137
159
|
- LLM_MODEL: Model name (default: "gpt-5-mini")
|
|
138
160
|
- DEFAULT_USER_EMAIL: Custom default user email
|
|
139
161
|
- DEFAULT_USER_PASSWORD: Custom default user password
|
|
140
162
|
- VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "pgvector"
|
|
141
163
|
- GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j"
|
|
164
|
+
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY
|
|
142
165
|
|
|
143
166
|
"""
|
|
167
|
+
if preferred_loaders is not None:
|
|
168
|
+
transformed = {}
|
|
169
|
+
for item in preferred_loaders:
|
|
170
|
+
if isinstance(item, dict):
|
|
171
|
+
transformed.update(item)
|
|
172
|
+
else:
|
|
173
|
+
transformed[item] = {}
|
|
174
|
+
preferred_loaders = transformed
|
|
175
|
+
|
|
144
176
|
tasks = [
|
|
145
177
|
Task(resolve_data_directories, include_subdirectories=True),
|
|
146
|
-
Task(
|
|
178
|
+
Task(
|
|
179
|
+
ingest_data,
|
|
180
|
+
dataset_name,
|
|
181
|
+
user,
|
|
182
|
+
node_set,
|
|
183
|
+
dataset_id,
|
|
184
|
+
preferred_loaders,
|
|
185
|
+
),
|
|
147
186
|
]
|
|
148
187
|
|
|
149
188
|
await setup()
|
|
@@ -167,6 +206,7 @@ async def add(
|
|
|
167
206
|
vector_db_config=vector_db_config,
|
|
168
207
|
graph_db_config=graph_db_config,
|
|
169
208
|
incremental_loading=incremental_loading,
|
|
209
|
+
data_per_batch=data_per_batch,
|
|
170
210
|
):
|
|
171
211
|
pipeline_run_info = run_info
|
|
172
212
|
|
|
@@ -73,7 +73,11 @@ def get_add_router() -> APIRouter:
|
|
|
73
73
|
|
|
74
74
|
try:
|
|
75
75
|
add_run = await cognee_add(
|
|
76
|
-
data,
|
|
76
|
+
data,
|
|
77
|
+
datasetName,
|
|
78
|
+
user=user,
|
|
79
|
+
dataset_id=datasetId,
|
|
80
|
+
node_set=node_set if node_set else None,
|
|
77
81
|
)
|
|
78
82
|
|
|
79
83
|
if isinstance(add_run, PipelineRunErrored):
|
cognee/api/v1/cognify/cognify.py
CHANGED
|
@@ -44,6 +44,7 @@ async def cognify(
|
|
|
44
44
|
graph_model: BaseModel = KnowledgeGraph,
|
|
45
45
|
chunker=TextChunker,
|
|
46
46
|
chunk_size: int = None,
|
|
47
|
+
chunks_per_batch: int = None,
|
|
47
48
|
config: Config = None,
|
|
48
49
|
vector_db_config: dict = None,
|
|
49
50
|
graph_db_config: dict = None,
|
|
@@ -51,6 +52,7 @@ async def cognify(
|
|
|
51
52
|
incremental_loading: bool = True,
|
|
52
53
|
custom_prompt: Optional[str] = None,
|
|
53
54
|
temporal_cognify: bool = False,
|
|
55
|
+
data_per_batch: int = 20,
|
|
54
56
|
):
|
|
55
57
|
"""
|
|
56
58
|
Transform ingested data into a structured knowledge graph.
|
|
@@ -105,6 +107,7 @@ async def cognify(
|
|
|
105
107
|
Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2)
|
|
106
108
|
Default limits: ~512-8192 tokens depending on models.
|
|
107
109
|
Smaller chunks = more granular but potentially fragmented knowledge.
|
|
110
|
+
chunks_per_batch: Number of chunks to be processed in a single batch in Cognify tasks.
|
|
108
111
|
vector_db_config: Custom vector database configuration for embeddings storage.
|
|
109
112
|
graph_db_config: Custom graph database configuration for relationship storage.
|
|
110
113
|
run_in_background: If True, starts processing asynchronously and returns immediately.
|
|
@@ -148,7 +151,7 @@ async def cognify(
|
|
|
148
151
|
# 2. Get entity relationships and connections
|
|
149
152
|
relationships = await cognee.search(
|
|
150
153
|
"connections between concepts",
|
|
151
|
-
query_type=SearchType.
|
|
154
|
+
query_type=SearchType.GRAPH_COMPLETION
|
|
152
155
|
)
|
|
153
156
|
|
|
154
157
|
# 3. Find relevant document chunks
|
|
@@ -209,10 +212,18 @@ async def cognify(
|
|
|
209
212
|
}
|
|
210
213
|
|
|
211
214
|
if temporal_cognify:
|
|
212
|
-
tasks = await get_temporal_tasks(
|
|
215
|
+
tasks = await get_temporal_tasks(
|
|
216
|
+
user=user, chunker=chunker, chunk_size=chunk_size, chunks_per_batch=chunks_per_batch
|
|
217
|
+
)
|
|
213
218
|
else:
|
|
214
219
|
tasks = await get_default_tasks(
|
|
215
|
-
user,
|
|
220
|
+
user=user,
|
|
221
|
+
graph_model=graph_model,
|
|
222
|
+
chunker=chunker,
|
|
223
|
+
chunk_size=chunk_size,
|
|
224
|
+
config=config,
|
|
225
|
+
custom_prompt=custom_prompt,
|
|
226
|
+
chunks_per_batch=chunks_per_batch,
|
|
216
227
|
)
|
|
217
228
|
|
|
218
229
|
# By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for
|
|
@@ -228,6 +239,7 @@ async def cognify(
|
|
|
228
239
|
graph_db_config=graph_db_config,
|
|
229
240
|
incremental_loading=incremental_loading,
|
|
230
241
|
pipeline_name="cognify_pipeline",
|
|
242
|
+
data_per_batch=data_per_batch,
|
|
231
243
|
)
|
|
232
244
|
|
|
233
245
|
|
|
@@ -238,6 +250,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|
|
238
250
|
chunk_size: int = None,
|
|
239
251
|
config: Config = None,
|
|
240
252
|
custom_prompt: Optional[str] = None,
|
|
253
|
+
chunks_per_batch: int = 100,
|
|
241
254
|
) -> list[Task]:
|
|
242
255
|
if config is None:
|
|
243
256
|
ontology_config = get_ontology_env_config()
|
|
@@ -256,6 +269,9 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|
|
256
269
|
"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}
|
|
257
270
|
}
|
|
258
271
|
|
|
272
|
+
if chunks_per_batch is None:
|
|
273
|
+
chunks_per_batch = 100
|
|
274
|
+
|
|
259
275
|
default_tasks = [
|
|
260
276
|
Task(classify_documents),
|
|
261
277
|
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
|
|
@@ -269,20 +285,20 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|
|
269
285
|
graph_model=graph_model,
|
|
270
286
|
config=config,
|
|
271
287
|
custom_prompt=custom_prompt,
|
|
272
|
-
task_config={"batch_size":
|
|
288
|
+
task_config={"batch_size": chunks_per_batch},
|
|
273
289
|
), # Generate knowledge graphs from the document chunks.
|
|
274
290
|
Task(
|
|
275
291
|
summarize_text,
|
|
276
|
-
task_config={"batch_size":
|
|
292
|
+
task_config={"batch_size": chunks_per_batch},
|
|
277
293
|
),
|
|
278
|
-
Task(add_data_points, task_config={"batch_size":
|
|
294
|
+
Task(add_data_points, task_config={"batch_size": chunks_per_batch}),
|
|
279
295
|
]
|
|
280
296
|
|
|
281
297
|
return default_tasks
|
|
282
298
|
|
|
283
299
|
|
|
284
300
|
async def get_temporal_tasks(
|
|
285
|
-
user: User = None, chunker=TextChunker, chunk_size: int = None
|
|
301
|
+
user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10
|
|
286
302
|
) -> list[Task]:
|
|
287
303
|
"""
|
|
288
304
|
Builds and returns a list of temporal processing tasks to be executed in sequence.
|
|
@@ -299,10 +315,14 @@ async def get_temporal_tasks(
|
|
|
299
315
|
user (User, optional): The user requesting task execution, used for permission checks.
|
|
300
316
|
chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker.
|
|
301
317
|
chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default.
|
|
318
|
+
chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify
|
|
302
319
|
|
|
303
320
|
Returns:
|
|
304
321
|
list[Task]: A list of Task objects representing the temporal processing pipeline.
|
|
305
322
|
"""
|
|
323
|
+
if chunks_per_batch is None:
|
|
324
|
+
chunks_per_batch = 10
|
|
325
|
+
|
|
306
326
|
temporal_tasks = [
|
|
307
327
|
Task(classify_documents),
|
|
308
328
|
Task(check_permissions_on_dataset, user=user, permissions=["write"]),
|
|
@@ -311,9 +331,9 @@ async def get_temporal_tasks(
|
|
|
311
331
|
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
|
312
332
|
chunker=chunker,
|
|
313
333
|
),
|
|
314
|
-
Task(extract_events_and_timestamps, task_config={"
|
|
334
|
+
Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}),
|
|
315
335
|
Task(extract_knowledge_graph_from_events),
|
|
316
|
-
Task(add_data_points, task_config={"batch_size":
|
|
336
|
+
Task(add_data_points, task_config={"batch_size": chunks_per_batch}),
|
|
317
337
|
]
|
|
318
338
|
|
|
319
339
|
return temporal_tasks
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
|
+
from cognee.modules.data.methods import has_dataset_data
|
|
2
3
|
from cognee.modules.users.methods import get_default_user
|
|
3
4
|
from cognee.modules.ingestion import discover_directory_datasets
|
|
4
5
|
from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
|
|
@@ -26,6 +27,16 @@ class datasets:
|
|
|
26
27
|
|
|
27
28
|
return await get_dataset_data(dataset.id)
|
|
28
29
|
|
|
30
|
+
@staticmethod
|
|
31
|
+
async def has_data(dataset_id: str) -> bool:
|
|
32
|
+
from cognee.modules.data.methods import get_dataset
|
|
33
|
+
|
|
34
|
+
user = await get_default_user()
|
|
35
|
+
|
|
36
|
+
dataset = await get_dataset(user.id, dataset_id)
|
|
37
|
+
|
|
38
|
+
return await has_dataset_data(dataset.id)
|
|
39
|
+
|
|
29
40
|
@staticmethod
|
|
30
41
|
async def get_status(dataset_ids: list[UUID]) -> dict:
|
|
31
42
|
return await get_pipeline_status(dataset_ids, pipeline_name="cognify_pipeline")
|
|
@@ -59,7 +59,7 @@ async def handle_search(arguments: Dict[str, Any], user) -> list:
|
|
|
59
59
|
valid_search_types = (
|
|
60
60
|
search_tool["parameters"]["properties"]["search_type"]["enum"]
|
|
61
61
|
if search_tool
|
|
62
|
-
else ["
|
|
62
|
+
else ["CODE", "GRAPH_COMPLETION", "NATURAL_LANGUAGE"]
|
|
63
63
|
)
|
|
64
64
|
|
|
65
65
|
if search_type_str not in valid_search_types:
|
cognee/api/v1/search/search.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
2
|
from typing import Union, Optional, List, Type
|
|
3
3
|
|
|
4
|
+
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
4
5
|
from cognee.modules.engine.models.node_set import NodeSet
|
|
5
6
|
from cognee.modules.users.models import User
|
|
6
7
|
from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult
|
|
@@ -8,6 +9,10 @@ from cognee.modules.users.methods import get_default_user
|
|
|
8
9
|
from cognee.modules.search.methods import search as search_function
|
|
9
10
|
from cognee.modules.data.methods import get_authorized_existing_datasets
|
|
10
11
|
from cognee.modules.data.exceptions import DatasetNotFoundError
|
|
12
|
+
from cognee.context_global_variables import set_session_user_context_variable
|
|
13
|
+
from cognee.shared.logging_utils import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
async def search(
|
|
@@ -25,6 +30,7 @@ async def search(
|
|
|
25
30
|
last_k: Optional[int] = 1,
|
|
26
31
|
only_context: bool = False,
|
|
27
32
|
use_combined_context: bool = False,
|
|
33
|
+
session_id: Optional[str] = None,
|
|
28
34
|
) -> Union[List[SearchResult], CombinedSearchResult]:
|
|
29
35
|
"""
|
|
30
36
|
Search and query the knowledge graph for insights, information, and connections.
|
|
@@ -52,11 +58,6 @@ async def search(
|
|
|
52
58
|
Best for: Direct document retrieval, specific fact-finding.
|
|
53
59
|
Returns: LLM responses based on relevant text chunks.
|
|
54
60
|
|
|
55
|
-
**INSIGHTS**:
|
|
56
|
-
Structured entity relationships and semantic connections.
|
|
57
|
-
Best for: Understanding concept relationships, knowledge mapping.
|
|
58
|
-
Returns: Formatted relationship data and entity connections.
|
|
59
|
-
|
|
60
61
|
**CHUNKS**:
|
|
61
62
|
Raw text segments that match the query semantically.
|
|
62
63
|
Best for: Finding specific passages, citations, exact content.
|
|
@@ -118,15 +119,14 @@ async def search(
|
|
|
118
119
|
|
|
119
120
|
save_interaction: Save interaction (query, context, answer connected to triplet endpoints) results into the graph or not
|
|
120
121
|
|
|
122
|
+
session_id: Optional session identifier for caching Q&A interactions. Defaults to 'default_session' if None.
|
|
123
|
+
|
|
121
124
|
Returns:
|
|
122
125
|
list: Search results in format determined by query_type:
|
|
123
126
|
|
|
124
127
|
**GRAPH_COMPLETION/RAG_COMPLETION**:
|
|
125
128
|
[List of conversational AI response strings]
|
|
126
129
|
|
|
127
|
-
**INSIGHTS**:
|
|
128
|
-
[List of formatted relationship descriptions and entity connections]
|
|
129
|
-
|
|
130
130
|
**CHUNKS**:
|
|
131
131
|
[List of relevant text passages with source metadata]
|
|
132
132
|
|
|
@@ -146,7 +146,6 @@ async def search(
|
|
|
146
146
|
Performance & Optimization:
|
|
147
147
|
- **GRAPH_COMPLETION**: Slower but most intelligent, uses LLM + graph context
|
|
148
148
|
- **RAG_COMPLETION**: Medium speed, uses LLM + document chunks (no graph traversal)
|
|
149
|
-
- **INSIGHTS**: Fast, returns structured relationships without LLM processing
|
|
150
149
|
- **CHUNKS**: Fastest, pure vector similarity search without LLM
|
|
151
150
|
- **SUMMARIES**: Fast, returns pre-computed summaries
|
|
152
151
|
- **CODE**: Medium speed, specialized for code understanding
|
|
@@ -177,6 +176,8 @@ async def search(
|
|
|
177
176
|
if user is None:
|
|
178
177
|
user = await get_default_user()
|
|
179
178
|
|
|
179
|
+
await set_session_user_context_variable(user)
|
|
180
|
+
|
|
180
181
|
# Transform string based datasets to UUID - String based datasets can only be found for current user
|
|
181
182
|
if datasets is not None and [all(isinstance(dataset, str) for dataset in datasets)]:
|
|
182
183
|
datasets = await get_authorized_existing_datasets(datasets, "read", user)
|
|
@@ -198,6 +199,7 @@ async def search(
|
|
|
198
199
|
last_k=last_k,
|
|
199
200
|
only_context=only_context,
|
|
200
201
|
use_combined_context=use_combined_context,
|
|
202
|
+
session_id=session_id,
|
|
201
203
|
)
|
|
202
204
|
|
|
203
205
|
return filtered_search_results
|
|
@@ -21,7 +21,13 @@ class SettingsDTO(OutDTO):
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class LLMConfigInputDTO(InDTO):
|
|
24
|
-
provider: Union[
|
|
24
|
+
provider: Union[
|
|
25
|
+
Literal["openai"],
|
|
26
|
+
Literal["ollama"],
|
|
27
|
+
Literal["anthropic"],
|
|
28
|
+
Literal["gemini"],
|
|
29
|
+
Literal["mistral"],
|
|
30
|
+
]
|
|
25
31
|
model: str
|
|
26
32
|
api_key: str
|
|
27
33
|
|
cognee/api/v1/ui/ui.py
CHANGED
|
@@ -502,22 +502,48 @@ def start_ui(
|
|
|
502
502
|
|
|
503
503
|
if start_mcp:
|
|
504
504
|
logger.info("Starting Cognee MCP server with Docker...")
|
|
505
|
-
cwd = os.getcwd()
|
|
506
|
-
env_file = os.path.join(cwd, ".env")
|
|
507
505
|
try:
|
|
506
|
+
image = "cognee/cognee-mcp:feature-standalone-mcp" # TODO: change to "cognee/cognee-mcp:main" right before merging into main
|
|
507
|
+
subprocess.run(["docker", "pull", image], check=True)
|
|
508
|
+
|
|
509
|
+
import uuid
|
|
510
|
+
|
|
511
|
+
container_name = f"cognee-mcp-{uuid.uuid4().hex[:8]}"
|
|
512
|
+
|
|
513
|
+
docker_cmd = [
|
|
514
|
+
"docker",
|
|
515
|
+
"run",
|
|
516
|
+
"--name",
|
|
517
|
+
container_name,
|
|
518
|
+
"-p",
|
|
519
|
+
f"{mcp_port}:8000",
|
|
520
|
+
"--rm",
|
|
521
|
+
"-e",
|
|
522
|
+
"TRANSPORT_MODE=sse",
|
|
523
|
+
]
|
|
524
|
+
|
|
525
|
+
if start_backend:
|
|
526
|
+
docker_cmd.extend(
|
|
527
|
+
[
|
|
528
|
+
"-e",
|
|
529
|
+
f"API_URL=http://localhost:{backend_port}",
|
|
530
|
+
]
|
|
531
|
+
)
|
|
532
|
+
logger.info(
|
|
533
|
+
f"Configuring MCP to connect to backend API at http://localhost:{backend_port}"
|
|
534
|
+
)
|
|
535
|
+
logger.info("(localhost will be auto-converted to host.docker.internal)")
|
|
536
|
+
else:
|
|
537
|
+
cwd = os.getcwd()
|
|
538
|
+
env_file = os.path.join(cwd, ".env")
|
|
539
|
+
docker_cmd.extend(["--env-file", env_file])
|
|
540
|
+
|
|
541
|
+
docker_cmd.append(
|
|
542
|
+
image
|
|
543
|
+
) # TODO: change to "cognee/cognee-mcp:main" right before merging into main
|
|
544
|
+
|
|
508
545
|
mcp_process = subprocess.Popen(
|
|
509
|
-
|
|
510
|
-
"docker",
|
|
511
|
-
"run",
|
|
512
|
-
"-p",
|
|
513
|
-
f"{mcp_port}:8000",
|
|
514
|
-
"--rm",
|
|
515
|
-
"--env-file",
|
|
516
|
-
env_file,
|
|
517
|
-
"-e",
|
|
518
|
-
"TRANSPORT_MODE=sse",
|
|
519
|
-
"cognee/cognee-mcp:main",
|
|
520
|
-
],
|
|
546
|
+
docker_cmd,
|
|
521
547
|
stdout=subprocess.PIPE,
|
|
522
548
|
stderr=subprocess.PIPE,
|
|
523
549
|
preexec_fn=os.setsid if hasattr(os, "setsid") else None,
|
|
@@ -526,8 +552,13 @@ def start_ui(
|
|
|
526
552
|
_stream_process_output(mcp_process, "stdout", "[MCP]", "\033[34m") # Blue
|
|
527
553
|
_stream_process_output(mcp_process, "stderr", "[MCP]", "\033[34m") # Blue
|
|
528
554
|
|
|
529
|
-
|
|
530
|
-
|
|
555
|
+
# Pass both PID and container name using a tuple
|
|
556
|
+
pid_callback((mcp_process.pid, container_name))
|
|
557
|
+
|
|
558
|
+
mode_info = "API mode" if start_backend else "direct mode"
|
|
559
|
+
logger.info(
|
|
560
|
+
f"✓ Cognee MCP server starting on http://127.0.0.1:{mcp_port}/sse ({mode_info})"
|
|
561
|
+
)
|
|
531
562
|
except Exception as e:
|
|
532
563
|
logger.error(f"Failed to start MCP server with Docker: {str(e)}")
|
|
533
564
|
# Start backend server if requested
|
cognee/api/v1/update/update.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
|
-
from typing import Union, BinaryIO, List, Optional
|
|
2
|
+
from typing import Union, BinaryIO, List, Optional, Any
|
|
3
3
|
|
|
4
4
|
from cognee.modules.users.models import User
|
|
5
5
|
from cognee.api.v1.delete import delete
|
|
@@ -10,12 +10,12 @@ from cognee.api.v1.cognify import cognify
|
|
|
10
10
|
async def update(
|
|
11
11
|
data_id: UUID,
|
|
12
12
|
data: Union[BinaryIO, list[BinaryIO], str, list[str]],
|
|
13
|
+
dataset_id: UUID,
|
|
13
14
|
user: User = None,
|
|
14
15
|
node_set: Optional[List[str]] = None,
|
|
15
|
-
dataset_id: Optional[UUID] = None,
|
|
16
16
|
vector_db_config: dict = None,
|
|
17
17
|
graph_db_config: dict = None,
|
|
18
|
-
preferred_loaders:
|
|
18
|
+
preferred_loaders: dict[str, dict[str, Any]] = None,
|
|
19
19
|
incremental_loading: bool = True,
|
|
20
20
|
):
|
|
21
21
|
"""
|
cognee/cli/_cognee.py
CHANGED
|
@@ -175,19 +175,59 @@ def main() -> int:
|
|
|
175
175
|
# Handle UI flag
|
|
176
176
|
if hasattr(args, "start_ui") and args.start_ui:
|
|
177
177
|
spawned_pids = []
|
|
178
|
+
docker_container = None
|
|
178
179
|
|
|
179
180
|
def signal_handler(signum, frame):
|
|
180
181
|
"""Handle Ctrl+C and other termination signals"""
|
|
181
|
-
nonlocal spawned_pids
|
|
182
|
-
fmt.echo("\nShutting down UI server...")
|
|
182
|
+
nonlocal spawned_pids, docker_container
|
|
183
183
|
|
|
184
|
+
try:
|
|
185
|
+
fmt.echo("\nShutting down UI server...")
|
|
186
|
+
except (BrokenPipeError, OSError):
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
# First, stop Docker container if running
|
|
190
|
+
if docker_container:
|
|
191
|
+
try:
|
|
192
|
+
result = subprocess.run(
|
|
193
|
+
["docker", "stop", docker_container],
|
|
194
|
+
capture_output=True,
|
|
195
|
+
timeout=10,
|
|
196
|
+
check=False,
|
|
197
|
+
)
|
|
198
|
+
try:
|
|
199
|
+
if result.returncode == 0:
|
|
200
|
+
fmt.success(f"✓ Docker container {docker_container} stopped.")
|
|
201
|
+
else:
|
|
202
|
+
fmt.warning(
|
|
203
|
+
f"Could not stop container {docker_container}: {result.stderr.decode()}"
|
|
204
|
+
)
|
|
205
|
+
except (BrokenPipeError, OSError):
|
|
206
|
+
pass
|
|
207
|
+
except subprocess.TimeoutExpired:
|
|
208
|
+
try:
|
|
209
|
+
fmt.warning(
|
|
210
|
+
f"Timeout stopping container {docker_container}, forcing removal..."
|
|
211
|
+
)
|
|
212
|
+
except (BrokenPipeError, OSError):
|
|
213
|
+
pass
|
|
214
|
+
subprocess.run(
|
|
215
|
+
["docker", "rm", "-f", docker_container], capture_output=True, check=False
|
|
216
|
+
)
|
|
217
|
+
except Exception:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
# Then, stop regular processes
|
|
184
221
|
for pid in spawned_pids:
|
|
185
222
|
try:
|
|
186
223
|
if hasattr(os, "killpg"):
|
|
187
224
|
# Unix-like systems: Use process groups
|
|
188
225
|
pgid = os.getpgid(pid)
|
|
189
226
|
os.killpg(pgid, signal.SIGTERM)
|
|
190
|
-
|
|
227
|
+
try:
|
|
228
|
+
fmt.success(f"✓ Process group {pgid} (PID {pid}) terminated.")
|
|
229
|
+
except (BrokenPipeError, OSError):
|
|
230
|
+
pass
|
|
191
231
|
else:
|
|
192
232
|
# Windows: Use taskkill to terminate process and its children
|
|
193
233
|
subprocess.run(
|
|
@@ -195,24 +235,35 @@ def main() -> int:
|
|
|
195
235
|
capture_output=True,
|
|
196
236
|
check=False,
|
|
197
237
|
)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
238
|
+
try:
|
|
239
|
+
fmt.success(f"✓ Process {pid} and its children terminated.")
|
|
240
|
+
except (BrokenPipeError, OSError):
|
|
241
|
+
pass
|
|
242
|
+
except (OSError, ProcessLookupError, subprocess.SubprocessError):
|
|
243
|
+
pass
|
|
201
244
|
|
|
202
245
|
sys.exit(0)
|
|
203
246
|
|
|
204
247
|
signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
|
|
205
248
|
signal.signal(signal.SIGTERM, signal_handler) # Termination request
|
|
249
|
+
if hasattr(signal, "SIGHUP"):
|
|
250
|
+
signal.signal(signal.SIGHUP, signal_handler)
|
|
206
251
|
|
|
207
252
|
try:
|
|
208
253
|
from cognee import start_ui
|
|
209
254
|
|
|
210
255
|
fmt.echo("Starting cognee UI...")
|
|
211
256
|
|
|
212
|
-
# Callback to capture PIDs of all spawned processes
|
|
213
|
-
def pid_callback(
|
|
214
|
-
nonlocal spawned_pids
|
|
215
|
-
|
|
257
|
+
# Callback to capture PIDs and Docker container of all spawned processes
|
|
258
|
+
def pid_callback(pid_or_tuple):
|
|
259
|
+
nonlocal spawned_pids, docker_container
|
|
260
|
+
# Handle both regular PIDs and (PID, container_name) tuples
|
|
261
|
+
if isinstance(pid_or_tuple, tuple):
|
|
262
|
+
pid, container_name = pid_or_tuple
|
|
263
|
+
spawned_pids.append(pid)
|
|
264
|
+
docker_container = container_name
|
|
265
|
+
else:
|
|
266
|
+
spawned_pids.append(pid_or_tuple)
|
|
216
267
|
|
|
217
268
|
frontend_port = 3000
|
|
218
269
|
start_backend, backend_port = True, 8000
|