cognee 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/health.py +2 -12
  3. cognee/api/v1/add/add.py +46 -6
  4. cognee/api/v1/add/routers/get_add_router.py +5 -1
  5. cognee/api/v1/cognify/cognify.py +29 -9
  6. cognee/api/v1/datasets/datasets.py +11 -0
  7. cognee/api/v1/responses/default_tools.py +0 -1
  8. cognee/api/v1/responses/dispatch_function.py +1 -1
  9. cognee/api/v1/responses/routers/default_tools.py +0 -1
  10. cognee/api/v1/search/search.py +11 -9
  11. cognee/api/v1/settings/routers/get_settings_router.py +7 -1
  12. cognee/api/v1/ui/ui.py +47 -16
  13. cognee/api/v1/update/routers/get_update_router.py +1 -1
  14. cognee/api/v1/update/update.py +3 -3
  15. cognee/cli/_cognee.py +61 -10
  16. cognee/cli/commands/add_command.py +3 -3
  17. cognee/cli/commands/cognify_command.py +3 -3
  18. cognee/cli/commands/config_command.py +9 -7
  19. cognee/cli/commands/delete_command.py +3 -3
  20. cognee/cli/commands/search_command.py +3 -7
  21. cognee/cli/config.py +0 -1
  22. cognee/context_global_variables.py +5 -0
  23. cognee/exceptions/exceptions.py +1 -1
  24. cognee/infrastructure/databases/cache/__init__.py +2 -0
  25. cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
  26. cognee/infrastructure/databases/cache/config.py +44 -0
  27. cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
  28. cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
  29. cognee/infrastructure/databases/exceptions/__init__.py +1 -0
  30. cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
  31. cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
  32. cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
  33. cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
  34. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
  35. cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
  36. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
  37. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
  38. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
  39. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
  40. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
  41. cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
  42. cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
  43. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
  44. cognee/infrastructure/files/exceptions.py +1 -1
  45. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
  46. cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
  47. cognee/infrastructure/files/utils/guess_file_type.py +6 -0
  48. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
  49. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
  50. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
  51. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
  52. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
  53. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
  54. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
  55. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
  56. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
  57. cognee/infrastructure/loaders/LoaderEngine.py +27 -7
  58. cognee/infrastructure/loaders/external/__init__.py +7 -0
  59. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
  60. cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
  61. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  62. cognee/modules/data/exceptions/exceptions.py +1 -1
  63. cognee/modules/data/methods/__init__.py +3 -0
  64. cognee/modules/data/methods/get_dataset_data.py +4 -1
  65. cognee/modules/data/methods/has_dataset_data.py +21 -0
  66. cognee/modules/engine/models/TableRow.py +0 -1
  67. cognee/modules/ingestion/save_data_to_file.py +9 -2
  68. cognee/modules/pipelines/exceptions/exceptions.py +1 -1
  69. cognee/modules/pipelines/operations/pipeline.py +12 -1
  70. cognee/modules/pipelines/operations/run_tasks.py +25 -197
  71. cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
  72. cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
  73. cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
  74. cognee/modules/retrieval/base_graph_retriever.py +3 -1
  75. cognee/modules/retrieval/base_retriever.py +3 -1
  76. cognee/modules/retrieval/chunks_retriever.py +5 -1
  77. cognee/modules/retrieval/code_retriever.py +20 -2
  78. cognee/modules/retrieval/completion_retriever.py +50 -9
  79. cognee/modules/retrieval/cypher_search_retriever.py +11 -1
  80. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
  81. cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
  82. cognee/modules/retrieval/graph_completion_retriever.py +54 -10
  83. cognee/modules/retrieval/lexical_retriever.py +20 -2
  84. cognee/modules/retrieval/natural_language_retriever.py +10 -1
  85. cognee/modules/retrieval/summaries_retriever.py +5 -1
  86. cognee/modules/retrieval/temporal_retriever.py +62 -10
  87. cognee/modules/retrieval/user_qa_feedback.py +3 -2
  88. cognee/modules/retrieval/utils/completion.py +5 -0
  89. cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
  90. cognee/modules/retrieval/utils/session_cache.py +156 -0
  91. cognee/modules/search/methods/get_search_type_tools.py +0 -5
  92. cognee/modules/search/methods/no_access_control_search.py +12 -1
  93. cognee/modules/search/methods/search.py +34 -2
  94. cognee/modules/search/types/SearchType.py +0 -1
  95. cognee/modules/settings/get_settings.py +23 -0
  96. cognee/modules/users/methods/get_authenticated_user.py +3 -1
  97. cognee/modules/users/methods/get_default_user.py +1 -6
  98. cognee/modules/users/roles/methods/create_role.py +2 -2
  99. cognee/modules/users/tenants/methods/create_tenant.py +2 -2
  100. cognee/shared/exceptions/exceptions.py +1 -1
  101. cognee/tasks/codingagents/coding_rule_associations.py +1 -2
  102. cognee/tasks/documents/exceptions/exceptions.py +1 -1
  103. cognee/tasks/graph/extract_graph_from_data.py +2 -0
  104. cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
  105. cognee/tasks/ingestion/ingest_data.py +11 -5
  106. cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
  107. cognee/tasks/storage/add_data_points.py +3 -10
  108. cognee/tasks/storage/index_data_points.py +19 -14
  109. cognee/tasks/storage/index_graph_edges.py +25 -11
  110. cognee/tasks/web_scraper/__init__.py +34 -0
  111. cognee/tasks/web_scraper/config.py +26 -0
  112. cognee/tasks/web_scraper/default_url_crawler.py +446 -0
  113. cognee/tasks/web_scraper/models.py +46 -0
  114. cognee/tasks/web_scraper/types.py +4 -0
  115. cognee/tasks/web_scraper/utils.py +142 -0
  116. cognee/tasks/web_scraper/web_scraper_task.py +396 -0
  117. cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
  118. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
  119. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
  120. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
  121. cognee/tests/subprocesses/reader.py +25 -0
  122. cognee/tests/subprocesses/simple_cognify_1.py +31 -0
  123. cognee/tests/subprocesses/simple_cognify_2.py +31 -0
  124. cognee/tests/subprocesses/writer.py +32 -0
  125. cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
  126. cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
  127. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
  128. cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
  129. cognee/tests/test_add_docling_document.py +56 -0
  130. cognee/tests/test_chromadb.py +7 -11
  131. cognee/tests/test_concurrent_subprocess_access.py +76 -0
  132. cognee/tests/test_conversation_history.py +240 -0
  133. cognee/tests/test_kuzu.py +27 -15
  134. cognee/tests/test_lancedb.py +7 -11
  135. cognee/tests/test_library.py +32 -2
  136. cognee/tests/test_neo4j.py +24 -16
  137. cognee/tests/test_neptune_analytics_vector.py +7 -11
  138. cognee/tests/test_permissions.py +9 -13
  139. cognee/tests/test_pgvector.py +4 -4
  140. cognee/tests/test_remote_kuzu.py +8 -11
  141. cognee/tests/test_s3_file_storage.py +1 -1
  142. cognee/tests/test_search_db.py +6 -8
  143. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
  144. cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
  145. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/METADATA +21 -6
  146. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -126
  147. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
  148. distributed/Dockerfile +0 -3
  149. distributed/entrypoint.py +21 -9
  150. distributed/signal.py +5 -0
  151. distributed/workers/data_point_saving_worker.py +64 -34
  152. distributed/workers/graph_saving_worker.py +71 -47
  153. cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
  154. cognee/modules/retrieval/insights_retriever.py +0 -133
  155. cognee/tests/test_memgraph.py +0 -109
  156. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
  157. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
  158. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
  159. {cognee-0.3.6.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
cognee/__init__.py CHANGED
@@ -19,6 +19,7 @@ from .api.v1.add import add
19
19
  from .api.v1.delete import delete
20
20
  from .api.v1.cognify import cognify
21
21
  from .modules.memify import memify
22
+ from .api.v1.update import update
22
23
  from .api.v1.config.config import config
23
24
  from .api.v1.datasets.datasets import datasets
24
25
  from .api.v1.prune import prune
cognee/api/health.py CHANGED
@@ -241,16 +241,6 @@ class HealthChecker:
241
241
  """Get comprehensive health status."""
242
242
  components = {}
243
243
 
244
- # Critical services
245
- critical_components = [
246
- "relational_db",
247
- "vector_db",
248
- "graph_db",
249
- "file_storage",
250
- "llm_provider",
251
- "embedding_service",
252
- ]
253
-
254
244
  critical_checks = [
255
245
  ("relational_db", self.check_relational_db()),
256
246
  ("vector_db", self.check_vector_db()),
@@ -296,11 +286,11 @@ class HealthChecker:
296
286
  else:
297
287
  components[name] = result
298
288
 
289
+ critical_comps = [check[0] for check in critical_checks]
299
290
  # Determine overall status
300
291
  critical_unhealthy = any(
301
- comp.status == HealthStatus.UNHEALTHY
292
+ comp.status == HealthStatus.UNHEALTHY and name in critical_comps
302
293
  for name, comp in components.items()
303
- if name in critical_components
304
294
  )
305
295
 
306
296
  has_degraded = any(comp.status == HealthStatus.DEGRADED for comp in components.values())
cognee/api/v1/add/add.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from uuid import UUID
2
- from typing import Union, BinaryIO, List, Optional
3
-
2
+ from typing import Union, BinaryIO, List, Optional, Any
4
3
  from cognee.modules.users.models import User
5
4
  from cognee.modules.pipelines import Task, run_pipeline
6
5
  from cognee.modules.pipelines.layers.resolve_authorized_user_dataset import (
@@ -11,6 +10,9 @@ from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
11
10
  )
12
11
  from cognee.modules.engine.operations.setup import setup
13
12
  from cognee.tasks.ingestion import ingest_data, resolve_data_directories
13
+ from cognee.shared.logging_utils import get_logger
14
+
15
+ logger = get_logger()
14
16
 
15
17
 
16
18
  async def add(
@@ -21,14 +23,15 @@ async def add(
21
23
  vector_db_config: dict = None,
22
24
  graph_db_config: dict = None,
23
25
  dataset_id: Optional[UUID] = None,
24
- preferred_loaders: List[str] = None,
26
+ preferred_loaders: Optional[List[Union[str, dict[str, dict[str, Any]]]]] = None,
25
27
  incremental_loading: bool = True,
28
+ data_per_batch: Optional[int] = 20,
26
29
  ):
27
30
  """
28
31
  Add data to Cognee for knowledge graph processing.
29
32
 
30
33
  This is the first step in the Cognee workflow - it ingests raw data and prepares it
31
- for processing. The function accepts various data formats including text, files, and
34
+ for processing. The function accepts various data formats including text, files, urls and
32
35
  binary streams, then stores them in a specified dataset for further processing.
33
36
 
34
37
  Prerequisites:
@@ -68,6 +71,7 @@ async def add(
68
71
  - S3 path: "s3://my-bucket/documents/file.pdf"
69
72
  - List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
70
73
  - Binary file object: open("file.txt", "rb")
74
+ - url: A web link url (https or http)
71
75
  dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
72
76
  Create separate datasets to organize different knowledge domains.
73
77
  user: User object for authentication and permissions. Uses default user if None.
@@ -78,6 +82,9 @@ async def add(
78
82
  vector_db_config: Optional configuration for vector database (for custom setups).
79
83
  graph_db_config: Optional configuration for graph database (for custom setups).
80
84
  dataset_id: Optional specific dataset UUID to use instead of dataset_name.
85
+ extraction_rules: Optional dictionary of rules (e.g., CSS selectors, XPath) for extracting specific content from web pages using BeautifulSoup
86
+ tavily_config: Optional configuration for Tavily API, including API key and extraction settings
87
+ soup_crawler_config: Optional configuration for BeautifulSoup crawler, specifying concurrency, crawl delay, and extraction rules.
81
88
 
82
89
  Returns:
83
90
  PipelineRunInfo: Information about the ingestion pipeline execution including:
@@ -126,6 +133,21 @@ async def add(
126
133
 
127
134
  # Add a single file
128
135
  await cognee.add("/home/user/documents/analysis.pdf")
136
+
137
+ # Add a single url and bs4 extract ingestion method
138
+ extraction_rules = {
139
+ "title": "h1",
140
+ "description": "p",
141
+ "more_info": "a[href*='more-info']"
142
+ }
143
+ await cognee.add("https://example.com",extraction_rules=extraction_rules)
144
+
145
+ # Add a single url and tavily extract ingestion method
146
+ Make sure to set TAVILY_API_KEY = YOUR_TAVILY_API_KEY as a environment variable
147
+ await cognee.add("https://example.com")
148
+
149
+ # Add multiple urls
150
+ await cognee.add(["https://example.com","https://books.toscrape.com"])
129
151
  ```
130
152
 
131
153
  Environment Variables:
@@ -133,17 +155,34 @@ async def add(
133
155
  - LLM_API_KEY: API key for your LLM provider (OpenAI, Anthropic, etc.)
134
156
 
135
157
  Optional:
136
- - LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama"
158
+ - LLM_PROVIDER: "openai" (default), "anthropic", "gemini", "ollama", "mistral"
137
159
  - LLM_MODEL: Model name (default: "gpt-5-mini")
138
160
  - DEFAULT_USER_EMAIL: Custom default user email
139
161
  - DEFAULT_USER_PASSWORD: Custom default user password
140
162
  - VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "pgvector"
141
163
  - GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j"
164
+ - TAVILY_API_KEY: YOUR_TAVILY_API_KEY
142
165
 
143
166
  """
167
+ if preferred_loaders is not None:
168
+ transformed = {}
169
+ for item in preferred_loaders:
170
+ if isinstance(item, dict):
171
+ transformed.update(item)
172
+ else:
173
+ transformed[item] = {}
174
+ preferred_loaders = transformed
175
+
144
176
  tasks = [
145
177
  Task(resolve_data_directories, include_subdirectories=True),
146
- Task(ingest_data, dataset_name, user, node_set, dataset_id, preferred_loaders),
178
+ Task(
179
+ ingest_data,
180
+ dataset_name,
181
+ user,
182
+ node_set,
183
+ dataset_id,
184
+ preferred_loaders,
185
+ ),
147
186
  ]
148
187
 
149
188
  await setup()
@@ -167,6 +206,7 @@ async def add(
167
206
  vector_db_config=vector_db_config,
168
207
  graph_db_config=graph_db_config,
169
208
  incremental_loading=incremental_loading,
209
+ data_per_batch=data_per_batch,
170
210
  ):
171
211
  pipeline_run_info = run_info
172
212
 
@@ -73,7 +73,11 @@ def get_add_router() -> APIRouter:
73
73
 
74
74
  try:
75
75
  add_run = await cognee_add(
76
- data, datasetName, user=user, dataset_id=datasetId, node_set=node_set
76
+ data,
77
+ datasetName,
78
+ user=user,
79
+ dataset_id=datasetId,
80
+ node_set=node_set if node_set else None,
77
81
  )
78
82
 
79
83
  if isinstance(add_run, PipelineRunErrored):
@@ -44,6 +44,7 @@ async def cognify(
44
44
  graph_model: BaseModel = KnowledgeGraph,
45
45
  chunker=TextChunker,
46
46
  chunk_size: int = None,
47
+ chunks_per_batch: int = None,
47
48
  config: Config = None,
48
49
  vector_db_config: dict = None,
49
50
  graph_db_config: dict = None,
@@ -51,6 +52,7 @@ async def cognify(
51
52
  incremental_loading: bool = True,
52
53
  custom_prompt: Optional[str] = None,
53
54
  temporal_cognify: bool = False,
55
+ data_per_batch: int = 20,
54
56
  ):
55
57
  """
56
58
  Transform ingested data into a structured knowledge graph.
@@ -105,6 +107,7 @@ async def cognify(
105
107
  Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2)
106
108
  Default limits: ~512-8192 tokens depending on models.
107
109
  Smaller chunks = more granular but potentially fragmented knowledge.
110
+ chunks_per_batch: Number of chunks to be processed in a single batch in Cognify tasks.
108
111
  vector_db_config: Custom vector database configuration for embeddings storage.
109
112
  graph_db_config: Custom graph database configuration for relationship storage.
110
113
  run_in_background: If True, starts processing asynchronously and returns immediately.
@@ -148,7 +151,7 @@ async def cognify(
148
151
  # 2. Get entity relationships and connections
149
152
  relationships = await cognee.search(
150
153
  "connections between concepts",
151
- query_type=SearchType.INSIGHTS
154
+ query_type=SearchType.GRAPH_COMPLETION
152
155
  )
153
156
 
154
157
  # 3. Find relevant document chunks
@@ -209,10 +212,18 @@ async def cognify(
209
212
  }
210
213
 
211
214
  if temporal_cognify:
212
- tasks = await get_temporal_tasks(user, chunker, chunk_size)
215
+ tasks = await get_temporal_tasks(
216
+ user=user, chunker=chunker, chunk_size=chunk_size, chunks_per_batch=chunks_per_batch
217
+ )
213
218
  else:
214
219
  tasks = await get_default_tasks(
215
- user, graph_model, chunker, chunk_size, config, custom_prompt
220
+ user=user,
221
+ graph_model=graph_model,
222
+ chunker=chunker,
223
+ chunk_size=chunk_size,
224
+ config=config,
225
+ custom_prompt=custom_prompt,
226
+ chunks_per_batch=chunks_per_batch,
216
227
  )
217
228
 
218
229
  # By calling get pipeline executor we get a function that will have the run_pipeline run in the background or a function that we will need to wait for
@@ -228,6 +239,7 @@ async def cognify(
228
239
  graph_db_config=graph_db_config,
229
240
  incremental_loading=incremental_loading,
230
241
  pipeline_name="cognify_pipeline",
242
+ data_per_batch=data_per_batch,
231
243
  )
232
244
 
233
245
 
@@ -238,6 +250,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
238
250
  chunk_size: int = None,
239
251
  config: Config = None,
240
252
  custom_prompt: Optional[str] = None,
253
+ chunks_per_batch: int = 100,
241
254
  ) -> list[Task]:
242
255
  if config is None:
243
256
  ontology_config = get_ontology_env_config()
@@ -256,6 +269,9 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
256
269
  "ontology_config": {"ontology_resolver": get_default_ontology_resolver()}
257
270
  }
258
271
 
272
+ if chunks_per_batch is None:
273
+ chunks_per_batch = 100
274
+
259
275
  default_tasks = [
260
276
  Task(classify_documents),
261
277
  Task(check_permissions_on_dataset, user=user, permissions=["write"]),
@@ -269,20 +285,20 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
269
285
  graph_model=graph_model,
270
286
  config=config,
271
287
  custom_prompt=custom_prompt,
272
- task_config={"batch_size": 10},
288
+ task_config={"batch_size": chunks_per_batch},
273
289
  ), # Generate knowledge graphs from the document chunks.
274
290
  Task(
275
291
  summarize_text,
276
- task_config={"batch_size": 10},
292
+ task_config={"batch_size": chunks_per_batch},
277
293
  ),
278
- Task(add_data_points, task_config={"batch_size": 10}),
294
+ Task(add_data_points, task_config={"batch_size": chunks_per_batch}),
279
295
  ]
280
296
 
281
297
  return default_tasks
282
298
 
283
299
 
284
300
  async def get_temporal_tasks(
285
- user: User = None, chunker=TextChunker, chunk_size: int = None
301
+ user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10
286
302
  ) -> list[Task]:
287
303
  """
288
304
  Builds and returns a list of temporal processing tasks to be executed in sequence.
@@ -299,10 +315,14 @@ async def get_temporal_tasks(
299
315
  user (User, optional): The user requesting task execution, used for permission checks.
300
316
  chunker (Callable, optional): A text chunking function/class to split documents. Defaults to TextChunker.
301
317
  chunk_size (int, optional): Maximum token size per chunk. If not provided, uses system default.
318
+ chunks_per_batch (int, optional): Number of chunks to process in a single batch in Cognify
302
319
 
303
320
  Returns:
304
321
  list[Task]: A list of Task objects representing the temporal processing pipeline.
305
322
  """
323
+ if chunks_per_batch is None:
324
+ chunks_per_batch = 10
325
+
306
326
  temporal_tasks = [
307
327
  Task(classify_documents),
308
328
  Task(check_permissions_on_dataset, user=user, permissions=["write"]),
@@ -311,9 +331,9 @@ async def get_temporal_tasks(
311
331
  max_chunk_size=chunk_size or get_max_chunk_tokens(),
312
332
  chunker=chunker,
313
333
  ),
314
- Task(extract_events_and_timestamps, task_config={"chunk_size": 10}),
334
+ Task(extract_events_and_timestamps, task_config={"batch_size": chunks_per_batch}),
315
335
  Task(extract_knowledge_graph_from_events),
316
- Task(add_data_points, task_config={"batch_size": 10}),
336
+ Task(add_data_points, task_config={"batch_size": chunks_per_batch}),
317
337
  ]
318
338
 
319
339
  return temporal_tasks
@@ -1,4 +1,5 @@
1
1
  from uuid import UUID
2
+ from cognee.modules.data.methods import has_dataset_data
2
3
  from cognee.modules.users.methods import get_default_user
3
4
  from cognee.modules.ingestion import discover_directory_datasets
4
5
  from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
@@ -26,6 +27,16 @@ class datasets:
26
27
 
27
28
  return await get_dataset_data(dataset.id)
28
29
 
30
+ @staticmethod
31
+ async def has_data(dataset_id: str) -> bool:
32
+ from cognee.modules.data.methods import get_dataset
33
+
34
+ user = await get_default_user()
35
+
36
+ dataset = await get_dataset(user.id, dataset_id)
37
+
38
+ return await has_dataset_data(dataset.id)
39
+
29
40
  @staticmethod
30
41
  async def get_status(dataset_ids: list[UUID]) -> dict:
31
42
  return await get_pipeline_status(dataset_ids, pipeline_name="cognify_pipeline")
@@ -14,7 +14,6 @@ DEFAULT_TOOLS = [
14
14
  "type": "string",
15
15
  "description": "Type of search to perform",
16
16
  "enum": [
17
- "INSIGHTS",
18
17
  "CODE",
19
18
  "GRAPH_COMPLETION",
20
19
  "NATURAL_LANGUAGE",
@@ -59,7 +59,7 @@ async def handle_search(arguments: Dict[str, Any], user) -> list:
59
59
  valid_search_types = (
60
60
  search_tool["parameters"]["properties"]["search_type"]["enum"]
61
61
  if search_tool
62
- else ["INSIGHTS", "CODE", "GRAPH_COMPLETION", "NATURAL_LANGUAGE"]
62
+ else ["CODE", "GRAPH_COMPLETION", "NATURAL_LANGUAGE"]
63
63
  )
64
64
 
65
65
  if search_type_str not in valid_search_types:
@@ -14,7 +14,6 @@ DEFAULT_TOOLS = [
14
14
  "type": "string",
15
15
  "description": "Type of search to perform",
16
16
  "enum": [
17
- "INSIGHTS",
18
17
  "CODE",
19
18
  "GRAPH_COMPLETION",
20
19
  "NATURAL_LANGUAGE",
@@ -1,6 +1,7 @@
1
1
  from uuid import UUID
2
2
  from typing import Union, Optional, List, Type
3
3
 
4
+ from cognee.infrastructure.databases.graph import get_graph_engine
4
5
  from cognee.modules.engine.models.node_set import NodeSet
5
6
  from cognee.modules.users.models import User
6
7
  from cognee.modules.search.types import SearchResult, SearchType, CombinedSearchResult
@@ -8,6 +9,10 @@ from cognee.modules.users.methods import get_default_user
8
9
  from cognee.modules.search.methods import search as search_function
9
10
  from cognee.modules.data.methods import get_authorized_existing_datasets
10
11
  from cognee.modules.data.exceptions import DatasetNotFoundError
12
+ from cognee.context_global_variables import set_session_user_context_variable
13
+ from cognee.shared.logging_utils import get_logger
14
+
15
+ logger = get_logger()
11
16
 
12
17
 
13
18
  async def search(
@@ -25,6 +30,7 @@ async def search(
25
30
  last_k: Optional[int] = 1,
26
31
  only_context: bool = False,
27
32
  use_combined_context: bool = False,
33
+ session_id: Optional[str] = None,
28
34
  ) -> Union[List[SearchResult], CombinedSearchResult]:
29
35
  """
30
36
  Search and query the knowledge graph for insights, information, and connections.
@@ -52,11 +58,6 @@ async def search(
52
58
  Best for: Direct document retrieval, specific fact-finding.
53
59
  Returns: LLM responses based on relevant text chunks.
54
60
 
55
- **INSIGHTS**:
56
- Structured entity relationships and semantic connections.
57
- Best for: Understanding concept relationships, knowledge mapping.
58
- Returns: Formatted relationship data and entity connections.
59
-
60
61
  **CHUNKS**:
61
62
  Raw text segments that match the query semantically.
62
63
  Best for: Finding specific passages, citations, exact content.
@@ -118,15 +119,14 @@ async def search(
118
119
 
119
120
  save_interaction: Save interaction (query, context, answer connected to triplet endpoints) results into the graph or not
120
121
 
122
+ session_id: Optional session identifier for caching Q&A interactions. Defaults to 'default_session' if None.
123
+
121
124
  Returns:
122
125
  list: Search results in format determined by query_type:
123
126
 
124
127
  **GRAPH_COMPLETION/RAG_COMPLETION**:
125
128
  [List of conversational AI response strings]
126
129
 
127
- **INSIGHTS**:
128
- [List of formatted relationship descriptions and entity connections]
129
-
130
130
  **CHUNKS**:
131
131
  [List of relevant text passages with source metadata]
132
132
 
@@ -146,7 +146,6 @@ async def search(
146
146
  Performance & Optimization:
147
147
  - **GRAPH_COMPLETION**: Slower but most intelligent, uses LLM + graph context
148
148
  - **RAG_COMPLETION**: Medium speed, uses LLM + document chunks (no graph traversal)
149
- - **INSIGHTS**: Fast, returns structured relationships without LLM processing
150
149
  - **CHUNKS**: Fastest, pure vector similarity search without LLM
151
150
  - **SUMMARIES**: Fast, returns pre-computed summaries
152
151
  - **CODE**: Medium speed, specialized for code understanding
@@ -177,6 +176,8 @@ async def search(
177
176
  if user is None:
178
177
  user = await get_default_user()
179
178
 
179
+ await set_session_user_context_variable(user)
180
+
180
181
  # Transform string based datasets to UUID - String based datasets can only be found for current user
181
182
  if datasets is not None and [all(isinstance(dataset, str) for dataset in datasets)]:
182
183
  datasets = await get_authorized_existing_datasets(datasets, "read", user)
@@ -198,6 +199,7 @@ async def search(
198
199
  last_k=last_k,
199
200
  only_context=only_context,
200
201
  use_combined_context=use_combined_context,
202
+ session_id=session_id,
201
203
  )
202
204
 
203
205
  return filtered_search_results
@@ -21,7 +21,13 @@ class SettingsDTO(OutDTO):
21
21
 
22
22
 
23
23
  class LLMConfigInputDTO(InDTO):
24
- provider: Union[Literal["openai"], Literal["ollama"], Literal["anthropic"], Literal["gemini"]]
24
+ provider: Union[
25
+ Literal["openai"],
26
+ Literal["ollama"],
27
+ Literal["anthropic"],
28
+ Literal["gemini"],
29
+ Literal["mistral"],
30
+ ]
25
31
  model: str
26
32
  api_key: str
27
33
 
cognee/api/v1/ui/ui.py CHANGED
@@ -502,22 +502,48 @@ def start_ui(
502
502
 
503
503
  if start_mcp:
504
504
  logger.info("Starting Cognee MCP server with Docker...")
505
- cwd = os.getcwd()
506
- env_file = os.path.join(cwd, ".env")
507
505
  try:
506
+ image = "cognee/cognee-mcp:feature-standalone-mcp" # TODO: change to "cognee/cognee-mcp:main" right before merging into main
507
+ subprocess.run(["docker", "pull", image], check=True)
508
+
509
+ import uuid
510
+
511
+ container_name = f"cognee-mcp-{uuid.uuid4().hex[:8]}"
512
+
513
+ docker_cmd = [
514
+ "docker",
515
+ "run",
516
+ "--name",
517
+ container_name,
518
+ "-p",
519
+ f"{mcp_port}:8000",
520
+ "--rm",
521
+ "-e",
522
+ "TRANSPORT_MODE=sse",
523
+ ]
524
+
525
+ if start_backend:
526
+ docker_cmd.extend(
527
+ [
528
+ "-e",
529
+ f"API_URL=http://localhost:{backend_port}",
530
+ ]
531
+ )
532
+ logger.info(
533
+ f"Configuring MCP to connect to backend API at http://localhost:{backend_port}"
534
+ )
535
+ logger.info("(localhost will be auto-converted to host.docker.internal)")
536
+ else:
537
+ cwd = os.getcwd()
538
+ env_file = os.path.join(cwd, ".env")
539
+ docker_cmd.extend(["--env-file", env_file])
540
+
541
+ docker_cmd.append(
542
+ image
543
+ ) # TODO: change to "cognee/cognee-mcp:main" right before merging into main
544
+
508
545
  mcp_process = subprocess.Popen(
509
- [
510
- "docker",
511
- "run",
512
- "-p",
513
- f"{mcp_port}:8000",
514
- "--rm",
515
- "--env-file",
516
- env_file,
517
- "-e",
518
- "TRANSPORT_MODE=sse",
519
- "cognee/cognee-mcp:main",
520
- ],
546
+ docker_cmd,
521
547
  stdout=subprocess.PIPE,
522
548
  stderr=subprocess.PIPE,
523
549
  preexec_fn=os.setsid if hasattr(os, "setsid") else None,
@@ -526,8 +552,13 @@ def start_ui(
526
552
  _stream_process_output(mcp_process, "stdout", "[MCP]", "\033[34m") # Blue
527
553
  _stream_process_output(mcp_process, "stderr", "[MCP]", "\033[34m") # Blue
528
554
 
529
- pid_callback(mcp_process.pid)
530
- logger.info(f"✓ Cognee MCP server starting on http://127.0.0.1:{mcp_port}/sse")
555
+ # Pass both PID and container name using a tuple
556
+ pid_callback((mcp_process.pid, container_name))
557
+
558
+ mode_info = "API mode" if start_backend else "direct mode"
559
+ logger.info(
560
+ f"✓ Cognee MCP server starting on http://127.0.0.1:{mcp_port}/sse ({mode_info})"
561
+ )
531
562
  except Exception as e:
532
563
  logger.error(f"Failed to start MCP server with Docker: {str(e)}")
533
564
  # Start backend server if requested
@@ -75,7 +75,7 @@ def get_update_router() -> APIRouter:
75
75
  data=data,
76
76
  dataset_id=dataset_id,
77
77
  user=user,
78
- node_set=node_set,
78
+ node_set=node_set if node_set else None,
79
79
  )
80
80
 
81
81
  # If any cognify run errored return JSONResponse with proper error status code
@@ -1,5 +1,5 @@
1
1
  from uuid import UUID
2
- from typing import Union, BinaryIO, List, Optional
2
+ from typing import Union, BinaryIO, List, Optional, Any
3
3
 
4
4
  from cognee.modules.users.models import User
5
5
  from cognee.api.v1.delete import delete
@@ -10,12 +10,12 @@ from cognee.api.v1.cognify import cognify
10
10
  async def update(
11
11
  data_id: UUID,
12
12
  data: Union[BinaryIO, list[BinaryIO], str, list[str]],
13
+ dataset_id: UUID,
13
14
  user: User = None,
14
15
  node_set: Optional[List[str]] = None,
15
- dataset_id: Optional[UUID] = None,
16
16
  vector_db_config: dict = None,
17
17
  graph_db_config: dict = None,
18
- preferred_loaders: List[str] = None,
18
+ preferred_loaders: dict[str, dict[str, Any]] = None,
19
19
  incremental_loading: bool = True,
20
20
  ):
21
21
  """
cognee/cli/_cognee.py CHANGED
@@ -175,19 +175,59 @@ def main() -> int:
175
175
  # Handle UI flag
176
176
  if hasattr(args, "start_ui") and args.start_ui:
177
177
  spawned_pids = []
178
+ docker_container = None
178
179
 
179
180
  def signal_handler(signum, frame):
180
181
  """Handle Ctrl+C and other termination signals"""
181
- nonlocal spawned_pids
182
- fmt.echo("\nShutting down UI server...")
182
+ nonlocal spawned_pids, docker_container
183
183
 
184
+ try:
185
+ fmt.echo("\nShutting down UI server...")
186
+ except (BrokenPipeError, OSError):
187
+ pass
188
+
189
+ # First, stop Docker container if running
190
+ if docker_container:
191
+ try:
192
+ result = subprocess.run(
193
+ ["docker", "stop", docker_container],
194
+ capture_output=True,
195
+ timeout=10,
196
+ check=False,
197
+ )
198
+ try:
199
+ if result.returncode == 0:
200
+ fmt.success(f"✓ Docker container {docker_container} stopped.")
201
+ else:
202
+ fmt.warning(
203
+ f"Could not stop container {docker_container}: {result.stderr.decode()}"
204
+ )
205
+ except (BrokenPipeError, OSError):
206
+ pass
207
+ except subprocess.TimeoutExpired:
208
+ try:
209
+ fmt.warning(
210
+ f"Timeout stopping container {docker_container}, forcing removal..."
211
+ )
212
+ except (BrokenPipeError, OSError):
213
+ pass
214
+ subprocess.run(
215
+ ["docker", "rm", "-f", docker_container], capture_output=True, check=False
216
+ )
217
+ except Exception:
218
+ pass
219
+
220
+ # Then, stop regular processes
184
221
  for pid in spawned_pids:
185
222
  try:
186
223
  if hasattr(os, "killpg"):
187
224
  # Unix-like systems: Use process groups
188
225
  pgid = os.getpgid(pid)
189
226
  os.killpg(pgid, signal.SIGTERM)
190
- fmt.success(f"✓ Process group {pgid} (PID {pid}) terminated.")
227
+ try:
228
+ fmt.success(f"✓ Process group {pgid} (PID {pid}) terminated.")
229
+ except (BrokenPipeError, OSError):
230
+ pass
191
231
  else:
192
232
  # Windows: Use taskkill to terminate process and its children
193
233
  subprocess.run(
@@ -195,24 +235,35 @@ def main() -> int:
195
235
  capture_output=True,
196
236
  check=False,
197
237
  )
198
- fmt.success(f"✓ Process {pid} and its children terminated.")
199
- except (OSError, ProcessLookupError, subprocess.SubprocessError) as e:
200
- fmt.warning(f"Could not terminate process {pid}: {e}")
238
+ try:
239
+ fmt.success(f"✓ Process {pid} and its children terminated.")
240
+ except (BrokenPipeError, OSError):
241
+ pass
242
+ except (OSError, ProcessLookupError, subprocess.SubprocessError):
243
+ pass
201
244
 
202
245
  sys.exit(0)
203
246
 
204
247
  signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
205
248
  signal.signal(signal.SIGTERM, signal_handler) # Termination request
249
+ if hasattr(signal, "SIGHUP"):
250
+ signal.signal(signal.SIGHUP, signal_handler)
206
251
 
207
252
  try:
208
253
  from cognee import start_ui
209
254
 
210
255
  fmt.echo("Starting cognee UI...")
211
256
 
212
- # Callback to capture PIDs of all spawned processes
213
- def pid_callback(pid):
214
- nonlocal spawned_pids
215
- spawned_pids.append(pid)
257
+ # Callback to capture PIDs and Docker container of all spawned processes
258
+ def pid_callback(pid_or_tuple):
259
+ nonlocal spawned_pids, docker_container
260
+ # Handle both regular PIDs and (PID, container_name) tuples
261
+ if isinstance(pid_or_tuple, tuple):
262
+ pid, container_name = pid_or_tuple
263
+ spawned_pids.append(pid)
264
+ docker_container = container_name
265
+ else:
266
+ spawned_pids.append(pid_or_tuple)
216
267
 
217
268
  frontend_port = 3000
218
269
  start_backend, backend_port = True, 8000