cognee 0.2.2.dev0__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. cognee/api/client.py +41 -3
  2. cognee/api/health.py +332 -0
  3. cognee/api/v1/add/add.py +5 -2
  4. cognee/api/v1/add/routers/get_add_router.py +3 -0
  5. cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
  6. cognee/api/v1/cognify/cognify.py +8 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
  8. cognee/api/v1/config/config.py +3 -1
  9. cognee/api/v1/datasets/routers/get_datasets_router.py +1 -7
  10. cognee/api/v1/delete/delete.py +16 -12
  11. cognee/api/v1/responses/routers/get_responses_router.py +3 -1
  12. cognee/api/v1/search/search.py +10 -0
  13. cognee/api/v1/settings/routers/get_settings_router.py +0 -2
  14. cognee/base_config.py +1 -0
  15. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
  16. cognee/infrastructure/databases/graph/config.py +2 -0
  17. cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
  18. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
  19. cognee/infrastructure/databases/graph/kuzu/adapter.py +12 -7
  20. cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +1 -1
  21. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +48 -13
  22. cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
  23. cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
  24. cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
  25. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
  26. cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
  27. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
  28. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -0
  29. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
  30. cognee/infrastructure/databases/vector/create_vector_engine.py +31 -15
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
  34. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
  35. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
  36. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
  37. cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
  38. cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
  39. cognee/infrastructure/files/utils/guess_file_type.py +2 -2
  40. cognee/infrastructure/files/utils/open_data_file.py +4 -23
  41. cognee/infrastructure/llm/LLMGateway.py +137 -0
  42. cognee/infrastructure/llm/__init__.py +14 -4
  43. cognee/infrastructure/llm/config.py +29 -1
  44. cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
  45. cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
  46. cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
  47. cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
  48. cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
  49. cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
  50. cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
  51. cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
  52. cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
  53. cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
  54. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
  55. cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
  65. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
  66. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
  67. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
  68. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
  69. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
  70. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
  71. cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
  73. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
  74. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
  77. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
  78. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
  79. cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
  80. cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
  82. cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
  83. cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
  84. cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
  85. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
  86. cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
  87. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
  88. cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
  89. cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
  90. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
  91. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
  92. cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
  93. cognee/infrastructure/llm/utils.py +3 -1
  94. cognee/infrastructure/loaders/LoaderEngine.py +156 -0
  95. cognee/infrastructure/loaders/LoaderInterface.py +73 -0
  96. cognee/infrastructure/loaders/__init__.py +18 -0
  97. cognee/infrastructure/loaders/core/__init__.py +7 -0
  98. cognee/infrastructure/loaders/core/audio_loader.py +98 -0
  99. cognee/infrastructure/loaders/core/image_loader.py +114 -0
  100. cognee/infrastructure/loaders/core/text_loader.py +90 -0
  101. cognee/infrastructure/loaders/create_loader_engine.py +32 -0
  102. cognee/infrastructure/loaders/external/__init__.py +22 -0
  103. cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
  104. cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
  105. cognee/infrastructure/loaders/get_loader_engine.py +18 -0
  106. cognee/infrastructure/loaders/supported_loaders.py +18 -0
  107. cognee/infrastructure/loaders/use_loader.py +21 -0
  108. cognee/infrastructure/loaders/utils/__init__.py +0 -0
  109. cognee/modules/data/methods/__init__.py +1 -0
  110. cognee/modules/data/methods/get_authorized_dataset.py +23 -0
  111. cognee/modules/data/models/Data.py +11 -1
  112. cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
  113. cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
  114. cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
  115. cognee/modules/engine/utils/generate_edge_id.py +5 -0
  116. cognee/modules/graph/cognee_graph/CogneeGraph.py +9 -18
  117. cognee/modules/graph/methods/get_formatted_graph_data.py +7 -1
  118. cognee/modules/graph/utils/get_graph_from_model.py +93 -101
  119. cognee/modules/ingestion/data_types/TextData.py +8 -2
  120. cognee/modules/ingestion/save_data_to_file.py +1 -1
  121. cognee/modules/pipelines/exceptions/__init__.py +1 -0
  122. cognee/modules/pipelines/exceptions/exceptions.py +12 -0
  123. cognee/modules/pipelines/models/DataItemStatus.py +5 -0
  124. cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
  125. cognee/modules/pipelines/models/__init__.py +1 -0
  126. cognee/modules/pipelines/operations/pipeline.py +10 -2
  127. cognee/modules/pipelines/operations/run_tasks.py +251 -19
  128. cognee/modules/retrieval/code_retriever.py +3 -5
  129. cognee/modules/retrieval/completion_retriever.py +1 -1
  130. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
  131. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
  132. cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
  133. cognee/modules/retrieval/natural_language_retriever.py +3 -5
  134. cognee/modules/retrieval/utils/completion.py +6 -9
  135. cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
  136. cognee/modules/search/methods/search.py +5 -1
  137. cognee/modules/search/operations/__init__.py +1 -0
  138. cognee/modules/search/operations/select_search_type.py +42 -0
  139. cognee/modules/search/types/SearchType.py +1 -0
  140. cognee/modules/settings/get_settings.py +0 -4
  141. cognee/modules/settings/save_vector_db_config.py +1 -1
  142. cognee/shared/data_models.py +3 -1
  143. cognee/shared/logging_utils.py +0 -5
  144. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  145. cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
  146. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
  147. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
  148. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
  149. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
  150. cognee/tasks/graph/extract_graph_from_code.py +3 -2
  151. cognee/tasks/graph/extract_graph_from_data.py +4 -3
  152. cognee/tasks/graph/infer_data_ontology.py +5 -6
  153. cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
  154. cognee/tasks/ingestion/ingest_data.py +91 -61
  155. cognee/tasks/ingestion/resolve_data_directories.py +3 -0
  156. cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
  157. cognee/tasks/storage/index_data_points.py +1 -1
  158. cognee/tasks/storage/index_graph_edges.py +4 -1
  159. cognee/tasks/summarization/summarize_code.py +2 -3
  160. cognee/tasks/summarization/summarize_text.py +3 -2
  161. cognee/tests/test_cognee_server_start.py +12 -7
  162. cognee/tests/test_deduplication.py +2 -2
  163. cognee/tests/test_deletion.py +58 -17
  164. cognee/tests/test_graph_visualization_permissions.py +161 -0
  165. cognee/tests/test_neptune_analytics_graph.py +309 -0
  166. cognee/tests/test_neptune_analytics_hybrid.py +176 -0
  167. cognee/tests/{test_qdrant.py → test_neptune_analytics_vector.py} +86 -16
  168. cognee/tests/test_pgvector.py +5 -5
  169. cognee/tests/test_s3.py +1 -6
  170. cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
  171. cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
  172. cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
  173. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
  174. cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
  175. cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
  176. cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
  177. cognee/tests/unit/modules/search/search_methods_test.py +55 -0
  178. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/METADATA +12 -6
  179. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/RECORD +195 -156
  180. cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
  181. cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
  182. cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
  183. cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
  184. cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
  185. cognee/modules/data/extraction/extract_categories.py +0 -14
  186. distributed/Dockerfile +0 -34
  187. distributed/app.py +0 -4
  188. distributed/entrypoint.py +0 -71
  189. distributed/entrypoint.sh +0 -5
  190. distributed/modal_image.py +0 -11
  191. distributed/queues.py +0 -5
  192. distributed/tasks/queued_add_data_points.py +0 -13
  193. distributed/tasks/queued_add_edges.py +0 -13
  194. distributed/tasks/queued_add_nodes.py +0 -13
  195. distributed/test.py +0 -28
  196. distributed/utils.py +0 -19
  197. distributed/workers/data_point_saving_worker.py +0 -93
  198. distributed/workers/graph_saving_worker.py +0 -104
  199. /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
  200. /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
  201. /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
  202. /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
  203. /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
  204. /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
  205. /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
  206. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
  207. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
  208. /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
  209. {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
  210. {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
  211. /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
  212. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/WHEEL +0 -0
  213. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/licenses/LICENSE +0 -0
  214. {cognee-0.2.2.dev0.dist-info → cognee-0.2.3.dist-info}/licenses/NOTICE.md +0 -0
@@ -1,21 +1,31 @@
1
1
  import os
2
+
3
+ import asyncio
2
4
  from uuid import UUID
3
- from typing import Any
5
+ from typing import Any, List
4
6
  from functools import wraps
7
+ from sqlalchemy import select
5
8
 
9
+ import cognee.modules.ingestion as ingestion
6
10
  from cognee.infrastructure.databases.graph import get_graph_engine
7
11
  from cognee.infrastructure.databases.relational import get_relational_engine
8
12
  from cognee.modules.pipelines.operations.run_tasks_distributed import run_tasks_distributed
9
13
  from cognee.modules.users.models import User
14
+ from cognee.modules.data.models import Data
15
+ from cognee.infrastructure.files.utils.open_data_file import open_data_file
10
16
  from cognee.shared.logging_utils import get_logger
11
17
  from cognee.modules.users.methods import get_default_user
12
18
  from cognee.modules.pipelines.utils import generate_pipeline_id
19
+ from cognee.modules.pipelines.exceptions import PipelineRunFailedError
20
+ from cognee.tasks.ingestion import save_data_item_to_storage, resolve_data_directories
13
21
  from cognee.modules.pipelines.models.PipelineRunInfo import (
14
22
  PipelineRunCompleted,
15
23
  PipelineRunErrored,
16
24
  PipelineRunStarted,
17
25
  PipelineRunYield,
26
+ PipelineRunAlreadyCompleted,
18
27
  )
28
+ from cognee.modules.pipelines.models.DataItemStatus import DataItemStatus
19
29
 
20
30
  from cognee.modules.pipelines.operations import (
21
31
  log_pipeline_run_start,
@@ -50,13 +60,184 @@ def override_run_tasks(new_gen):
50
60
 
51
61
  @override_run_tasks(run_tasks_distributed)
52
62
  async def run_tasks(
53
- tasks: list[Task],
63
+ tasks: List[Task],
54
64
  dataset_id: UUID,
55
- data: Any = None,
65
+ data: List[Any] = None,
56
66
  user: User = None,
57
67
  pipeline_name: str = "unknown_pipeline",
58
68
  context: dict = None,
69
+ incremental_loading: bool = False,
59
70
  ):
71
+ async def _run_tasks_data_item_incremental(
72
+ data_item,
73
+ dataset,
74
+ tasks,
75
+ pipeline_name,
76
+ pipeline_id,
77
+ pipeline_run_id,
78
+ context,
79
+ user,
80
+ ):
81
+ db_engine = get_relational_engine()
82
+ # If incremental_loading of data is set to True don't process documents already processed by pipeline
83
+ # If data is being added to Cognee for the first time calculate the id of the data
84
+ if not isinstance(data_item, Data):
85
+ file_path = await save_data_item_to_storage(data_item)
86
+ # Ingest data and add metadata
87
+ async with open_data_file(file_path) as file:
88
+ classified_data = ingestion.classify(file)
89
+ # data_id is the hash of file contents + owner id to avoid duplicate data
90
+ data_id = ingestion.identify(classified_data, user)
91
+ else:
92
+ # If data was already processed by Cognee get data id
93
+ data_id = data_item.id
94
+
95
+ # Check pipeline status, if Data already processed for pipeline before skip current processing
96
+ async with db_engine.get_async_session() as session:
97
+ data_point = (
98
+ await session.execute(select(Data).filter(Data.id == data_id))
99
+ ).scalar_one_or_none()
100
+ if data_point:
101
+ if (
102
+ data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
103
+ == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
104
+ ):
105
+ yield {
106
+ "run_info": PipelineRunAlreadyCompleted(
107
+ pipeline_run_id=pipeline_run_id,
108
+ dataset_id=dataset.id,
109
+ dataset_name=dataset.name,
110
+ ),
111
+ "data_id": data_id,
112
+ }
113
+ return
114
+
115
+ try:
116
+ # Process data based on data_item and list of tasks
117
+ async for result in run_tasks_with_telemetry(
118
+ tasks=tasks,
119
+ data=[data_item],
120
+ user=user,
121
+ pipeline_name=pipeline_id,
122
+ context=context,
123
+ ):
124
+ yield PipelineRunYield(
125
+ pipeline_run_id=pipeline_run_id,
126
+ dataset_id=dataset.id,
127
+ dataset_name=dataset.name,
128
+ payload=result,
129
+ )
130
+
131
+ # Update pipeline status for Data element
132
+ async with db_engine.get_async_session() as session:
133
+ data_point = (
134
+ await session.execute(select(Data).filter(Data.id == data_id))
135
+ ).scalar_one_or_none()
136
+ data_point.pipeline_status[pipeline_name] = {
137
+ str(dataset.id): DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
138
+ }
139
+ await session.merge(data_point)
140
+ await session.commit()
141
+
142
+ yield {
143
+ "run_info": PipelineRunCompleted(
144
+ pipeline_run_id=pipeline_run_id,
145
+ dataset_id=dataset.id,
146
+ dataset_name=dataset.name,
147
+ ),
148
+ "data_id": data_id,
149
+ }
150
+
151
+ except Exception as error:
152
+ # Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
153
+ logger.error(
154
+ f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
155
+ )
156
+ yield {
157
+ "run_info": PipelineRunErrored(
158
+ pipeline_run_id=pipeline_run_id,
159
+ payload=repr(error),
160
+ dataset_id=dataset.id,
161
+ dataset_name=dataset.name,
162
+ ),
163
+ "data_id": data_id,
164
+ }
165
+
166
+ if os.getenv("RAISE_INCREMENTAL_LOADING_ERRORS", "true").lower() == "true":
167
+ raise error
168
+
169
+ async def _run_tasks_data_item_regular(
170
+ data_item,
171
+ dataset,
172
+ tasks,
173
+ pipeline_id,
174
+ pipeline_run_id,
175
+ context,
176
+ user,
177
+ ):
178
+ # Process data based on data_item and list of tasks
179
+ async for result in run_tasks_with_telemetry(
180
+ tasks=tasks,
181
+ data=[data_item],
182
+ user=user,
183
+ pipeline_name=pipeline_id,
184
+ context=context,
185
+ ):
186
+ yield PipelineRunYield(
187
+ pipeline_run_id=pipeline_run_id,
188
+ dataset_id=dataset.id,
189
+ dataset_name=dataset.name,
190
+ payload=result,
191
+ )
192
+
193
+ yield {
194
+ "run_info": PipelineRunCompleted(
195
+ pipeline_run_id=pipeline_run_id,
196
+ dataset_id=dataset.id,
197
+ dataset_name=dataset.name,
198
+ )
199
+ }
200
+
201
+ async def _run_tasks_data_item(
202
+ data_item,
203
+ dataset,
204
+ tasks,
205
+ pipeline_name,
206
+ pipeline_id,
207
+ pipeline_run_id,
208
+ context,
209
+ user,
210
+ incremental_loading,
211
+ ):
212
+ # Go through async generator and return data item processing result. Result can be PipelineRunAlreadyCompleted when data item is skipped,
213
+ # PipelineRunCompleted when processing was successful and PipelineRunErrored if there were issues
214
+ result = None
215
+ if incremental_loading:
216
+ async for result in _run_tasks_data_item_incremental(
217
+ data_item=data_item,
218
+ dataset=dataset,
219
+ tasks=tasks,
220
+ pipeline_name=pipeline_name,
221
+ pipeline_id=pipeline_id,
222
+ pipeline_run_id=pipeline_run_id,
223
+ context=context,
224
+ user=user,
225
+ ):
226
+ pass
227
+ else:
228
+ async for result in _run_tasks_data_item_regular(
229
+ data_item=data_item,
230
+ dataset=dataset,
231
+ tasks=tasks,
232
+ pipeline_id=pipeline_id,
233
+ pipeline_run_id=pipeline_run_id,
234
+ context=context,
235
+ user=user,
236
+ ):
237
+ pass
238
+
239
+ return result
240
+
60
241
  if not user:
61
242
  user = await get_default_user()
62
243
 
@@ -68,9 +249,7 @@ async def run_tasks(
68
249
  dataset = await session.get(Dataset, dataset_id)
69
250
 
70
251
  pipeline_id = generate_pipeline_id(user.id, dataset.id, pipeline_name)
71
-
72
252
  pipeline_run = await log_pipeline_run_start(pipeline_id, pipeline_name, dataset_id, data)
73
-
74
253
  pipeline_run_id = pipeline_run.pipeline_run_id
75
254
 
76
255
  yield PipelineRunStarted(
@@ -81,18 +260,65 @@ async def run_tasks(
81
260
  )
82
261
 
83
262
  try:
84
- async for result in run_tasks_with_telemetry(
85
- tasks=tasks,
86
- data=data,
87
- user=user,
88
- pipeline_name=pipeline_id,
89
- context=context,
90
- ):
91
- yield PipelineRunYield(
92
- pipeline_run_id=pipeline_run_id,
93
- dataset_id=dataset.id,
94
- dataset_name=dataset.name,
95
- payload=result,
263
+ if not isinstance(data, list):
264
+ data = [data]
265
+
266
+ if incremental_loading:
267
+ data = await resolve_data_directories(data)
268
+
269
+ # TODO: Return to using async.gather for data items after Cognee release
270
+ # # Create async tasks per data item that will run the pipeline for the data item
271
+ # data_item_tasks = [
272
+ # asyncio.create_task(
273
+ # _run_tasks_data_item(
274
+ # data_item,
275
+ # dataset,
276
+ # tasks,
277
+ # pipeline_name,
278
+ # pipeline_id,
279
+ # pipeline_run_id,
280
+ # context,
281
+ # user,
282
+ # incremental_loading,
283
+ # )
284
+ # )
285
+ # for data_item in data
286
+ # ]
287
+ # results = await asyncio.gather(*data_item_tasks)
288
+ # # Remove skipped data items from results
289
+ # results = [result for result in results if result]
290
+
291
+ ### TEMP sync data item handling
292
+ results = []
293
+ # Run the pipeline for each data_item sequentially, one after the other
294
+ for data_item in data:
295
+ result = await _run_tasks_data_item(
296
+ data_item,
297
+ dataset,
298
+ tasks,
299
+ pipeline_name,
300
+ pipeline_id,
301
+ pipeline_run_id,
302
+ context,
303
+ user,
304
+ incremental_loading,
305
+ )
306
+
307
+ # Skip items that returned a false-y value
308
+ if result:
309
+ results.append(result)
310
+ ### END
311
+
312
+ # Remove skipped data items from results
313
+ results = [result for result in results if result]
314
+
315
+ # If any data item could not be processed propagate error
316
+ errored_results = [
317
+ result for result in results if isinstance(result["run_info"], PipelineRunErrored)
318
+ ]
319
+ if errored_results:
320
+ raise PipelineRunFailedError(
321
+ message="Pipeline run failed. Data item could not be processed."
96
322
  )
97
323
 
98
324
  await log_pipeline_run_complete(
@@ -103,6 +329,7 @@ async def run_tasks(
103
329
  pipeline_run_id=pipeline_run_id,
104
330
  dataset_id=dataset.id,
105
331
  dataset_name=dataset.name,
332
+ data_ingestion_info=results,
106
333
  )
107
334
 
108
335
  graph_engine = await get_graph_engine()
@@ -120,9 +347,14 @@ async def run_tasks(
120
347
 
121
348
  yield PipelineRunErrored(
122
349
  pipeline_run_id=pipeline_run_id,
123
- payload=error,
350
+ payload=repr(error),
124
351
  dataset_id=dataset.id,
125
352
  dataset_name=dataset.name,
353
+ data_ingestion_info=locals().get(
354
+ "results"
355
+ ), # Returns results if they exist or returns None
126
356
  )
127
357
 
128
- raise error
358
+ # In case of error during incremental loading of data just let the user know the pipeline Errored, don't raise error
359
+ if not isinstance(error, PipelineRunFailedError):
360
+ raise error
@@ -7,8 +7,7 @@ from cognee.shared.logging_utils import get_logger
7
7
  from cognee.modules.retrieval.base_retriever import BaseRetriever
8
8
  from cognee.infrastructure.databases.graph import get_graph_engine
9
9
  from cognee.infrastructure.databases.vector import get_vector_engine
10
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
11
- from cognee.infrastructure.llm.prompts import read_query_prompt
10
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
12
11
 
13
12
  logger = get_logger("CodeRetriever")
14
13
 
@@ -42,11 +41,10 @@ class CodeRetriever(BaseRetriever):
42
41
  f"Processing query with LLM: '{query[:100]}{'...' if len(query) > 100 else ''}'"
43
42
  )
44
43
 
45
- system_prompt = read_query_prompt("codegraph_retriever_system.txt")
46
- llm_client = get_llm_client()
44
+ system_prompt = LLMGateway.read_query_prompt("codegraph_retriever_system.txt")
47
45
 
48
46
  try:
49
- result = await llm_client.acreate_structured_output(
47
+ result = await LLMGateway.acreate_structured_output(
50
48
  text_input=query,
51
49
  system_prompt=system_prompt,
52
50
  response_model=self.CodeQueryInfo,
@@ -90,4 +90,4 @@ class CompletionRetriever(BaseRetriever):
90
90
  completion = await generate_completion(
91
91
  query, context, self.user_prompt_path, self.system_prompt_path
92
92
  )
93
- return completion
93
+ return [completion]
@@ -4,8 +4,6 @@ import asyncio
4
4
  from cognee.infrastructure.context.BaseContextProvider import BaseContextProvider
5
5
  from cognee.infrastructure.engine import DataPoint
6
6
  from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
7
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
8
- from cognee.infrastructure.llm.prompts import read_query_prompt
9
7
  from cognee.modules.retrieval.utils.brute_force_triplet_search import (
10
8
  brute_force_triplet_search,
11
9
  format_triplets,
@@ -1,9 +1,7 @@
1
1
  from typing import Any, Optional, List, Type
2
2
  from cognee.shared.logging_utils import get_logger
3
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
4
3
  from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
5
4
  from cognee.modules.retrieval.utils.completion import generate_completion
6
- from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
7
5
 
8
6
  logger = get_logger()
9
7
 
@@ -1,9 +1,9 @@
1
1
  from typing import Any, Optional, List, Type
2
2
  from cognee.shared.logging_utils import get_logger
3
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
3
+
4
4
  from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
5
5
  from cognee.modules.retrieval.utils.completion import generate_completion
6
- from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
6
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
7
7
 
8
8
  logger = get_logger()
9
9
 
@@ -73,7 +73,6 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
73
73
 
74
74
  - List[str]: A list containing the generated answer to the user's query.
75
75
  """
76
- llm_client = get_llm_client()
77
76
  followup_question = ""
78
77
  triplets = []
79
78
  answer = [""]
@@ -95,27 +94,27 @@ class GraphCompletionCotRetriever(GraphCompletionRetriever):
95
94
  logger.info(f"Chain-of-thought: round {round_idx} - answer: {answer}")
96
95
  if round_idx < max_iter:
97
96
  valid_args = {"query": query, "answer": answer, "context": context}
98
- valid_user_prompt = render_prompt(
97
+ valid_user_prompt = LLMGateway.render_prompt(
99
98
  filename=self.validation_user_prompt_path, context=valid_args
100
99
  )
101
- valid_system_prompt = read_query_prompt(
100
+ valid_system_prompt = LLMGateway.read_query_prompt(
102
101
  prompt_file_name=self.validation_system_prompt_path
103
102
  )
104
103
 
105
- reasoning = await llm_client.acreate_structured_output(
104
+ reasoning = await LLMGateway.acreate_structured_output(
106
105
  text_input=valid_user_prompt,
107
106
  system_prompt=valid_system_prompt,
108
107
  response_model=str,
109
108
  )
110
109
  followup_args = {"query": query, "answer": answer, "reasoning": reasoning}
111
- followup_prompt = render_prompt(
110
+ followup_prompt = LLMGateway.render_prompt(
112
111
  filename=self.followup_user_prompt_path, context=followup_args
113
112
  )
114
- followup_system = read_query_prompt(
113
+ followup_system = LLMGateway.read_query_prompt(
115
114
  prompt_file_name=self.followup_system_prompt_path
116
115
  )
117
116
 
118
- followup_question = await llm_client.acreate_structured_output(
117
+ followup_question = await LLMGateway.acreate_structured_output(
119
118
  text_input=followup_prompt, system_prompt=followup_system, response_model=str
120
119
  )
121
120
  logger.info(
@@ -2,8 +2,7 @@ from typing import Any, Optional
2
2
  from cognee.shared.logging_utils import get_logger
3
3
  from cognee.infrastructure.databases.graph import get_graph_engine
4
4
  from cognee.infrastructure.databases.graph.networkx.adapter import NetworkXAdapter
5
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
6
- from cognee.infrastructure.llm.prompts import render_prompt
5
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
7
6
  from cognee.modules.retrieval.base_retriever import BaseRetriever
8
7
  from cognee.modules.retrieval.exceptions import SearchTypeNotSupported
9
8
  from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface
@@ -51,8 +50,7 @@ class NaturalLanguageRetriever(BaseRetriever):
51
50
 
52
51
  async def _generate_cypher_query(self, query: str, edge_schemas, previous_attempts=None) -> str:
53
52
  """Generate a Cypher query using LLM based on natural language query and schema information."""
54
- llm_client = get_llm_client()
55
- system_prompt = render_prompt(
53
+ system_prompt = LLMGateway.render_prompt(
56
54
  self.system_prompt_path,
57
55
  context={
58
56
  "edge_schemas": edge_schemas,
@@ -60,7 +58,7 @@ class NaturalLanguageRetriever(BaseRetriever):
60
58
  },
61
59
  )
62
60
 
63
- return await llm_client.acreate_structured_output(
61
+ return await LLMGateway.acreate_structured_output(
64
62
  text_input=query,
65
63
  system_prompt=system_prompt,
66
64
  response_model=str,
@@ -1,5 +1,4 @@
1
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
2
- from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
1
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
3
2
 
4
3
 
5
4
  async def generate_completion(
@@ -10,11 +9,10 @@ async def generate_completion(
10
9
  ) -> str:
11
10
  """Generates a completion using LLM with given context and prompts."""
12
11
  args = {"question": query, "context": context}
13
- user_prompt = render_prompt(user_prompt_path, args)
14
- system_prompt = read_query_prompt(system_prompt_path)
12
+ user_prompt = LLMGateway.render_prompt(user_prompt_path, args)
13
+ system_prompt = LLMGateway.read_query_prompt(system_prompt_path)
15
14
 
16
- llm_client = get_llm_client()
17
- return await llm_client.acreate_structured_output(
15
+ return await LLMGateway.acreate_structured_output(
18
16
  text_input=user_prompt,
19
17
  system_prompt=system_prompt,
20
18
  response_model=str,
@@ -26,10 +24,9 @@ async def summarize_text(
26
24
  prompt_path: str = "summarize_search_results.txt",
27
25
  ) -> str:
28
26
  """Summarizes text using LLM with the specified prompt."""
29
- system_prompt = read_query_prompt(prompt_path)
30
- llm_client = get_llm_client()
27
+ system_prompt = LLMGateway.read_query_prompt(prompt_path)
31
28
 
32
- return await llm_client.acreate_structured_output(
29
+ return await LLMGateway.acreate_structured_output(
33
30
  text_input=text,
34
31
  system_prompt=system_prompt,
35
32
  response_model=str,
@@ -9,7 +9,7 @@ from cognee.modules.users.methods import get_default_user
9
9
  from cognee.modules.users.models import User
10
10
  from cognee.shared.utils import send_telemetry
11
11
  from cognee.modules.search.methods import search
12
- from cognee.infrastructure.llm.get_llm_client import get_llm_client
12
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
13
13
 
14
14
  logger = get_logger(level=ERROR)
15
15
 
@@ -71,8 +71,7 @@ async def code_description_to_code_part(
71
71
  if isinstance(obj, dict) and "description" in obj
72
72
  )
73
73
 
74
- llm_client = get_llm_client()
75
- context_from_documents = await llm_client.acreate_structured_output(
74
+ context_from_documents = await LLMGateway.acreate_structured_output(
76
75
  text_input=f"The retrieved context from documents is {concatenated_descriptions}.",
77
76
  system_prompt="You are a Senior Software Engineer, summarize the context from documents"
78
77
  f" in a way that it is gonna be provided next to codeparts as context"
@@ -27,7 +27,7 @@ from cognee.modules.users.models import User
27
27
  from cognee.modules.data.models import Dataset
28
28
  from cognee.shared.utils import send_telemetry
29
29
  from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
30
- from cognee.modules.search.operations import log_query, log_result
30
+ from cognee.modules.search.operations import log_query, log_result, select_search_type
31
31
 
32
32
 
33
33
  async def search(
@@ -129,6 +129,10 @@ async def specific_search(
129
129
  SearchType.NATURAL_LANGUAGE: NaturalLanguageRetriever().get_completion,
130
130
  }
131
131
 
132
+ # If the query type is FEELING_LUCKY, select the search type intelligently
133
+ if query_type is SearchType.FEELING_LUCKY:
134
+ query_type = await select_search_type(query)
135
+
132
136
  search_task = search_tasks.get(query_type)
133
137
 
134
138
  if search_task is None:
@@ -1,3 +1,4 @@
1
1
  from .log_query import log_query
2
2
  from .log_result import log_result
3
3
  from .get_history import get_history
4
+ from .select_search_type import select_search_type
@@ -0,0 +1,42 @@
1
+ from cognee.infrastructure.llm.prompts import read_query_prompt
2
+ from cognee.modules.search.types import SearchType
3
+ from cognee.shared.logging_utils import get_logger
4
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
5
+
6
+ logger = get_logger("SearchTypeSelector")
7
+
8
+
9
+ async def select_search_type(
10
+ query: str,
11
+ system_prompt_path: str = "search_type_selector_prompt.txt",
12
+ ) -> SearchType:
13
+ """
14
+ Analyzes the query and Selects the best search type.
15
+
16
+ Args:
17
+ query: The query to analyze.
18
+ system_prompt_path: The path to the system prompt.
19
+
20
+ Returns:
21
+ The best search type given by the LLM.
22
+ """
23
+ default_search_type = SearchType.RAG_COMPLETION
24
+ system_prompt = read_query_prompt(system_prompt_path)
25
+
26
+ try:
27
+ response = await LLMGateway.acreate_structured_output(
28
+ text_input=query,
29
+ system_prompt=system_prompt,
30
+ response_model=str,
31
+ )
32
+
33
+ if response.upper() in SearchType.__members__:
34
+ logger.info(f"Selected lucky search type: {response.upper()}")
35
+ return SearchType(response.upper())
36
+
37
+ # If the response is not a valid search type, return the default search type
38
+ logger.info(f"LLM gives an invalid search type: {response.upper()}")
39
+ return default_search_type
40
+ except Exception as e:
41
+ logger.error(f"Failed to select search type intelligently from LLM: {str(e)}")
42
+ return default_search_type
@@ -13,3 +13,4 @@ class SearchType(Enum):
13
13
  NATURAL_LANGUAGE = "NATURAL_LANGUAGE"
14
14
  GRAPH_COMPLETION_COT = "GRAPH_COMPLETION_COT"
15
15
  GRAPH_COMPLETION_CONTEXT_EXTENSION = "GRAPH_COMPLETION_CONTEXT_EXTENSION"
16
+ FEELING_LUCKY = "FEELING_LUCKY"
@@ -43,10 +43,6 @@ def get_settings() -> SettingsDict:
43
43
  llm_config = get_llm_config()
44
44
 
45
45
  vector_dbs = [
46
- {
47
- "value": "qdrant",
48
- "label": "Qdrant",
49
- },
50
46
  {
51
47
  "value": "lancedb",
52
48
  "label": "LanceDB",
@@ -6,7 +6,7 @@ from cognee.infrastructure.databases.vector import get_vectordb_config
6
6
  class VectorDBConfig(BaseModel):
7
7
  url: str
8
8
  api_key: str
9
- provider: Union[Literal["lancedb"], Literal["qdrant"], Literal["pgvector"]]
9
+ provider: Union[Literal["lancedb"], Literal["pgvector"]]
10
10
 
11
11
 
12
12
  async def save_vector_db_config(vector_db_config: VectorDBConfig):
@@ -4,7 +4,9 @@ from enum import Enum, auto
4
4
  from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from pydantic import BaseModel, Field
7
- from cognee.infrastructure.llm.config import get_llm_config
7
+ from cognee.infrastructure.llm.config import (
8
+ get_llm_config,
9
+ )
8
10
 
9
11
  if get_llm_config().llm_provider.lower() == "gemini":
10
12
  """
@@ -175,17 +175,13 @@ def log_database_configuration(logger):
175
175
  try:
176
176
  # Log relational database configuration
177
177
  relational_config = get_relational_config()
178
- logger.info(f"Relational database: {relational_config.db_provider}")
179
178
  if relational_config.db_provider == "postgres":
180
179
  logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}")
181
- logger.info(f"Postgres database: {relational_config.db_name}")
182
180
  elif relational_config.db_provider == "sqlite":
183
181
  logger.info(f"SQLite path: {relational_config.db_path}")
184
- logger.info(f"SQLite database: {relational_config.db_name}")
185
182
 
186
183
  # Log vector database configuration
187
184
  vector_config = get_vectordb_config()
188
- logger.info(f"Vector database: {vector_config.vector_db_provider}")
189
185
  if vector_config.vector_db_provider == "lancedb":
190
186
  logger.info(f"Vector database path: {vector_config.vector_db_url}")
191
187
  else:
@@ -193,7 +189,6 @@ def log_database_configuration(logger):
193
189
 
194
190
  # Log graph database configuration
195
191
  graph_config = get_graph_config()
196
- logger.info(f"Graph database: {graph_config.graph_database_provider}")
197
192
  if graph_config.graph_database_provider == "kuzu":
198
193
  logger.info(f"Graph database path: {graph_config.graph_file_path}")
199
194
  else: