cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. cognee/api/client.py +44 -4
  2. cognee/api/health.py +332 -0
  3. cognee/api/v1/add/add.py +5 -2
  4. cognee/api/v1/add/routers/get_add_router.py +3 -0
  5. cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
  6. cognee/api/v1/cognify/cognify.py +8 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
  8. cognee/api/v1/config/config.py +3 -1
  9. cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
  10. cognee/api/v1/delete/delete.py +16 -12
  11. cognee/api/v1/responses/routers/get_responses_router.py +3 -1
  12. cognee/api/v1/search/search.py +10 -0
  13. cognee/api/v1/settings/routers/get_settings_router.py +0 -2
  14. cognee/base_config.py +1 -0
  15. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
  16. cognee/infrastructure/databases/graph/config.py +2 -0
  17. cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
  18. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
  19. cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
  20. cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
  21. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
  22. cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
  23. cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
  24. cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
  25. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
  26. cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
  27. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
  28. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
  29. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
  30. cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
  34. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
  35. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
  36. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
  37. cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
  38. cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
  39. cognee/infrastructure/files/utils/guess_file_type.py +2 -2
  40. cognee/infrastructure/files/utils/open_data_file.py +4 -23
  41. cognee/infrastructure/llm/LLMGateway.py +137 -0
  42. cognee/infrastructure/llm/__init__.py +14 -4
  43. cognee/infrastructure/llm/config.py +29 -1
  44. cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
  45. cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
  46. cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
  47. cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
  48. cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
  49. cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
  50. cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
  51. cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
  52. cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
  53. cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
  54. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
  55. cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
  65. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
  66. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
  67. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
  68. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
  69. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
  70. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
  71. cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
  73. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
  74. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
  77. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
  78. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
  79. cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
  80. cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
  82. cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
  83. cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
  84. cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
  85. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
  86. cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
  87. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
  88. cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
  89. cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
  90. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
  91. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
  92. cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
  93. cognee/infrastructure/llm/utils.py +3 -1
  94. cognee/infrastructure/loaders/LoaderEngine.py +156 -0
  95. cognee/infrastructure/loaders/LoaderInterface.py +73 -0
  96. cognee/infrastructure/loaders/__init__.py +18 -0
  97. cognee/infrastructure/loaders/core/__init__.py +7 -0
  98. cognee/infrastructure/loaders/core/audio_loader.py +98 -0
  99. cognee/infrastructure/loaders/core/image_loader.py +114 -0
  100. cognee/infrastructure/loaders/core/text_loader.py +90 -0
  101. cognee/infrastructure/loaders/create_loader_engine.py +32 -0
  102. cognee/infrastructure/loaders/external/__init__.py +22 -0
  103. cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
  104. cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
  105. cognee/infrastructure/loaders/get_loader_engine.py +18 -0
  106. cognee/infrastructure/loaders/supported_loaders.py +18 -0
  107. cognee/infrastructure/loaders/use_loader.py +21 -0
  108. cognee/infrastructure/loaders/utils/__init__.py +0 -0
  109. cognee/modules/data/methods/__init__.py +1 -0
  110. cognee/modules/data/methods/get_authorized_dataset.py +23 -0
  111. cognee/modules/data/models/Data.py +13 -3
  112. cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
  113. cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
  114. cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
  115. cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
  116. cognee/modules/engine/utils/generate_edge_id.py +5 -0
  117. cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
  118. cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
  119. cognee/modules/graph/utils/get_graph_from_model.py +93 -101
  120. cognee/modules/ingestion/data_types/TextData.py +8 -2
  121. cognee/modules/ingestion/save_data_to_file.py +1 -1
  122. cognee/modules/pipelines/exceptions/__init__.py +1 -0
  123. cognee/modules/pipelines/exceptions/exceptions.py +12 -0
  124. cognee/modules/pipelines/models/DataItemStatus.py +5 -0
  125. cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
  126. cognee/modules/pipelines/models/__init__.py +1 -0
  127. cognee/modules/pipelines/operations/pipeline.py +10 -2
  128. cognee/modules/pipelines/operations/run_tasks.py +252 -20
  129. cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
  130. cognee/modules/retrieval/chunks_retriever.py +23 -1
  131. cognee/modules/retrieval/code_retriever.py +66 -9
  132. cognee/modules/retrieval/completion_retriever.py +11 -9
  133. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
  134. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
  135. cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
  136. cognee/modules/retrieval/graph_completion_retriever.py +1 -1
  137. cognee/modules/retrieval/insights_retriever.py +4 -0
  138. cognee/modules/retrieval/natural_language_retriever.py +9 -15
  139. cognee/modules/retrieval/summaries_retriever.py +23 -1
  140. cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
  141. cognee/modules/retrieval/utils/completion.py +6 -9
  142. cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
  143. cognee/modules/search/methods/search.py +5 -1
  144. cognee/modules/search/operations/__init__.py +1 -0
  145. cognee/modules/search/operations/select_search_type.py +42 -0
  146. cognee/modules/search/types/SearchType.py +1 -0
  147. cognee/modules/settings/get_settings.py +0 -8
  148. cognee/modules/settings/save_vector_db_config.py +1 -1
  149. cognee/shared/data_models.py +3 -1
  150. cognee/shared/logging_utils.py +0 -5
  151. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  152. cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
  153. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
  154. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
  155. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
  156. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
  157. cognee/tasks/graph/extract_graph_from_code.py +3 -2
  158. cognee/tasks/graph/extract_graph_from_data.py +4 -3
  159. cognee/tasks/graph/infer_data_ontology.py +5 -6
  160. cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
  161. cognee/tasks/ingestion/ingest_data.py +91 -61
  162. cognee/tasks/ingestion/resolve_data_directories.py +3 -0
  163. cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
  164. cognee/tasks/storage/index_data_points.py +1 -1
  165. cognee/tasks/storage/index_graph_edges.py +4 -1
  166. cognee/tasks/summarization/summarize_code.py +2 -3
  167. cognee/tasks/summarization/summarize_text.py +3 -2
  168. cognee/tests/test_cognee_server_start.py +12 -7
  169. cognee/tests/test_deduplication.py +2 -2
  170. cognee/tests/test_deletion.py +58 -17
  171. cognee/tests/test_graph_visualization_permissions.py +161 -0
  172. cognee/tests/test_neptune_analytics_graph.py +309 -0
  173. cognee/tests/test_neptune_analytics_hybrid.py +176 -0
  174. cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
  175. cognee/tests/test_pgvector.py +5 -5
  176. cognee/tests/test_s3.py +1 -6
  177. cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
  178. cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
  179. cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
  180. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
  181. cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
  182. cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
  183. cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
  184. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
  185. cognee/tests/unit/modules/search/search_methods_test.py +55 -0
  186. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
  187. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
  188. cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
  189. cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
  190. cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
  191. cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
  192. cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
  193. cognee/modules/data/extraction/extract_categories.py +0 -14
  194. cognee/tests/test_qdrant.py +0 -99
  195. distributed/Dockerfile +0 -34
  196. distributed/app.py +0 -4
  197. distributed/entrypoint.py +0 -71
  198. distributed/entrypoint.sh +0 -5
  199. distributed/modal_image.py +0 -11
  200. distributed/queues.py +0 -5
  201. distributed/tasks/queued_add_data_points.py +0 -13
  202. distributed/tasks/queued_add_edges.py +0 -13
  203. distributed/tasks/queued_add_nodes.py +0 -13
  204. distributed/test.py +0 -28
  205. distributed/utils.py +0 -19
  206. distributed/workers/data_point_saving_worker.py +0 -93
  207. distributed/workers/graph_saving_worker.py +0 -104
  208. /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
  209. /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
  210. /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
  211. /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
  212. /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
  213. /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
  214. /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
  215. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
  216. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
  217. /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
  218. {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
  219. {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
  220. /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
  221. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
  222. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
  223. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
@@ -4,43 +4,50 @@ from cognee.infrastructure.engine import DataPoint, Edge
4
4
  from cognee.modules.storage.utils import copy_model
5
5
 
6
6
 
7
- def _extract_field_info(field_value: Any) -> Tuple[str, Any, Optional[Edge]]:
8
- """Extract field type, actual value, and edge metadata from a field value."""
9
-
10
- # Handle tuple[Edge, DataPoint]
11
- if (
12
- isinstance(field_value, tuple)
13
- and len(field_value) == 2
14
- and isinstance(field_value[0], Edge)
15
- and isinstance(field_value[1], DataPoint)
16
- ):
17
- return "single_datapoint_with_edge", field_value[1], field_value[0]
18
-
19
- # Handle tuple[Edge, list[DataPoint]]
7
+ def _extract_field_data(field_value: Any) -> List[Tuple[Optional[Edge], List[DataPoint]]]:
8
+ """Extract edge metadata and datapoints from a field value."""
9
+ # Handle single DataPoint
10
+ if isinstance(field_value, DataPoint):
11
+ return [(None, [field_value])]
12
+
13
+ # Handle list - could contain DataPoints, edge tuples, or mixed
14
+ if isinstance(field_value, list) and len(field_value) > 0:
15
+ result = []
16
+ for item in field_value:
17
+ # Handle tuple[Edge, DataPoint or list[DataPoint]]
18
+ if isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], Edge):
19
+ edge, data_value = item
20
+ if isinstance(data_value, DataPoint):
21
+ result.append((edge, [data_value]))
22
+ elif (
23
+ isinstance(data_value, list)
24
+ and len(data_value) > 0
25
+ and isinstance(data_value[0], DataPoint)
26
+ ):
27
+ result.append((edge, data_value))
28
+ # Handle single DataPoint in list
29
+ elif isinstance(item, DataPoint):
30
+ result.append((None, [item]))
31
+ return result
32
+
33
+ # Handle tuple[Edge, DataPoint or list[DataPoint]]
20
34
  if (
21
35
  isinstance(field_value, tuple)
22
36
  and len(field_value) == 2
23
37
  and isinstance(field_value[0], Edge)
24
- and isinstance(field_value[1], list)
25
- and len(field_value[1]) > 0
26
- and isinstance(field_value[1][0], DataPoint)
27
38
  ):
28
- return "list_datapoint_with_edge", field_value[1], field_value[0]
29
-
30
- # Handle single DataPoint
31
- if isinstance(field_value, DataPoint):
32
- return "single_datapoint", field_value, None
33
-
34
- # Handle list of DataPoints
35
- if (
36
- isinstance(field_value, list)
37
- and len(field_value) > 0
38
- and isinstance(field_value[0], DataPoint)
39
- ):
40
- return "list_datapoint", field_value, None
39
+ edge_metadata, data_value = field_value
40
+ if isinstance(data_value, DataPoint):
41
+ return [(edge_metadata, [data_value])]
42
+ elif (
43
+ isinstance(data_value, list)
44
+ and len(data_value) > 0
45
+ and isinstance(data_value[0], DataPoint)
46
+ ):
47
+ return [(edge_metadata, data_value)]
41
48
 
42
- # Regular property
43
- return "property", field_value, None
49
+ # Regular property or empty list
50
+ return []
44
51
 
45
52
 
46
53
  def _create_edge_properties(
@@ -80,30 +87,49 @@ def _get_relationship_key(field_name: str, edge_metadata: Optional[Edge]) -> str
80
87
 
81
88
  def _generate_property_key(data_point_id: str, relationship_key: str, target_id: str) -> str:
82
89
  """Generate a unique property key for visited_properties tracking."""
83
- return f"{data_point_id}{relationship_key}{target_id}"
90
+ return f"{data_point_id}_{relationship_key}_{target_id}"
84
91
 
85
92
 
86
93
  def _process_datapoint_field(
87
94
  data_point_id: str,
88
95
  field_name: str,
89
- datapoints: List[DataPoint],
90
- edge_metadata: Optional[Edge],
96
+ edge_datapoint_pairs: List[Tuple[Optional[Edge], List[DataPoint]]],
91
97
  visited_properties: Dict[str, bool],
92
98
  properties_to_visit: set,
93
99
  excluded_properties: set,
94
100
  ) -> None:
95
- """Process a field containing DataPoint(s), handling both single and list cases."""
101
+ """Process a field containing DataPoints, always working with lists."""
96
102
  excluded_properties.add(field_name)
97
- relationship_key = _get_relationship_key(field_name, edge_metadata)
98
103
 
99
- for index, datapoint in enumerate(datapoints):
100
- property_key = _generate_property_key(data_point_id, relationship_key, str(datapoint.id))
101
- if property_key in visited_properties:
104
+ for edge_metadata, datapoints in edge_datapoint_pairs:
105
+ relationship_key = _get_relationship_key(field_name, edge_metadata)
106
+
107
+ for datapoint in datapoints:
108
+ property_key = _generate_property_key(
109
+ data_point_id, relationship_key, str(datapoint.id)
110
+ )
111
+ if property_key in visited_properties:
112
+ continue
113
+
114
+ # Always use field_name since we're working with lists
115
+ properties_to_visit.add(field_name)
116
+
117
+
118
+ def _targets_generator(
119
+ data_point: DataPoint,
120
+ properties_to_visit: set,
121
+ ) -> Tuple[DataPoint, str, Optional[Edge]]:
122
+ """Generator that yields (target_datapoint, field_name, edge_metadata) tuples."""
123
+ for field_name in properties_to_visit:
124
+ field_value = getattr(data_point, field_name)
125
+ edge_datapoint_pairs = _extract_field_data(field_value)
126
+
127
+ if not edge_datapoint_pairs:
102
128
  continue
103
129
 
104
- # For single datapoint, use field_name; for list, use field_name.index
105
- field_identifier = field_name if len(datapoints) == 1 else f"{field_name}.{index}"
106
- properties_to_visit.add(field_identifier)
130
+ for edge_metadata, datapoints in edge_datapoint_pairs:
131
+ for target_datapoint in datapoints:
132
+ yield target_datapoint, field_name, edge_metadata
107
133
 
108
134
 
109
135
  async def get_graph_from_model(
@@ -143,26 +169,17 @@ async def get_graph_from_model(
143
169
  if field_name == "metadata":
144
170
  continue
145
171
 
146
- field_type, actual_value, edge_metadata = _extract_field_info(field_value)
172
+ edge_datapoint_pairs = _extract_field_data(field_value)
147
173
 
148
- if field_type == "property":
174
+ if not edge_datapoint_pairs:
175
+ # Regular property
149
176
  data_point_properties[field_name] = field_value
150
- elif field_type in ["single_datapoint", "single_datapoint_with_edge"]:
151
- _process_datapoint_field(
152
- data_point_id,
153
- field_name,
154
- [actual_value],
155
- edge_metadata,
156
- visited_properties,
157
- properties_to_visit,
158
- excluded_properties,
159
- )
160
- elif field_type in ["list_datapoint", "list_datapoint_with_edge"]:
177
+ else:
178
+ # DataPoint relationship
161
179
  _process_datapoint_field(
162
180
  data_point_id,
163
181
  field_name,
164
- actual_value,
165
- edge_metadata,
182
+ edge_datapoint_pairs,
166
183
  visited_properties,
167
184
  properties_to_visit,
168
185
  excluded_properties,
@@ -176,41 +193,15 @@ async def get_graph_from_model(
176
193
  nodes.append(SimpleDataPointModel(**data_point_properties))
177
194
  added_nodes[data_point_id] = True
178
195
 
179
- # Process all relationships
180
- for field_name_with_index in properties_to_visit:
181
- # Parse field name and index
182
- if "." in field_name_with_index:
183
- field_name, index_str = field_name_with_index.split(".")
184
- index = int(index_str)
185
- else:
186
- field_name, index = field_name_with_index, None
187
-
188
- # Get field value and extract edge metadata
189
- field_value = getattr(data_point, field_name)
190
- edge_metadata = None
191
-
192
- if (
193
- isinstance(field_value, tuple)
194
- and len(field_value) == 2
195
- and isinstance(field_value[0], Edge)
196
- ):
197
- edge_metadata, field_value = field_value
198
-
199
- # Get specific datapoint - handle both single and list cases
200
- if index is not None:
201
- # List case: extract specific item by index
202
- target_datapoint = field_value[index]
203
- elif isinstance(field_value, list):
204
- # Single datapoint case that was wrapped in a list
205
- target_datapoint = field_value[0]
206
- else:
207
- # True single datapoint case
208
- target_datapoint = field_value
196
+ # Process all relationships using generator
197
+ for target_datapoint, field_name, edge_metadata in _targets_generator(
198
+ data_point, properties_to_visit
199
+ ):
200
+ relationship_name = _get_relationship_key(field_name, edge_metadata)
209
201
 
210
202
  # Create edge if not already added
211
- edge_key = f"{data_point_id}{target_datapoint.id}{field_name}"
203
+ edge_key = f"{data_point_id}_{target_datapoint.id}_{field_name}"
212
204
  if edge_key not in added_edges:
213
- relationship_name = _get_relationship_key(field_name, edge_metadata)
214
205
  edge_properties = _create_edge_properties(
215
206
  data_point.id, target_datapoint.id, relationship_name, edge_metadata
216
207
  )
@@ -218,23 +209,24 @@ async def get_graph_from_model(
218
209
  added_edges[edge_key] = True
219
210
 
220
211
  # Mark property as visited - CRITICAL for preventing infinite loops
221
- relationship_key = _get_relationship_key(field_name, edge_metadata)
222
212
  property_key = _generate_property_key(
223
- data_point_id, relationship_key, str(target_datapoint.id)
213
+ data_point_id, relationship_name, str(target_datapoint.id)
224
214
  )
225
215
  visited_properties[property_key] = True
226
216
 
227
217
  # Recursively process target node if not already processed
228
- if str(target_datapoint.id) not in added_nodes:
229
- child_nodes, child_edges = await get_graph_from_model(
230
- target_datapoint,
231
- include_root=True,
232
- added_nodes=added_nodes,
233
- added_edges=added_edges,
234
- visited_properties=visited_properties,
235
- )
236
- nodes.extend(child_nodes)
237
- edges.extend(child_edges)
218
+ if str(target_datapoint.id) in added_nodes:
219
+ continue
220
+
221
+ child_nodes, child_edges = await get_graph_from_model(
222
+ target_datapoint,
223
+ include_root=True,
224
+ added_nodes=added_nodes,
225
+ added_edges=added_edges,
226
+ visited_properties=visited_properties,
227
+ )
228
+ nodes.extend(child_nodes)
229
+ edges.extend(child_edges)
238
230
 
239
231
  return nodes, edges
240
232
 
@@ -1,5 +1,6 @@
1
1
  from typing import BinaryIO
2
2
  from contextlib import asynccontextmanager
3
+ import hashlib
3
4
  from cognee.infrastructure.data.utils.extract_keywords import extract_keywords
4
5
  from .IngestionData import IngestionData
5
6
 
@@ -16,9 +17,9 @@ class TextData(IngestionData):
16
17
  self.data = data
17
18
 
18
19
  def get_identifier(self):
19
- keywords = extract_keywords(self.data)
20
+ metadata = self.get_metadata()
20
21
 
21
- return "text/plain" + "_" + "|".join(keywords)
22
+ return metadata["content_hash"]
22
23
 
23
24
  def get_metadata(self):
24
25
  self.ensure_metadata()
@@ -29,6 +30,11 @@ class TextData(IngestionData):
29
30
  if self.metadata is None:
30
31
  self.metadata = {}
31
32
 
33
+ data_contents = self.data.encode("utf-8")
34
+ hash_contents = hashlib.md5(data_contents).hexdigest()
35
+ self.metadata["name"] = "text_" + hash_contents + ".txt"
36
+ self.metadata["content_hash"] = hash_contents
37
+
32
38
  @asynccontextmanager
33
39
  async def get_data(self):
34
40
  yield self.data
@@ -1,7 +1,7 @@
1
- import hashlib
2
1
  from typing import BinaryIO, Union
3
2
  from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
4
3
  from .classify import classify
4
+ import hashlib
5
5
 
6
6
 
7
7
  async def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
@@ -0,0 +1 @@
1
+ from .exceptions import PipelineRunFailedError
@@ -0,0 +1,12 @@
1
+ from cognee.exceptions import CogneeApiError
2
+ from fastapi import status
3
+
4
+
5
+ class PipelineRunFailedError(CogneeApiError):
6
+ def __init__(
7
+ self,
8
+ message: str = "Pipeline run failed.",
9
+ name: str = "PipelineRunFailedError",
10
+ status_code: int = status.HTTP_422_UNPROCESSABLE_ENTITY,
11
+ ):
12
+ super().__init__(message, name, status_code)
@@ -0,0 +1,5 @@
1
+ import enum
2
+
3
+
4
+ class DataItemStatus(str, enum.Enum):
5
+ DATA_ITEM_PROCESSING_COMPLETED = "DATA_ITEM_PROCESSING_COMPLETED"
@@ -9,6 +9,7 @@ class PipelineRunInfo(BaseModel):
9
9
  dataset_id: UUID
10
10
  dataset_name: str
11
11
  payload: Optional[Any] = None
12
+ data_ingestion_info: Optional[list] = None
12
13
 
13
14
  model_config = {
14
15
  "arbitrary_types_allowed": True,
@@ -30,6 +31,11 @@ class PipelineRunCompleted(PipelineRunInfo):
30
31
  pass
31
32
 
32
33
 
34
+ class PipelineRunAlreadyCompleted(PipelineRunInfo):
35
+ status: str = "PipelineRunAlreadyCompleted"
36
+ pass
37
+
38
+
33
39
  class PipelineRunErrored(PipelineRunInfo):
34
40
  status: str = "PipelineRunErrored"
35
41
  pass
@@ -6,3 +6,4 @@ from .PipelineRunInfo import (
6
6
  PipelineRunCompleted,
7
7
  PipelineRunErrored,
8
8
  )
9
+ from .DataItemStatus import DataItemStatus
@@ -52,6 +52,7 @@ async def cognee_pipeline(
52
52
  pipeline_name: str = "custom_pipeline",
53
53
  vector_db_config: dict = None,
54
54
  graph_db_config: dict = None,
55
+ incremental_loading: bool = False,
55
56
  ):
56
57
  # Note: These context variables allow different value assignment for databases in Cognee
57
58
  # per async task, thread, process and etc.
@@ -69,7 +70,10 @@ async def cognee_pipeline(
69
70
  cognee_pipeline.first_run = True
70
71
 
71
72
  if cognee_pipeline.first_run:
72
- from cognee.infrastructure.llm.utils import test_llm_connection, test_embedding_connection
73
+ from cognee.infrastructure.llm.utils import (
74
+ test_llm_connection,
75
+ test_embedding_connection,
76
+ )
73
77
 
74
78
  # Test LLM and Embedding configuration once before running Cognee
75
79
  await test_llm_connection()
@@ -106,6 +110,7 @@ async def cognee_pipeline(
106
110
  data=data,
107
111
  pipeline_name=pipeline_name,
108
112
  context={"dataset": dataset},
113
+ incremental_loading=incremental_loading,
109
114
  ):
110
115
  yield run_info
111
116
 
@@ -117,6 +122,7 @@ async def run_pipeline(
117
122
  data=None,
118
123
  pipeline_name: str = "custom_pipeline",
119
124
  context: dict = None,
125
+ incremental_loading=False,
120
126
  ):
121
127
  check_dataset_name(dataset.name)
122
128
 
@@ -184,7 +190,9 @@ async def run_pipeline(
184
190
  if not isinstance(task, Task):
185
191
  raise ValueError(f"Task {task} is not an instance of Task")
186
192
 
187
- pipeline_run = run_tasks(tasks, dataset_id, data, user, pipeline_name, context)
193
+ pipeline_run = run_tasks(
194
+ tasks, dataset_id, data, user, pipeline_name, context, incremental_loading
195
+ )
188
196
 
189
197
  async for pipeline_run_info in pipeline_run:
190
198
  yield pipeline_run_info