cognee 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/health.py +2 -12
  3. cognee/api/v1/add/add.py +46 -6
  4. cognee/api/v1/add/routers/get_add_router.py +5 -1
  5. cognee/api/v1/cognify/cognify.py +29 -9
  6. cognee/api/v1/datasets/datasets.py +11 -0
  7. cognee/api/v1/responses/default_tools.py +0 -1
  8. cognee/api/v1/responses/dispatch_function.py +1 -1
  9. cognee/api/v1/responses/routers/default_tools.py +0 -1
  10. cognee/api/v1/search/search.py +11 -9
  11. cognee/api/v1/settings/routers/get_settings_router.py +7 -1
  12. cognee/api/v1/ui/ui.py +47 -16
  13. cognee/api/v1/update/routers/get_update_router.py +1 -1
  14. cognee/api/v1/update/update.py +3 -3
  15. cognee/cli/_cognee.py +61 -10
  16. cognee/cli/commands/add_command.py +3 -3
  17. cognee/cli/commands/cognify_command.py +3 -3
  18. cognee/cli/commands/config_command.py +9 -7
  19. cognee/cli/commands/delete_command.py +3 -3
  20. cognee/cli/commands/search_command.py +3 -7
  21. cognee/cli/config.py +0 -1
  22. cognee/context_global_variables.py +5 -0
  23. cognee/exceptions/exceptions.py +1 -1
  24. cognee/infrastructure/databases/cache/__init__.py +2 -0
  25. cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
  26. cognee/infrastructure/databases/cache/config.py +44 -0
  27. cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
  28. cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
  29. cognee/infrastructure/databases/exceptions/__init__.py +1 -0
  30. cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
  31. cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
  32. cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
  33. cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
  34. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
  35. cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
  36. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
  37. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
  38. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
  39. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
  40. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
  41. cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
  42. cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
  43. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
  44. cognee/infrastructure/files/exceptions.py +1 -1
  45. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
  46. cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
  47. cognee/infrastructure/files/utils/guess_file_type.py +6 -0
  48. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
  49. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
  50. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
  51. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
  52. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
  53. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
  54. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
  55. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
  56. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
  57. cognee/infrastructure/loaders/LoaderEngine.py +27 -7
  58. cognee/infrastructure/loaders/external/__init__.py +7 -0
  59. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
  60. cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
  61. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  62. cognee/modules/data/exceptions/exceptions.py +1 -1
  63. cognee/modules/data/methods/__init__.py +3 -0
  64. cognee/modules/data/methods/get_dataset_data.py +4 -1
  65. cognee/modules/data/methods/has_dataset_data.py +21 -0
  66. cognee/modules/engine/models/TableRow.py +0 -1
  67. cognee/modules/ingestion/save_data_to_file.py +9 -2
  68. cognee/modules/pipelines/exceptions/exceptions.py +1 -1
  69. cognee/modules/pipelines/operations/pipeline.py +12 -1
  70. cognee/modules/pipelines/operations/run_tasks.py +25 -197
  71. cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
  72. cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
  73. cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
  74. cognee/modules/retrieval/base_graph_retriever.py +3 -1
  75. cognee/modules/retrieval/base_retriever.py +3 -1
  76. cognee/modules/retrieval/chunks_retriever.py +5 -1
  77. cognee/modules/retrieval/code_retriever.py +20 -2
  78. cognee/modules/retrieval/completion_retriever.py +50 -9
  79. cognee/modules/retrieval/cypher_search_retriever.py +11 -1
  80. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
  81. cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
  82. cognee/modules/retrieval/graph_completion_retriever.py +54 -10
  83. cognee/modules/retrieval/lexical_retriever.py +20 -2
  84. cognee/modules/retrieval/natural_language_retriever.py +10 -1
  85. cognee/modules/retrieval/summaries_retriever.py +5 -1
  86. cognee/modules/retrieval/temporal_retriever.py +62 -10
  87. cognee/modules/retrieval/user_qa_feedback.py +3 -2
  88. cognee/modules/retrieval/utils/completion.py +5 -0
  89. cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
  90. cognee/modules/retrieval/utils/session_cache.py +156 -0
  91. cognee/modules/search/methods/get_search_type_tools.py +0 -5
  92. cognee/modules/search/methods/no_access_control_search.py +12 -1
  93. cognee/modules/search/methods/search.py +34 -2
  94. cognee/modules/search/types/SearchType.py +0 -1
  95. cognee/modules/settings/get_settings.py +23 -0
  96. cognee/modules/users/methods/get_authenticated_user.py +3 -1
  97. cognee/modules/users/methods/get_default_user.py +1 -6
  98. cognee/modules/users/roles/methods/create_role.py +2 -2
  99. cognee/modules/users/tenants/methods/create_tenant.py +2 -2
  100. cognee/shared/exceptions/exceptions.py +1 -1
  101. cognee/tasks/codingagents/coding_rule_associations.py +1 -2
  102. cognee/tasks/documents/exceptions/exceptions.py +1 -1
  103. cognee/tasks/graph/extract_graph_from_data.py +2 -0
  104. cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
  105. cognee/tasks/ingestion/ingest_data.py +11 -5
  106. cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
  107. cognee/tasks/storage/add_data_points.py +3 -10
  108. cognee/tasks/storage/index_data_points.py +19 -14
  109. cognee/tasks/storage/index_graph_edges.py +25 -11
  110. cognee/tasks/web_scraper/__init__.py +34 -0
  111. cognee/tasks/web_scraper/config.py +26 -0
  112. cognee/tasks/web_scraper/default_url_crawler.py +446 -0
  113. cognee/tasks/web_scraper/models.py +46 -0
  114. cognee/tasks/web_scraper/types.py +4 -0
  115. cognee/tasks/web_scraper/utils.py +142 -0
  116. cognee/tasks/web_scraper/web_scraper_task.py +396 -0
  117. cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
  118. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
  119. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
  120. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
  121. cognee/tests/subprocesses/reader.py +25 -0
  122. cognee/tests/subprocesses/simple_cognify_1.py +31 -0
  123. cognee/tests/subprocesses/simple_cognify_2.py +31 -0
  124. cognee/tests/subprocesses/writer.py +32 -0
  125. cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
  126. cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
  127. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
  128. cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
  129. cognee/tests/test_add_docling_document.py +56 -0
  130. cognee/tests/test_chromadb.py +7 -11
  131. cognee/tests/test_concurrent_subprocess_access.py +76 -0
  132. cognee/tests/test_conversation_history.py +240 -0
  133. cognee/tests/test_kuzu.py +27 -15
  134. cognee/tests/test_lancedb.py +7 -11
  135. cognee/tests/test_library.py +32 -2
  136. cognee/tests/test_neo4j.py +24 -16
  137. cognee/tests/test_neptune_analytics_vector.py +7 -11
  138. cognee/tests/test_permissions.py +9 -13
  139. cognee/tests/test_pgvector.py +4 -4
  140. cognee/tests/test_remote_kuzu.py +8 -11
  141. cognee/tests/test_s3_file_storage.py +1 -1
  142. cognee/tests/test_search_db.py +6 -8
  143. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
  144. cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
  145. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/METADATA +22 -7
  146. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -128
  147. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
  148. distributed/Dockerfile +0 -3
  149. distributed/entrypoint.py +21 -9
  150. distributed/signal.py +5 -0
  151. distributed/workers/data_point_saving_worker.py +64 -34
  152. distributed/workers/graph_saving_worker.py +71 -47
  153. cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
  154. cognee/modules/retrieval/insights_retriever.py +0 -133
  155. cognee/tests/test_memgraph.py +0 -109
  156. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
  157. distributed/poetry.lock +0 -12238
  158. distributed/pyproject.toml +0 -185
  159. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
  160. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
  161. {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
@@ -4,35 +4,27 @@ import asyncio
4
4
  from uuid import UUID
5
5
  from typing import Any, List
6
6
  from functools import wraps
7
- from sqlalchemy import select
8
7
 
9
- import cognee.modules.ingestion as ingestion
10
8
  from cognee.infrastructure.databases.graph import get_graph_engine
11
9
  from cognee.infrastructure.databases.relational import get_relational_engine
12
10
  from cognee.modules.pipelines.operations.run_tasks_distributed import run_tasks_distributed
13
11
  from cognee.modules.users.models import User
14
- from cognee.modules.data.models import Data
15
- from cognee.infrastructure.files.utils.open_data_file import open_data_file
16
12
  from cognee.shared.logging_utils import get_logger
17
13
  from cognee.modules.users.methods import get_default_user
18
14
  from cognee.modules.pipelines.utils import generate_pipeline_id
19
15
  from cognee.modules.pipelines.exceptions import PipelineRunFailedError
20
- from cognee.tasks.ingestion import save_data_item_to_storage, resolve_data_directories
16
+ from cognee.tasks.ingestion import resolve_data_directories
21
17
  from cognee.modules.pipelines.models.PipelineRunInfo import (
22
18
  PipelineRunCompleted,
23
19
  PipelineRunErrored,
24
20
  PipelineRunStarted,
25
- PipelineRunYield,
26
- PipelineRunAlreadyCompleted,
27
21
  )
28
- from cognee.modules.pipelines.models.DataItemStatus import DataItemStatus
29
-
30
22
  from cognee.modules.pipelines.operations import (
31
23
  log_pipeline_run_start,
32
24
  log_pipeline_run_complete,
33
25
  log_pipeline_run_error,
34
26
  )
35
- from .run_tasks_with_telemetry import run_tasks_with_telemetry
27
+ from .run_tasks_data_item import run_tasks_data_item
36
28
  from ..tasks.task import Task
37
29
 
38
30
 
@@ -67,177 +59,8 @@ async def run_tasks(
67
59
  pipeline_name: str = "unknown_pipeline",
68
60
  context: dict = None,
69
61
  incremental_loading: bool = False,
62
+ data_per_batch: int = 20,
70
63
  ):
71
- async def _run_tasks_data_item_incremental(
72
- data_item,
73
- dataset,
74
- tasks,
75
- pipeline_name,
76
- pipeline_id,
77
- pipeline_run_id,
78
- context,
79
- user,
80
- ):
81
- db_engine = get_relational_engine()
82
- # If incremental_loading of data is set to True don't process documents already processed by pipeline
83
- # If data is being added to Cognee for the first time calculate the id of the data
84
- if not isinstance(data_item, Data):
85
- file_path = await save_data_item_to_storage(data_item)
86
- # Ingest data and add metadata
87
- async with open_data_file(file_path) as file:
88
- classified_data = ingestion.classify(file)
89
- # data_id is the hash of file contents + owner id to avoid duplicate data
90
- data_id = ingestion.identify(classified_data, user)
91
- else:
92
- # If data was already processed by Cognee get data id
93
- data_id = data_item.id
94
-
95
- # Check pipeline status, if Data already processed for pipeline before skip current processing
96
- async with db_engine.get_async_session() as session:
97
- data_point = (
98
- await session.execute(select(Data).filter(Data.id == data_id))
99
- ).scalar_one_or_none()
100
- if data_point:
101
- if (
102
- data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
103
- == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
104
- ):
105
- yield {
106
- "run_info": PipelineRunAlreadyCompleted(
107
- pipeline_run_id=pipeline_run_id,
108
- dataset_id=dataset.id,
109
- dataset_name=dataset.name,
110
- ),
111
- "data_id": data_id,
112
- }
113
- return
114
-
115
- try:
116
- # Process data based on data_item and list of tasks
117
- async for result in run_tasks_with_telemetry(
118
- tasks=tasks,
119
- data=[data_item],
120
- user=user,
121
- pipeline_name=pipeline_id,
122
- context=context,
123
- ):
124
- yield PipelineRunYield(
125
- pipeline_run_id=pipeline_run_id,
126
- dataset_id=dataset.id,
127
- dataset_name=dataset.name,
128
- payload=result,
129
- )
130
-
131
- # Update pipeline status for Data element
132
- async with db_engine.get_async_session() as session:
133
- data_point = (
134
- await session.execute(select(Data).filter(Data.id == data_id))
135
- ).scalar_one_or_none()
136
- data_point.pipeline_status[pipeline_name] = {
137
- str(dataset.id): DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
138
- }
139
- await session.merge(data_point)
140
- await session.commit()
141
-
142
- yield {
143
- "run_info": PipelineRunCompleted(
144
- pipeline_run_id=pipeline_run_id,
145
- dataset_id=dataset.id,
146
- dataset_name=dataset.name,
147
- ),
148
- "data_id": data_id,
149
- }
150
-
151
- except Exception as error:
152
- # Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
153
- logger.error(
154
- f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
155
- )
156
- yield {
157
- "run_info": PipelineRunErrored(
158
- pipeline_run_id=pipeline_run_id,
159
- payload=repr(error),
160
- dataset_id=dataset.id,
161
- dataset_name=dataset.name,
162
- ),
163
- "data_id": data_id,
164
- }
165
-
166
- if os.getenv("RAISE_INCREMENTAL_LOADING_ERRORS", "true").lower() == "true":
167
- raise error
168
-
169
- async def _run_tasks_data_item_regular(
170
- data_item,
171
- dataset,
172
- tasks,
173
- pipeline_id,
174
- pipeline_run_id,
175
- context,
176
- user,
177
- ):
178
- # Process data based on data_item and list of tasks
179
- async for result in run_tasks_with_telemetry(
180
- tasks=tasks,
181
- data=[data_item],
182
- user=user,
183
- pipeline_name=pipeline_id,
184
- context=context,
185
- ):
186
- yield PipelineRunYield(
187
- pipeline_run_id=pipeline_run_id,
188
- dataset_id=dataset.id,
189
- dataset_name=dataset.name,
190
- payload=result,
191
- )
192
-
193
- yield {
194
- "run_info": PipelineRunCompleted(
195
- pipeline_run_id=pipeline_run_id,
196
- dataset_id=dataset.id,
197
- dataset_name=dataset.name,
198
- )
199
- }
200
-
201
- async def _run_tasks_data_item(
202
- data_item,
203
- dataset,
204
- tasks,
205
- pipeline_name,
206
- pipeline_id,
207
- pipeline_run_id,
208
- context,
209
- user,
210
- incremental_loading,
211
- ):
212
- # Go through async generator and return data item processing result. Result can be PipelineRunAlreadyCompleted when data item is skipped,
213
- # PipelineRunCompleted when processing was successful and PipelineRunErrored if there were issues
214
- result = None
215
- if incremental_loading:
216
- async for result in _run_tasks_data_item_incremental(
217
- data_item=data_item,
218
- dataset=dataset,
219
- tasks=tasks,
220
- pipeline_name=pipeline_name,
221
- pipeline_id=pipeline_id,
222
- pipeline_run_id=pipeline_run_id,
223
- context=context,
224
- user=user,
225
- ):
226
- pass
227
- else:
228
- async for result in _run_tasks_data_item_regular(
229
- data_item=data_item,
230
- dataset=dataset,
231
- tasks=tasks,
232
- pipeline_id=pipeline_id,
233
- pipeline_run_id=pipeline_run_id,
234
- context=context,
235
- user=user,
236
- ):
237
- pass
238
-
239
- return result
240
-
241
64
  if not user:
242
65
  user = await get_default_user()
243
66
 
@@ -266,24 +89,29 @@ async def run_tasks(
266
89
  if incremental_loading:
267
90
  data = await resolve_data_directories(data)
268
91
 
269
- # Create async tasks per data item that will run the pipeline for the data item
270
- data_item_tasks = [
271
- asyncio.create_task(
272
- _run_tasks_data_item(
273
- data_item,
274
- dataset,
275
- tasks,
276
- pipeline_name,
277
- pipeline_id,
278
- pipeline_run_id,
279
- context,
280
- user,
281
- incremental_loading,
92
+ # Create and gather batches of async tasks of data items that will run the pipeline for the data item
93
+ results = []
94
+ for start in range(0, len(data), data_per_batch):
95
+ data_batch = data[start : start + data_per_batch]
96
+
97
+ data_item_tasks = [
98
+ asyncio.create_task(
99
+ run_tasks_data_item(
100
+ data_item,
101
+ dataset,
102
+ tasks,
103
+ pipeline_name,
104
+ pipeline_id,
105
+ pipeline_run_id,
106
+ context,
107
+ user,
108
+ incremental_loading,
109
+ )
282
110
  )
283
- )
284
- for data_item in data
285
- ]
286
- results = await asyncio.gather(*data_item_tasks)
111
+ for data_item in data_batch
112
+ ]
113
+
114
+ results.extend(await asyncio.gather(*data_item_tasks))
287
115
 
288
116
  # Remove skipped data items from results
289
117
  results = [result for result in results if result]
@@ -0,0 +1,260 @@
1
+ """
2
+ Data item processing functions for pipeline operations.
3
+
4
+ This module contains reusable functions for processing individual data items
5
+ within pipeline operations, supporting both incremental and regular processing modes.
6
+ """
7
+
8
+ import os
9
+ from typing import Any, Dict, AsyncGenerator, Optional
10
+ from sqlalchemy import select
11
+
12
+ import cognee.modules.ingestion as ingestion
13
+ from cognee.infrastructure.databases.relational import get_relational_engine
14
+ from cognee.infrastructure.files.utils.open_data_file import open_data_file
15
+ from cognee.shared.logging_utils import get_logger
16
+ from cognee.modules.users.models import User
17
+ from cognee.modules.data.models import Data, Dataset
18
+ from cognee.tasks.ingestion import save_data_item_to_storage
19
+ from cognee.modules.pipelines.models.PipelineRunInfo import (
20
+ PipelineRunCompleted,
21
+ PipelineRunErrored,
22
+ PipelineRunYield,
23
+ PipelineRunAlreadyCompleted,
24
+ )
25
+ from cognee.modules.pipelines.models.DataItemStatus import DataItemStatus
26
+ from cognee.modules.pipelines.operations.run_tasks_with_telemetry import run_tasks_with_telemetry
27
+ from ..tasks.task import Task
28
+
29
+ logger = get_logger("run_tasks_data_item")
30
+
31
+
32
+ async def run_tasks_data_item_incremental(
33
+ data_item: Any,
34
+ dataset: Dataset,
35
+ tasks: list[Task],
36
+ pipeline_name: str,
37
+ pipeline_id: str,
38
+ pipeline_run_id: str,
39
+ context: Optional[Dict[str, Any]],
40
+ user: User,
41
+ ) -> AsyncGenerator[Dict[str, Any], None]:
42
+ """
43
+ Process a single data item with incremental loading support.
44
+
45
+ This function handles incremental processing by checking if the data item
46
+ has already been processed for the given pipeline and dataset. If it has,
47
+ it skips processing and returns a completion status.
48
+
49
+ Args:
50
+ data_item: The data item to process
51
+ dataset: The dataset containing the data item
52
+ tasks: List of tasks to execute on the data item
53
+ pipeline_name: Name of the pipeline
54
+ pipeline_id: Unique identifier for the pipeline
55
+ pipeline_run_id: Unique identifier for this pipeline run
56
+ context: Optional context dictionary
57
+ user: User performing the operation
58
+
59
+ Yields:
60
+ Dict containing run_info and data_id for each processing step
61
+ """
62
+ db_engine = get_relational_engine()
63
+
64
+ # If incremental_loading of data is set to True don't process documents already processed by pipeline
65
+ # If data is being added to Cognee for the first time calculate the id of the data
66
+ if not isinstance(data_item, Data):
67
+ file_path = await save_data_item_to_storage(data_item)
68
+ # Ingest data and add metadata
69
+ async with open_data_file(file_path) as file:
70
+ classified_data = ingestion.classify(file)
71
+ # data_id is the hash of file contents + owner id to avoid duplicate data
72
+ data_id = ingestion.identify(classified_data, user)
73
+ else:
74
+ # If data was already processed by Cognee get data id
75
+ data_id = data_item.id
76
+
77
+ # Check pipeline status, if Data already processed for pipeline before skip current processing
78
+ async with db_engine.get_async_session() as session:
79
+ data_point = (
80
+ await session.execute(select(Data).filter(Data.id == data_id))
81
+ ).scalar_one_or_none()
82
+ if data_point:
83
+ if (
84
+ data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
85
+ == DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
86
+ ):
87
+ yield {
88
+ "run_info": PipelineRunAlreadyCompleted(
89
+ pipeline_run_id=pipeline_run_id,
90
+ dataset_id=dataset.id,
91
+ dataset_name=dataset.name,
92
+ ),
93
+ "data_id": data_id,
94
+ }
95
+ return
96
+
97
+ try:
98
+ # Process data based on data_item and list of tasks
99
+ async for result in run_tasks_with_telemetry(
100
+ tasks=tasks,
101
+ data=[data_item],
102
+ user=user,
103
+ pipeline_name=pipeline_id,
104
+ context=context,
105
+ ):
106
+ yield PipelineRunYield(
107
+ pipeline_run_id=pipeline_run_id,
108
+ dataset_id=dataset.id,
109
+ dataset_name=dataset.name,
110
+ payload=result,
111
+ )
112
+
113
+ # Update pipeline status for Data element
114
+ async with db_engine.get_async_session() as session:
115
+ data_point = (
116
+ await session.execute(select(Data).filter(Data.id == data_id))
117
+ ).scalar_one_or_none()
118
+ status_for_pipeline = data_point.pipeline_status.setdefault(pipeline_name, {})
119
+ status_for_pipeline[str(dataset.id)] = DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
120
+ await session.merge(data_point)
121
+ await session.commit()
122
+
123
+ yield {
124
+ "run_info": PipelineRunCompleted(
125
+ pipeline_run_id=pipeline_run_id,
126
+ dataset_id=dataset.id,
127
+ dataset_name=dataset.name,
128
+ ),
129
+ "data_id": data_id,
130
+ }
131
+
132
+ except Exception as error:
133
+ # Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
134
+ logger.error(
135
+ f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
136
+ )
137
+ yield {
138
+ "run_info": PipelineRunErrored(
139
+ pipeline_run_id=pipeline_run_id,
140
+ payload=repr(error),
141
+ dataset_id=dataset.id,
142
+ dataset_name=dataset.name,
143
+ ),
144
+ "data_id": data_id,
145
+ }
146
+
147
+ if os.getenv("RAISE_INCREMENTAL_LOADING_ERRORS", "true").lower() == "true":
148
+ raise error
149
+
150
+
151
+ async def run_tasks_data_item_regular(
152
+ data_item: Any,
153
+ dataset: Dataset,
154
+ tasks: list[Task],
155
+ pipeline_id: str,
156
+ pipeline_run_id: str,
157
+ context: Optional[Dict[str, Any]],
158
+ user: User,
159
+ ) -> AsyncGenerator[Dict[str, Any], None]:
160
+ """
161
+ Process a single data item in regular (non-incremental) mode.
162
+
163
+ This function processes a data item without checking for previous processing
164
+ status, executing all tasks on the data item.
165
+
166
+ Args:
167
+ data_item: The data item to process
168
+ dataset: The dataset containing the data item
169
+ tasks: List of tasks to execute on the data item
170
+ pipeline_id: Unique identifier for the pipeline
171
+ pipeline_run_id: Unique identifier for this pipeline run
172
+ context: Optional context dictionary
173
+ user: User performing the operation
174
+
175
+ Yields:
176
+ Dict containing run_info for each processing step
177
+ """
178
+ # Process data based on data_item and list of tasks
179
+ async for result in run_tasks_with_telemetry(
180
+ tasks=tasks,
181
+ data=[data_item],
182
+ user=user,
183
+ pipeline_name=pipeline_id,
184
+ context=context,
185
+ ):
186
+ yield PipelineRunYield(
187
+ pipeline_run_id=pipeline_run_id,
188
+ dataset_id=dataset.id,
189
+ dataset_name=dataset.name,
190
+ payload=result,
191
+ )
192
+
193
+ yield {
194
+ "run_info": PipelineRunCompleted(
195
+ pipeline_run_id=pipeline_run_id,
196
+ dataset_id=dataset.id,
197
+ dataset_name=dataset.name,
198
+ )
199
+ }
200
+
201
+
202
+ async def run_tasks_data_item(
203
+ data_item: Any,
204
+ dataset: Dataset,
205
+ tasks: list[Task],
206
+ pipeline_name: str,
207
+ pipeline_id: str,
208
+ pipeline_run_id: str,
209
+ context: Optional[Dict[str, Any]],
210
+ user: User,
211
+ incremental_loading: bool,
212
+ ) -> Optional[Dict[str, Any]]:
213
+ """
214
+ Process a single data item, choosing between incremental and regular processing.
215
+
216
+ This is the main entry point for data item processing that delegates to either
217
+ incremental or regular processing based on the incremental_loading flag.
218
+
219
+ Args:
220
+ data_item: The data item to process
221
+ dataset: The dataset containing the data item
222
+ tasks: List of tasks to execute on the data item
223
+ pipeline_name: Name of the pipeline
224
+ pipeline_id: Unique identifier for the pipeline
225
+ pipeline_run_id: Unique identifier for this pipeline run
226
+ context: Optional context dictionary
227
+ user: User performing the operation
228
+ incremental_loading: Whether to use incremental processing
229
+
230
+ Returns:
231
+ Dict containing the final processing result, or None if processing was skipped
232
+ """
233
+ # Go through async generator and return data item processing result. Result can be PipelineRunAlreadyCompleted when data item is skipped,
234
+ # PipelineRunCompleted when processing was successful and PipelineRunErrored if there were issues
235
+ result = None
236
+ if incremental_loading:
237
+ async for result in run_tasks_data_item_incremental(
238
+ data_item=data_item,
239
+ dataset=dataset,
240
+ tasks=tasks,
241
+ pipeline_name=pipeline_name,
242
+ pipeline_id=pipeline_id,
243
+ pipeline_run_id=pipeline_run_id,
244
+ context=context,
245
+ user=user,
246
+ ):
247
+ pass
248
+ else:
249
+ async for result in run_tasks_data_item_regular(
250
+ data_item=data_item,
251
+ dataset=dataset,
252
+ tasks=tasks,
253
+ pipeline_id=pipeline_id,
254
+ pipeline_run_id=pipeline_run_id,
255
+ context=context,
256
+ user=user,
257
+ ):
258
+ pass
259
+
260
+ return result