cognee 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +5 -1
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/ui/ui.py +47 -16
- cognee/api/v1/update/routers/get_update_router.py +1 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +67 -44
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +32 -1
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +5 -0
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +34 -2
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/METADATA +22 -7
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/RECORD +155 -128
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- distributed/poetry.lock +0 -12238
- distributed/pyproject.toml +0 -185
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/WHEEL +0 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.5.dist-info → cognee-0.3.7.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -4,35 +4,27 @@ import asyncio
|
|
|
4
4
|
from uuid import UUID
|
|
5
5
|
from typing import Any, List
|
|
6
6
|
from functools import wraps
|
|
7
|
-
from sqlalchemy import select
|
|
8
7
|
|
|
9
|
-
import cognee.modules.ingestion as ingestion
|
|
10
8
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
11
9
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
12
10
|
from cognee.modules.pipelines.operations.run_tasks_distributed import run_tasks_distributed
|
|
13
11
|
from cognee.modules.users.models import User
|
|
14
|
-
from cognee.modules.data.models import Data
|
|
15
|
-
from cognee.infrastructure.files.utils.open_data_file import open_data_file
|
|
16
12
|
from cognee.shared.logging_utils import get_logger
|
|
17
13
|
from cognee.modules.users.methods import get_default_user
|
|
18
14
|
from cognee.modules.pipelines.utils import generate_pipeline_id
|
|
19
15
|
from cognee.modules.pipelines.exceptions import PipelineRunFailedError
|
|
20
|
-
from cognee.tasks.ingestion import
|
|
16
|
+
from cognee.tasks.ingestion import resolve_data_directories
|
|
21
17
|
from cognee.modules.pipelines.models.PipelineRunInfo import (
|
|
22
18
|
PipelineRunCompleted,
|
|
23
19
|
PipelineRunErrored,
|
|
24
20
|
PipelineRunStarted,
|
|
25
|
-
PipelineRunYield,
|
|
26
|
-
PipelineRunAlreadyCompleted,
|
|
27
21
|
)
|
|
28
|
-
from cognee.modules.pipelines.models.DataItemStatus import DataItemStatus
|
|
29
|
-
|
|
30
22
|
from cognee.modules.pipelines.operations import (
|
|
31
23
|
log_pipeline_run_start,
|
|
32
24
|
log_pipeline_run_complete,
|
|
33
25
|
log_pipeline_run_error,
|
|
34
26
|
)
|
|
35
|
-
from .
|
|
27
|
+
from .run_tasks_data_item import run_tasks_data_item
|
|
36
28
|
from ..tasks.task import Task
|
|
37
29
|
|
|
38
30
|
|
|
@@ -67,177 +59,8 @@ async def run_tasks(
|
|
|
67
59
|
pipeline_name: str = "unknown_pipeline",
|
|
68
60
|
context: dict = None,
|
|
69
61
|
incremental_loading: bool = False,
|
|
62
|
+
data_per_batch: int = 20,
|
|
70
63
|
):
|
|
71
|
-
async def _run_tasks_data_item_incremental(
|
|
72
|
-
data_item,
|
|
73
|
-
dataset,
|
|
74
|
-
tasks,
|
|
75
|
-
pipeline_name,
|
|
76
|
-
pipeline_id,
|
|
77
|
-
pipeline_run_id,
|
|
78
|
-
context,
|
|
79
|
-
user,
|
|
80
|
-
):
|
|
81
|
-
db_engine = get_relational_engine()
|
|
82
|
-
# If incremental_loading of data is set to True don't process documents already processed by pipeline
|
|
83
|
-
# If data is being added to Cognee for the first time calculate the id of the data
|
|
84
|
-
if not isinstance(data_item, Data):
|
|
85
|
-
file_path = await save_data_item_to_storage(data_item)
|
|
86
|
-
# Ingest data and add metadata
|
|
87
|
-
async with open_data_file(file_path) as file:
|
|
88
|
-
classified_data = ingestion.classify(file)
|
|
89
|
-
# data_id is the hash of file contents + owner id to avoid duplicate data
|
|
90
|
-
data_id = ingestion.identify(classified_data, user)
|
|
91
|
-
else:
|
|
92
|
-
# If data was already processed by Cognee get data id
|
|
93
|
-
data_id = data_item.id
|
|
94
|
-
|
|
95
|
-
# Check pipeline status, if Data already processed for pipeline before skip current processing
|
|
96
|
-
async with db_engine.get_async_session() as session:
|
|
97
|
-
data_point = (
|
|
98
|
-
await session.execute(select(Data).filter(Data.id == data_id))
|
|
99
|
-
).scalar_one_or_none()
|
|
100
|
-
if data_point:
|
|
101
|
-
if (
|
|
102
|
-
data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
|
|
103
|
-
== DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
|
|
104
|
-
):
|
|
105
|
-
yield {
|
|
106
|
-
"run_info": PipelineRunAlreadyCompleted(
|
|
107
|
-
pipeline_run_id=pipeline_run_id,
|
|
108
|
-
dataset_id=dataset.id,
|
|
109
|
-
dataset_name=dataset.name,
|
|
110
|
-
),
|
|
111
|
-
"data_id": data_id,
|
|
112
|
-
}
|
|
113
|
-
return
|
|
114
|
-
|
|
115
|
-
try:
|
|
116
|
-
# Process data based on data_item and list of tasks
|
|
117
|
-
async for result in run_tasks_with_telemetry(
|
|
118
|
-
tasks=tasks,
|
|
119
|
-
data=[data_item],
|
|
120
|
-
user=user,
|
|
121
|
-
pipeline_name=pipeline_id,
|
|
122
|
-
context=context,
|
|
123
|
-
):
|
|
124
|
-
yield PipelineRunYield(
|
|
125
|
-
pipeline_run_id=pipeline_run_id,
|
|
126
|
-
dataset_id=dataset.id,
|
|
127
|
-
dataset_name=dataset.name,
|
|
128
|
-
payload=result,
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
# Update pipeline status for Data element
|
|
132
|
-
async with db_engine.get_async_session() as session:
|
|
133
|
-
data_point = (
|
|
134
|
-
await session.execute(select(Data).filter(Data.id == data_id))
|
|
135
|
-
).scalar_one_or_none()
|
|
136
|
-
data_point.pipeline_status[pipeline_name] = {
|
|
137
|
-
str(dataset.id): DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
|
|
138
|
-
}
|
|
139
|
-
await session.merge(data_point)
|
|
140
|
-
await session.commit()
|
|
141
|
-
|
|
142
|
-
yield {
|
|
143
|
-
"run_info": PipelineRunCompleted(
|
|
144
|
-
pipeline_run_id=pipeline_run_id,
|
|
145
|
-
dataset_id=dataset.id,
|
|
146
|
-
dataset_name=dataset.name,
|
|
147
|
-
),
|
|
148
|
-
"data_id": data_id,
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
except Exception as error:
|
|
152
|
-
# Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
|
|
153
|
-
logger.error(
|
|
154
|
-
f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
|
|
155
|
-
)
|
|
156
|
-
yield {
|
|
157
|
-
"run_info": PipelineRunErrored(
|
|
158
|
-
pipeline_run_id=pipeline_run_id,
|
|
159
|
-
payload=repr(error),
|
|
160
|
-
dataset_id=dataset.id,
|
|
161
|
-
dataset_name=dataset.name,
|
|
162
|
-
),
|
|
163
|
-
"data_id": data_id,
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
if os.getenv("RAISE_INCREMENTAL_LOADING_ERRORS", "true").lower() == "true":
|
|
167
|
-
raise error
|
|
168
|
-
|
|
169
|
-
async def _run_tasks_data_item_regular(
|
|
170
|
-
data_item,
|
|
171
|
-
dataset,
|
|
172
|
-
tasks,
|
|
173
|
-
pipeline_id,
|
|
174
|
-
pipeline_run_id,
|
|
175
|
-
context,
|
|
176
|
-
user,
|
|
177
|
-
):
|
|
178
|
-
# Process data based on data_item and list of tasks
|
|
179
|
-
async for result in run_tasks_with_telemetry(
|
|
180
|
-
tasks=tasks,
|
|
181
|
-
data=[data_item],
|
|
182
|
-
user=user,
|
|
183
|
-
pipeline_name=pipeline_id,
|
|
184
|
-
context=context,
|
|
185
|
-
):
|
|
186
|
-
yield PipelineRunYield(
|
|
187
|
-
pipeline_run_id=pipeline_run_id,
|
|
188
|
-
dataset_id=dataset.id,
|
|
189
|
-
dataset_name=dataset.name,
|
|
190
|
-
payload=result,
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
yield {
|
|
194
|
-
"run_info": PipelineRunCompleted(
|
|
195
|
-
pipeline_run_id=pipeline_run_id,
|
|
196
|
-
dataset_id=dataset.id,
|
|
197
|
-
dataset_name=dataset.name,
|
|
198
|
-
)
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
async def _run_tasks_data_item(
|
|
202
|
-
data_item,
|
|
203
|
-
dataset,
|
|
204
|
-
tasks,
|
|
205
|
-
pipeline_name,
|
|
206
|
-
pipeline_id,
|
|
207
|
-
pipeline_run_id,
|
|
208
|
-
context,
|
|
209
|
-
user,
|
|
210
|
-
incremental_loading,
|
|
211
|
-
):
|
|
212
|
-
# Go through async generator and return data item processing result. Result can be PipelineRunAlreadyCompleted when data item is skipped,
|
|
213
|
-
# PipelineRunCompleted when processing was successful and PipelineRunErrored if there were issues
|
|
214
|
-
result = None
|
|
215
|
-
if incremental_loading:
|
|
216
|
-
async for result in _run_tasks_data_item_incremental(
|
|
217
|
-
data_item=data_item,
|
|
218
|
-
dataset=dataset,
|
|
219
|
-
tasks=tasks,
|
|
220
|
-
pipeline_name=pipeline_name,
|
|
221
|
-
pipeline_id=pipeline_id,
|
|
222
|
-
pipeline_run_id=pipeline_run_id,
|
|
223
|
-
context=context,
|
|
224
|
-
user=user,
|
|
225
|
-
):
|
|
226
|
-
pass
|
|
227
|
-
else:
|
|
228
|
-
async for result in _run_tasks_data_item_regular(
|
|
229
|
-
data_item=data_item,
|
|
230
|
-
dataset=dataset,
|
|
231
|
-
tasks=tasks,
|
|
232
|
-
pipeline_id=pipeline_id,
|
|
233
|
-
pipeline_run_id=pipeline_run_id,
|
|
234
|
-
context=context,
|
|
235
|
-
user=user,
|
|
236
|
-
):
|
|
237
|
-
pass
|
|
238
|
-
|
|
239
|
-
return result
|
|
240
|
-
|
|
241
64
|
if not user:
|
|
242
65
|
user = await get_default_user()
|
|
243
66
|
|
|
@@ -266,24 +89,29 @@ async def run_tasks(
|
|
|
266
89
|
if incremental_loading:
|
|
267
90
|
data = await resolve_data_directories(data)
|
|
268
91
|
|
|
269
|
-
# Create async tasks
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
92
|
+
# Create and gather batches of async tasks of data items that will run the pipeline for the data item
|
|
93
|
+
results = []
|
|
94
|
+
for start in range(0, len(data), data_per_batch):
|
|
95
|
+
data_batch = data[start : start + data_per_batch]
|
|
96
|
+
|
|
97
|
+
data_item_tasks = [
|
|
98
|
+
asyncio.create_task(
|
|
99
|
+
run_tasks_data_item(
|
|
100
|
+
data_item,
|
|
101
|
+
dataset,
|
|
102
|
+
tasks,
|
|
103
|
+
pipeline_name,
|
|
104
|
+
pipeline_id,
|
|
105
|
+
pipeline_run_id,
|
|
106
|
+
context,
|
|
107
|
+
user,
|
|
108
|
+
incremental_loading,
|
|
109
|
+
)
|
|
282
110
|
)
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
111
|
+
for data_item in data_batch
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
results.extend(await asyncio.gather(*data_item_tasks))
|
|
287
115
|
|
|
288
116
|
# Remove skipped data items from results
|
|
289
117
|
results = [result for result in results if result]
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data item processing functions for pipeline operations.
|
|
3
|
+
|
|
4
|
+
This module contains reusable functions for processing individual data items
|
|
5
|
+
within pipeline operations, supporting both incremental and regular processing modes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import Any, Dict, AsyncGenerator, Optional
|
|
10
|
+
from sqlalchemy import select
|
|
11
|
+
|
|
12
|
+
import cognee.modules.ingestion as ingestion
|
|
13
|
+
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
14
|
+
from cognee.infrastructure.files.utils.open_data_file import open_data_file
|
|
15
|
+
from cognee.shared.logging_utils import get_logger
|
|
16
|
+
from cognee.modules.users.models import User
|
|
17
|
+
from cognee.modules.data.models import Data, Dataset
|
|
18
|
+
from cognee.tasks.ingestion import save_data_item_to_storage
|
|
19
|
+
from cognee.modules.pipelines.models.PipelineRunInfo import (
|
|
20
|
+
PipelineRunCompleted,
|
|
21
|
+
PipelineRunErrored,
|
|
22
|
+
PipelineRunYield,
|
|
23
|
+
PipelineRunAlreadyCompleted,
|
|
24
|
+
)
|
|
25
|
+
from cognee.modules.pipelines.models.DataItemStatus import DataItemStatus
|
|
26
|
+
from cognee.modules.pipelines.operations.run_tasks_with_telemetry import run_tasks_with_telemetry
|
|
27
|
+
from ..tasks.task import Task
|
|
28
|
+
|
|
29
|
+
logger = get_logger("run_tasks_data_item")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
async def run_tasks_data_item_incremental(
|
|
33
|
+
data_item: Any,
|
|
34
|
+
dataset: Dataset,
|
|
35
|
+
tasks: list[Task],
|
|
36
|
+
pipeline_name: str,
|
|
37
|
+
pipeline_id: str,
|
|
38
|
+
pipeline_run_id: str,
|
|
39
|
+
context: Optional[Dict[str, Any]],
|
|
40
|
+
user: User,
|
|
41
|
+
) -> AsyncGenerator[Dict[str, Any], None]:
|
|
42
|
+
"""
|
|
43
|
+
Process a single data item with incremental loading support.
|
|
44
|
+
|
|
45
|
+
This function handles incremental processing by checking if the data item
|
|
46
|
+
has already been processed for the given pipeline and dataset. If it has,
|
|
47
|
+
it skips processing and returns a completion status.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
data_item: The data item to process
|
|
51
|
+
dataset: The dataset containing the data item
|
|
52
|
+
tasks: List of tasks to execute on the data item
|
|
53
|
+
pipeline_name: Name of the pipeline
|
|
54
|
+
pipeline_id: Unique identifier for the pipeline
|
|
55
|
+
pipeline_run_id: Unique identifier for this pipeline run
|
|
56
|
+
context: Optional context dictionary
|
|
57
|
+
user: User performing the operation
|
|
58
|
+
|
|
59
|
+
Yields:
|
|
60
|
+
Dict containing run_info and data_id for each processing step
|
|
61
|
+
"""
|
|
62
|
+
db_engine = get_relational_engine()
|
|
63
|
+
|
|
64
|
+
# If incremental_loading of data is set to True don't process documents already processed by pipeline
|
|
65
|
+
# If data is being added to Cognee for the first time calculate the id of the data
|
|
66
|
+
if not isinstance(data_item, Data):
|
|
67
|
+
file_path = await save_data_item_to_storage(data_item)
|
|
68
|
+
# Ingest data and add metadata
|
|
69
|
+
async with open_data_file(file_path) as file:
|
|
70
|
+
classified_data = ingestion.classify(file)
|
|
71
|
+
# data_id is the hash of file contents + owner id to avoid duplicate data
|
|
72
|
+
data_id = ingestion.identify(classified_data, user)
|
|
73
|
+
else:
|
|
74
|
+
# If data was already processed by Cognee get data id
|
|
75
|
+
data_id = data_item.id
|
|
76
|
+
|
|
77
|
+
# Check pipeline status, if Data already processed for pipeline before skip current processing
|
|
78
|
+
async with db_engine.get_async_session() as session:
|
|
79
|
+
data_point = (
|
|
80
|
+
await session.execute(select(Data).filter(Data.id == data_id))
|
|
81
|
+
).scalar_one_or_none()
|
|
82
|
+
if data_point:
|
|
83
|
+
if (
|
|
84
|
+
data_point.pipeline_status.get(pipeline_name, {}).get(str(dataset.id))
|
|
85
|
+
== DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
|
|
86
|
+
):
|
|
87
|
+
yield {
|
|
88
|
+
"run_info": PipelineRunAlreadyCompleted(
|
|
89
|
+
pipeline_run_id=pipeline_run_id,
|
|
90
|
+
dataset_id=dataset.id,
|
|
91
|
+
dataset_name=dataset.name,
|
|
92
|
+
),
|
|
93
|
+
"data_id": data_id,
|
|
94
|
+
}
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
# Process data based on data_item and list of tasks
|
|
99
|
+
async for result in run_tasks_with_telemetry(
|
|
100
|
+
tasks=tasks,
|
|
101
|
+
data=[data_item],
|
|
102
|
+
user=user,
|
|
103
|
+
pipeline_name=pipeline_id,
|
|
104
|
+
context=context,
|
|
105
|
+
):
|
|
106
|
+
yield PipelineRunYield(
|
|
107
|
+
pipeline_run_id=pipeline_run_id,
|
|
108
|
+
dataset_id=dataset.id,
|
|
109
|
+
dataset_name=dataset.name,
|
|
110
|
+
payload=result,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Update pipeline status for Data element
|
|
114
|
+
async with db_engine.get_async_session() as session:
|
|
115
|
+
data_point = (
|
|
116
|
+
await session.execute(select(Data).filter(Data.id == data_id))
|
|
117
|
+
).scalar_one_or_none()
|
|
118
|
+
status_for_pipeline = data_point.pipeline_status.setdefault(pipeline_name, {})
|
|
119
|
+
status_for_pipeline[str(dataset.id)] = DataItemStatus.DATA_ITEM_PROCESSING_COMPLETED
|
|
120
|
+
await session.merge(data_point)
|
|
121
|
+
await session.commit()
|
|
122
|
+
|
|
123
|
+
yield {
|
|
124
|
+
"run_info": PipelineRunCompleted(
|
|
125
|
+
pipeline_run_id=pipeline_run_id,
|
|
126
|
+
dataset_id=dataset.id,
|
|
127
|
+
dataset_name=dataset.name,
|
|
128
|
+
),
|
|
129
|
+
"data_id": data_id,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
except Exception as error:
|
|
133
|
+
# Temporarily swallow error and try to process rest of documents first, then re-raise error at end of data ingestion pipeline
|
|
134
|
+
logger.error(
|
|
135
|
+
f"Exception caught while processing data: {error}.\n Data processing failed for data item: {data_item}."
|
|
136
|
+
)
|
|
137
|
+
yield {
|
|
138
|
+
"run_info": PipelineRunErrored(
|
|
139
|
+
pipeline_run_id=pipeline_run_id,
|
|
140
|
+
payload=repr(error),
|
|
141
|
+
dataset_id=dataset.id,
|
|
142
|
+
dataset_name=dataset.name,
|
|
143
|
+
),
|
|
144
|
+
"data_id": data_id,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if os.getenv("RAISE_INCREMENTAL_LOADING_ERRORS", "true").lower() == "true":
|
|
148
|
+
raise error
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
async def run_tasks_data_item_regular(
|
|
152
|
+
data_item: Any,
|
|
153
|
+
dataset: Dataset,
|
|
154
|
+
tasks: list[Task],
|
|
155
|
+
pipeline_id: str,
|
|
156
|
+
pipeline_run_id: str,
|
|
157
|
+
context: Optional[Dict[str, Any]],
|
|
158
|
+
user: User,
|
|
159
|
+
) -> AsyncGenerator[Dict[str, Any], None]:
|
|
160
|
+
"""
|
|
161
|
+
Process a single data item in regular (non-incremental) mode.
|
|
162
|
+
|
|
163
|
+
This function processes a data item without checking for previous processing
|
|
164
|
+
status, executing all tasks on the data item.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
data_item: The data item to process
|
|
168
|
+
dataset: The dataset containing the data item
|
|
169
|
+
tasks: List of tasks to execute on the data item
|
|
170
|
+
pipeline_id: Unique identifier for the pipeline
|
|
171
|
+
pipeline_run_id: Unique identifier for this pipeline run
|
|
172
|
+
context: Optional context dictionary
|
|
173
|
+
user: User performing the operation
|
|
174
|
+
|
|
175
|
+
Yields:
|
|
176
|
+
Dict containing run_info for each processing step
|
|
177
|
+
"""
|
|
178
|
+
# Process data based on data_item and list of tasks
|
|
179
|
+
async for result in run_tasks_with_telemetry(
|
|
180
|
+
tasks=tasks,
|
|
181
|
+
data=[data_item],
|
|
182
|
+
user=user,
|
|
183
|
+
pipeline_name=pipeline_id,
|
|
184
|
+
context=context,
|
|
185
|
+
):
|
|
186
|
+
yield PipelineRunYield(
|
|
187
|
+
pipeline_run_id=pipeline_run_id,
|
|
188
|
+
dataset_id=dataset.id,
|
|
189
|
+
dataset_name=dataset.name,
|
|
190
|
+
payload=result,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
yield {
|
|
194
|
+
"run_info": PipelineRunCompleted(
|
|
195
|
+
pipeline_run_id=pipeline_run_id,
|
|
196
|
+
dataset_id=dataset.id,
|
|
197
|
+
dataset_name=dataset.name,
|
|
198
|
+
)
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
async def run_tasks_data_item(
|
|
203
|
+
data_item: Any,
|
|
204
|
+
dataset: Dataset,
|
|
205
|
+
tasks: list[Task],
|
|
206
|
+
pipeline_name: str,
|
|
207
|
+
pipeline_id: str,
|
|
208
|
+
pipeline_run_id: str,
|
|
209
|
+
context: Optional[Dict[str, Any]],
|
|
210
|
+
user: User,
|
|
211
|
+
incremental_loading: bool,
|
|
212
|
+
) -> Optional[Dict[str, Any]]:
|
|
213
|
+
"""
|
|
214
|
+
Process a single data item, choosing between incremental and regular processing.
|
|
215
|
+
|
|
216
|
+
This is the main entry point for data item processing that delegates to either
|
|
217
|
+
incremental or regular processing based on the incremental_loading flag.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
data_item: The data item to process
|
|
221
|
+
dataset: The dataset containing the data item
|
|
222
|
+
tasks: List of tasks to execute on the data item
|
|
223
|
+
pipeline_name: Name of the pipeline
|
|
224
|
+
pipeline_id: Unique identifier for the pipeline
|
|
225
|
+
pipeline_run_id: Unique identifier for this pipeline run
|
|
226
|
+
context: Optional context dictionary
|
|
227
|
+
user: User performing the operation
|
|
228
|
+
incremental_loading: Whether to use incremental processing
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Dict containing the final processing result, or None if processing was skipped
|
|
232
|
+
"""
|
|
233
|
+
# Go through async generator and return data item processing result. Result can be PipelineRunAlreadyCompleted when data item is skipped,
|
|
234
|
+
# PipelineRunCompleted when processing was successful and PipelineRunErrored if there were issues
|
|
235
|
+
result = None
|
|
236
|
+
if incremental_loading:
|
|
237
|
+
async for result in run_tasks_data_item_incremental(
|
|
238
|
+
data_item=data_item,
|
|
239
|
+
dataset=dataset,
|
|
240
|
+
tasks=tasks,
|
|
241
|
+
pipeline_name=pipeline_name,
|
|
242
|
+
pipeline_id=pipeline_id,
|
|
243
|
+
pipeline_run_id=pipeline_run_id,
|
|
244
|
+
context=context,
|
|
245
|
+
user=user,
|
|
246
|
+
):
|
|
247
|
+
pass
|
|
248
|
+
else:
|
|
249
|
+
async for result in run_tasks_data_item_regular(
|
|
250
|
+
data_item=data_item,
|
|
251
|
+
dataset=dataset,
|
|
252
|
+
tasks=tasks,
|
|
253
|
+
pipeline_id=pipeline_id,
|
|
254
|
+
pipeline_run_id=pipeline_run_id,
|
|
255
|
+
context=context,
|
|
256
|
+
user=user,
|
|
257
|
+
):
|
|
258
|
+
pass
|
|
259
|
+
|
|
260
|
+
return result
|