cognee 0.2.3.dev1__py3-none-any.whl → 0.3.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/__main__.py +4 -0
- cognee/api/client.py +28 -3
- cognee/api/health.py +10 -13
- cognee/api/v1/add/add.py +20 -6
- cognee/api/v1/add/routers/get_add_router.py +12 -37
- cognee/api/v1/cloud/routers/__init__.py +1 -0
- cognee/api/v1/cloud/routers/get_checks_router.py +23 -0
- cognee/api/v1/cognify/code_graph_pipeline.py +14 -3
- cognee/api/v1/cognify/cognify.py +67 -105
- cognee/api/v1/cognify/routers/get_cognify_router.py +11 -3
- cognee/api/v1/datasets/routers/get_datasets_router.py +16 -5
- cognee/api/v1/memify/routers/__init__.py +1 -0
- cognee/api/v1/memify/routers/get_memify_router.py +100 -0
- cognee/api/v1/notebooks/routers/__init__.py +1 -0
- cognee/api/v1/notebooks/routers/get_notebooks_router.py +96 -0
- cognee/api/v1/responses/default_tools.py +4 -0
- cognee/api/v1/responses/dispatch_function.py +6 -1
- cognee/api/v1/responses/models.py +1 -1
- cognee/api/v1/search/routers/get_search_router.py +20 -1
- cognee/api/v1/search/search.py +17 -4
- cognee/api/v1/sync/__init__.py +17 -0
- cognee/api/v1/sync/routers/__init__.py +3 -0
- cognee/api/v1/sync/routers/get_sync_router.py +241 -0
- cognee/api/v1/sync/sync.py +877 -0
- cognee/api/v1/users/routers/get_auth_router.py +13 -1
- cognee/base_config.py +10 -1
- cognee/cli/__init__.py +10 -0
- cognee/cli/_cognee.py +180 -0
- cognee/cli/commands/__init__.py +1 -0
- cognee/cli/commands/add_command.py +80 -0
- cognee/cli/commands/cognify_command.py +128 -0
- cognee/cli/commands/config_command.py +225 -0
- cognee/cli/commands/delete_command.py +80 -0
- cognee/cli/commands/search_command.py +149 -0
- cognee/cli/config.py +33 -0
- cognee/cli/debug.py +21 -0
- cognee/cli/echo.py +45 -0
- cognee/cli/exceptions.py +23 -0
- cognee/cli/minimal_cli.py +97 -0
- cognee/cli/reference.py +26 -0
- cognee/cli/suppress_logging.py +12 -0
- cognee/eval_framework/corpus_builder/corpus_builder_executor.py +2 -2
- cognee/eval_framework/eval_config.py +1 -1
- cognee/infrastructure/databases/graph/config.py +10 -4
- cognee/infrastructure/databases/graph/get_graph_engine.py +4 -9
- cognee/infrastructure/databases/graph/kuzu/adapter.py +199 -2
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +138 -0
- cognee/infrastructure/databases/relational/__init__.py +2 -0
- cognee/infrastructure/databases/relational/get_async_session.py +15 -0
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +6 -1
- cognee/infrastructure/databases/relational/with_async_session.py +25 -0
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +1 -1
- cognee/infrastructure/databases/vector/config.py +13 -6
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +6 -4
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +16 -7
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +5 -5
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -2
- cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +2 -6
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +10 -7
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -0
- cognee/infrastructure/files/storage/S3FileStorage.py +5 -0
- cognee/infrastructure/files/storage/StorageManager.py +7 -1
- cognee/infrastructure/files/storage/storage.py +16 -0
- cognee/infrastructure/files/utils/get_data_file_path.py +14 -9
- cognee/infrastructure/files/utils/get_file_metadata.py +2 -1
- cognee/infrastructure/llm/LLMGateway.py +32 -5
- cognee/infrastructure/llm/config.py +6 -4
- cognee/infrastructure/llm/prompts/extract_query_time.txt +15 -0
- cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt +25 -0
- cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt +30 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +16 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +2 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_event_entities.py +44 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py +1 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_content_graph.py +19 -15
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py +46 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +3 -3
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +3 -3
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +2 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +14 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +6 -4
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +28 -4
- cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +2 -2
- cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py +3 -3
- cognee/infrastructure/llm/tokenizer/Mistral/adapter.py +3 -3
- cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +6 -6
- cognee/infrastructure/llm/utils.py +7 -7
- cognee/infrastructure/utils/run_sync.py +8 -1
- cognee/modules/chunking/models/DocumentChunk.py +4 -3
- cognee/modules/cloud/exceptions/CloudApiKeyMissingError.py +15 -0
- cognee/modules/cloud/exceptions/CloudConnectionError.py +15 -0
- cognee/modules/cloud/exceptions/__init__.py +2 -0
- cognee/modules/cloud/operations/__init__.py +1 -0
- cognee/modules/cloud/operations/check_api_key.py +25 -0
- cognee/modules/data/deletion/prune_system.py +1 -1
- cognee/modules/data/methods/__init__.py +2 -0
- cognee/modules/data/methods/check_dataset_name.py +1 -1
- cognee/modules/data/methods/create_authorized_dataset.py +19 -0
- cognee/modules/data/methods/get_authorized_dataset.py +11 -5
- cognee/modules/data/methods/get_authorized_dataset_by_name.py +16 -0
- cognee/modules/data/methods/get_dataset_data.py +1 -1
- cognee/modules/data/methods/load_or_create_datasets.py +2 -20
- cognee/modules/engine/models/Event.py +16 -0
- cognee/modules/engine/models/Interval.py +8 -0
- cognee/modules/engine/models/Timestamp.py +13 -0
- cognee/modules/engine/models/__init__.py +3 -0
- cognee/modules/engine/utils/__init__.py +2 -0
- cognee/modules/engine/utils/generate_event_datapoint.py +46 -0
- cognee/modules/engine/utils/generate_timestamp_datapoint.py +51 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +2 -2
- cognee/modules/graph/methods/get_formatted_graph_data.py +3 -2
- cognee/modules/graph/utils/__init__.py +1 -0
- cognee/modules/graph/utils/resolve_edges_to_text.py +71 -0
- cognee/modules/memify/__init__.py +1 -0
- cognee/modules/memify/memify.py +118 -0
- cognee/modules/notebooks/methods/__init__.py +5 -0
- cognee/modules/notebooks/methods/create_notebook.py +26 -0
- cognee/modules/notebooks/methods/delete_notebook.py +13 -0
- cognee/modules/notebooks/methods/get_notebook.py +21 -0
- cognee/modules/notebooks/methods/get_notebooks.py +18 -0
- cognee/modules/notebooks/methods/update_notebook.py +17 -0
- cognee/modules/notebooks/models/Notebook.py +53 -0
- cognee/modules/notebooks/models/__init__.py +1 -0
- cognee/modules/notebooks/operations/__init__.py +1 -0
- cognee/modules/notebooks/operations/run_in_local_sandbox.py +55 -0
- cognee/modules/pipelines/__init__.py +1 -1
- cognee/modules/pipelines/exceptions/tasks.py +18 -0
- cognee/modules/pipelines/layers/__init__.py +1 -0
- cognee/modules/pipelines/layers/check_pipeline_run_qualification.py +59 -0
- cognee/modules/pipelines/layers/pipeline_execution_mode.py +127 -0
- cognee/modules/pipelines/layers/reset_dataset_pipeline_run_status.py +28 -0
- cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +34 -0
- cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +55 -0
- cognee/modules/pipelines/layers/setup_and_check_environment.py +41 -0
- cognee/modules/pipelines/layers/validate_pipeline_tasks.py +20 -0
- cognee/modules/pipelines/methods/__init__.py +2 -0
- cognee/modules/pipelines/methods/get_pipeline_runs_by_dataset.py +34 -0
- cognee/modules/pipelines/methods/reset_pipeline_run_status.py +16 -0
- cognee/modules/pipelines/operations/__init__.py +0 -1
- cognee/modules/pipelines/operations/log_pipeline_run_initiated.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +24 -138
- cognee/modules/pipelines/operations/run_tasks.py +17 -41
- cognee/modules/retrieval/base_feedback.py +11 -0
- cognee/modules/retrieval/base_graph_retriever.py +18 -0
- cognee/modules/retrieval/base_retriever.py +1 -1
- cognee/modules/retrieval/code_retriever.py +8 -0
- cognee/modules/retrieval/coding_rules_retriever.py +31 -0
- cognee/modules/retrieval/completion_retriever.py +9 -3
- cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -0
- cognee/modules/retrieval/cypher_search_retriever.py +1 -9
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +29 -13
- cognee/modules/retrieval/graph_completion_cot_retriever.py +30 -13
- cognee/modules/retrieval/graph_completion_retriever.py +107 -56
- cognee/modules/retrieval/graph_summary_completion_retriever.py +5 -1
- cognee/modules/retrieval/insights_retriever.py +14 -3
- cognee/modules/retrieval/natural_language_retriever.py +0 -4
- cognee/modules/retrieval/summaries_retriever.py +1 -1
- cognee/modules/retrieval/temporal_retriever.py +152 -0
- cognee/modules/retrieval/user_qa_feedback.py +83 -0
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +7 -32
- cognee/modules/retrieval/utils/completion.py +10 -3
- cognee/modules/retrieval/utils/extract_uuid_from_node.py +18 -0
- cognee/modules/retrieval/utils/models.py +40 -0
- cognee/modules/search/methods/get_search_type_tools.py +168 -0
- cognee/modules/search/methods/no_access_control_search.py +47 -0
- cognee/modules/search/methods/search.py +239 -118
- cognee/modules/search/types/SearchResult.py +21 -0
- cognee/modules/search/types/SearchType.py +3 -0
- cognee/modules/search/types/__init__.py +1 -0
- cognee/modules/search/utils/__init__.py +2 -0
- cognee/modules/search/utils/prepare_search_result.py +41 -0
- cognee/modules/search/utils/transform_context_to_graph.py +38 -0
- cognee/modules/settings/get_settings.py +2 -2
- cognee/modules/sync/__init__.py +1 -0
- cognee/modules/sync/methods/__init__.py +23 -0
- cognee/modules/sync/methods/create_sync_operation.py +53 -0
- cognee/modules/sync/methods/get_sync_operation.py +107 -0
- cognee/modules/sync/methods/update_sync_operation.py +248 -0
- cognee/modules/sync/models/SyncOperation.py +142 -0
- cognee/modules/sync/models/__init__.py +3 -0
- cognee/modules/users/__init__.py +0 -1
- cognee/modules/users/methods/__init__.py +4 -1
- cognee/modules/users/methods/create_user.py +26 -1
- cognee/modules/users/methods/get_authenticated_user.py +36 -42
- cognee/modules/users/methods/get_default_user.py +3 -1
- cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +2 -1
- cognee/root_dir.py +19 -0
- cognee/shared/CodeGraphEntities.py +1 -0
- cognee/shared/logging_utils.py +143 -32
- cognee/shared/utils.py +0 -1
- cognee/tasks/codingagents/coding_rule_associations.py +127 -0
- cognee/tasks/graph/extract_graph_from_data.py +6 -2
- cognee/tasks/ingestion/save_data_item_to_storage.py +23 -0
- cognee/tasks/memify/__init__.py +2 -0
- cognee/tasks/memify/extract_subgraph.py +7 -0
- cognee/tasks/memify/extract_subgraph_chunks.py +11 -0
- cognee/tasks/repo_processor/get_local_dependencies.py +2 -0
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +144 -47
- cognee/tasks/storage/add_data_points.py +33 -3
- cognee/tasks/temporal_graph/__init__.py +1 -0
- cognee/tasks/temporal_graph/add_entities_to_event.py +85 -0
- cognee/tasks/temporal_graph/enrich_events.py +34 -0
- cognee/tasks/temporal_graph/extract_events_and_entities.py +32 -0
- cognee/tasks/temporal_graph/extract_knowledge_graph_from_events.py +41 -0
- cognee/tasks/temporal_graph/models.py +49 -0
- cognee/tests/integration/cli/__init__.py +3 -0
- cognee/tests/integration/cli/test_cli_integration.py +331 -0
- cognee/tests/integration/documents/PdfDocument_test.py +2 -2
- cognee/tests/integration/documents/TextDocument_test.py +2 -4
- cognee/tests/integration/documents/UnstructuredDocument_test.py +5 -8
- cognee/tests/{test_deletion.py → test_delete_hard.py} +0 -37
- cognee/tests/test_delete_soft.py +85 -0
- cognee/tests/test_kuzu.py +2 -2
- cognee/tests/test_neo4j.py +2 -2
- cognee/tests/test_permissions.py +3 -3
- cognee/tests/test_relational_db_migration.py +7 -5
- cognee/tests/test_search_db.py +136 -23
- cognee/tests/test_temporal_graph.py +167 -0
- cognee/tests/unit/api/__init__.py +1 -0
- cognee/tests/unit/api/test_conditional_authentication_endpoints.py +246 -0
- cognee/tests/unit/cli/__init__.py +3 -0
- cognee/tests/unit/cli/test_cli_commands.py +483 -0
- cognee/tests/unit/cli/test_cli_edge_cases.py +625 -0
- cognee/tests/unit/cli/test_cli_main.py +173 -0
- cognee/tests/unit/cli/test_cli_runner.py +62 -0
- cognee/tests/unit/cli/test_cli_utils.py +127 -0
- cognee/tests/unit/modules/retrieval/chunks_retriever_test.py +18 -2
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +12 -15
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +10 -15
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +4 -3
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +4 -2
- cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +18 -2
- cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +225 -0
- cognee/tests/unit/modules/users/__init__.py +1 -0
- cognee/tests/unit/modules/users/test_conditional_authentication.py +277 -0
- cognee/tests/unit/processing/utils/utils_test.py +20 -1
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/METADATA +13 -9
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/RECORD +245 -135
- cognee-0.3.0.dev0.dist-info/entry_points.txt +2 -0
- cognee/infrastructure/databases/graph/networkx/adapter.py +0 -1017
- cognee/infrastructure/pipeline/models/Operation.py +0 -60
- cognee/notebooks/github_analysis_step_by_step.ipynb +0 -37
- cognee/tests/tasks/descriptive_metrics/networkx_metrics_test.py +0 -7
- cognee/tests/unit/modules/search/search_methods_test.py +0 -223
- /cognee/{infrastructure/databases/graph/networkx → api/v1/memify}/__init__.py +0 -0
- /cognee/{infrastructure/pipeline/models → tasks/codingagents}/__init__.py +0 -0
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/WHEEL +0 -0
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Any, AsyncIterable, AsyncGenerator, Callable, Dict, Union, Awaitable
|
|
3
|
+
from cognee.modules.pipelines.models.PipelineRunInfo import PipelineRunCompleted, PipelineRunErrored
|
|
4
|
+
from cognee.modules.pipelines.queues.pipeline_run_info_queues import push_to_queue
|
|
5
|
+
|
|
6
|
+
AsyncGenLike = Union[
|
|
7
|
+
AsyncIterable[Any],
|
|
8
|
+
AsyncGenerator[Any, None],
|
|
9
|
+
Callable[..., AsyncIterable[Any]],
|
|
10
|
+
Callable[..., AsyncGenerator[Any, None]],
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def run_pipeline_blocking(pipeline: AsyncGenLike, **params) -> Dict[str, Any]:
|
|
15
|
+
"""
|
|
16
|
+
Execute a pipeline synchronously (blocking until all results are consumed).
|
|
17
|
+
|
|
18
|
+
This function iterates through the given pipeline (an async generator/iterable)
|
|
19
|
+
until completion, aggregating the run information for each dataset.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
pipeline (AsyncGenLike): The pipeline generator or callable producing async run information.
|
|
23
|
+
**params: Arbitrary keyword arguments to be passed to the pipeline if it is callable.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Dict[str, Any]:
|
|
27
|
+
- If multiple datasets are processed, a mapping of dataset_id -> last run_info.
|
|
28
|
+
- If no dataset_id is present in run_info, the run_info itself is returned.
|
|
29
|
+
"""
|
|
30
|
+
agen = pipeline(**params) if callable(pipeline) else pipeline
|
|
31
|
+
|
|
32
|
+
total_run_info: Dict[str, Any] = {}
|
|
33
|
+
|
|
34
|
+
async for run_info in agen:
|
|
35
|
+
dataset_id = getattr(run_info, "dataset_id", None)
|
|
36
|
+
if dataset_id:
|
|
37
|
+
total_run_info[dataset_id] = run_info
|
|
38
|
+
else:
|
|
39
|
+
total_run_info = run_info
|
|
40
|
+
|
|
41
|
+
return total_run_info
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def run_pipeline_as_background_process(
|
|
45
|
+
pipeline: AsyncGenLike,
|
|
46
|
+
**params,
|
|
47
|
+
) -> Dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Execute one or more pipelines as background tasks.
|
|
50
|
+
|
|
51
|
+
This function:
|
|
52
|
+
1. Starts pipelines for each dataset (if multiple datasets are provided).
|
|
53
|
+
2. Returns the initial "started" run information immediately.
|
|
54
|
+
3. Continues executing the pipelines in the background,
|
|
55
|
+
pushing run updates to a queue until each completes.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
pipeline (AsyncGenLike): The pipeline generator or callable producing async run information.
|
|
59
|
+
**params: Arbitrary keyword arguments to be passed to the pipeline if it is callable.
|
|
60
|
+
Expected to include "datasets", which may be a single dataset ID (str)
|
|
61
|
+
or a list of dataset IDs.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Dict[str, Any]: A mapping of dataset_id -> initial run_info (with payload removed for serialization).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
datasets = params.get("datasets", None)
|
|
68
|
+
|
|
69
|
+
if isinstance(datasets, str):
|
|
70
|
+
datasets = [datasets]
|
|
71
|
+
|
|
72
|
+
pipeline_run_started_info = {}
|
|
73
|
+
|
|
74
|
+
async def handle_rest_of_the_run(pipeline_list):
|
|
75
|
+
# Execute all provided pipelines one by one to avoid database write conflicts
|
|
76
|
+
# TODO: Convert to async gather task instead of for loop when Queue mechanism for database is created
|
|
77
|
+
for pipeline in pipeline_list:
|
|
78
|
+
while True:
|
|
79
|
+
try:
|
|
80
|
+
pipeline_run_info = await anext(pipeline)
|
|
81
|
+
|
|
82
|
+
push_to_queue(pipeline_run_info.pipeline_run_id, pipeline_run_info)
|
|
83
|
+
|
|
84
|
+
if isinstance(pipeline_run_info, PipelineRunCompleted) or isinstance(
|
|
85
|
+
pipeline_run_info, PipelineRunErrored
|
|
86
|
+
):
|
|
87
|
+
break
|
|
88
|
+
except StopAsyncIteration:
|
|
89
|
+
break
|
|
90
|
+
|
|
91
|
+
# Start all pipelines to get started status
|
|
92
|
+
pipeline_list = []
|
|
93
|
+
for dataset in datasets:
|
|
94
|
+
call_params = dict(params)
|
|
95
|
+
if "datasets" in call_params:
|
|
96
|
+
call_params["datasets"] = dataset
|
|
97
|
+
|
|
98
|
+
pipeline_run = pipeline(**call_params) if callable(pipeline) else pipeline
|
|
99
|
+
|
|
100
|
+
# Save dataset Pipeline run started info
|
|
101
|
+
run_info = await anext(pipeline_run)
|
|
102
|
+
pipeline_run_started_info[run_info.dataset_id] = run_info
|
|
103
|
+
|
|
104
|
+
if pipeline_run_started_info[run_info.dataset_id].payload:
|
|
105
|
+
# Remove payload info to avoid serialization
|
|
106
|
+
# TODO: Handle payload serialization
|
|
107
|
+
pipeline_run_started_info[run_info.dataset_id].payload = []
|
|
108
|
+
|
|
109
|
+
pipeline_list.append(pipeline_run)
|
|
110
|
+
|
|
111
|
+
# Send all started pipelines to execute one by one in background
|
|
112
|
+
asyncio.create_task(handle_rest_of_the_run(pipeline_list=pipeline_list))
|
|
113
|
+
|
|
114
|
+
return pipeline_run_started_info
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_pipeline_executor(
|
|
118
|
+
run_in_background: bool = False,
|
|
119
|
+
) -> Callable[..., Awaitable[Dict[str, Any]]]:
|
|
120
|
+
"""
|
|
121
|
+
Return the appropriate pipeline runner.
|
|
122
|
+
|
|
123
|
+
Usage:
|
|
124
|
+
run_fn = get_run_pipeline_fn(run_in_background=True)
|
|
125
|
+
result = await run_fn(pipeline, **params)
|
|
126
|
+
"""
|
|
127
|
+
return run_pipeline_as_background_process if run_in_background else run_pipeline_blocking
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
from typing import Optional, List
|
|
3
|
+
|
|
4
|
+
from cognee.modules.pipelines.methods import get_pipeline_runs_by_dataset, reset_pipeline_run_status
|
|
5
|
+
from cognee.modules.pipelines.models.PipelineRun import PipelineRunStatus
|
|
6
|
+
from cognee.modules.users.models import User
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def reset_dataset_pipeline_run_status(
|
|
10
|
+
dataset_id: UUID, user: User, pipeline_names: Optional[list[str]] = None
|
|
11
|
+
):
|
|
12
|
+
"""Reset the status of all (or selected) pipeline runs for a dataset.
|
|
13
|
+
|
|
14
|
+
If *pipeline_names* is given, only runs whose *pipeline_name* is in
|
|
15
|
+
that list are touched.
|
|
16
|
+
"""
|
|
17
|
+
related_pipeline_runs = await get_pipeline_runs_by_dataset(dataset_id)
|
|
18
|
+
|
|
19
|
+
for pipeline_run in related_pipeline_runs:
|
|
20
|
+
# Skip runs that are initiated
|
|
21
|
+
if pipeline_run.status is PipelineRunStatus.DATASET_PROCESSING_INITIATED:
|
|
22
|
+
continue
|
|
23
|
+
|
|
24
|
+
# If a name filter is provided, skip non-matching runs
|
|
25
|
+
if pipeline_names is not None and pipeline_run.pipeline_name not in pipeline_names:
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
await reset_pipeline_run_status(user.id, dataset_id, pipeline_run.pipeline_name)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
|
|
3
|
+
from cognee.api.v1.exceptions import DatasetNotFoundError
|
|
4
|
+
from cognee.modules.users.models import User
|
|
5
|
+
from cognee.modules.users.methods import get_default_user
|
|
6
|
+
from cognee.modules.data.methods import (
|
|
7
|
+
create_authorized_dataset,
|
|
8
|
+
get_authorized_dataset,
|
|
9
|
+
get_authorized_dataset_by_name,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
async def resolve_authorized_user_dataset(dataset_id: UUID, dataset_name: str, user: User):
|
|
14
|
+
if not user:
|
|
15
|
+
user = await get_default_user()
|
|
16
|
+
|
|
17
|
+
if dataset_id:
|
|
18
|
+
authorized_dataset = await get_authorized_dataset(user, dataset_id, "write")
|
|
19
|
+
elif dataset_name:
|
|
20
|
+
authorized_dataset = await get_authorized_dataset_by_name(dataset_name, user, "write")
|
|
21
|
+
|
|
22
|
+
if not authorized_dataset:
|
|
23
|
+
authorized_dataset = await create_authorized_dataset(
|
|
24
|
+
dataset_name=dataset_name, user=user
|
|
25
|
+
)
|
|
26
|
+
else:
|
|
27
|
+
raise ValueError("Either dataset_id or dataset_name must be provided.")
|
|
28
|
+
|
|
29
|
+
if not authorized_dataset:
|
|
30
|
+
raise DatasetNotFoundError(
|
|
31
|
+
message=f"Dataset ({str(dataset_id) or dataset_name}) not found."
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return user, authorized_dataset
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
from typing import Union, Tuple, List
|
|
3
|
+
|
|
4
|
+
from cognee.modules.users.methods import get_default_user
|
|
5
|
+
from cognee.modules.users.models import User
|
|
6
|
+
from cognee.modules.data.models import Dataset
|
|
7
|
+
from cognee.modules.data.exceptions import DatasetNotFoundError
|
|
8
|
+
from cognee.modules.data.methods import (
|
|
9
|
+
get_authorized_existing_datasets,
|
|
10
|
+
load_or_create_datasets,
|
|
11
|
+
check_dataset_name,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def resolve_authorized_user_datasets(
|
|
16
|
+
datasets: Union[str, UUID, list[str], list[UUID]], user: User = None
|
|
17
|
+
) -> Tuple[User, List[Dataset]]:
|
|
18
|
+
"""
|
|
19
|
+
Function handles creation and dataset authorization if datasets already exist for Cognee.
|
|
20
|
+
Verifies that provided user has necessary permission for provided Dataset.
|
|
21
|
+
If Dataset does not exist creates the Dataset and gives permission for the user creating the dataset.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
user: Cognee User request is being processed for, if None default user will be used.
|
|
25
|
+
datasets: Dataset names or Dataset UUID (in case Datasets already exist)
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
|
|
29
|
+
"""
|
|
30
|
+
# If no user is provided use default user
|
|
31
|
+
if user is None:
|
|
32
|
+
user = await get_default_user()
|
|
33
|
+
|
|
34
|
+
# Convert datasets to list
|
|
35
|
+
if isinstance(datasets, str) or isinstance(datasets, UUID):
|
|
36
|
+
datasets = [datasets]
|
|
37
|
+
|
|
38
|
+
# Get datasets user wants write permissions for (verify user has permissions if datasets are provided as well)
|
|
39
|
+
# NOTE: If a user wants to write to a dataset he does not own it must be provided through UUID
|
|
40
|
+
existing_datasets = await get_authorized_existing_datasets(datasets, "write", user)
|
|
41
|
+
|
|
42
|
+
if not datasets:
|
|
43
|
+
# Get datasets from database if none sent.
|
|
44
|
+
authorized_datasets = existing_datasets
|
|
45
|
+
else:
|
|
46
|
+
# If dataset matches an existing Dataset (by name or id), reuse it. Otherwise, create a new Dataset.
|
|
47
|
+
authorized_datasets = await load_or_create_datasets(datasets, existing_datasets, user)
|
|
48
|
+
|
|
49
|
+
if not authorized_datasets:
|
|
50
|
+
raise DatasetNotFoundError("There are no datasets to work with.")
|
|
51
|
+
|
|
52
|
+
for dataset in authorized_datasets:
|
|
53
|
+
check_dataset_name(dataset.name)
|
|
54
|
+
|
|
55
|
+
return user, authorized_datasets
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from cognee.context_global_variables import (
|
|
3
|
+
graph_db_config as context_graph_db_config,
|
|
4
|
+
vector_db_config as context_vector_db_config,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from cognee.infrastructure.databases.relational import (
|
|
8
|
+
create_db_and_tables as create_relational_db_and_tables,
|
|
9
|
+
)
|
|
10
|
+
from cognee.infrastructure.databases.vector.pgvector import (
|
|
11
|
+
create_db_and_tables as create_pgvector_db_and_tables,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
_first_run_done = False
|
|
15
|
+
_first_run_lock = asyncio.Lock()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def setup_and_check_environment(
|
|
19
|
+
vector_db_config: dict = None,
|
|
20
|
+
graph_db_config: dict = None,
|
|
21
|
+
):
|
|
22
|
+
if vector_db_config:
|
|
23
|
+
context_vector_db_config.set(vector_db_config)
|
|
24
|
+
if graph_db_config:
|
|
25
|
+
context_graph_db_config.set(graph_db_config)
|
|
26
|
+
|
|
27
|
+
# Create tables for databases
|
|
28
|
+
await create_relational_db_and_tables()
|
|
29
|
+
await create_pgvector_db_and_tables()
|
|
30
|
+
|
|
31
|
+
global _first_run_done
|
|
32
|
+
async with _first_run_lock:
|
|
33
|
+
if not _first_run_done:
|
|
34
|
+
from cognee.infrastructure.llm.utils import (
|
|
35
|
+
test_llm_connection,
|
|
36
|
+
test_embedding_connection,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
await test_llm_connection()
|
|
40
|
+
await test_embedding_connection()
|
|
41
|
+
_first_run_done = True
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from ..tasks.task import Task
|
|
2
|
+
from ..exceptions.tasks import WrongTaskTypeError
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def validate_pipeline_tasks(tasks: list[Task]):
|
|
6
|
+
"""
|
|
7
|
+
Validates the tasks argument to ensure it is a list of Task class instances.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
tasks (list[Task]): The list of tasks to be validated.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
if not isinstance(tasks, list):
|
|
14
|
+
raise WrongTaskTypeError(f"tasks argument must be a list, got {type(tasks).__name__}.")
|
|
15
|
+
|
|
16
|
+
for task in tasks:
|
|
17
|
+
if not isinstance(task, Task):
|
|
18
|
+
raise WrongTaskTypeError(
|
|
19
|
+
f"tasks argument must be a list of Task class instances, got {type(task).__name__} in the list."
|
|
20
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from sqlalchemy import select, func
|
|
4
|
+
from sqlalchemy.orm import aliased
|
|
5
|
+
|
|
6
|
+
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
7
|
+
from ..models import PipelineRun
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def get_pipeline_runs_by_dataset(dataset_id: UUID):
|
|
11
|
+
db_engine = get_relational_engine()
|
|
12
|
+
|
|
13
|
+
async with db_engine.get_async_session() as session:
|
|
14
|
+
query = (
|
|
15
|
+
select(
|
|
16
|
+
PipelineRun,
|
|
17
|
+
func.row_number()
|
|
18
|
+
.over(
|
|
19
|
+
partition_by=(PipelineRun.dataset_id, PipelineRun.pipeline_name),
|
|
20
|
+
order_by=PipelineRun.created_at.desc(),
|
|
21
|
+
)
|
|
22
|
+
.label("rn"),
|
|
23
|
+
)
|
|
24
|
+
.filter(PipelineRun.dataset_id == dataset_id)
|
|
25
|
+
.subquery()
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
aliased_pipeline_run = aliased(PipelineRun, query)
|
|
29
|
+
|
|
30
|
+
latest_run = select(aliased_pipeline_run).filter(query.c.rn == 1)
|
|
31
|
+
|
|
32
|
+
runs = (await session.execute(latest_run)).scalars().all()
|
|
33
|
+
|
|
34
|
+
return runs
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
from cognee.modules.pipelines.utils.generate_pipeline_id import generate_pipeline_id
|
|
3
|
+
from cognee.modules.pipelines.operations.log_pipeline_run_initiated import (
|
|
4
|
+
log_pipeline_run_initiated,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
async def reset_pipeline_run_status(user_id: UUID, dataset_id: UUID, pipeline_name: str):
|
|
9
|
+
pipeline_id = generate_pipeline_id(user_id, dataset_id, pipeline_name)
|
|
10
|
+
|
|
11
|
+
# Without this the pipeline status will be DATASET_PROCESSING_COMPLETED and will skip the execution.
|
|
12
|
+
await log_pipeline_run_initiated(
|
|
13
|
+
pipeline_id=pipeline_id,
|
|
14
|
+
pipeline_name=pipeline_name,
|
|
15
|
+
dataset_id=dataset_id,
|
|
16
|
+
)
|
|
@@ -2,4 +2,3 @@ from .log_pipeline_run_initiated import log_pipeline_run_initiated
|
|
|
2
2
|
from .log_pipeline_run_start import log_pipeline_run_start
|
|
3
3
|
from .log_pipeline_run_complete import log_pipeline_run_complete
|
|
4
4
|
from .log_pipeline_run_error import log_pipeline_run_error
|
|
5
|
-
from .pipeline import cognee_pipeline
|
|
@@ -4,7 +4,7 @@ from cognee.modules.pipelines.models import PipelineRun, PipelineRunStatus
|
|
|
4
4
|
from cognee.modules.pipelines.utils import generate_pipeline_run_id
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
async def log_pipeline_run_initiated(pipeline_id:
|
|
7
|
+
async def log_pipeline_run_initiated(pipeline_id: UUID, pipeline_name: str, dataset_id: UUID):
|
|
8
8
|
pipeline_run = PipelineRun(
|
|
9
9
|
pipeline_run_id=generate_pipeline_run_id(pipeline_id, dataset_id),
|
|
10
10
|
pipeline_name=pipeline_name,
|
|
@@ -2,41 +2,23 @@ import asyncio
|
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
from typing import Union
|
|
4
4
|
|
|
5
|
+
from cognee.modules.pipelines.layers.setup_and_check_environment import (
|
|
6
|
+
setup_and_check_environment,
|
|
7
|
+
)
|
|
8
|
+
|
|
5
9
|
from cognee.shared.logging_utils import get_logger
|
|
6
10
|
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
|
7
11
|
from cognee.modules.data.models import Data, Dataset
|
|
8
12
|
from cognee.modules.pipelines.operations.run_tasks import run_tasks
|
|
9
|
-
from cognee.modules.pipelines.
|
|
10
|
-
from cognee.modules.pipelines.utils import generate_pipeline_id
|
|
11
|
-
from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
|
|
12
|
-
from cognee.modules.pipelines.methods import get_pipeline_run_by_dataset
|
|
13
|
-
|
|
13
|
+
from cognee.modules.pipelines.layers import validate_pipeline_tasks
|
|
14
14
|
from cognee.modules.pipelines.tasks.task import Task
|
|
15
|
-
from cognee.modules.users.methods import get_default_user
|
|
16
15
|
from cognee.modules.users.models import User
|
|
17
|
-
from cognee.modules.pipelines.operations import log_pipeline_run_initiated
|
|
18
16
|
from cognee.context_global_variables import set_database_global_context_variables
|
|
19
|
-
from cognee.modules.
|
|
20
|
-
|
|
21
|
-
get_authorized_existing_datasets,
|
|
22
|
-
load_or_create_datasets,
|
|
23
|
-
check_dataset_name,
|
|
17
|
+
from cognee.modules.pipelines.layers.resolve_authorized_user_datasets import (
|
|
18
|
+
resolve_authorized_user_datasets,
|
|
24
19
|
)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
PipelineRunCompleted,
|
|
28
|
-
PipelineRunStarted,
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
from cognee.infrastructure.databases.relational import (
|
|
32
|
-
create_db_and_tables as create_relational_db_and_tables,
|
|
33
|
-
)
|
|
34
|
-
from cognee.infrastructure.databases.vector.pgvector import (
|
|
35
|
-
create_db_and_tables as create_pgvector_db_and_tables,
|
|
36
|
-
)
|
|
37
|
-
from cognee.context_global_variables import (
|
|
38
|
-
graph_db_config as context_graph_db_config,
|
|
39
|
-
vector_db_config as context_vector_db_config,
|
|
20
|
+
from cognee.modules.pipelines.layers.check_pipeline_run_qualification import (
|
|
21
|
+
check_pipeline_run_qualification,
|
|
40
22
|
)
|
|
41
23
|
|
|
42
24
|
logger = get_logger("cognee.pipeline")
|
|
@@ -44,7 +26,7 @@ logger = get_logger("cognee.pipeline")
|
|
|
44
26
|
update_status_lock = asyncio.Lock()
|
|
45
27
|
|
|
46
28
|
|
|
47
|
-
async def
|
|
29
|
+
async def run_pipeline(
|
|
48
30
|
tasks: list[Task],
|
|
49
31
|
data=None,
|
|
50
32
|
datasets: Union[str, list[str], list[UUID]] = None,
|
|
@@ -54,56 +36,13 @@ async def cognee_pipeline(
|
|
|
54
36
|
graph_db_config: dict = None,
|
|
55
37
|
incremental_loading: bool = False,
|
|
56
38
|
):
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
if vector_db_config:
|
|
60
|
-
context_vector_db_config.set(vector_db_config)
|
|
61
|
-
if graph_db_config:
|
|
62
|
-
context_graph_db_config.set(graph_db_config)
|
|
63
|
-
|
|
64
|
-
# Create tables for databases
|
|
65
|
-
await create_relational_db_and_tables()
|
|
66
|
-
await create_pgvector_db_and_tables()
|
|
67
|
-
|
|
68
|
-
# Initialize first_run attribute if it doesn't exist
|
|
69
|
-
if not hasattr(cognee_pipeline, "first_run"):
|
|
70
|
-
cognee_pipeline.first_run = True
|
|
71
|
-
|
|
72
|
-
if cognee_pipeline.first_run:
|
|
73
|
-
from cognee.infrastructure.llm.utils import (
|
|
74
|
-
test_llm_connection,
|
|
75
|
-
test_embedding_connection,
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
# Test LLM and Embedding configuration once before running Cognee
|
|
79
|
-
await test_llm_connection()
|
|
80
|
-
await test_embedding_connection()
|
|
81
|
-
cognee_pipeline.first_run = False # Update flag after first run
|
|
39
|
+
validate_pipeline_tasks(tasks)
|
|
40
|
+
await setup_and_check_environment(vector_db_config, graph_db_config)
|
|
82
41
|
|
|
83
|
-
|
|
84
|
-
if user is None:
|
|
85
|
-
user = await get_default_user()
|
|
42
|
+
user, authorized_datasets = await resolve_authorized_user_datasets(datasets, user)
|
|
86
43
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
datasets = [datasets]
|
|
90
|
-
|
|
91
|
-
# Get datasets user wants write permissions for (verify user has permissions if datasets are provided as well)
|
|
92
|
-
# NOTE: If a user wants to write to a dataset he does not own it must be provided through UUID
|
|
93
|
-
existing_datasets = await get_authorized_existing_datasets(datasets, "write", user)
|
|
94
|
-
|
|
95
|
-
if not datasets:
|
|
96
|
-
# Get datasets from database if none sent.
|
|
97
|
-
datasets = existing_datasets
|
|
98
|
-
else:
|
|
99
|
-
# If dataset matches an existing Dataset (by name or id), reuse it. Otherwise, create a new Dataset.
|
|
100
|
-
datasets = await load_or_create_datasets(datasets, existing_datasets, user)
|
|
101
|
-
|
|
102
|
-
if not datasets:
|
|
103
|
-
raise DatasetNotFoundError("There are no datasets to work with.")
|
|
104
|
-
|
|
105
|
-
for dataset in datasets:
|
|
106
|
-
async for run_info in run_pipeline(
|
|
44
|
+
for dataset in authorized_datasets:
|
|
45
|
+
async for run_info in run_pipeline_per_dataset(
|
|
107
46
|
dataset=dataset,
|
|
108
47
|
user=user,
|
|
109
48
|
tasks=tasks,
|
|
@@ -115,7 +54,7 @@ async def cognee_pipeline(
|
|
|
115
54
|
yield run_info
|
|
116
55
|
|
|
117
56
|
|
|
118
|
-
async def
|
|
57
|
+
async def run_pipeline_per_dataset(
|
|
119
58
|
dataset: Dataset,
|
|
120
59
|
user: User,
|
|
121
60
|
tasks: list[Task],
|
|
@@ -124,74 +63,21 @@ async def run_pipeline(
|
|
|
124
63
|
context: dict = None,
|
|
125
64
|
incremental_loading=False,
|
|
126
65
|
):
|
|
127
|
-
check_dataset_name(dataset.name)
|
|
128
|
-
|
|
129
66
|
# Will only be used if ENABLE_BACKEND_ACCESS_CONTROL is set to True
|
|
130
67
|
await set_database_global_context_variables(dataset.id, dataset.owner_id)
|
|
131
68
|
|
|
132
|
-
# Ugly hack, but no easier way to do this.
|
|
133
|
-
if pipeline_name == "add_pipeline":
|
|
134
|
-
pipeline_id = generate_pipeline_id(user.id, dataset.id, pipeline_name)
|
|
135
|
-
# Refresh the add pipeline status so data is added to a dataset.
|
|
136
|
-
# Without this the app_pipeline status will be DATASET_PROCESSING_COMPLETED and will skip the execution.
|
|
137
|
-
|
|
138
|
-
await log_pipeline_run_initiated(
|
|
139
|
-
pipeline_id=pipeline_id,
|
|
140
|
-
pipeline_name="add_pipeline",
|
|
141
|
-
dataset_id=dataset.id,
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
# Refresh the cognify pipeline status after we add new files.
|
|
145
|
-
# Without this the cognify_pipeline status will be DATASET_PROCESSING_COMPLETED and will skip the execution.
|
|
146
|
-
await log_pipeline_run_initiated(
|
|
147
|
-
pipeline_id=pipeline_id,
|
|
148
|
-
pipeline_name="cognify_pipeline",
|
|
149
|
-
dataset_id=dataset.id,
|
|
150
|
-
)
|
|
151
|
-
|
|
152
|
-
dataset_id = dataset.id
|
|
153
|
-
|
|
154
69
|
if not data:
|
|
155
|
-
data: list[Data] = await get_dataset_data(dataset_id=
|
|
156
|
-
|
|
157
|
-
# async with update_status_lock: TODO: Add UI lock to prevent multiple backend requests
|
|
158
|
-
if isinstance(dataset, Dataset):
|
|
159
|
-
task_status = await get_pipeline_status([dataset_id], pipeline_name)
|
|
160
|
-
else:
|
|
161
|
-
task_status = [
|
|
162
|
-
PipelineRunStatus.DATASET_PROCESSING_COMPLETED
|
|
163
|
-
] # TODO: this is a random assignment, find permanent solution
|
|
164
|
-
|
|
165
|
-
if str(dataset_id) in task_status:
|
|
166
|
-
if task_status[str(dataset_id)] == PipelineRunStatus.DATASET_PROCESSING_STARTED:
|
|
167
|
-
logger.info("Dataset %s is already being processed.", dataset_id)
|
|
168
|
-
pipeline_run = await get_pipeline_run_by_dataset(dataset_id, pipeline_name)
|
|
169
|
-
yield PipelineRunStarted(
|
|
170
|
-
pipeline_run_id=pipeline_run.pipeline_run_id,
|
|
171
|
-
dataset_id=dataset.id,
|
|
172
|
-
dataset_name=dataset.name,
|
|
173
|
-
payload=data,
|
|
174
|
-
)
|
|
175
|
-
return
|
|
176
|
-
elif task_status[str(dataset_id)] == PipelineRunStatus.DATASET_PROCESSING_COMPLETED:
|
|
177
|
-
logger.info("Dataset %s is already processed.", dataset_id)
|
|
178
|
-
pipeline_run = await get_pipeline_run_by_dataset(dataset_id, pipeline_name)
|
|
179
|
-
yield PipelineRunCompleted(
|
|
180
|
-
pipeline_run_id=pipeline_run.pipeline_run_id,
|
|
181
|
-
dataset_id=dataset.id,
|
|
182
|
-
dataset_name=dataset.name,
|
|
183
|
-
)
|
|
184
|
-
return
|
|
185
|
-
|
|
186
|
-
if not isinstance(tasks, list):
|
|
187
|
-
raise ValueError("Tasks must be a list")
|
|
70
|
+
data: list[Data] = await get_dataset_data(dataset_id=dataset.id)
|
|
188
71
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
72
|
+
process_pipeline_status = await check_pipeline_run_qualification(dataset, data, pipeline_name)
|
|
73
|
+
if process_pipeline_status:
|
|
74
|
+
# If pipeline was already processed or is currently being processed
|
|
75
|
+
# return status information to async generator and finish execution
|
|
76
|
+
yield process_pipeline_status
|
|
77
|
+
return
|
|
192
78
|
|
|
193
79
|
pipeline_run = run_tasks(
|
|
194
|
-
tasks,
|
|
80
|
+
tasks, dataset.id, data, user, pipeline_name, context, incremental_loading
|
|
195
81
|
)
|
|
196
82
|
|
|
197
83
|
async for pipeline_run_info in pipeline_run:
|
|
@@ -266,48 +266,24 @@ async def run_tasks(
|
|
|
266
266
|
if incremental_loading:
|
|
267
267
|
data = await resolve_data_directories(data)
|
|
268
268
|
|
|
269
|
-
#
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
# )
|
|
284
|
-
# )
|
|
285
|
-
# for data_item in data
|
|
286
|
-
# ]
|
|
287
|
-
# results = await asyncio.gather(*data_item_tasks)
|
|
288
|
-
# # Remove skipped data items from results
|
|
289
|
-
# results = [result for result in results if result]
|
|
290
|
-
|
|
291
|
-
### TEMP sync data item handling
|
|
292
|
-
results = []
|
|
293
|
-
# Run the pipeline for each data_item sequentially, one after the other
|
|
294
|
-
for data_item in data:
|
|
295
|
-
result = await _run_tasks_data_item(
|
|
296
|
-
data_item,
|
|
297
|
-
dataset,
|
|
298
|
-
tasks,
|
|
299
|
-
pipeline_name,
|
|
300
|
-
pipeline_id,
|
|
301
|
-
pipeline_run_id,
|
|
302
|
-
context,
|
|
303
|
-
user,
|
|
304
|
-
incremental_loading,
|
|
269
|
+
# Create async tasks per data item that will run the pipeline for the data item
|
|
270
|
+
data_item_tasks = [
|
|
271
|
+
asyncio.create_task(
|
|
272
|
+
_run_tasks_data_item(
|
|
273
|
+
data_item,
|
|
274
|
+
dataset,
|
|
275
|
+
tasks,
|
|
276
|
+
pipeline_name,
|
|
277
|
+
pipeline_id,
|
|
278
|
+
pipeline_run_id,
|
|
279
|
+
context,
|
|
280
|
+
user,
|
|
281
|
+
incremental_loading,
|
|
282
|
+
)
|
|
305
283
|
)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
results.append(result)
|
|
310
|
-
### END
|
|
284
|
+
for data_item in data
|
|
285
|
+
]
|
|
286
|
+
results = await asyncio.gather(*data_item_tasks)
|
|
311
287
|
|
|
312
288
|
# Remove skipped data items from results
|
|
313
289
|
results = [result for result in results if result]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseFeedback(ABC):
|
|
6
|
+
"""Base class for all user feedback operations."""
|
|
7
|
+
|
|
8
|
+
@abstractmethod
|
|
9
|
+
async def add_feedback(self, feedback_text: str) -> Any:
|
|
10
|
+
"""Add user feedback to the system."""
|
|
11
|
+
pass
|