recurvedata-lib 0.1.487__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of recurvedata-lib might be problematic. Click here for more details.
- recurvedata/__init__.py +0 -0
- recurvedata/__version__.py +1 -0
- recurvedata/client/__init__.py +3 -0
- recurvedata/client/client.py +150 -0
- recurvedata/client/server_client.py +91 -0
- recurvedata/config.py +99 -0
- recurvedata/connectors/__init__.py +20 -0
- recurvedata/connectors/_register.py +46 -0
- recurvedata/connectors/base.py +111 -0
- recurvedata/connectors/config_schema.py +1575 -0
- recurvedata/connectors/connectors/__init__.py +0 -0
- recurvedata/connectors/connectors/aliyun_access_key.py +30 -0
- recurvedata/connectors/connectors/auth.py +44 -0
- recurvedata/connectors/connectors/azure_blob.py +89 -0
- recurvedata/connectors/connectors/azure_synapse.py +79 -0
- recurvedata/connectors/connectors/bigquery.py +359 -0
- recurvedata/connectors/connectors/clickhouse.py +219 -0
- recurvedata/connectors/connectors/dingtalk.py +61 -0
- recurvedata/connectors/connectors/doris.py +215 -0
- recurvedata/connectors/connectors/es.py +62 -0
- recurvedata/connectors/connectors/feishu.py +65 -0
- recurvedata/connectors/connectors/ftp.py +50 -0
- recurvedata/connectors/connectors/generic.py +49 -0
- recurvedata/connectors/connectors/google_cloud_storage.py +115 -0
- recurvedata/connectors/connectors/google_service_account.py +225 -0
- recurvedata/connectors/connectors/hive.py +207 -0
- recurvedata/connectors/connectors/impala.py +210 -0
- recurvedata/connectors/connectors/jenkins.py +51 -0
- recurvedata/connectors/connectors/mail.py +89 -0
- recurvedata/connectors/connectors/microsoft_fabric.py +284 -0
- recurvedata/connectors/connectors/mongo.py +79 -0
- recurvedata/connectors/connectors/mssql.py +131 -0
- recurvedata/connectors/connectors/mysql.py +191 -0
- recurvedata/connectors/connectors/n8n.py +141 -0
- recurvedata/connectors/connectors/oss.py +74 -0
- recurvedata/connectors/connectors/owncloud.py +36 -0
- recurvedata/connectors/connectors/phoenix.py +36 -0
- recurvedata/connectors/connectors/postgres.py +230 -0
- recurvedata/connectors/connectors/python.py +50 -0
- recurvedata/connectors/connectors/redshift.py +187 -0
- recurvedata/connectors/connectors/s3.py +93 -0
- recurvedata/connectors/connectors/sftp.py +87 -0
- recurvedata/connectors/connectors/slack.py +35 -0
- recurvedata/connectors/connectors/spark.py +99 -0
- recurvedata/connectors/connectors/starrocks.py +175 -0
- recurvedata/connectors/connectors/tencent_cos.py +40 -0
- recurvedata/connectors/connectors/tidb.py +49 -0
- recurvedata/connectors/const.py +315 -0
- recurvedata/connectors/datasource.py +189 -0
- recurvedata/connectors/dbapi.py +469 -0
- recurvedata/connectors/fs.py +66 -0
- recurvedata/connectors/ftp.py +40 -0
- recurvedata/connectors/object_store.py +60 -0
- recurvedata/connectors/pigeon.py +172 -0
- recurvedata/connectors/proxy.py +104 -0
- recurvedata/connectors/service.py +223 -0
- recurvedata/connectors/utils.py +47 -0
- recurvedata/consts.py +49 -0
- recurvedata/core/__init__.py +0 -0
- recurvedata/core/config.py +46 -0
- recurvedata/core/configurable.py +27 -0
- recurvedata/core/consts.py +2 -0
- recurvedata/core/templating.py +206 -0
- recurvedata/core/tracing.py +223 -0
- recurvedata/core/transformer.py +186 -0
- recurvedata/core/translation.py +91 -0
- recurvedata/dbt/client.py +97 -0
- recurvedata/dbt/consts.py +99 -0
- recurvedata/dbt/cosmos_utils.py +275 -0
- recurvedata/dbt/error_codes.py +18 -0
- recurvedata/dbt/schemas.py +98 -0
- recurvedata/dbt/service.py +451 -0
- recurvedata/dbt/utils.py +246 -0
- recurvedata/error_codes.py +71 -0
- recurvedata/exceptions.py +72 -0
- recurvedata/executors/__init__.py +4 -0
- recurvedata/executors/cli/__init__.py +7 -0
- recurvedata/executors/cli/connector.py +117 -0
- recurvedata/executors/cli/dbt.py +118 -0
- recurvedata/executors/cli/main.py +82 -0
- recurvedata/executors/cli/parameters.py +18 -0
- recurvedata/executors/client.py +190 -0
- recurvedata/executors/consts.py +50 -0
- recurvedata/executors/debug_executor.py +100 -0
- recurvedata/executors/executor.py +300 -0
- recurvedata/executors/link_executor.py +189 -0
- recurvedata/executors/models.py +34 -0
- recurvedata/executors/schemas.py +222 -0
- recurvedata/executors/service/__init__.py +0 -0
- recurvedata/executors/service/connector.py +380 -0
- recurvedata/executors/utils.py +172 -0
- recurvedata/filestorage/__init__.py +11 -0
- recurvedata/filestorage/_factory.py +33 -0
- recurvedata/filestorage/backends/__init__.py +0 -0
- recurvedata/filestorage/backends/fsspec.py +45 -0
- recurvedata/filestorage/backends/local.py +67 -0
- recurvedata/filestorage/backends/oss.py +56 -0
- recurvedata/filestorage/interface.py +84 -0
- recurvedata/operators/__init__.py +10 -0
- recurvedata/operators/base.py +28 -0
- recurvedata/operators/config.py +21 -0
- recurvedata/operators/context.py +255 -0
- recurvedata/operators/dbt_operator/__init__.py +2 -0
- recurvedata/operators/dbt_operator/model_pipeline_link_operator.py +55 -0
- recurvedata/operators/dbt_operator/operator.py +353 -0
- recurvedata/operators/link_operator/__init__.py +1 -0
- recurvedata/operators/link_operator/operator.py +120 -0
- recurvedata/operators/models.py +55 -0
- recurvedata/operators/notify_operator/__init__.py +1 -0
- recurvedata/operators/notify_operator/operator.py +180 -0
- recurvedata/operators/operator.py +119 -0
- recurvedata/operators/python_operator/__init__.py +1 -0
- recurvedata/operators/python_operator/operator.py +132 -0
- recurvedata/operators/sensor_operator/__init__.py +1 -0
- recurvedata/operators/sensor_operator/airflow_utils.py +63 -0
- recurvedata/operators/sensor_operator/operator.py +172 -0
- recurvedata/operators/spark_operator/__init__.py +1 -0
- recurvedata/operators/spark_operator/operator.py +200 -0
- recurvedata/operators/spark_operator/spark_sample.py +47 -0
- recurvedata/operators/sql_operator/__init__.py +1 -0
- recurvedata/operators/sql_operator/operator.py +90 -0
- recurvedata/operators/task.py +211 -0
- recurvedata/operators/transfer_operator/__init__.py +40 -0
- recurvedata/operators/transfer_operator/const.py +10 -0
- recurvedata/operators/transfer_operator/dump_aliyun_sls.py +82 -0
- recurvedata/operators/transfer_operator/dump_sheet_task_base.py +292 -0
- recurvedata/operators/transfer_operator/dump_task_cass.py +155 -0
- recurvedata/operators/transfer_operator/dump_task_dbapi.py +209 -0
- recurvedata/operators/transfer_operator/dump_task_es.py +113 -0
- recurvedata/operators/transfer_operator/dump_task_feishu_sheet.py +114 -0
- recurvedata/operators/transfer_operator/dump_task_ftp.py +234 -0
- recurvedata/operators/transfer_operator/dump_task_google_sheet.py +66 -0
- recurvedata/operators/transfer_operator/dump_task_mongodb.py +168 -0
- recurvedata/operators/transfer_operator/dump_task_oss.py +285 -0
- recurvedata/operators/transfer_operator/dump_task_python.py +212 -0
- recurvedata/operators/transfer_operator/dump_task_s3.py +270 -0
- recurvedata/operators/transfer_operator/dump_task_sftp.py +229 -0
- recurvedata/operators/transfer_operator/load_task_aliyun_oss.py +107 -0
- recurvedata/operators/transfer_operator/load_task_azure_blob.py +115 -0
- recurvedata/operators/transfer_operator/load_task_azure_synapse.py +90 -0
- recurvedata/operators/transfer_operator/load_task_clickhouse.py +167 -0
- recurvedata/operators/transfer_operator/load_task_doris.py +164 -0
- recurvedata/operators/transfer_operator/load_task_email.py +188 -0
- recurvedata/operators/transfer_operator/load_task_es.py +86 -0
- recurvedata/operators/transfer_operator/load_task_filebrowser.py +151 -0
- recurvedata/operators/transfer_operator/load_task_ftp.py +19 -0
- recurvedata/operators/transfer_operator/load_task_google_bigquery.py +90 -0
- recurvedata/operators/transfer_operator/load_task_google_cloud_storage.py +127 -0
- recurvedata/operators/transfer_operator/load_task_google_sheet.py +130 -0
- recurvedata/operators/transfer_operator/load_task_hive.py +158 -0
- recurvedata/operators/transfer_operator/load_task_microsoft_fabric.py +105 -0
- recurvedata/operators/transfer_operator/load_task_mssql.py +153 -0
- recurvedata/operators/transfer_operator/load_task_mysql.py +157 -0
- recurvedata/operators/transfer_operator/load_task_owncloud.py +135 -0
- recurvedata/operators/transfer_operator/load_task_postgresql.py +109 -0
- recurvedata/operators/transfer_operator/load_task_qcloud_cos.py +119 -0
- recurvedata/operators/transfer_operator/load_task_recurve_data_prep.py +75 -0
- recurvedata/operators/transfer_operator/load_task_redshift.py +95 -0
- recurvedata/operators/transfer_operator/load_task_s3.py +150 -0
- recurvedata/operators/transfer_operator/load_task_sftp.py +90 -0
- recurvedata/operators/transfer_operator/load_task_starrocks.py +169 -0
- recurvedata/operators/transfer_operator/load_task_yicrowds.py +97 -0
- recurvedata/operators/transfer_operator/mixin.py +31 -0
- recurvedata/operators/transfer_operator/operator.py +231 -0
- recurvedata/operators/transfer_operator/task.py +223 -0
- recurvedata/operators/transfer_operator/utils.py +134 -0
- recurvedata/operators/ui.py +80 -0
- recurvedata/operators/utils/__init__.py +51 -0
- recurvedata/operators/utils/file_factory.py +150 -0
- recurvedata/operators/utils/fs.py +10 -0
- recurvedata/operators/utils/lineage.py +265 -0
- recurvedata/operators/web_init.py +15 -0
- recurvedata/pigeon/connector/__init__.py +294 -0
- recurvedata/pigeon/connector/_registry.py +17 -0
- recurvedata/pigeon/connector/aliyun_oss.py +80 -0
- recurvedata/pigeon/connector/awss3.py +123 -0
- recurvedata/pigeon/connector/azure_blob.py +176 -0
- recurvedata/pigeon/connector/azure_synapse.py +51 -0
- recurvedata/pigeon/connector/cass.py +151 -0
- recurvedata/pigeon/connector/clickhouse.py +403 -0
- recurvedata/pigeon/connector/clickhouse_native.py +351 -0
- recurvedata/pigeon/connector/dbapi.py +571 -0
- recurvedata/pigeon/connector/doris.py +166 -0
- recurvedata/pigeon/connector/es.py +176 -0
- recurvedata/pigeon/connector/feishu.py +1135 -0
- recurvedata/pigeon/connector/ftp.py +163 -0
- recurvedata/pigeon/connector/google_bigquery.py +283 -0
- recurvedata/pigeon/connector/google_cloud_storage.py +130 -0
- recurvedata/pigeon/connector/hbase_phoenix.py +108 -0
- recurvedata/pigeon/connector/hdfs.py +204 -0
- recurvedata/pigeon/connector/hive_impala.py +383 -0
- recurvedata/pigeon/connector/microsoft_fabric.py +95 -0
- recurvedata/pigeon/connector/mongodb.py +56 -0
- recurvedata/pigeon/connector/mssql.py +467 -0
- recurvedata/pigeon/connector/mysql.py +175 -0
- recurvedata/pigeon/connector/owncloud.py +92 -0
- recurvedata/pigeon/connector/postgresql.py +267 -0
- recurvedata/pigeon/connector/power_bi.py +179 -0
- recurvedata/pigeon/connector/qcloud_cos.py +79 -0
- recurvedata/pigeon/connector/redshift.py +123 -0
- recurvedata/pigeon/connector/sftp.py +73 -0
- recurvedata/pigeon/connector/sqlite.py +42 -0
- recurvedata/pigeon/connector/starrocks.py +144 -0
- recurvedata/pigeon/connector/tableau.py +162 -0
- recurvedata/pigeon/const.py +21 -0
- recurvedata/pigeon/csv.py +172 -0
- recurvedata/pigeon/docs/datasources-example.json +82 -0
- recurvedata/pigeon/docs/images/pigeon_design.png +0 -0
- recurvedata/pigeon/docs/lightweight-data-sync-solution.md +111 -0
- recurvedata/pigeon/dumper/__init__.py +171 -0
- recurvedata/pigeon/dumper/aliyun_sls.py +415 -0
- recurvedata/pigeon/dumper/base.py +141 -0
- recurvedata/pigeon/dumper/cass.py +213 -0
- recurvedata/pigeon/dumper/dbapi.py +346 -0
- recurvedata/pigeon/dumper/es.py +112 -0
- recurvedata/pigeon/dumper/ftp.py +64 -0
- recurvedata/pigeon/dumper/mongodb.py +103 -0
- recurvedata/pigeon/handler/__init__.py +4 -0
- recurvedata/pigeon/handler/base.py +153 -0
- recurvedata/pigeon/handler/csv_handler.py +290 -0
- recurvedata/pigeon/loader/__init__.py +87 -0
- recurvedata/pigeon/loader/base.py +83 -0
- recurvedata/pigeon/loader/csv_to_azure_synapse.py +214 -0
- recurvedata/pigeon/loader/csv_to_clickhouse.py +152 -0
- recurvedata/pigeon/loader/csv_to_doris.py +215 -0
- recurvedata/pigeon/loader/csv_to_es.py +51 -0
- recurvedata/pigeon/loader/csv_to_google_bigquery.py +169 -0
- recurvedata/pigeon/loader/csv_to_hive.py +468 -0
- recurvedata/pigeon/loader/csv_to_microsoft_fabric.py +242 -0
- recurvedata/pigeon/loader/csv_to_mssql.py +174 -0
- recurvedata/pigeon/loader/csv_to_mysql.py +180 -0
- recurvedata/pigeon/loader/csv_to_postgresql.py +248 -0
- recurvedata/pigeon/loader/csv_to_redshift.py +240 -0
- recurvedata/pigeon/loader/csv_to_starrocks.py +233 -0
- recurvedata/pigeon/meta.py +116 -0
- recurvedata/pigeon/row_factory.py +42 -0
- recurvedata/pigeon/schema/__init__.py +124 -0
- recurvedata/pigeon/schema/types.py +13 -0
- recurvedata/pigeon/sync.py +283 -0
- recurvedata/pigeon/transformer.py +146 -0
- recurvedata/pigeon/utils/__init__.py +134 -0
- recurvedata/pigeon/utils/bloomfilter.py +181 -0
- recurvedata/pigeon/utils/date_time.py +323 -0
- recurvedata/pigeon/utils/escape.py +15 -0
- recurvedata/pigeon/utils/fs.py +266 -0
- recurvedata/pigeon/utils/json.py +44 -0
- recurvedata/pigeon/utils/keyed_tuple.py +85 -0
- recurvedata/pigeon/utils/mp.py +156 -0
- recurvedata/pigeon/utils/sql.py +328 -0
- recurvedata/pigeon/utils/timing.py +155 -0
- recurvedata/provider_manager.py +0 -0
- recurvedata/providers/__init__.py +0 -0
- recurvedata/providers/dbapi/__init__.py +0 -0
- recurvedata/providers/flywheel/__init__.py +0 -0
- recurvedata/providers/mysql/__init__.py +0 -0
- recurvedata/schedulers/__init__.py +1 -0
- recurvedata/schedulers/airflow.py +974 -0
- recurvedata/schedulers/airflow_db_process.py +331 -0
- recurvedata/schedulers/airflow_operators.py +61 -0
- recurvedata/schedulers/airflow_plugin.py +9 -0
- recurvedata/schedulers/airflow_trigger_dag_patch.py +117 -0
- recurvedata/schedulers/base.py +99 -0
- recurvedata/schedulers/cli.py +228 -0
- recurvedata/schedulers/client.py +56 -0
- recurvedata/schedulers/consts.py +52 -0
- recurvedata/schedulers/debug_celery.py +62 -0
- recurvedata/schedulers/model.py +63 -0
- recurvedata/schedulers/schemas.py +97 -0
- recurvedata/schedulers/service.py +20 -0
- recurvedata/schedulers/system_dags.py +59 -0
- recurvedata/schedulers/task_status.py +279 -0
- recurvedata/schedulers/utils.py +73 -0
- recurvedata/schema/__init__.py +0 -0
- recurvedata/schema/field.py +88 -0
- recurvedata/schema/schema.py +55 -0
- recurvedata/schema/types.py +17 -0
- recurvedata/schema.py +0 -0
- recurvedata/server/__init__.py +0 -0
- recurvedata/server/app.py +7 -0
- recurvedata/server/connector/__init__.py +0 -0
- recurvedata/server/connector/api.py +79 -0
- recurvedata/server/connector/schemas.py +28 -0
- recurvedata/server/data_service/__init__.py +0 -0
- recurvedata/server/data_service/api.py +126 -0
- recurvedata/server/data_service/client.py +18 -0
- recurvedata/server/data_service/consts.py +1 -0
- recurvedata/server/data_service/schemas.py +68 -0
- recurvedata/server/data_service/service.py +218 -0
- recurvedata/server/dbt/__init__.py +0 -0
- recurvedata/server/dbt/api.py +116 -0
- recurvedata/server/error_code.py +49 -0
- recurvedata/server/exceptions.py +19 -0
- recurvedata/server/executor/__init__.py +0 -0
- recurvedata/server/executor/api.py +37 -0
- recurvedata/server/executor/schemas.py +30 -0
- recurvedata/server/executor/service.py +220 -0
- recurvedata/server/main.py +32 -0
- recurvedata/server/schedulers/__init__.py +0 -0
- recurvedata/server/schedulers/api.py +252 -0
- recurvedata/server/schedulers/schemas.py +50 -0
- recurvedata/server/schemas.py +50 -0
- recurvedata/utils/__init__.py +15 -0
- recurvedata/utils/_typer.py +61 -0
- recurvedata/utils/attrdict.py +19 -0
- recurvedata/utils/command_helper.py +20 -0
- recurvedata/utils/compat.py +12 -0
- recurvedata/utils/compression.py +203 -0
- recurvedata/utils/crontab.py +42 -0
- recurvedata/utils/crypto_util.py +305 -0
- recurvedata/utils/dataclass.py +11 -0
- recurvedata/utils/date_time.py +464 -0
- recurvedata/utils/dispatch.py +114 -0
- recurvedata/utils/email_util.py +104 -0
- recurvedata/utils/files.py +386 -0
- recurvedata/utils/helpers.py +170 -0
- recurvedata/utils/httputil.py +117 -0
- recurvedata/utils/imports.py +132 -0
- recurvedata/utils/json.py +80 -0
- recurvedata/utils/log.py +117 -0
- recurvedata/utils/log_capture.py +153 -0
- recurvedata/utils/mp.py +178 -0
- recurvedata/utils/normalizer.py +102 -0
- recurvedata/utils/redis_lock.py +474 -0
- recurvedata/utils/registry.py +54 -0
- recurvedata/utils/shell.py +15 -0
- recurvedata/utils/singleton.py +33 -0
- recurvedata/utils/sql.py +6 -0
- recurvedata/utils/timeout.py +28 -0
- recurvedata/utils/tracing.py +14 -0
- recurvedata_lib-0.1.487.dist-info/METADATA +605 -0
- recurvedata_lib-0.1.487.dist-info/RECORD +333 -0
- recurvedata_lib-0.1.487.dist-info/WHEEL +5 -0
- recurvedata_lib-0.1.487.dist-info/entry_points.txt +6 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
from airflow.api.common.mark_tasks import set_dag_run_state_to_failed, set_dag_run_state_to_success
|
|
6
|
+
from airflow.api.common.trigger_dag import trigger_dag
|
|
7
|
+
from airflow.models import DAG, DagModel, DagRun, TaskInstance
|
|
8
|
+
from airflow.models.serialized_dag import SerializedDagModel
|
|
9
|
+
from airflow.utils.session import create_session, provide_session
|
|
10
|
+
from airflow.utils.state import TaskInstanceState
|
|
11
|
+
from sqlalchemy import Index, Table
|
|
12
|
+
from sqlalchemy.orm import Session
|
|
13
|
+
from sqlalchemy.schema import CreateIndex
|
|
14
|
+
|
|
15
|
+
from recurvedata.utils.date_time import to_local_datetime, utcnow
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AirflowDbService:
|
|
21
|
+
@classmethod
|
|
22
|
+
def update_dag(cls, dag: DAG):
|
|
23
|
+
logger.info(f"start sync dag {dag.dag_id} to serialized_dag")
|
|
24
|
+
DAG.sync_to_db(dag)
|
|
25
|
+
SerializedDagModel.write_dag(dag)
|
|
26
|
+
logger.info(f"finish sync {dag.dag_id} to serialized_dag")
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def activate_dag(cls, dag: DAG):
|
|
30
|
+
with create_session() as session:
|
|
31
|
+
d = session.query(DagModel).filter(DagModel.dag_id == dag.dag_id).one_or_none()
|
|
32
|
+
if not d:
|
|
33
|
+
raise ValueError(f"dag not exists: {dag.dag_id}")
|
|
34
|
+
|
|
35
|
+
if d.is_paused is False:
|
|
36
|
+
logger.info(f"{dag.dag_id} is active, no need to activate")
|
|
37
|
+
return
|
|
38
|
+
|
|
39
|
+
logger.info(f"start activate_dag dag {dag.dag_id}")
|
|
40
|
+
|
|
41
|
+
d.is_paused = False
|
|
42
|
+
session.merge(d)
|
|
43
|
+
session.commit()
|
|
44
|
+
|
|
45
|
+
logger.info(f"finish activate_dag dag {dag.dag_id}")
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def deactivate_dag(cls, dag: DAG):
|
|
49
|
+
with create_session() as session:
|
|
50
|
+
d = session.query(DagModel).filter(DagModel.dag_id == dag.dag_id).one_or_none()
|
|
51
|
+
if not d:
|
|
52
|
+
raise ValueError(f"dag not exists: {dag.dag_id}")
|
|
53
|
+
|
|
54
|
+
if d.is_paused is True:
|
|
55
|
+
logger.info(f"{dag.dag_id} is deactive, no need to deactivate")
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
logger.info(f"start deactivate_dag dag {dag.dag_id}")
|
|
59
|
+
|
|
60
|
+
d.is_paused = True
|
|
61
|
+
session.merge(d)
|
|
62
|
+
session.commit()
|
|
63
|
+
|
|
64
|
+
logger.info(f"finish deactivate_dag dag {dag.dag_id}")
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def delete_dag(cls, dag_id: str, job_name: str):
|
|
68
|
+
with create_session() as session:
|
|
69
|
+
d: DagModel = session.query(DagModel).filter(DagModel.dag_id == dag_id).one_or_none()
|
|
70
|
+
if not d:
|
|
71
|
+
logger.warning(f"dag not exists: {dag_id}")
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
logger.info(f"start delete_dag dag {job_name} {dag_id}")
|
|
75
|
+
d.is_paused = True
|
|
76
|
+
d.is_active = False
|
|
77
|
+
session.merge(d)
|
|
78
|
+
session.commit()
|
|
79
|
+
|
|
80
|
+
logger.info(f"finish delete_dag dag {job_name} {dag_id}")
|
|
81
|
+
# todo(chenjingmeng): delete dag
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def trigger_job_run(
|
|
85
|
+
cls,
|
|
86
|
+
dag: DAG,
|
|
87
|
+
execution_date: datetime.datetime,
|
|
88
|
+
include_past: bool,
|
|
89
|
+
include_future: bool,
|
|
90
|
+
run_type: str,
|
|
91
|
+
conf: dict[str, Any] | None = None,
|
|
92
|
+
):
|
|
93
|
+
execution_date_ds = execution_date.isoformat()
|
|
94
|
+
run_id = f"{run_type}__{execution_date_ds}"
|
|
95
|
+
reference_date = to_local_datetime(execution_date_ds)
|
|
96
|
+
current_date = utcnow()
|
|
97
|
+
airflow_current_date = dag.previous_schedule(current_date)
|
|
98
|
+
|
|
99
|
+
if include_past:
|
|
100
|
+
airflow_start_date = dag.start_date or dag.default_args.get("start_date")
|
|
101
|
+
if airflow_start_date:
|
|
102
|
+
tmp_date = dag.previous_schedule(reference_date)
|
|
103
|
+
while tmp_date >= airflow_start_date:
|
|
104
|
+
cls._trigger_run_if_not_exists(
|
|
105
|
+
dag.dag_id, run_id=f"{run_type}__{tmp_date.isoformat()}", execution_date=tmp_date, conf=conf
|
|
106
|
+
)
|
|
107
|
+
tmp_date = dag.previous_schedule(tmp_date)
|
|
108
|
+
|
|
109
|
+
if include_future:
|
|
110
|
+
tmp_date = dag.following_schedule(reference_date)
|
|
111
|
+
while tmp_date <= airflow_current_date:
|
|
112
|
+
cls._trigger_run_if_not_exists(
|
|
113
|
+
dag.dag_id, run_id=f"{run_type}__{tmp_date.isoformat()}", execution_date=tmp_date, conf=conf
|
|
114
|
+
)
|
|
115
|
+
tmp_date = dag.following_schedule(tmp_date)
|
|
116
|
+
|
|
117
|
+
cls._trigger_run_if_not_exists(dag.dag_id, run_id, execution_date=execution_date, conf=conf)
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def _trigger_run_if_not_exists(
|
|
121
|
+
dag_id: str, run_id: str, execution_date: datetime.datetime, conf: dict[str, Any] | None = None
|
|
122
|
+
):
|
|
123
|
+
import recurvedata.schedulers.airflow_trigger_dag_patch # noqa
|
|
124
|
+
|
|
125
|
+
with create_session() as session:
|
|
126
|
+
existing_run = session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.run_id == run_id).first()
|
|
127
|
+
if existing_run:
|
|
128
|
+
logger.info(f"Skipping existing run for {dag_id} at {run_id}")
|
|
129
|
+
return
|
|
130
|
+
logger.info(f"start trigger dag_run for {dag_id} at {run_id}")
|
|
131
|
+
trigger_dag(dag_id, run_id=run_id, execution_date=execution_date, conf=conf, replace_microseconds=False)
|
|
132
|
+
logger.info(f"finished trigger dag_run for {dag_id} at {run_id} execution_date: {execution_date}")
|
|
133
|
+
|
|
134
|
+
@staticmethod
|
|
135
|
+
@provide_session
|
|
136
|
+
def _get_rerun_earliest_execution_date(dag: DAG, session: Session = None) -> Optional[datetime.datetime]:
|
|
137
|
+
earliest_dag_run = (
|
|
138
|
+
session.query(DagRun).filter(DagRun.dag_id == dag.dag_id).order_by(DagRun.execution_date).first()
|
|
139
|
+
)
|
|
140
|
+
return earliest_dag_run and earliest_dag_run.execution_date
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def rerun_job_run(
|
|
144
|
+
cls,
|
|
145
|
+
dag: DAG,
|
|
146
|
+
run_id: str | None,
|
|
147
|
+
min_execution_date: datetime.datetime | None,
|
|
148
|
+
max_execution_date: datetime.datetime | None,
|
|
149
|
+
failed_only: bool,
|
|
150
|
+
):
|
|
151
|
+
drs: list[DagRun] = DagRun.find(
|
|
152
|
+
dag_id=dag.dag_id,
|
|
153
|
+
run_id=run_id,
|
|
154
|
+
execution_start_date=min_execution_date,
|
|
155
|
+
execution_end_date=max_execution_date,
|
|
156
|
+
)
|
|
157
|
+
if not drs:
|
|
158
|
+
logger.info(f"skip rerun, no dag_run found for {dag.dag_id} at {run_id}")
|
|
159
|
+
return
|
|
160
|
+
clear_start_date = min([dr.execution_date for dr in drs])
|
|
161
|
+
clear_end_date = max([dr.execution_date for dr in drs])
|
|
162
|
+
|
|
163
|
+
logger.info(
|
|
164
|
+
f"prepare to clear dag_run for {dag.dag_id}, start_date: {clear_start_date}, end_date: {clear_end_date}, failed_only: {failed_only}"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
dag.clear(
|
|
168
|
+
start_date=clear_start_date,
|
|
169
|
+
end_date=clear_end_date,
|
|
170
|
+
only_failed=failed_only,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def rerun_task_run(
|
|
175
|
+
cls,
|
|
176
|
+
dag: DAG,
|
|
177
|
+
run_id: str,
|
|
178
|
+
node_key: str,
|
|
179
|
+
min_execution_date: datetime.datetime | None,
|
|
180
|
+
max_execution_date: datetime.datetime | None,
|
|
181
|
+
include_upstream: bool,
|
|
182
|
+
include_downstream: bool,
|
|
183
|
+
failed_only: bool,
|
|
184
|
+
):
|
|
185
|
+
drs: list[DagRun] = DagRun.find(
|
|
186
|
+
dag_id=dag.dag_id,
|
|
187
|
+
run_id=run_id,
|
|
188
|
+
execution_start_date=min_execution_date,
|
|
189
|
+
execution_end_date=max_execution_date,
|
|
190
|
+
)
|
|
191
|
+
if not drs:
|
|
192
|
+
logger.info(f"skip rerun, no dag_run found for {dag.dag_id} at {run_id}")
|
|
193
|
+
return
|
|
194
|
+
clear_start_date = min([dr.execution_date for dr in drs])
|
|
195
|
+
clear_end_date = max([dr.execution_date for dr in drs])
|
|
196
|
+
|
|
197
|
+
clear_task_ids: list[str] = []
|
|
198
|
+
for task_id in dag.task_dict.keys():
|
|
199
|
+
if task_id.startswith(node_key):
|
|
200
|
+
clear_task_ids.append(task_id)
|
|
201
|
+
|
|
202
|
+
expanded_task_ids = set(clear_task_ids)
|
|
203
|
+
if include_upstream or include_downstream:
|
|
204
|
+
if include_upstream:
|
|
205
|
+
for task_id in clear_task_ids:
|
|
206
|
+
if task_id in dag.task_dict:
|
|
207
|
+
task = dag.task_dict[task_id]
|
|
208
|
+
upstream_task_ids = [t.task_id for t in task.upstream_list]
|
|
209
|
+
expanded_task_ids.update(upstream_task_ids)
|
|
210
|
+
|
|
211
|
+
if include_downstream:
|
|
212
|
+
for task_id in clear_task_ids:
|
|
213
|
+
if task_id in dag.task_dict:
|
|
214
|
+
task = dag.task_dict[task_id]
|
|
215
|
+
downstream_task_ids = [t.task_id for t in task.downstream_list]
|
|
216
|
+
expanded_task_ids.update(downstream_task_ids)
|
|
217
|
+
|
|
218
|
+
clear_task_ids = list(expanded_task_ids)
|
|
219
|
+
|
|
220
|
+
logger.info(
|
|
221
|
+
f"prepare to clear task: {dag.dag_id}, {clear_task_ids} start_date: {clear_start_date}, end_date: {clear_end_date}, failed_only: {failed_only}"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
clear_cnt = dag.clear(
|
|
225
|
+
task_ids=clear_task_ids,
|
|
226
|
+
start_date=clear_start_date,
|
|
227
|
+
end_date=clear_end_date,
|
|
228
|
+
only_failed=failed_only,
|
|
229
|
+
)
|
|
230
|
+
logger.info(f"finish clear task: {dag.dag_id}, {clear_task_ids}, total clear: {clear_cnt} task_instances")
|
|
231
|
+
|
|
232
|
+
@classmethod
|
|
233
|
+
def init_airflow_tables(cls):
|
|
234
|
+
from airflow.settings import engine
|
|
235
|
+
from airflow.utils.db import reflect_tables
|
|
236
|
+
|
|
237
|
+
def _is_index_exists(session: Session, table_name: str, index_name: str) -> bool:
|
|
238
|
+
query = f"""
|
|
239
|
+
SELECT EXISTS (
|
|
240
|
+
SELECT 1
|
|
241
|
+
FROM pg_indexes
|
|
242
|
+
WHERE tablename = {table_name!r}
|
|
243
|
+
AND indexname = {index_name!r}
|
|
244
|
+
)
|
|
245
|
+
"""
|
|
246
|
+
result = session.execute(query)
|
|
247
|
+
return result.scalar()
|
|
248
|
+
|
|
249
|
+
with create_session() as session:
|
|
250
|
+
metadata = reflect_tables(tables=["dag_run", "task_instance"], session=session)
|
|
251
|
+
dag_run = Table("dag_run", metadata, autoload_with=engine)
|
|
252
|
+
task_instance = Table("task_instance", metadata, autoload_with=engine)
|
|
253
|
+
|
|
254
|
+
dag_run_updated_at_idx = Index("ix_dag_run_updated_at", dag_run.c.updated_at)
|
|
255
|
+
task_instance_updated_at_idx = Index("ix_task_instance_updated_at", task_instance.c.updated_at)
|
|
256
|
+
|
|
257
|
+
with engine.connect():
|
|
258
|
+
if not _is_index_exists(session, "dag_run", "ix_dag_run_updated_at"):
|
|
259
|
+
logger.info("start creating index on dag_run.updated_at")
|
|
260
|
+
session.execute(CreateIndex(dag_run_updated_at_idx))
|
|
261
|
+
logger.info("Created index on dag_run.updated_at")
|
|
262
|
+
else:
|
|
263
|
+
logger.info("Skipped creating index on dag_run.updated_at")
|
|
264
|
+
|
|
265
|
+
if not _is_index_exists(session, "task_instance", "ix_task_instance_updated_at"):
|
|
266
|
+
logger.info("start creating index on task_instance.updated_at")
|
|
267
|
+
session.execute(CreateIndex(task_instance_updated_at_idx))
|
|
268
|
+
logger.info("Created index on task_instance.updated_at")
|
|
269
|
+
else:
|
|
270
|
+
logger.info("Skipped creating index on task_instance.updated_at")
|
|
271
|
+
|
|
272
|
+
@classmethod
|
|
273
|
+
def mark_dag_run_success(cls, dag: DAG, run_id: str = None, whole_dag: bool = False):
|
|
274
|
+
if not run_id:
|
|
275
|
+
if not whole_dag:
|
|
276
|
+
logger.info("mark_dag_run need a run_id, skip mark_dag_run")
|
|
277
|
+
return
|
|
278
|
+
run_ids = cls._get_dag_run_ids(dag)
|
|
279
|
+
for run_id in run_ids:
|
|
280
|
+
cls.mark_dag_run_success(dag, run_id)
|
|
281
|
+
return
|
|
282
|
+
logger.info(f"start mark dag run {dag.dag_id} {run_id} to success")
|
|
283
|
+
set_dag_run_state_to_success(dag=dag, run_id=run_id, commit=True)
|
|
284
|
+
|
|
285
|
+
@classmethod
|
|
286
|
+
def mark_dag_run_failed(cls, dag: DAG, run_id: str = None, whole_dag: bool = False):
|
|
287
|
+
"""
|
|
288
|
+
will mark un-running tasks to skipped;
|
|
289
|
+
mark running tasks to failed;
|
|
290
|
+
keep finished tasks the same.
|
|
291
|
+
"""
|
|
292
|
+
if not run_id:
|
|
293
|
+
if not whole_dag:
|
|
294
|
+
logger.info("mark_dag_run need a run_id, skip mark_dag_run")
|
|
295
|
+
return
|
|
296
|
+
run_ids = cls._get_dag_run_ids(dag)
|
|
297
|
+
for run_id in run_ids:
|
|
298
|
+
cls.mark_dag_run_failed(dag, run_id)
|
|
299
|
+
return
|
|
300
|
+
logger.info(f"start mark dag run {dag.dag_id} {run_id} to success")
|
|
301
|
+
set_dag_run_state_to_failed(dag=dag, run_id=run_id, commit=True)
|
|
302
|
+
|
|
303
|
+
@staticmethod
|
|
304
|
+
@provide_session
|
|
305
|
+
def _get_dag_run_ids(dag: DAG, session: Session = None) -> list[str]:
|
|
306
|
+
query = session.query(DagRun.run_id).filter(DagRun.dag_id == dag.dag_id)
|
|
307
|
+
return [res[0] for res in query.all()]
|
|
308
|
+
|
|
309
|
+
@classmethod
|
|
310
|
+
@provide_session
|
|
311
|
+
def delete_whole_dag_dr_ti(cls, dag: DAG, session: Session = None):
|
|
312
|
+
logger.info(f"start delete whole dag_run and task_instance for {dag.dag_id}")
|
|
313
|
+
for model in (TaskInstance, DagRun):
|
|
314
|
+
session.query(model).filter(model.dag_id == dag.dag_id).delete(synchronize_session="fetch")
|
|
315
|
+
logger.info(f"finish deleted whole dag_run and task_instance for {dag.dag_id}")
|
|
316
|
+
|
|
317
|
+
@staticmethod
|
|
318
|
+
@provide_session
|
|
319
|
+
def _set_task_run_state(dag: DAG, run_id: str, node_key: str, state: TaskInstanceState, session: Session = None):
|
|
320
|
+
logger.info(f"start set task_run {dag.dag_id} {run_id} {node_key} to {state}")
|
|
321
|
+
dag.set_task_instance_state(
|
|
322
|
+
task_id=node_key,
|
|
323
|
+
run_id=run_id,
|
|
324
|
+
state=state,
|
|
325
|
+
session=session,
|
|
326
|
+
)
|
|
327
|
+
logger.info(f"finish set task_run {dag.dag_id} {run_id} {node_key} to {state}")
|
|
328
|
+
|
|
329
|
+
@staticmethod
|
|
330
|
+
def terminate_task_run(dag: DAG, run_id: str, node_key: str):
|
|
331
|
+
AirflowDbService._set_task_run_state(dag, run_id, node_key, TaskInstanceState.FAILED)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from airflow.exceptions import AirflowSkipException
|
|
4
|
+
from airflow.models import TaskInstance
|
|
5
|
+
from airflow.operators.bash import BashOperator
|
|
6
|
+
from airflow.utils.context import Context
|
|
7
|
+
from airflow.utils.task_instance_session import get_current_task_instance_session
|
|
8
|
+
from sqlalchemy.orm.attributes import flag_modified
|
|
9
|
+
|
|
10
|
+
from recurvedata.executors.utils import read_meta_file
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RecurveBashOperator(BashOperator):
|
|
16
|
+
def execute(self, context: Context):
|
|
17
|
+
try:
|
|
18
|
+
res = super().execute(context)
|
|
19
|
+
self.update_meta_to_task_instance_executor_config(context)
|
|
20
|
+
return res
|
|
21
|
+
except Exception:
|
|
22
|
+
self.update_meta_to_task_instance_executor_config(context)
|
|
23
|
+
raise
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def read_meta_file(context: Context) -> dict:
|
|
27
|
+
return read_meta_file(
|
|
28
|
+
context["dag"].dag_id, context["ti"].task_id, context["next_execution_date"] or context["execution_date"]
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def update_meta_to_task_instance_executor_config(self, context: Context):
|
|
32
|
+
meta = self.read_meta_file(context)
|
|
33
|
+
if not meta:
|
|
34
|
+
return
|
|
35
|
+
logger.debug(f"update_meta_to_task_instance_executor_config: {str(meta)}")
|
|
36
|
+
session = get_current_task_instance_session()
|
|
37
|
+
task_instance = TaskInstance.get_task_instance(
|
|
38
|
+
dag_id=context["dag"].dag_id,
|
|
39
|
+
task_id=context["ti"].task_id,
|
|
40
|
+
run_id=context["dag_run"].run_id,
|
|
41
|
+
map_index=-1,
|
|
42
|
+
session=session,
|
|
43
|
+
)
|
|
44
|
+
if task_instance:
|
|
45
|
+
task_instance.executor_config.update(meta)
|
|
46
|
+
flag_modified(task_instance, "executor_config")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class SkipSelfBashOperator(BashOperator):
|
|
50
|
+
ui_color = "#e8f7e4"
|
|
51
|
+
|
|
52
|
+
def execute(self, context):
|
|
53
|
+
raise AirflowSkipException("This task is skipped")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class LinkNodeBashOperator(RecurveBashOperator):
|
|
57
|
+
ui_color = "#8DEEEE"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class LinkErrorBashOperator(BashOperator):
|
|
61
|
+
ui_color = "red" # not used
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""
|
|
2
|
+
monkey patch airflow airflow/api/common/trigger_dag.py,
|
|
3
|
+
airflow native _trigger_dag will create data_interval_end = execution_date dag run,
|
|
4
|
+
which will cause plan run error (one data_interval_end may have multiple run_id).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
|
|
11
|
+
import airflow.api.common.trigger_dag
|
|
12
|
+
from airflow.exceptions import DagNotFound, DagRunAlreadyExists
|
|
13
|
+
from airflow.models import DagBag, DagRun
|
|
14
|
+
from airflow.models.dag import DAG
|
|
15
|
+
from airflow.timetables.base import DataInterval
|
|
16
|
+
from airflow.timetables.interval import CronDataIntervalTimetable
|
|
17
|
+
from airflow.utils import timezone
|
|
18
|
+
from airflow.utils.state import DagRunState
|
|
19
|
+
from airflow.utils.types import DagRunType
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _recurve_get_next_data_interval(mannual_data_interval: DataInterval, dag: DAG):
|
|
25
|
+
if not isinstance(dag.timetable, CronDataIntervalTimetable):
|
|
26
|
+
return mannual_data_interval
|
|
27
|
+
next_data_interval_end = dag.timetable._get_next(mannual_data_interval.end)
|
|
28
|
+
return DataInterval(start=mannual_data_interval.end, end=next_data_interval_end)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _recurve_trigger_dag(
|
|
32
|
+
dag_id: str,
|
|
33
|
+
dag_bag: DagBag,
|
|
34
|
+
run_id: str | None = None,
|
|
35
|
+
conf: dict | str | None = None,
|
|
36
|
+
execution_date: datetime | None = None,
|
|
37
|
+
replace_microseconds: bool = True,
|
|
38
|
+
) -> list[DagRun | None]:
|
|
39
|
+
"""
|
|
40
|
+
Triggers DAG run.
|
|
41
|
+
|
|
42
|
+
:param dag_id: DAG ID
|
|
43
|
+
:param dag_bag: DAG Bag model
|
|
44
|
+
:param run_id: ID of the dag_run
|
|
45
|
+
:param conf: configuration
|
|
46
|
+
:param execution_date: date of execution
|
|
47
|
+
:param replace_microseconds: whether microseconds should be zeroed
|
|
48
|
+
:return: list of triggered dags
|
|
49
|
+
"""
|
|
50
|
+
logger.info("start call _recurve_trigger_dag")
|
|
51
|
+
dag = dag_bag.get_dag(dag_id) # prefetch dag if it is stored serialized
|
|
52
|
+
|
|
53
|
+
if dag is None or dag_id not in dag_bag.dags:
|
|
54
|
+
raise DagNotFound(f"Dag id {dag_id} not found")
|
|
55
|
+
|
|
56
|
+
execution_date = execution_date or timezone.utcnow()
|
|
57
|
+
|
|
58
|
+
if not timezone.is_localized(execution_date):
|
|
59
|
+
raise ValueError("The execution_date should be localized")
|
|
60
|
+
|
|
61
|
+
if replace_microseconds:
|
|
62
|
+
execution_date = execution_date.replace(microsecond=0)
|
|
63
|
+
|
|
64
|
+
if dag.default_args and "start_date" in dag.default_args:
|
|
65
|
+
min_dag_start_date = dag.default_args["start_date"]
|
|
66
|
+
if min_dag_start_date and execution_date < min_dag_start_date:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
f"The execution_date [{execution_date.isoformat()}] should be >= start_date "
|
|
69
|
+
f"[{min_dag_start_date.isoformat()}] from DAG's default_args"
|
|
70
|
+
)
|
|
71
|
+
logical_date = timezone.coerce_datetime(execution_date)
|
|
72
|
+
|
|
73
|
+
data_interval = dag.timetable.infer_manual_data_interval(run_after=logical_date)
|
|
74
|
+
|
|
75
|
+
# recurve update start #
|
|
76
|
+
recurve_external_trigger = True
|
|
77
|
+
inferred_run_type = DagRunType.from_run_id(run_id)
|
|
78
|
+
if inferred_run_type == DagRunType.SCHEDULED:
|
|
79
|
+
new_data_interval = _recurve_get_next_data_interval(data_interval, dag)
|
|
80
|
+
logger.info(f"adjust data interval: {data_interval} -> {new_data_interval}")
|
|
81
|
+
data_interval = new_data_interval
|
|
82
|
+
recurve_external_trigger = False
|
|
83
|
+
# recurve update end #
|
|
84
|
+
|
|
85
|
+
run_id = run_id or dag.timetable.generate_run_id(
|
|
86
|
+
run_type=DagRunType.MANUAL, logical_date=logical_date, data_interval=data_interval
|
|
87
|
+
)
|
|
88
|
+
dag_run = DagRun.find_duplicate(dag_id=dag_id, execution_date=execution_date, run_id=run_id)
|
|
89
|
+
|
|
90
|
+
if dag_run:
|
|
91
|
+
raise DagRunAlreadyExists(dag_run=dag_run, execution_date=execution_date, run_id=run_id)
|
|
92
|
+
|
|
93
|
+
run_conf = None
|
|
94
|
+
if conf:
|
|
95
|
+
run_conf = conf if isinstance(conf, dict) else json.loads(conf)
|
|
96
|
+
|
|
97
|
+
# recurve update start #
|
|
98
|
+
dag_runs = []
|
|
99
|
+
dags_to_run = [dag, *dag.subdags]
|
|
100
|
+
for _dag in dags_to_run:
|
|
101
|
+
dag_run = _dag.create_dagrun(
|
|
102
|
+
run_id=run_id,
|
|
103
|
+
execution_date=execution_date,
|
|
104
|
+
state=DagRunState.QUEUED,
|
|
105
|
+
conf=run_conf,
|
|
106
|
+
external_trigger=recurve_external_trigger,
|
|
107
|
+
dag_hash=dag_bag.dags_hash.get(dag_id),
|
|
108
|
+
data_interval=data_interval,
|
|
109
|
+
)
|
|
110
|
+
dag_runs.append(dag_run)
|
|
111
|
+
# recurve update end #
|
|
112
|
+
|
|
113
|
+
return dag_runs
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
logger.info("monkey patch airflow.api.common.trigger_dag._trigger_dag")
|
|
117
|
+
airflow.api.common.trigger_dag._trigger_dag = _recurve_trigger_dag
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from functools import cached_property
|
|
5
|
+
from typing import ClassVar, Optional
|
|
6
|
+
|
|
7
|
+
import pendulum
|
|
8
|
+
|
|
9
|
+
from recurvedata.schedulers.client import SchedulerClient
|
|
10
|
+
from recurvedata.schedulers.schemas import JobListResponse
|
|
11
|
+
from recurvedata.utils.dataclass import init_dataclass_from_dict
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class DagSchema:
|
|
18
|
+
id: int # recurve job_id
|
|
19
|
+
name: str # recurve job_name
|
|
20
|
+
project_id: int
|
|
21
|
+
project_name: str
|
|
22
|
+
workflow_id: int
|
|
23
|
+
workflow_name: str
|
|
24
|
+
workflow_version: str
|
|
25
|
+
graph: list[tuple[str, str]] # [(upstream_node_key, downstream_node_key),]
|
|
26
|
+
nodes: list
|
|
27
|
+
schedule_type: str
|
|
28
|
+
schedule_interval: str
|
|
29
|
+
timezone: str
|
|
30
|
+
|
|
31
|
+
owner_username: str
|
|
32
|
+
# scheduler_args: dict
|
|
33
|
+
start_date: Optional[datetime.datetime] = None
|
|
34
|
+
end_date: Optional[datetime.datetime] = None
|
|
35
|
+
scheduler_settings: Optional[dict] = None
|
|
36
|
+
retries: Optional[int] = None
|
|
37
|
+
retry_delay: Optional[int] = None
|
|
38
|
+
|
|
39
|
+
# attr for modeling pipeline
|
|
40
|
+
skip_data_tests: bool = False
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def job_id(self):
|
|
44
|
+
return self.id
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SchedulerBase(object):
|
|
48
|
+
DEFAULT_DAG_OWNER: ClassVar[str] = "recurve"
|
|
49
|
+
|
|
50
|
+
def __init__(self, sharding_size: int = 1, sharding_key: int = 0):
|
|
51
|
+
self.sharding_size = sharding_size
|
|
52
|
+
self.sharding_key = sharding_key
|
|
53
|
+
self.client: SchedulerClient = self.init_client()
|
|
54
|
+
|
|
55
|
+
@cached_property
|
|
56
|
+
def localtz(self): # todo: move to CONF
|
|
57
|
+
return pendulum.timezone("Asia/Shanghai")
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def init_client(cls) -> SchedulerClient:
|
|
61
|
+
return SchedulerClient()
|
|
62
|
+
|
|
63
|
+
def list_scheduler_dag(self):
|
|
64
|
+
"""
|
|
65
|
+
从 sdk 获取符合条件的所有 dag 信息
|
|
66
|
+
:return:
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
jobs: JobListResponse = self.client.list_jobs(sharding_size=self.sharding_size, sharding_key=self.sharding_key)
|
|
70
|
+
|
|
71
|
+
for job in jobs.jobs:
|
|
72
|
+
dag = init_dataclass_from_dict(DagSchema, job.model_dump())
|
|
73
|
+
yield dag
|
|
74
|
+
|
|
75
|
+
def create_dag(self, row: DagSchema):
|
|
76
|
+
"""
|
|
77
|
+
生成对应调度器(airflow/...) 的对象
|
|
78
|
+
:param args:
|
|
79
|
+
:param kwargs:
|
|
80
|
+
:return:
|
|
81
|
+
"""
|
|
82
|
+
try:
|
|
83
|
+
return self.create_dag_impl(row)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.exception(f"failed to generate dag {row.id}, %s", e)
|
|
86
|
+
return # todo: add new client api to notify
|
|
87
|
+
|
|
88
|
+
def create_dag_impl(self, row: DagSchema):
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
def execute(self, *args, **kwargs):
|
|
92
|
+
"""
|
|
93
|
+
入口
|
|
94
|
+
:param args:
|
|
95
|
+
:param kwargs:
|
|
96
|
+
:return:
|
|
97
|
+
"""
|
|
98
|
+
for row in self.list_scheduler_dag():
|
|
99
|
+
pass
|