dltaf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +23 -0
- cli/generate_dags.py +413 -0
- cli/show_lineage.py +146 -0
- dag_builder/__init__.py +30 -0
- dag_builder/builder.py +134 -0
- dag_builder/dag_builder_core.py +253 -0
- dag_builder/dependency_resolver.py +286 -0
- dag_builder/manifest_loader.py +157 -0
- dag_builder/task_factory.py +772 -0
- dlt_pipelines/__init__.py +0 -0
- dlt_pipelines/mongodb_runtime/__init__.py +1 -0
- dlt_pipelines/mongodb_runtime/mongodb/__init__.py +116 -0
- dlt_pipelines/mongodb_runtime/mongodb/helpers.py +624 -0
- dlt_utils/__init__.py +7 -0
- dlt_utils/airflow_dlt_runner.py +78 -0
- dlt_utils/clickhouse_helpers.py +292 -0
- dlt_utils/manifest_runner.py +1008 -0
- dlt_utils/naming.py +47 -0
- dlt_utils/oracle_serialization.py +49 -0
- dlt_utils/vault_env.py +336 -0
- dltaf/__init__.py +27 -0
- dltaf/cli.py +133 -0
- dltaf/examples/__init__.py +0 -0
- dltaf/examples/manifests/__init__.py +0 -0
- dltaf/examples/manifests/smoke_mongodb_catalog.yaml +25 -0
- dltaf/examples/manifests/smoke_oracle_custom_sql.yaml +28 -0
- dltaf/examples/manifests/smoke_sql_database_catalog.yaml +26 -0
- dltaf/examples/sql/__init__.py +0 -0
- dltaf/examples/sql/oracle_smoke_query.sql +1 -0
- dltaf/plugins.py +392 -0
- dltaf-0.1.0.dist-info/METADATA +242 -0
- dltaf-0.1.0.dist-info/RECORD +39 -0
- dltaf-0.1.0.dist-info/WHEEL +5 -0
- dltaf-0.1.0.dist-info/entry_points.txt +8 -0
- dltaf-0.1.0.dist-info/licenses/LICENSE +159 -0
- dltaf-0.1.0.dist-info/top_level.txt +6 -0
- lineage/__init__.py +40 -0
- lineage/dependency_graph.py +298 -0
- lineage/manifest_dependency_resolver.py +522 -0
cli/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""CLI утилиты для управления dlt пайплайнами и Airflow DAG'ами.
|
|
2
|
+
|
|
3
|
+
Этот пакет содержит производственные command-line инструменты для работы с dlt:
|
|
4
|
+
- show_lineage: Визуализация и анализ зависимостей между пайплайнами (lineage graph)
|
|
5
|
+
- generate_dags: Автоматическая генерация Airflow DAG файлов из YAML манифестов
|
|
6
|
+
|
|
7
|
+
Примеры использования:
|
|
8
|
+
# Показать lineage (граф зависимостей пайплайнов)
|
|
9
|
+
python -m cli.show_lineage
|
|
10
|
+
python -m cli.show_lineage --format json # JSON формат для программной обработки
|
|
11
|
+
python -m cli.show_lineage --format mermaid # Mermaid диаграмма для визуализации
|
|
12
|
+
|
|
13
|
+
# Генерация Airflow DAG файлов из YAML манифестов
|
|
14
|
+
python -m cli.generate_dags # инкрементальная генерация
|
|
15
|
+
python -m cli.generate_dags --clean # удалить старые DAG'и, сгенерировать заново
|
|
16
|
+
|
|
17
|
+
# Через установленные CLI команды (после pip install -e .):
|
|
18
|
+
dlt-show-lineage
|
|
19
|
+
dlt-show-lineage --format mermaid
|
|
20
|
+
dlt-generate-dags --clean
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
__all__ = ["show_lineage", "generate_dags"]
|
cli/generate_dags.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""CLI for generating Airflow DAG files from YAML manifests."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Tuple
|
|
9
|
+
|
|
10
|
+
from lineage import ManifestDependencyResolver, DependencyGraph
|
|
11
|
+
|
|
12
|
+
_PACKAGE_ROOT = Path(__file__).resolve().parent.parent
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _resolve_path(path: Path, *, base: Path = _PACKAGE_ROOT) -> Path:
|
|
16
|
+
"""Разрешает путь: относительный — от base, абсолютный — как есть."""
|
|
17
|
+
p = Path(path)
|
|
18
|
+
if not p.is_absolute():
|
|
19
|
+
p = base / p
|
|
20
|
+
return p.resolve()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Шаблоны для генерации DAG файлов
|
|
24
|
+
SIMPLE_DAG_TEMPLATE = '''"""DAG для {pipeline_name}.
|
|
25
|
+
|
|
26
|
+
⚠️ АВТОМАТИЧЕСКИ СГЕНЕРИРОВАН из YAML манифеста для CI/CD валидации.
|
|
27
|
+
Не редактируйте вручную — изменения будут перезаписаны при следующей генерации!
|
|
28
|
+
|
|
29
|
+
Простой независимый пайплайн без зависимостей.
|
|
30
|
+
{description}
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
import sys
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from datetime import timedelta
|
|
36
|
+
|
|
37
|
+
# Добавить корневую директорию проекта в PYTHONPATH для импорта модулей
|
|
38
|
+
_dag_file_path = Path(__file__).resolve()
|
|
39
|
+
_project_root = _dag_file_path.parent.parent
|
|
40
|
+
if str(_project_root) not in sys.path:
|
|
41
|
+
sys.path.insert(0, str(_project_root))
|
|
42
|
+
|
|
43
|
+
from airflow import DAG
|
|
44
|
+
from dag_builder import build_dag_from_yaml
|
|
45
|
+
import pendulum
|
|
46
|
+
|
|
47
|
+
default_args = {{
|
|
48
|
+
"owner": "data-platform",
|
|
49
|
+
"retries": {retries},
|
|
50
|
+
"retry_delay": timedelta(minutes={retry_delay_minutes}),
|
|
51
|
+
"execution_timeout": timedelta(hours={execution_timeout_hours}),
|
|
52
|
+
}}
|
|
53
|
+
|
|
54
|
+
with DAG(
|
|
55
|
+
dag_id="{dag_id}",
|
|
56
|
+
default_args=default_args,
|
|
57
|
+
start_date=pendulum.datetime(2024, 1, 1, tz="UTC"),
|
|
58
|
+
schedule="{schedule}",
|
|
59
|
+
catchup=False,
|
|
60
|
+
max_active_runs=1,
|
|
61
|
+
tags={tags},
|
|
62
|
+
doc_md=__doc__,
|
|
63
|
+
) as dag:
|
|
64
|
+
build_dag_from_yaml(
|
|
65
|
+
dag=dag,
|
|
66
|
+
manifest_path="{manifest_filename}",
|
|
67
|
+
)
|
|
68
|
+
'''
|
|
69
|
+
|
|
70
|
+
WITH_DEPS_DAG_TEMPLATE = '''"""DAG для {pipeline_name} с автоматической загрузкой зависимостей.
|
|
71
|
+
|
|
72
|
+
⚠️ АВТОМАТИЧЕСКИ СГЕНЕРИРОВАН из YAML манифеста для CI/CD валидации.
|
|
73
|
+
Не редактируйте вручную — изменения будут перезаписаны при следующей генерации!
|
|
74
|
+
|
|
75
|
+
Этот DAG автоматически загружает все зависимости:
|
|
76
|
+
{dependencies_list}
|
|
77
|
+
|
|
78
|
+
Все зависимости автоматически разрешаются и выстраиваются в граф
|
|
79
|
+
через native Airflow dependencies (>>).
|
|
80
|
+
{description}
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
import sys
|
|
84
|
+
from pathlib import Path
|
|
85
|
+
from datetime import timedelta
|
|
86
|
+
|
|
87
|
+
# Добавить корневую директорию проекта в PYTHONPATH для импорта модулей
|
|
88
|
+
_dag_file_path = Path(__file__).resolve()
|
|
89
|
+
_project_root = _dag_file_path.parent.parent
|
|
90
|
+
if str(_project_root) not in sys.path:
|
|
91
|
+
sys.path.insert(0, str(_project_root))
|
|
92
|
+
|
|
93
|
+
from airflow import DAG
|
|
94
|
+
from dag_builder import build_dag_from_yaml
|
|
95
|
+
import pendulum
|
|
96
|
+
|
|
97
|
+
default_args = {{
|
|
98
|
+
"owner": "data-platform",
|
|
99
|
+
"retries": {retries},
|
|
100
|
+
"retry_delay": timedelta(minutes={retry_delay_minutes}),
|
|
101
|
+
"execution_timeout": timedelta(hours={execution_timeout_hours}),
|
|
102
|
+
}}
|
|
103
|
+
|
|
104
|
+
with DAG(
|
|
105
|
+
dag_id="{dag_id}",
|
|
106
|
+
default_args=default_args,
|
|
107
|
+
start_date=pendulum.datetime(2024, 1, 1, tz="UTC"),
|
|
108
|
+
schedule="{schedule}",
|
|
109
|
+
catchup=False,
|
|
110
|
+
max_active_runs=1,
|
|
111
|
+
tags={tags},
|
|
112
|
+
doc_md=__doc__,
|
|
113
|
+
) as dag:
|
|
114
|
+
build_dag_from_yaml(
|
|
115
|
+
dag=dag,
|
|
116
|
+
manifest_path="{manifest_filename}",
|
|
117
|
+
)
|
|
118
|
+
'''
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class DAGGenerator:
|
|
122
|
+
"""Генератор DAG файлов из YAML манифестов."""
|
|
123
|
+
|
|
124
|
+
def __init__(
|
|
125
|
+
self,
|
|
126
|
+
manifests_dir: Path,
|
|
127
|
+
output_dir: Path,
|
|
128
|
+
clean: bool = False,
|
|
129
|
+
):
|
|
130
|
+
self.manifests_dir = manifests_dir
|
|
131
|
+
self.output_dir = output_dir
|
|
132
|
+
self.clean = clean
|
|
133
|
+
|
|
134
|
+
self.resolver = ManifestDependencyResolver(manifests_dir)
|
|
135
|
+
self.graph: DependencyGraph = None
|
|
136
|
+
|
|
137
|
+
def generate(self) -> Tuple[int, int]:
|
|
138
|
+
"""Генерирует DAG файлы.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Tuple[generated_count, skipped_count]
|
|
142
|
+
"""
|
|
143
|
+
print(f"Scanning manifests in: {self.manifests_dir}")
|
|
144
|
+
|
|
145
|
+
# Строим граф зависимостей
|
|
146
|
+
try:
|
|
147
|
+
self.graph = self.resolver.build_graph()
|
|
148
|
+
except Exception as e:
|
|
149
|
+
print(f"ERROR: failed to build dependency graph: {e}")
|
|
150
|
+
return 0, 0
|
|
151
|
+
|
|
152
|
+
print(f"Found manifests: {len(self.resolver._manifest_map)}")
|
|
153
|
+
|
|
154
|
+
# Очистка старых DAG'ов если нужно
|
|
155
|
+
if self.clean:
|
|
156
|
+
self._clean_old_dags()
|
|
157
|
+
|
|
158
|
+
# Генерация DAG'ов (простой режим: 1 манифест = 1 DAG)
|
|
159
|
+
return self._generate_simple_dags()
|
|
160
|
+
|
|
161
|
+
def _clean_old_dags(self) -> None:
|
|
162
|
+
"""Удаляет старые DAG файлы (кроме тех, что уже в git — production)"""
|
|
163
|
+
print("\nCleaning previously generated DAG files...")
|
|
164
|
+
|
|
165
|
+
def _is_tracked_in_git(relative_path: Path) -> bool:
|
|
166
|
+
"""Проверяет, есть ли файл в git (tracked)."""
|
|
167
|
+
try:
|
|
168
|
+
result = subprocess.run(
|
|
169
|
+
["git", "ls-files", "--error-unmatch", str(relative_path)],
|
|
170
|
+
capture_output=True,
|
|
171
|
+
text=True,
|
|
172
|
+
timeout=5,
|
|
173
|
+
cwd=Path.cwd(),
|
|
174
|
+
)
|
|
175
|
+
return result.returncode == 0
|
|
176
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
patterns = ["dlt__*.py", "standalone_*.py", "domain__*.py", "simple__*.py"]
|
|
180
|
+
deleted = 0
|
|
181
|
+
|
|
182
|
+
for pattern in patterns:
|
|
183
|
+
for dag_file in self.output_dir.glob(pattern):
|
|
184
|
+
# Путь относительно текущей директории (проект = dp-dlt-af)
|
|
185
|
+
relative_path = self.output_dir / dag_file.name
|
|
186
|
+
if _is_tracked_in_git(relative_path):
|
|
187
|
+
print(f" protected: {dag_file.name} (tracked in git)")
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
print(f" delete: {dag_file.name}")
|
|
191
|
+
dag_file.unlink()
|
|
192
|
+
deleted += 1
|
|
193
|
+
|
|
194
|
+
print(f"Removed files: {deleted}")
|
|
195
|
+
|
|
196
|
+
def _generate_simple_dags(self) -> Tuple[int, int]:
|
|
197
|
+
"""Генерирует простые DAG'и (1 манифест = 1 DAG)."""
|
|
198
|
+
print("\nGenerating DAG files (1 manifest = 1 DAG)...")
|
|
199
|
+
|
|
200
|
+
generated = 0
|
|
201
|
+
skipped = 0
|
|
202
|
+
|
|
203
|
+
for pipeline_name, manifest_path in self.resolver._manifest_map.items():
|
|
204
|
+
try:
|
|
205
|
+
# Проверяем, есть ли зависимости
|
|
206
|
+
dependencies = self.resolver.get_dependencies(pipeline_name)
|
|
207
|
+
|
|
208
|
+
if dependencies:
|
|
209
|
+
# DAG с автозагрузкой зависимостей
|
|
210
|
+
dag_content = self._render_with_deps_dag(pipeline_name, manifest_path)
|
|
211
|
+
dag_filename = f"{pipeline_name}_with_deps.py"
|
|
212
|
+
else:
|
|
213
|
+
# Простой DAG без зависимостей
|
|
214
|
+
dag_content = self._render_simple_dag(pipeline_name, manifest_path)
|
|
215
|
+
dag_filename = f"{pipeline_name}.py"
|
|
216
|
+
|
|
217
|
+
output_path = self.output_dir / dag_filename
|
|
218
|
+
output_path.write_text(dag_content, encoding="utf-8")
|
|
219
|
+
|
|
220
|
+
print(f" ok {dag_filename}")
|
|
221
|
+
generated += 1
|
|
222
|
+
|
|
223
|
+
except Exception as e:
|
|
224
|
+
print(f" error {pipeline_name}: {e}")
|
|
225
|
+
skipped += 1
|
|
226
|
+
|
|
227
|
+
return generated, skipped
|
|
228
|
+
|
|
229
|
+
def _load_manifest_config(self, manifest_path: Path) -> Dict[str, Any]:
|
|
230
|
+
"""Загружает конфигурацию из YAML манифеста."""
|
|
231
|
+
import yaml
|
|
232
|
+
|
|
233
|
+
with open(manifest_path, "r", encoding="utf-8") as f:
|
|
234
|
+
data = yaml.safe_load(f)
|
|
235
|
+
|
|
236
|
+
if not isinstance(data, dict):
|
|
237
|
+
raise ValueError(f"Манифест должен быть YAML объектом: {manifest_path}")
|
|
238
|
+
|
|
239
|
+
return data
|
|
240
|
+
|
|
241
|
+
def _extract_airflow_params(self, manifest_data: Dict[str, Any], pipeline_name: str) -> Dict[str, Any]:
|
|
242
|
+
"""Извлекает параметры Airflow из манифеста с fallback на defaults."""
|
|
243
|
+
airflow_config = manifest_data.get("airflow", {})
|
|
244
|
+
|
|
245
|
+
# Извлекаем source для fallback тегов и schedule
|
|
246
|
+
parts = pipeline_name.split("__")
|
|
247
|
+
source = parts[1] if len(parts) >= 4 else "unknown"
|
|
248
|
+
dataset = parts[4] if len(parts) >= 5 else "unknown"
|
|
249
|
+
|
|
250
|
+
# Schedule из манифеста или default
|
|
251
|
+
schedule = airflow_config.get("schedule")
|
|
252
|
+
if not schedule:
|
|
253
|
+
# Fallback schedule на основе source
|
|
254
|
+
schedule_defaults = {
|
|
255
|
+
"postgres": "0 3 * * *",
|
|
256
|
+
"oracle": "0 2 * * *",
|
|
257
|
+
"mongodb": "0 5 * * *",
|
|
258
|
+
"pkb": "0 6 * * *",
|
|
259
|
+
}
|
|
260
|
+
schedule = schedule_defaults.get(source.split("_")[0], "0 * * * *")
|
|
261
|
+
|
|
262
|
+
# Tags из манифеста или default
|
|
263
|
+
tags = airflow_config.get("tags")
|
|
264
|
+
if not tags:
|
|
265
|
+
tags = ["dlt", source.split("_")[0], dataset]
|
|
266
|
+
|
|
267
|
+
# Default args из манифеста
|
|
268
|
+
default_args = airflow_config.get("default_args", {})
|
|
269
|
+
retries = default_args.get("retries", 2)
|
|
270
|
+
retry_delay_minutes = default_args.get("retry_delay_minutes", 5)
|
|
271
|
+
|
|
272
|
+
# Task config
|
|
273
|
+
task_config = airflow_config.get("task", {})
|
|
274
|
+
execution_timeout_hours = task_config.get("execution_timeout_hours", 2)
|
|
275
|
+
|
|
276
|
+
return {
|
|
277
|
+
"schedule": schedule,
|
|
278
|
+
"tags": tags,
|
|
279
|
+
"retries": retries,
|
|
280
|
+
"retry_delay_minutes": retry_delay_minutes,
|
|
281
|
+
"execution_timeout_hours": execution_timeout_hours,
|
|
282
|
+
"default_args": default_args,
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
def _render_simple_dag(self, pipeline_name: str, manifest_path: Path) -> str:
|
|
286
|
+
"""Рендерит простой DAG без зависимостей."""
|
|
287
|
+
# Загружаем конфигурацию из манифеста
|
|
288
|
+
manifest_data = self._load_manifest_config(manifest_path)
|
|
289
|
+
airflow_params = self._extract_airflow_params(manifest_data, pipeline_name)
|
|
290
|
+
|
|
291
|
+
# Формируем теги для шаблона
|
|
292
|
+
tags_formatted = [f'"{tag}"' for tag in airflow_params["tags"]]
|
|
293
|
+
|
|
294
|
+
return SIMPLE_DAG_TEMPLATE.format(
|
|
295
|
+
pipeline_name=pipeline_name,
|
|
296
|
+
description="",
|
|
297
|
+
dag_id=pipeline_name,
|
|
298
|
+
manifest_filename=manifest_path.name,
|
|
299
|
+
schedule=airflow_params["schedule"],
|
|
300
|
+
tags=f"[{', '.join(tags_formatted)}]",
|
|
301
|
+
retries=airflow_params["retries"],
|
|
302
|
+
retry_delay_minutes=airflow_params["retry_delay_minutes"],
|
|
303
|
+
execution_timeout_hours=airflow_params["execution_timeout_hours"],
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
def _render_with_deps_dag(self, pipeline_name: str, manifest_path: Path) -> str:
|
|
307
|
+
"""Рендерит DAG с автозагрузкой зависимостей."""
|
|
308
|
+
# Загружаем конфигурацию из манифеста
|
|
309
|
+
manifest_data = self._load_manifest_config(manifest_path)
|
|
310
|
+
airflow_params = self._extract_airflow_params(manifest_data, pipeline_name)
|
|
311
|
+
|
|
312
|
+
# Получаем полный список зависимостей
|
|
313
|
+
all_deps = self._get_transitive_dependencies(pipeline_name)
|
|
314
|
+
deps_list = "\n".join([f"- {dep}" for dep in [pipeline_name] + all_deps])
|
|
315
|
+
|
|
316
|
+
# Добавляем тег "with-dependencies" если его нет
|
|
317
|
+
tags = airflow_params["tags"]
|
|
318
|
+
if "with-dependencies" not in tags:
|
|
319
|
+
tags = tags + ["with-dependencies"]
|
|
320
|
+
|
|
321
|
+
tags_formatted = [f'"{tag}"' for tag in tags]
|
|
322
|
+
|
|
323
|
+
return WITH_DEPS_DAG_TEMPLATE.format(
|
|
324
|
+
pipeline_name=pipeline_name,
|
|
325
|
+
dependencies_list=deps_list,
|
|
326
|
+
description=f"\nВАЖНО: Запускается после зависимостей ({airflow_params['schedule']}).",
|
|
327
|
+
dag_id=pipeline_name,
|
|
328
|
+
manifest_filename=manifest_path.name,
|
|
329
|
+
schedule=airflow_params["schedule"],
|
|
330
|
+
tags=f"[{', '.join(tags_formatted)}]",
|
|
331
|
+
retries=airflow_params["retries"],
|
|
332
|
+
retry_delay_minutes=airflow_params["retry_delay_minutes"],
|
|
333
|
+
execution_timeout_hours=airflow_params["execution_timeout_hours"],
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
def _get_transitive_dependencies(self, pipeline_name: str) -> List[str]:
|
|
337
|
+
"""Получает все транзитивные зависимости пайплайна."""
|
|
338
|
+
all_deps = []
|
|
339
|
+
visited = set()
|
|
340
|
+
|
|
341
|
+
def visit(name: str):
|
|
342
|
+
if name in visited:
|
|
343
|
+
return
|
|
344
|
+
visited.add(name)
|
|
345
|
+
|
|
346
|
+
deps = self.resolver.get_dependencies(name)
|
|
347
|
+
for dep in deps:
|
|
348
|
+
visit(dep)
|
|
349
|
+
if dep not in all_deps:
|
|
350
|
+
all_deps.append(dep)
|
|
351
|
+
|
|
352
|
+
visit(pipeline_name)
|
|
353
|
+
return all_deps[::-1] # Reverse для правильного порядка
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def main():
|
|
357
|
+
parser = argparse.ArgumentParser(
|
|
358
|
+
description="Генерация Airflow DAG файлов из YAML манифестов"
|
|
359
|
+
)
|
|
360
|
+
parser.add_argument(
|
|
361
|
+
"--manifests-dir",
|
|
362
|
+
type=Path,
|
|
363
|
+
default=Path("dlt_pipelines/manifests"),
|
|
364
|
+
help="Директория с YAML манифестами (default: dlt_pipelines/manifests)",
|
|
365
|
+
)
|
|
366
|
+
parser.add_argument(
|
|
367
|
+
"--output-dir",
|
|
368
|
+
type=Path,
|
|
369
|
+
default=Path("dags"),
|
|
370
|
+
help="Директория для генерации DAG файлов (default: dags)",
|
|
371
|
+
)
|
|
372
|
+
parser.add_argument(
|
|
373
|
+
"--clean",
|
|
374
|
+
action="store_true",
|
|
375
|
+
help="Удалить старые DAG файлы перед генерацией",
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
args = parser.parse_args()
|
|
379
|
+
|
|
380
|
+
# Разрешаем пути относительно корня пакета (независимо от cwd)
|
|
381
|
+
manifests_dir = _resolve_path(args.manifests_dir)
|
|
382
|
+
output_dir = _resolve_path(args.output_dir)
|
|
383
|
+
|
|
384
|
+
if not manifests_dir.exists():
|
|
385
|
+
print(f"ОШИБКА: Директория с манифестами не найдена: {manifests_dir}")
|
|
386
|
+
return 1
|
|
387
|
+
|
|
388
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
389
|
+
|
|
390
|
+
print("=" * 70)
|
|
391
|
+
print("Генератор Airflow DAG файлов")
|
|
392
|
+
print("=" * 70)
|
|
393
|
+
|
|
394
|
+
# Генерация
|
|
395
|
+
generator = DAGGenerator(
|
|
396
|
+
manifests_dir=manifests_dir,
|
|
397
|
+
output_dir=output_dir,
|
|
398
|
+
clean=args.clean,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
generated, skipped = generator.generate()
|
|
402
|
+
|
|
403
|
+
print("\n" + "=" * 70)
|
|
404
|
+
print("OK Генерация завершена!")
|
|
405
|
+
print(f" Создано DAG'ов: {generated}")
|
|
406
|
+
print(f" Пропущено: {skipped}")
|
|
407
|
+
print("=" * 70)
|
|
408
|
+
|
|
409
|
+
return 0 if skipped == 0 else 1
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
if __name__ == "__main__":
|
|
413
|
+
sys.exit(main())
|
cli/show_lineage.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""CLI for rendering lineage and dependency reports for dltaf manifests."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
# Добавить родительскую директорию в путь для импортов
|
|
10
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
11
|
+
|
|
12
|
+
from lineage import ManifestDependencyResolver, CyclicDependencyError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def format_text_report(resolver: ManifestDependencyResolver) -> str:
|
|
16
|
+
"""Generate a human-readable dependency report."""
|
|
17
|
+
try:
|
|
18
|
+
return resolver.get_lineage_report()
|
|
19
|
+
except CyclicDependencyError as e:
|
|
20
|
+
return f"ERROR: {e}\n\nCycle: {' -> '.join(e.cycle)}"
|
|
21
|
+
except Exception as e:
|
|
22
|
+
return f"ERROR: {e}"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def format_json_report(resolver: ManifestDependencyResolver) -> str:
|
|
26
|
+
"""Generate a machine-readable JSON report."""
|
|
27
|
+
try:
|
|
28
|
+
graph = resolver.build_graph(validate=True)
|
|
29
|
+
|
|
30
|
+
report = {
|
|
31
|
+
"status": "success",
|
|
32
|
+
"total_pipelines": len(resolver._manifest_map),
|
|
33
|
+
"pipelines": {},
|
|
34
|
+
"execution_order": {
|
|
35
|
+
"topological": graph.topological_sort(),
|
|
36
|
+
"levels": graph.get_execution_order(),
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
for pipeline_name in resolver._manifest_map:
|
|
41
|
+
node = graph.get_node(pipeline_name)
|
|
42
|
+
report["pipelines"][pipeline_name] = {
|
|
43
|
+
"dependencies": list(node.dependencies) if node else [],
|
|
44
|
+
"dependents": list(graph.get_dependents(pipeline_name)),
|
|
45
|
+
"transitive_dependencies": list(graph.get_transitive_dependencies(pipeline_name)),
|
|
46
|
+
"manifest_path": str(resolver.get_manifest_path(pipeline_name)),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return json.dumps(report, indent=2, ensure_ascii=False)
|
|
50
|
+
|
|
51
|
+
except CyclicDependencyError as e:
|
|
52
|
+
return json.dumps({
|
|
53
|
+
"status": "error",
|
|
54
|
+
"error": "cyclic_dependency",
|
|
55
|
+
"message": str(e),
|
|
56
|
+
"cycle": e.cycle,
|
|
57
|
+
}, indent=2, ensure_ascii=False)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
return json.dumps({
|
|
60
|
+
"status": "error",
|
|
61
|
+
"error": type(e).__name__,
|
|
62
|
+
"message": str(e),
|
|
63
|
+
}, indent=2, ensure_ascii=False)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def format_mermaid_diagram(resolver: ManifestDependencyResolver) -> str:
|
|
67
|
+
"""Generate a Mermaid dependency graph."""
|
|
68
|
+
try:
|
|
69
|
+
graph = resolver.build_graph(validate=True)
|
|
70
|
+
|
|
71
|
+
lines = ["graph TD"]
|
|
72
|
+
|
|
73
|
+
# Добавить узлы
|
|
74
|
+
for pipeline_name in resolver._manifest_map:
|
|
75
|
+
safe_name = pipeline_name.replace("-", "_").replace(".", "_")
|
|
76
|
+
lines.append(f" {safe_name}[{pipeline_name}]")
|
|
77
|
+
|
|
78
|
+
# Добавить рёбра (зависимости)
|
|
79
|
+
# dep --> pipeline означает: "dep выполняется ПЕРЕД pipeline"
|
|
80
|
+
for pipeline_name in resolver._manifest_map:
|
|
81
|
+
safe_name = pipeline_name.replace("-", "_").replace(".", "_")
|
|
82
|
+
deps = graph.get_dependencies(pipeline_name)
|
|
83
|
+
for dep in deps:
|
|
84
|
+
safe_dep = dep.replace("-", "_").replace(".", "_")
|
|
85
|
+
lines.append(f" {safe_dep} --> {safe_name}")
|
|
86
|
+
|
|
87
|
+
return "\n".join(lines)
|
|
88
|
+
|
|
89
|
+
except CyclicDependencyError as e:
|
|
90
|
+
return f"Could not generate diagram: cyclic dependency detected\n\nCycle: {' -> '.join(e.cycle)}"
|
|
91
|
+
except Exception as e:
|
|
92
|
+
return f"Could not generate diagram: {e}"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def main():
|
|
96
|
+
parser = argparse.ArgumentParser(
|
|
97
|
+
description="Render lineage and dependency reports for dltaf manifests",
|
|
98
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
parser.add_argument(
|
|
102
|
+
"--manifests-dir",
|
|
103
|
+
type=Path,
|
|
104
|
+
default=Path(__file__).resolve().parent.parent / "dlt_pipelines" / "manifests",
|
|
105
|
+
help="Directory with YAML manifests (default: dlt_pipelines/manifests)",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
parser.add_argument(
|
|
109
|
+
"--format",
|
|
110
|
+
choices=["text", "json", "mermaid"],
|
|
111
|
+
default="text",
|
|
112
|
+
help="Output format (default: text)",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
args = parser.parse_args()
|
|
116
|
+
|
|
117
|
+
if not args.manifests_dir.exists():
|
|
118
|
+
print(f"ERROR: manifests directory not found: {args.manifests_dir}", file=sys.stderr)
|
|
119
|
+
sys.exit(1)
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
resolver = ManifestDependencyResolver(args.manifests_dir)
|
|
123
|
+
|
|
124
|
+
if args.format == "text":
|
|
125
|
+
print(format_text_report(resolver))
|
|
126
|
+
elif args.format == "json":
|
|
127
|
+
print(format_json_report(resolver))
|
|
128
|
+
elif args.format == "mermaid":
|
|
129
|
+
print(format_mermaid_diagram(resolver))
|
|
130
|
+
|
|
131
|
+
# Проверка на ошибки валидации
|
|
132
|
+
try:
|
|
133
|
+
resolver.build_graph(validate=True)
|
|
134
|
+
sys.exit(0)
|
|
135
|
+
except CyclicDependencyError:
|
|
136
|
+
sys.exit(1)
|
|
137
|
+
except Exception:
|
|
138
|
+
sys.exit(1)
|
|
139
|
+
|
|
140
|
+
except Exception as e:
|
|
141
|
+
print(f"ERROR: {e}", file=sys.stderr)
|
|
142
|
+
sys.exit(1)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
main()
|
dag_builder/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Модуль для динамического построения Airflow DAG из YAML манифестов.
|
|
2
|
+
|
|
3
|
+
Этот модуль предоставляет функцию build_dag_from_yaml() для создания DAG'ов
|
|
4
|
+
с автоматическим управлением зависимостями между пайплайнами.
|
|
5
|
+
|
|
6
|
+
Основные компоненты:
|
|
7
|
+
- build_dag_from_yaml: Главная функция для построения DAG из манифестов
|
|
8
|
+
- ManifestLoader: Загрузчик и валидатор YAML манифестов
|
|
9
|
+
- DependencyResolver: Резолвер зависимостей между манифестами
|
|
10
|
+
- TaskFactory: Фабрика для создания Airflow задач
|
|
11
|
+
- DAGBuilder: Построитель графа задач в DAG
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from dag_builder.builder import build_dag_from_yaml
|
|
16
|
+
from dag_builder.manifest_loader import ManifestLoader
|
|
17
|
+
from dag_builder.dependency_resolver import DependencyResolver
|
|
18
|
+
from dag_builder.task_factory import TaskFactory
|
|
19
|
+
from dag_builder.dag_builder_core import DAGBuilder
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
# Главная функция
|
|
23
|
+
"build_dag_from_yaml",
|
|
24
|
+
|
|
25
|
+
# Компоненты
|
|
26
|
+
"ManifestLoader",
|
|
27
|
+
"DependencyResolver",
|
|
28
|
+
"TaskFactory",
|
|
29
|
+
"DAGBuilder",
|
|
30
|
+
]
|