data-collection-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_collection_framework-0.1.0.dist-info/METADATA +19 -0
- data_collection_framework-0.1.0.dist-info/RECORD +44 -0
- data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
- data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
- data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
- dcf/__init__.py +4 -0
- dcf/cli.py +841 -0
- dcf/config/__init__.py +4 -0
- dcf/config/loader.py +77 -0
- dcf/config/models.py +240 -0
- dcf/engine/__init__.py +6 -0
- dcf/engine/fetcher.py +118 -0
- dcf/engine/iterator.py +96 -0
- dcf/engine/projector.py +56 -0
- dcf/engine/runner.py +90 -0
- dcf/engine/transforms.py +41 -0
- dcf/gcp/__init__.py +0 -0
- dcf/gcp/_collector_utils.py +87 -0
- dcf/gcp/auth.py +1 -0
- dcf/gcp/batch_deploy.py +548 -0
- dcf/gcp/bootstrap.py +131 -0
- dcf/gcp/gcloud.py +42 -0
- dcf/gcp/terraform.py +151 -0
- dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
- dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
- dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
- dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
- dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
- dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
- dcf/infra/modules/batch_collector/local/main.tf +32 -0
- dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/local/variables.tf +25 -0
- dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
- dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
- dcf/infra/templates/docker-compose.yml.tftpl +76 -0
- dcf/local_deploy.py +756 -0
- dcf/project.py +23 -0
- dcf/spark_session.py +66 -0
- dcf/warehouse_reader.py +323 -0
- dcf/writer/__init__.py +3 -0
- dcf/writer/iceberg.py +315 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
terraform {
|
|
2
|
+
required_version = ">= 1.0"
|
|
3
|
+
required_providers {
|
|
4
|
+
local = {
|
|
5
|
+
source = "hashicorp/local"
|
|
6
|
+
version = "~> 2.0"
|
|
7
|
+
}
|
|
8
|
+
null = {
|
|
9
|
+
source = "hashicorp/null"
|
|
10
|
+
version = "~> 3.0"
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
resource "local_file" "dockerfile" {
|
|
16
|
+
content = templatefile("${path.module}/templates/airflow.Dockerfile.tftpl", {
|
|
17
|
+
target = "local"
|
|
18
|
+
})
|
|
19
|
+
filename = "${var.build_context}/Dockerfile"
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
resource "null_resource" "build" {
|
|
23
|
+
depends_on = [local_file.dockerfile]
|
|
24
|
+
|
|
25
|
+
triggers = {
|
|
26
|
+
content_hash = var.content_hash
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
provisioner "local-exec" {
|
|
30
|
+
command = "docker build -t ${var.image_tag} ${var.build_context}"
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
resource "local_file" "compose" {
|
|
35
|
+
content = templatefile("${path.module}/templates/docker-compose.yml.tftpl", {
|
|
36
|
+
image_tag = var.image_tag
|
|
37
|
+
dag_dir = var.dag_dir
|
|
38
|
+
docker_socket = var.docker_socket
|
|
39
|
+
db_password = var.db_password
|
|
40
|
+
admin_password = var.admin_password
|
|
41
|
+
fernet_key = var.fernet_key
|
|
42
|
+
webserver_port = var.webserver_port
|
|
43
|
+
})
|
|
44
|
+
filename = var.compose_file_path
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
resource "null_resource" "up" {
|
|
48
|
+
depends_on = [local_file.compose, null_resource.build]
|
|
49
|
+
|
|
50
|
+
triggers = {
|
|
51
|
+
content_hash = var.content_hash
|
|
52
|
+
compose_hash = sha256(local_file.compose.content)
|
|
53
|
+
compose_file_path = var.compose_file_path
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
provisioner "local-exec" {
|
|
57
|
+
command = "docker compose -f ${var.compose_file_path} up -d"
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
provisioner "local-exec" {
|
|
61
|
+
when = destroy
|
|
62
|
+
command = "docker compose -f ${self.triggers.compose_file_path} down --volumes"
|
|
63
|
+
}
|
|
64
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
output "webserver_url" {
|
|
2
|
+
description = "URL of the local Airflow webserver"
|
|
3
|
+
value = "http://localhost:${var.webserver_port}"
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
output "compose_file" {
|
|
7
|
+
description = "Absolute path to the generated docker-compose.yml"
|
|
8
|
+
value = var.compose_file_path
|
|
9
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
variable "image_tag" {
|
|
2
|
+
type = string
|
|
3
|
+
description = "Docker image tag for the Airflow image (e.g. dcf-airflow-local:latest)"
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
variable "build_context" {
|
|
7
|
+
type = string
|
|
8
|
+
description = "Absolute path to the Airflow build context directory"
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
variable "content_hash" {
|
|
12
|
+
type = string
|
|
13
|
+
description = "SHA256 of Airflow Dockerfile template — triggers rebuild when template changes"
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
variable "dag_dir" {
|
|
17
|
+
type = string
|
|
18
|
+
description = "Absolute host path to the DAGs directory (mounted read-only into scheduler)"
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
variable "warehouse_path" {
|
|
22
|
+
type = string
|
|
23
|
+
description = "Absolute host path to the warehouse (for DockerOperator volume mounts)"
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
variable "docker_socket" {
|
|
27
|
+
type = string
|
|
28
|
+
default = "/var/run/docker.sock"
|
|
29
|
+
description = "Host Docker socket path"
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
variable "db_password" {
|
|
33
|
+
type = string
|
|
34
|
+
sensitive = true
|
|
35
|
+
description = "PostgreSQL password for the local Airflow database"
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
variable "admin_password" {
|
|
39
|
+
type = string
|
|
40
|
+
sensitive = true
|
|
41
|
+
description = "Airflow webserver admin password"
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
variable "fernet_key" {
|
|
45
|
+
type = string
|
|
46
|
+
sensitive = true
|
|
47
|
+
description = "Airflow fernet key for encrypting connection passwords"
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
variable "compose_file_path" {
|
|
51
|
+
type = string
|
|
52
|
+
description = "Absolute path where the generated docker-compose.yml will be written"
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
variable "webserver_port" {
|
|
56
|
+
type = number
|
|
57
|
+
default = 8090
|
|
58
|
+
description = "Host port to expose the Airflow webserver on"
|
|
59
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
terraform {
|
|
2
|
+
required_version = ">= 1.0"
|
|
3
|
+
required_providers {
|
|
4
|
+
local = {
|
|
5
|
+
source = "hashicorp/local"
|
|
6
|
+
version = "~> 2.0"
|
|
7
|
+
}
|
|
8
|
+
null = {
|
|
9
|
+
source = "hashicorp/null"
|
|
10
|
+
version = "~> 3.0"
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
resource "local_file" "dockerfile" {
|
|
16
|
+
content = templatefile("${path.module}/templates/batch_collector.Dockerfile.tftpl", {
|
|
17
|
+
java_enabled = var.java_enabled
|
|
18
|
+
})
|
|
19
|
+
filename = "${var.build_context}/Dockerfile"
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
resource "null_resource" "build" {
|
|
23
|
+
depends_on = [local_file.dockerfile]
|
|
24
|
+
|
|
25
|
+
triggers = {
|
|
26
|
+
content_hash = var.content_hash
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
provisioner "local-exec" {
|
|
30
|
+
command = "docker build -t ${var.image_tag} ${var.build_context}"
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
variable "collector_name" {
|
|
2
|
+
type = string
|
|
3
|
+
description = "dcf collector name (e.g. github_repos)"
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
variable "build_context" {
|
|
7
|
+
type = string
|
|
8
|
+
description = "Absolute path to the stable build context directory"
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
variable "image_tag" {
|
|
12
|
+
type = string
|
|
13
|
+
description = "Docker image tag (e.g. dcf-local/github_repos:latest)"
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
variable "content_hash" {
|
|
17
|
+
type = string
|
|
18
|
+
description = "SHA256 of build context files — triggers rebuild when changed"
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
variable "java_enabled" {
|
|
22
|
+
type = bool
|
|
23
|
+
default = true
|
|
24
|
+
description = "Install OpenJDK in the container (required for local Iceberg/Spark)"
|
|
25
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
%{ if java_enabled ~}
|
|
3
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
4
|
+
openjdk-21-jre-headless && rm -rf /var/lib/apt/lists/*
|
|
5
|
+
%{ endif ~}
|
|
6
|
+
WORKDIR /app
|
|
7
|
+
COPY pyproject.toml .
|
|
8
|
+
COPY ddt/ ./ddt/
|
|
9
|
+
RUN pip install --no-cache-dir -e .
|
|
10
|
+
COPY pipelines/ ./pipelines/
|
|
11
|
+
COPY connectors/ ./connectors/
|
|
12
|
+
COPY project.yml .
|
|
13
|
+
ENV PIPELINE_NAME=""
|
|
14
|
+
CMD ["sh", "-c", "ddt run $PIPELINE_NAME"]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
version: "3.8"
|
|
2
|
+
|
|
3
|
+
x-airflow-env: &airflow-env
|
|
4
|
+
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${db_password}@postgres/airflow
|
|
5
|
+
AIRFLOW__CORE__EXECUTOR: LocalExecutor
|
|
6
|
+
AIRFLOW__CORE__FERNET_KEY: ${fernet_key}
|
|
7
|
+
AIRFLOW__WEBSERVER__SECRET_KEY: ${fernet_key}
|
|
8
|
+
AIRFLOW__CORE__LOAD_EXAMPLES: "false"
|
|
9
|
+
AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL: "30"
|
|
10
|
+
AIRFLOW__API__AUTH_BACKENDS: "airflow.api.auth.backend.basic_auth"
|
|
11
|
+
|
|
12
|
+
services:
|
|
13
|
+
postgres:
|
|
14
|
+
image: postgres:15
|
|
15
|
+
environment:
|
|
16
|
+
POSTGRES_USER: airflow
|
|
17
|
+
POSTGRES_PASSWORD: ${db_password}
|
|
18
|
+
POSTGRES_DB: airflow
|
|
19
|
+
volumes:
|
|
20
|
+
- postgres_data:/var/lib/postgresql/data
|
|
21
|
+
healthcheck:
|
|
22
|
+
test: ["CMD", "pg_isready", "-U", "airflow"]
|
|
23
|
+
interval: 5s
|
|
24
|
+
retries: 10
|
|
25
|
+
|
|
26
|
+
airflow-init:
|
|
27
|
+
image: ${image_tag}
|
|
28
|
+
depends_on:
|
|
29
|
+
postgres:
|
|
30
|
+
condition: service_healthy
|
|
31
|
+
environment:
|
|
32
|
+
<<: *airflow-env
|
|
33
|
+
entrypoint:
|
|
34
|
+
- bash
|
|
35
|
+
- -c
|
|
36
|
+
- >-
|
|
37
|
+
airflow db migrate &&
|
|
38
|
+
airflow users create
|
|
39
|
+
--username admin
|
|
40
|
+
--password ${admin_password}
|
|
41
|
+
--firstname Admin
|
|
42
|
+
--lastname User
|
|
43
|
+
--role Admin
|
|
44
|
+
--email admin@ddt.local
|
|
45
|
+
|
|
46
|
+
airflow-scheduler:
|
|
47
|
+
image: ${image_tag}
|
|
48
|
+
depends_on:
|
|
49
|
+
airflow-init:
|
|
50
|
+
condition: service_completed_successfully
|
|
51
|
+
environment:
|
|
52
|
+
<<: *airflow-env
|
|
53
|
+
volumes:
|
|
54
|
+
- ${dag_dir}:/opt/airflow/dags:ro
|
|
55
|
+
- ${docker_socket}:/var/run/docker.sock
|
|
56
|
+
command: scheduler
|
|
57
|
+
restart: on-failure
|
|
58
|
+
|
|
59
|
+
airflow-webserver:
|
|
60
|
+
image: ${image_tag}
|
|
61
|
+
depends_on:
|
|
62
|
+
airflow-init:
|
|
63
|
+
condition: service_completed_successfully
|
|
64
|
+
environment:
|
|
65
|
+
<<: *airflow-env
|
|
66
|
+
ports:
|
|
67
|
+
- "${webserver_port}:8080"
|
|
68
|
+
command: webserver
|
|
69
|
+
healthcheck:
|
|
70
|
+
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
|
|
71
|
+
interval: 10s
|
|
72
|
+
retries: 15
|
|
73
|
+
restart: on-failure
|
|
74
|
+
|
|
75
|
+
volumes:
|
|
76
|
+
postgres_data:
|