data-collection-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data_collection_framework-0.1.0.dist-info/METADATA +19 -0
  2. data_collection_framework-0.1.0.dist-info/RECORD +44 -0
  3. data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
  4. data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
  6. dcf/__init__.py +4 -0
  7. dcf/cli.py +841 -0
  8. dcf/config/__init__.py +4 -0
  9. dcf/config/loader.py +77 -0
  10. dcf/config/models.py +240 -0
  11. dcf/engine/__init__.py +6 -0
  12. dcf/engine/fetcher.py +118 -0
  13. dcf/engine/iterator.py +96 -0
  14. dcf/engine/projector.py +56 -0
  15. dcf/engine/runner.py +90 -0
  16. dcf/engine/transforms.py +41 -0
  17. dcf/gcp/__init__.py +0 -0
  18. dcf/gcp/_collector_utils.py +87 -0
  19. dcf/gcp/auth.py +1 -0
  20. dcf/gcp/batch_deploy.py +548 -0
  21. dcf/gcp/bootstrap.py +131 -0
  22. dcf/gcp/gcloud.py +42 -0
  23. dcf/gcp/terraform.py +151 -0
  24. dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
  25. dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
  26. dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
  27. dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
  28. dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
  29. dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
  30. dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
  31. dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
  32. dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
  33. dcf/infra/modules/batch_collector/local/main.tf +32 -0
  34. dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
  35. dcf/infra/modules/batch_collector/local/variables.tf +25 -0
  36. dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
  37. dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
  38. dcf/infra/templates/docker-compose.yml.tftpl +76 -0
  39. dcf/local_deploy.py +756 -0
  40. dcf/project.py +23 -0
  41. dcf/spark_session.py +66 -0
  42. dcf/warehouse_reader.py +323 -0
  43. dcf/writer/__init__.py +3 -0
  44. dcf/writer/iceberg.py +315 -0
@@ -0,0 +1,64 @@
1
+ terraform {
2
+ required_version = ">= 1.0"
3
+ required_providers {
4
+ local = {
5
+ source = "hashicorp/local"
6
+ version = "~> 2.0"
7
+ }
8
+ null = {
9
+ source = "hashicorp/null"
10
+ version = "~> 3.0"
11
+ }
12
+ }
13
+ }
14
+
15
+ resource "local_file" "dockerfile" {
16
+ content = templatefile("${path.module}/templates/airflow.Dockerfile.tftpl", {
17
+ target = "local"
18
+ })
19
+ filename = "${var.build_context}/Dockerfile"
20
+ }
21
+
22
+ resource "null_resource" "build" {
23
+ depends_on = [local_file.dockerfile]
24
+
25
+ triggers = {
26
+ content_hash = var.content_hash
27
+ }
28
+
29
+ provisioner "local-exec" {
30
+ command = "docker build -t ${var.image_tag} ${var.build_context}"
31
+ }
32
+ }
33
+
34
+ resource "local_file" "compose" {
35
+ content = templatefile("${path.module}/templates/docker-compose.yml.tftpl", {
36
+ image_tag = var.image_tag
37
+ dag_dir = var.dag_dir
38
+ docker_socket = var.docker_socket
39
+ db_password = var.db_password
40
+ admin_password = var.admin_password
41
+ fernet_key = var.fernet_key
42
+ webserver_port = var.webserver_port
43
+ })
44
+ filename = var.compose_file_path
45
+ }
46
+
47
+ resource "null_resource" "up" {
48
+ depends_on = [local_file.compose, null_resource.build]
49
+
50
+ triggers = {
51
+ content_hash = var.content_hash
52
+ compose_hash = sha256(local_file.compose.content)
53
+ compose_file_path = var.compose_file_path
54
+ }
55
+
56
+ provisioner "local-exec" {
57
+ command = "docker compose -f ${var.compose_file_path} up -d"
58
+ }
59
+
60
+ provisioner "local-exec" {
61
+ when = destroy
62
+ command = "docker compose -f ${self.triggers.compose_file_path} down --volumes"
63
+ }
64
+ }
@@ -0,0 +1,9 @@
1
+ output "webserver_url" {
2
+ description = "URL of the local Airflow webserver"
3
+ value = "http://localhost:${var.webserver_port}"
4
+ }
5
+
6
+ output "compose_file" {
7
+ description = "Absolute path to the generated docker-compose.yml"
8
+ value = var.compose_file_path
9
+ }
@@ -0,0 +1,59 @@
1
+ variable "image_tag" {
2
+ type = string
3
+ description = "Docker image tag for the Airflow image (e.g. dcf-airflow-local:latest)"
4
+ }
5
+
6
+ variable "build_context" {
7
+ type = string
8
+ description = "Absolute path to the Airflow build context directory"
9
+ }
10
+
11
+ variable "content_hash" {
12
+ type = string
13
+ description = "SHA256 of Airflow Dockerfile template — triggers rebuild when template changes"
14
+ }
15
+
16
+ variable "dag_dir" {
17
+ type = string
18
+ description = "Absolute host path to the DAGs directory (mounted read-only into scheduler)"
19
+ }
20
+
21
+ variable "warehouse_path" {
22
+ type = string
23
+ description = "Absolute host path to the warehouse (for DockerOperator volume mounts)"
24
+ }
25
+
26
+ variable "docker_socket" {
27
+ type = string
28
+ default = "/var/run/docker.sock"
29
+ description = "Host Docker socket path"
30
+ }
31
+
32
+ variable "db_password" {
33
+ type = string
34
+ sensitive = true
35
+ description = "PostgreSQL password for the local Airflow database"
36
+ }
37
+
38
+ variable "admin_password" {
39
+ type = string
40
+ sensitive = true
41
+ description = "Airflow webserver admin password"
42
+ }
43
+
44
+ variable "fernet_key" {
45
+ type = string
46
+ sensitive = true
47
+ description = "Airflow fernet key for encrypting connection passwords"
48
+ }
49
+
50
+ variable "compose_file_path" {
51
+ type = string
52
+ description = "Absolute path where the generated docker-compose.yml will be written"
53
+ }
54
+
55
+ variable "webserver_port" {
56
+ type = number
57
+ default = 8090
58
+ description = "Host port to expose the Airflow webserver on"
59
+ }
@@ -0,0 +1,32 @@
1
+ terraform {
2
+ required_version = ">= 1.0"
3
+ required_providers {
4
+ local = {
5
+ source = "hashicorp/local"
6
+ version = "~> 2.0"
7
+ }
8
+ null = {
9
+ source = "hashicorp/null"
10
+ version = "~> 3.0"
11
+ }
12
+ }
13
+ }
14
+
15
+ resource "local_file" "dockerfile" {
16
+ content = templatefile("${path.module}/templates/batch_collector.Dockerfile.tftpl", {
17
+ java_enabled = var.java_enabled
18
+ })
19
+ filename = "${var.build_context}/Dockerfile"
20
+ }
21
+
22
+ resource "null_resource" "build" {
23
+ depends_on = [local_file.dockerfile]
24
+
25
+ triggers = {
26
+ content_hash = var.content_hash
27
+ }
28
+
29
+ provisioner "local-exec" {
30
+ command = "docker build -t ${var.image_tag} ${var.build_context}"
31
+ }
32
+ }
@@ -0,0 +1,4 @@
1
+ output "image_tag" {
2
+ description = "Docker image tag that was built"
3
+ value = var.image_tag
4
+ }
@@ -0,0 +1,25 @@
1
+ variable "collector_name" {
2
+ type = string
3
+ description = "dcf collector name (e.g. github_repos)"
4
+ }
5
+
6
+ variable "build_context" {
7
+ type = string
8
+ description = "Absolute path to the stable build context directory"
9
+ }
10
+
11
+ variable "image_tag" {
12
+ type = string
13
+ description = "Docker image tag (e.g. dcf-local/github_repos:latest)"
14
+ }
15
+
16
+ variable "content_hash" {
17
+ type = string
18
+ description = "SHA256 of build context files — triggers rebuild when changed"
19
+ }
20
+
21
+ variable "java_enabled" {
22
+ type = bool
23
+ default = true
24
+ description = "Install OpenJDK in the container (required for local Iceberg/Spark)"
25
+ }
@@ -0,0 +1,6 @@
1
+ FROM apache/airflow:2.10.4-python3.12
2
+ %{ if target == "local" ~}
3
+ RUN pip install --no-cache-dir apache-airflow-providers-docker
4
+ %{ else ~}
5
+ RUN pip install --no-cache-dir apache-airflow-providers-google
6
+ %{ endif ~}
@@ -0,0 +1,14 @@
1
+ FROM python:3.12-slim
2
+ %{ if java_enabled ~}
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ openjdk-21-jre-headless && rm -rf /var/lib/apt/lists/*
5
+ %{ endif ~}
6
+ WORKDIR /app
7
+ COPY pyproject.toml .
8
+ COPY ddt/ ./ddt/
9
+ RUN pip install --no-cache-dir -e .
10
+ COPY pipelines/ ./pipelines/
11
+ COPY connectors/ ./connectors/
12
+ COPY project.yml .
13
+ ENV PIPELINE_NAME=""
14
+ CMD ["sh", "-c", "ddt run $PIPELINE_NAME"]
@@ -0,0 +1,76 @@
1
+ version: "3.8"
2
+
3
+ x-airflow-env: &airflow-env
4
+ AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${db_password}@postgres/airflow
5
+ AIRFLOW__CORE__EXECUTOR: LocalExecutor
6
+ AIRFLOW__CORE__FERNET_KEY: ${fernet_key}
7
+ AIRFLOW__WEBSERVER__SECRET_KEY: ${fernet_key}
8
+ AIRFLOW__CORE__LOAD_EXAMPLES: "false"
9
+ AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL: "30"
10
+ AIRFLOW__API__AUTH_BACKENDS: "airflow.api.auth.backend.basic_auth"
11
+
12
+ services:
13
+ postgres:
14
+ image: postgres:15
15
+ environment:
16
+ POSTGRES_USER: airflow
17
+ POSTGRES_PASSWORD: ${db_password}
18
+ POSTGRES_DB: airflow
19
+ volumes:
20
+ - postgres_data:/var/lib/postgresql/data
21
+ healthcheck:
22
+ test: ["CMD", "pg_isready", "-U", "airflow"]
23
+ interval: 5s
24
+ retries: 10
25
+
26
+ airflow-init:
27
+ image: ${image_tag}
28
+ depends_on:
29
+ postgres:
30
+ condition: service_healthy
31
+ environment:
32
+ <<: *airflow-env
33
+ entrypoint:
34
+ - bash
35
+ - -c
36
+ - >-
37
+ airflow db migrate &&
38
+ airflow users create
39
+ --username admin
40
+ --password ${admin_password}
41
+ --firstname Admin
42
+ --lastname User
43
+ --role Admin
44
+ --email admin@ddt.local
45
+
46
+ airflow-scheduler:
47
+ image: ${image_tag}
48
+ depends_on:
49
+ airflow-init:
50
+ condition: service_completed_successfully
51
+ environment:
52
+ <<: *airflow-env
53
+ volumes:
54
+ - ${dag_dir}:/opt/airflow/dags:ro
55
+ - ${docker_socket}:/var/run/docker.sock
56
+ command: scheduler
57
+ restart: on-failure
58
+
59
+ airflow-webserver:
60
+ image: ${image_tag}
61
+ depends_on:
62
+ airflow-init:
63
+ condition: service_completed_successfully
64
+ environment:
65
+ <<: *airflow-env
66
+ ports:
67
+ - "${webserver_port}:8080"
68
+ command: webserver
69
+ healthcheck:
70
+ test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
71
+ interval: 10s
72
+ retries: 15
73
+ restart: on-failure
74
+
75
+ volumes:
76
+ postgres_data: