data-collection-framework 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_collection_framework-0.1.0.dist-info/METADATA +19 -0
- data_collection_framework-0.1.0.dist-info/RECORD +44 -0
- data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
- data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
- data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
- dcf/__init__.py +4 -0
- dcf/cli.py +841 -0
- dcf/config/__init__.py +4 -0
- dcf/config/loader.py +77 -0
- dcf/config/models.py +240 -0
- dcf/engine/__init__.py +6 -0
- dcf/engine/fetcher.py +118 -0
- dcf/engine/iterator.py +96 -0
- dcf/engine/projector.py +56 -0
- dcf/engine/runner.py +90 -0
- dcf/engine/transforms.py +41 -0
- dcf/gcp/__init__.py +0 -0
- dcf/gcp/_collector_utils.py +87 -0
- dcf/gcp/auth.py +1 -0
- dcf/gcp/batch_deploy.py +548 -0
- dcf/gcp/bootstrap.py +131 -0
- dcf/gcp/gcloud.py +42 -0
- dcf/gcp/terraform.py +151 -0
- dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
- dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
- dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
- dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
- dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
- dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
- dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
- dcf/infra/modules/batch_collector/local/main.tf +32 -0
- dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
- dcf/infra/modules/batch_collector/local/variables.tf +25 -0
- dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
- dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
- dcf/infra/templates/docker-compose.yml.tftpl +76 -0
- dcf/local_deploy.py +756 -0
- dcf/project.py +23 -0
- dcf/spark_session.py +66 -0
- dcf/warehouse_reader.py +323 -0
- dcf/writer/__init__.py +3 -0
- dcf/writer/iceberg.py +315 -0
dcf/gcp/bootstrap.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from google.api_core.exceptions import Conflict, NotFound
|
|
6
|
+
from google.auth.credentials import Credentials
|
|
7
|
+
from google.cloud import secretmanager, storage
|
|
8
|
+
from googleapiclient import discovery
|
|
9
|
+
from googleapiclient.errors import HttpError
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
_SA_ACCOUNT_ID = "dcf-lake"
|
|
14
|
+
_SECRET_ID = "dcf-lake-sa-key"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def create_state_bucket(project_id: str, region: str, credentials: Credentials) -> str:
|
|
18
|
+
"""Create the GCS bucket used for Terraform state. Returns bucket name."""
|
|
19
|
+
bucket_name = f"dcf-tf-state-{project_id}"
|
|
20
|
+
client = storage.Client(project=project_id, credentials=credentials)
|
|
21
|
+
try:
|
|
22
|
+
bucket = client.create_bucket(bucket_name, location=region)
|
|
23
|
+
bucket.versioning_enabled = True
|
|
24
|
+
bucket.patch()
|
|
25
|
+
logger.info("Created Terraform state bucket %s", bucket_name)
|
|
26
|
+
except Conflict:
|
|
27
|
+
logger.info("Terraform state bucket %s already exists", bucket_name)
|
|
28
|
+
except Exception as e:
|
|
29
|
+
from google.api_core.exceptions import Forbidden
|
|
30
|
+
if isinstance(e, Forbidden) or "billing" in str(e).lower():
|
|
31
|
+
raise RuntimeError(
|
|
32
|
+
f"Billing is not enabled for project '{project_id}'. "
|
|
33
|
+
f"Enable it at: https://console.cloud.google.com/billing"
|
|
34
|
+
) from e
|
|
35
|
+
raise
|
|
36
|
+
return bucket_name
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def create_service_account(project_id: str, credentials: Credentials) -> str:
|
|
40
|
+
"""Create the dcf-lake service account. Returns SA email."""
|
|
41
|
+
sa_email = f"{_SA_ACCOUNT_ID}@{project_id}.iam.gserviceaccount.com"
|
|
42
|
+
service = discovery.build("iam", "v1", credentials=credentials, cache_discovery=False)
|
|
43
|
+
try:
|
|
44
|
+
service.projects().serviceAccounts().create(
|
|
45
|
+
name=f"projects/{project_id}",
|
|
46
|
+
body={
|
|
47
|
+
"accountId": _SA_ACCOUNT_ID,
|
|
48
|
+
"serviceAccount": {"displayName": "dcf Lake Service Account"},
|
|
49
|
+
},
|
|
50
|
+
).execute()
|
|
51
|
+
logger.info("Created service account %s", sa_email)
|
|
52
|
+
except HttpError as e:
|
|
53
|
+
if e.resp.status == 409:
|
|
54
|
+
logger.info("Service account %s already exists", sa_email)
|
|
55
|
+
else:
|
|
56
|
+
raise
|
|
57
|
+
return sa_email
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def create_service_account_key(project_id: str, sa_email: str, credentials: Credentials) -> dict:
|
|
61
|
+
"""Create a new JSON key for the SA. Returns decoded key dict."""
|
|
62
|
+
service = discovery.build("iam", "v1", credentials=credentials, cache_discovery=False)
|
|
63
|
+
result = service.projects().serviceAccounts().keys().create(
|
|
64
|
+
name=f"projects/{project_id}/serviceAccounts/{sa_email}",
|
|
65
|
+
body={"privateKeyType": "TYPE_GOOGLE_CREDENTIALS_FILE"},
|
|
66
|
+
).execute()
|
|
67
|
+
key_data = json.loads(base64.b64decode(result["privateKeyData"]).decode())
|
|
68
|
+
logger.info("Created SA key for %s", sa_email)
|
|
69
|
+
return key_data
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def store_key_in_secret_manager(project_id: str, key_data: dict, credentials: Credentials) -> str:
|
|
73
|
+
"""
|
|
74
|
+
Store the SA key in Secret Manager as 'dcf-lake-sa-key'.
|
|
75
|
+
Creates the secret if it doesn't exist, then adds a new version.
|
|
76
|
+
Returns the full secret resource name.
|
|
77
|
+
"""
|
|
78
|
+
client = secretmanager.SecretManagerServiceClient(credentials=credentials)
|
|
79
|
+
parent = f"projects/{project_id}"
|
|
80
|
+
secret_name = f"{parent}/secrets/{_SECRET_ID}"
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
client.create_secret(request={
|
|
84
|
+
"parent": parent,
|
|
85
|
+
"secret_id": _SECRET_ID,
|
|
86
|
+
"secret": {"replication": {"automatic": {}}},
|
|
87
|
+
})
|
|
88
|
+
logger.info("Created Secret Manager secret %s", _SECRET_ID)
|
|
89
|
+
except Conflict:
|
|
90
|
+
logger.info("Secret %s already exists, adding new version", _SECRET_ID)
|
|
91
|
+
|
|
92
|
+
client.add_secret_version(request={
|
|
93
|
+
"parent": secret_name,
|
|
94
|
+
"payload": {"data": json.dumps(key_data).encode()},
|
|
95
|
+
})
|
|
96
|
+
logger.info("Stored SA key in Secret Manager")
|
|
97
|
+
return secret_name
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def delete_secret(secret_name: str, credentials: Credentials) -> None:
|
|
101
|
+
"""Delete a Secret Manager secret and all its versions."""
|
|
102
|
+
client = secretmanager.SecretManagerServiceClient(credentials=credentials)
|
|
103
|
+
try:
|
|
104
|
+
client.delete_secret(request={"name": secret_name})
|
|
105
|
+
logger.info("Deleted secret %s", secret_name)
|
|
106
|
+
except NotFound:
|
|
107
|
+
logger.info("Secret %s not found, skipping", secret_name)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def delete_service_account(project_id: str, sa_email: str, credentials: Credentials) -> None:
|
|
111
|
+
"""Delete the dcf-lake service account."""
|
|
112
|
+
service = discovery.build("iam", "v1", credentials=credentials, cache_discovery=False)
|
|
113
|
+
try:
|
|
114
|
+
service.projects().serviceAccounts().delete(
|
|
115
|
+
name=f"projects/{project_id}/serviceAccounts/{sa_email}",
|
|
116
|
+
).execute()
|
|
117
|
+
logger.info("Deleted service account %s", sa_email)
|
|
118
|
+
except HttpError as e:
|
|
119
|
+
if e.resp.status == 404:
|
|
120
|
+
logger.info("Service account %s not found, skipping", sa_email)
|
|
121
|
+
else:
|
|
122
|
+
raise
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def fetch_service_account_key(project_id: str, secret_name: str) -> dict:
|
|
126
|
+
"""Fetch the latest SA key from Secret Manager using ADC credentials."""
|
|
127
|
+
from .gcloud import get_credentials
|
|
128
|
+
credentials = get_credentials()
|
|
129
|
+
client = secretmanager.SecretManagerServiceClient(credentials=credentials)
|
|
130
|
+
response = client.access_secret_version(request={"name": f"{secret_name}/versions/latest"})
|
|
131
|
+
return json.loads(response.payload.data.decode())
|
dcf/gcp/gcloud.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
import subprocess
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import google.auth
|
|
6
|
+
from google.auth.exceptions import DefaultCredentialsError
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
|
|
11
|
+
_INSTALL_URL = "https://cloud.google.com/sdk/docs/install"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_credentials():
|
|
15
|
+
"""
|
|
16
|
+
Return ADC credentials scoped to cloud-platform.
|
|
17
|
+
|
|
18
|
+
Resolution order:
|
|
19
|
+
1. Existing ADC (GOOGLE_APPLICATION_CREDENTIALS env var or gcloud ADC file)
|
|
20
|
+
2. If not configured but gcloud is installed, run `gcloud auth application-default login`
|
|
21
|
+
(opens a browser on the local machine) then retry.
|
|
22
|
+
3. If gcloud is not installed, raise RuntimeError with install instructions.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
creds, _ = google.auth.default(scopes=_SCOPES)
|
|
26
|
+
return creds
|
|
27
|
+
except DefaultCredentialsError:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
gcloud = shutil.which("gcloud")
|
|
31
|
+
if not gcloud:
|
|
32
|
+
raise RuntimeError(
|
|
33
|
+
"No Google credentials found and gcloud CLI is not installed.\n"
|
|
34
|
+
f"Install it at: {_INSTALL_URL}\n"
|
|
35
|
+
"Then run: gcloud auth application-default login"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
logger.info("ADC not configured — running gcloud auth application-default login")
|
|
39
|
+
subprocess.run(["gcloud", "auth", "application-default", "login"], check=True)
|
|
40
|
+
|
|
41
|
+
creds, _ = google.auth.default(scopes=_SCOPES)
|
|
42
|
+
return creds
|
dcf/gcp/terraform.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from google.api_core.exceptions import NotFound, Forbidden
|
|
9
|
+
from google.cloud import storage
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
_MODULE_DIR = Path(__file__).parent.parent / "infra" / "modules" / "gcp"
|
|
14
|
+
_WORK_DIR = Path.home() / ".dcf" / "terraform"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def provision(
|
|
18
|
+
project_id: str,
|
|
19
|
+
region: str,
|
|
20
|
+
sa_email: str,
|
|
21
|
+
tf_state_bucket: str,
|
|
22
|
+
) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Run terraform init + apply.
|
|
25
|
+
Auth is handled automatically via ADC (gcloud application-default credentials).
|
|
26
|
+
Returns the warehouse_bucket name from terraform output.
|
|
27
|
+
Raises RuntimeError on non-zero exit.
|
|
28
|
+
"""
|
|
29
|
+
_WORK_DIR.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
for tf_file in _MODULE_DIR.glob("*.tf"):
|
|
32
|
+
shutil.copy2(tf_file, _WORK_DIR / tf_file.name)
|
|
33
|
+
|
|
34
|
+
env = {
|
|
35
|
+
**os.environ,
|
|
36
|
+
"TF_INPUT": "0",
|
|
37
|
+
"TF_PLUGIN_CACHE_DIR": str(_WORK_DIR / ".plugin-cache"),
|
|
38
|
+
}
|
|
39
|
+
(_WORK_DIR / ".plugin-cache").mkdir(exist_ok=True)
|
|
40
|
+
|
|
41
|
+
_run(
|
|
42
|
+
[
|
|
43
|
+
"terraform", "init", "-reconfigure",
|
|
44
|
+
f"-backend-config=bucket={tf_state_bucket}",
|
|
45
|
+
"-backend-config=prefix=terraform/state",
|
|
46
|
+
],
|
|
47
|
+
_WORK_DIR, env,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Import any already-existing resources so apply doesn't fail with 409
|
|
51
|
+
_import_existing_resources(project_id, _WORK_DIR, env)
|
|
52
|
+
|
|
53
|
+
_run(
|
|
54
|
+
[
|
|
55
|
+
"terraform", "apply", "-auto-approve",
|
|
56
|
+
f"-var=project_id={project_id}",
|
|
57
|
+
f"-var=region={region}",
|
|
58
|
+
f"-var=sa_email={sa_email}",
|
|
59
|
+
],
|
|
60
|
+
_WORK_DIR, env,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return _read_output(_WORK_DIR, env, "warehouse_bucket")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def destroy(
|
|
67
|
+
project_id: str,
|
|
68
|
+
region: str,
|
|
69
|
+
sa_email: str,
|
|
70
|
+
tf_state_bucket: str,
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Run terraform destroy to remove all provisioned GCP resources."""
|
|
73
|
+
_WORK_DIR.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
|
|
75
|
+
for tf_file in _MODULE_DIR.glob("*.tf"):
|
|
76
|
+
shutil.copy2(tf_file, _WORK_DIR / tf_file.name)
|
|
77
|
+
|
|
78
|
+
env = {
|
|
79
|
+
**os.environ,
|
|
80
|
+
"TF_INPUT": "0",
|
|
81
|
+
"TF_PLUGIN_CACHE_DIR": str(_WORK_DIR / ".plugin-cache"),
|
|
82
|
+
}
|
|
83
|
+
(_WORK_DIR / ".plugin-cache").mkdir(exist_ok=True)
|
|
84
|
+
|
|
85
|
+
_run(
|
|
86
|
+
[
|
|
87
|
+
"terraform", "init", "-reconfigure",
|
|
88
|
+
f"-backend-config=bucket={tf_state_bucket}",
|
|
89
|
+
"-backend-config=prefix=terraform/state",
|
|
90
|
+
],
|
|
91
|
+
_WORK_DIR, env,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
_run(
|
|
95
|
+
[
|
|
96
|
+
"terraform", "destroy", "-auto-approve",
|
|
97
|
+
f"-var=project_id={project_id}",
|
|
98
|
+
f"-var=region={region}",
|
|
99
|
+
f"-var=sa_email={sa_email}",
|
|
100
|
+
],
|
|
101
|
+
_WORK_DIR, env,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _import_existing_resources(project_id: str, work_dir: Path, env: dict) -> None:
|
|
106
|
+
"""Import already-existing GCP resources into Terraform state to avoid 409 on apply."""
|
|
107
|
+
warehouse_bucket = f"dcf-warehouse-{project_id}"
|
|
108
|
+
client = storage.Client(project=project_id)
|
|
109
|
+
try:
|
|
110
|
+
client.get_bucket(warehouse_bucket)
|
|
111
|
+
except (NotFound, Forbidden):
|
|
112
|
+
return # bucket doesn't exist yet — apply will create it
|
|
113
|
+
|
|
114
|
+
# Bucket exists; import it so terraform apply doesn't try to create it again
|
|
115
|
+
result = subprocess.run(
|
|
116
|
+
["terraform", "import", "google_storage_bucket.warehouse", warehouse_bucket],
|
|
117
|
+
cwd=str(work_dir), env=env, capture_output=True, text=True,
|
|
118
|
+
)
|
|
119
|
+
if result.returncode == 0:
|
|
120
|
+
logger.info("Imported existing warehouse bucket '%s' into Terraform state", warehouse_bucket)
|
|
121
|
+
elif "already managed by Terraform" in result.stdout + result.stderr:
|
|
122
|
+
logger.info("Warehouse bucket '%s' already in Terraform state", warehouse_bucket)
|
|
123
|
+
else:
|
|
124
|
+
# Import failed for an unexpected reason — log and continue; apply may still succeed
|
|
125
|
+
logger.warning("terraform import returned non-zero: %s", result.stderr[-500:])
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _run(cmd: list[str], cwd: Path, env: dict) -> None:
|
|
129
|
+
result = subprocess.run(cmd, cwd=str(cwd), env=env, capture_output=True, text=True)
|
|
130
|
+
if result.returncode != 0:
|
|
131
|
+
logger.error(
|
|
132
|
+
"Terraform command failed: %s\nSTDOUT: %s\nSTDERR: %s",
|
|
133
|
+
" ".join(cmd), result.stdout, result.stderr,
|
|
134
|
+
)
|
|
135
|
+
raise RuntimeError(
|
|
136
|
+
f"terraform {cmd[1]} failed (exit {result.returncode}): {result.stderr[-2000:]}"
|
|
137
|
+
)
|
|
138
|
+
logger.info("terraform %s OK", cmd[1])
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _read_output(work_dir: Path, env: dict, key: str) -> str:
|
|
142
|
+
result = subprocess.run(
|
|
143
|
+
["terraform", "output", "-json"],
|
|
144
|
+
cwd=str(work_dir), env=env, capture_output=True, text=True,
|
|
145
|
+
)
|
|
146
|
+
if result.returncode != 0:
|
|
147
|
+
raise RuntimeError(f"terraform output failed: {result.stderr}")
|
|
148
|
+
outputs = json.loads(result.stdout)
|
|
149
|
+
if key not in outputs:
|
|
150
|
+
raise RuntimeError(f"'{key}' not in terraform output. Got: {list(outputs)}")
|
|
151
|
+
return outputs[key]["value"]
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
terraform {
|
|
2
|
+
required_version = ">= 1.0"
|
|
3
|
+
required_providers {
|
|
4
|
+
google = {
|
|
5
|
+
source = "hashicorp/google"
|
|
6
|
+
version = "~> 5.0"
|
|
7
|
+
}
|
|
8
|
+
local = {
|
|
9
|
+
source = "hashicorp/local"
|
|
10
|
+
version = "~> 2.0"
|
|
11
|
+
}
|
|
12
|
+
null = {
|
|
13
|
+
source = "hashicorp/null"
|
|
14
|
+
version = "~> 3.0"
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
provider "google" {
|
|
20
|
+
project = var.project_id
|
|
21
|
+
region = var.region
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
resource "local_file" "dockerfile" {
|
|
25
|
+
content = templatefile("${path.module}/templates/airflow.Dockerfile.tftpl", {
|
|
26
|
+
target = "gcp"
|
|
27
|
+
})
|
|
28
|
+
filename = "${var.build_context}/Dockerfile"
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
resource "null_resource" "build" {
|
|
32
|
+
depends_on = [local_file.dockerfile]
|
|
33
|
+
|
|
34
|
+
triggers = {
|
|
35
|
+
content_hash = var.content_hash
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
provisioner "local-exec" {
|
|
39
|
+
command = "gcloud builds submit --project ${var.project_id} --region ${var.region} --tag ${var.image_uri} --timeout 600s ${var.build_context}"
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
resource "google_sql_database_instance" "airflow_db" {
|
|
44
|
+
name = "dcf-airflow-db"
|
|
45
|
+
database_version = "POSTGRES_15"
|
|
46
|
+
region = var.region
|
|
47
|
+
|
|
48
|
+
deletion_protection = false
|
|
49
|
+
|
|
50
|
+
settings {
|
|
51
|
+
tier = "db-f1-micro"
|
|
52
|
+
|
|
53
|
+
backup_configuration {
|
|
54
|
+
enabled = false
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
resource "google_sql_database" "airflow" {
|
|
60
|
+
name = "airflow"
|
|
61
|
+
instance = google_sql_database_instance.airflow_db.name
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
resource "google_sql_user" "airflow" {
|
|
65
|
+
name = "airflow"
|
|
66
|
+
instance = google_sql_database_instance.airflow_db.name
|
|
67
|
+
password = var.db_password
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
resource "google_project_iam_member" "cloudsql_client" {
|
|
71
|
+
project = var.project_id
|
|
72
|
+
role = "roles/cloudsql.client"
|
|
73
|
+
member = "serviceAccount:${var.sa_email}"
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
resource "google_project_iam_member" "storage_viewer" {
|
|
77
|
+
project = var.project_id
|
|
78
|
+
role = "roles/storage.objectViewer"
|
|
79
|
+
member = "serviceAccount:${var.sa_email}"
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
locals {
|
|
83
|
+
db_conn_name = google_sql_database_instance.airflow_db.connection_name
|
|
84
|
+
db_url = "postgresql+psycopg2://airflow:${var.db_password}@/airflow?host=/cloudsql/${local.db_conn_name}"
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
resource "google_cloud_run_v2_service" "airflow" {
|
|
88
|
+
depends_on = [null_resource.build, google_sql_database_instance.airflow_db]
|
|
89
|
+
|
|
90
|
+
name = "dcf-airflow"
|
|
91
|
+
location = var.region
|
|
92
|
+
|
|
93
|
+
template {
|
|
94
|
+
service_account = var.sa_email
|
|
95
|
+
|
|
96
|
+
scaling {
|
|
97
|
+
min_instance_count = 1
|
|
98
|
+
max_instance_count = 1
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
volumes {
|
|
102
|
+
name = "dags"
|
|
103
|
+
gcs {
|
|
104
|
+
bucket = var.warehouse_bucket
|
|
105
|
+
read_only = true
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
volumes {
|
|
110
|
+
name = "cloudsql"
|
|
111
|
+
cloud_sql_instance {
|
|
112
|
+
instances = [local.db_conn_name]
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
containers {
|
|
117
|
+
image = var.image_uri
|
|
118
|
+
|
|
119
|
+
ports {
|
|
120
|
+
container_port = 8080
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
volume_mounts {
|
|
124
|
+
name = "dags"
|
|
125
|
+
mount_path = "/opt/airflow/dags"
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
volume_mounts {
|
|
129
|
+
name = "cloudsql"
|
|
130
|
+
mount_path = "/cloudsql"
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
env {
|
|
134
|
+
name = "AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"
|
|
135
|
+
value = local.db_url
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
env {
|
|
139
|
+
name = "AIRFLOW__CORE__EXECUTOR"
|
|
140
|
+
value = "LocalExecutor"
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
env {
|
|
144
|
+
name = "AIRFLOW__CORE__FERNET_KEY"
|
|
145
|
+
value = var.fernet_key
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
env {
|
|
149
|
+
name = "AIRFLOW__WEBSERVER__SECRET_KEY"
|
|
150
|
+
value = var.fernet_key
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
env {
|
|
154
|
+
name = "AIRFLOW__CORE__LOAD_EXAMPLES"
|
|
155
|
+
value = "false"
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
env {
|
|
159
|
+
name = "AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL"
|
|
160
|
+
value = "30"
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
env {
|
|
164
|
+
name = "_AIRFLOW_WWW_USER_CREATE"
|
|
165
|
+
value = "true"
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
env {
|
|
169
|
+
name = "_AIRFLOW_WWW_USER_USERNAME"
|
|
170
|
+
value = "admin"
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
env {
|
|
174
|
+
name = "_AIRFLOW_WWW_USER_PASSWORD"
|
|
175
|
+
value = var.admin_password
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
resources {
|
|
179
|
+
limits = {
|
|
180
|
+
memory = "2Gi"
|
|
181
|
+
cpu = "1"
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
command = ["airflow", "standalone"]
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
lifecycle {
|
|
190
|
+
ignore_changes = [
|
|
191
|
+
template[0].containers[0].image,
|
|
192
|
+
]
|
|
193
|
+
}
|
|
194
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
output "webserver_url" {
|
|
2
|
+
description = "HTTPS URL of the Cloud Run Airflow service"
|
|
3
|
+
value = google_cloud_run_v2_service.airflow.uri
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
output "service_name" {
|
|
7
|
+
description = "Name of the Cloud Run Airflow service"
|
|
8
|
+
value = google_cloud_run_v2_service.airflow.name
|
|
9
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
variable "image_uri" {
|
|
2
|
+
type = string
|
|
3
|
+
description = "Artifact Registry URI for the Airflow image"
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
variable "build_context" {
|
|
7
|
+
type = string
|
|
8
|
+
description = "Absolute host path to the Airflow build context directory"
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
variable "content_hash" {
|
|
12
|
+
type = string
|
|
13
|
+
description = "SHA256 of Airflow Dockerfile template — triggers Cloud Build rebuild"
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
variable "project_id" {
|
|
17
|
+
type = string
|
|
18
|
+
description = "GCP project ID"
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
variable "region" {
|
|
22
|
+
type = string
|
|
23
|
+
description = "GCP region"
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
variable "sa_email" {
|
|
27
|
+
type = string
|
|
28
|
+
description = "Service account email for Cloud Run Airflow service"
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
variable "warehouse_bucket" {
|
|
32
|
+
type = string
|
|
33
|
+
description = "GCS bucket where DAGs are stored at airflow/dags/"
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
variable "db_password" {
|
|
37
|
+
type = string
|
|
38
|
+
sensitive = true
|
|
39
|
+
description = "PostgreSQL password for Cloud SQL Airflow database"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
variable "admin_password" {
|
|
43
|
+
type = string
|
|
44
|
+
sensitive = true
|
|
45
|
+
description = "Airflow webserver admin password"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
variable "fernet_key" {
|
|
49
|
+
type = string
|
|
50
|
+
sensitive = true
|
|
51
|
+
description = "Airflow fernet key for encrypting connection passwords"
|
|
52
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
terraform {
|
|
2
|
+
required_version = ">= 1.0"
|
|
3
|
+
required_providers {
|
|
4
|
+
google = {
|
|
5
|
+
source = "hashicorp/google"
|
|
6
|
+
version = "~> 5.0"
|
|
7
|
+
}
|
|
8
|
+
local = {
|
|
9
|
+
source = "hashicorp/local"
|
|
10
|
+
version = "~> 2.0"
|
|
11
|
+
}
|
|
12
|
+
null = {
|
|
13
|
+
source = "hashicorp/null"
|
|
14
|
+
version = "~> 3.0"
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
provider "google" {
|
|
20
|
+
project = var.project_id
|
|
21
|
+
region = var.region
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
resource "local_file" "dockerfile" {
|
|
25
|
+
content = templatefile("${path.module}/templates/batch_collector.Dockerfile.tftpl", {
|
|
26
|
+
java_enabled = var.java_enabled
|
|
27
|
+
})
|
|
28
|
+
filename = "${var.build_context}/Dockerfile"
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
resource "null_resource" "build" {
|
|
32
|
+
depends_on = [local_file.dockerfile]
|
|
33
|
+
|
|
34
|
+
triggers = {
|
|
35
|
+
content_hash = var.content_hash
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
provisioner "local-exec" {
|
|
39
|
+
command = "gcloud builds submit --project ${var.project_id} --region ${var.region} --tag ${var.image_uri} --timeout 600s ${var.build_context}"
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
resource "google_cloud_run_v2_job" "collector" {
|
|
44
|
+
depends_on = [null_resource.build]
|
|
45
|
+
|
|
46
|
+
name = "dcf-job-${replace(var.collector_name, "_", "-")}"
|
|
47
|
+
location = var.region
|
|
48
|
+
|
|
49
|
+
template {
|
|
50
|
+
template {
|
|
51
|
+
service_account = var.sa_email
|
|
52
|
+
max_retries = 0
|
|
53
|
+
|
|
54
|
+
containers {
|
|
55
|
+
image = var.image_uri
|
|
56
|
+
|
|
57
|
+
env {
|
|
58
|
+
name = "COLLECTOR_NAME"
|
|
59
|
+
value = var.collector_name
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
resources {
|
|
63
|
+
limits = {
|
|
64
|
+
memory = "512Mi"
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
variable "project_id" {
|
|
2
|
+
type = string
|
|
3
|
+
description = "GCP project ID"
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
variable "region" {
|
|
7
|
+
type = string
|
|
8
|
+
description = "GCP region"
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
variable "collector_name" {
|
|
12
|
+
type = string
|
|
13
|
+
description = "dcf collector name (e.g. github_repos)"
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
variable "image_uri" {
|
|
17
|
+
type = string
|
|
18
|
+
description = "Container image URI for the Cloud Run job"
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
variable "sa_email" {
|
|
22
|
+
type = string
|
|
23
|
+
description = "Service account email for the Cloud Run job"
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
variable "build_context" {
|
|
27
|
+
type = string
|
|
28
|
+
description = "Absolute path to the stable build context directory"
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
variable "content_hash" {
|
|
32
|
+
type = string
|
|
33
|
+
description = "SHA256 of build context files — triggers Cloud Build rebuild when changed"
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
variable "java_enabled" {
|
|
37
|
+
type = bool
|
|
38
|
+
default = false
|
|
39
|
+
description = "Install OpenJDK in the container (false for GCP — uses PyArrow direct write)"
|
|
40
|
+
}
|