data-collection-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. data_collection_framework-0.1.0.dist-info/METADATA +19 -0
  2. data_collection_framework-0.1.0.dist-info/RECORD +44 -0
  3. data_collection_framework-0.1.0.dist-info/WHEEL +5 -0
  4. data_collection_framework-0.1.0.dist-info/entry_points.txt +2 -0
  5. data_collection_framework-0.1.0.dist-info/top_level.txt +1 -0
  6. dcf/__init__.py +4 -0
  7. dcf/cli.py +841 -0
  8. dcf/config/__init__.py +4 -0
  9. dcf/config/loader.py +77 -0
  10. dcf/config/models.py +240 -0
  11. dcf/engine/__init__.py +6 -0
  12. dcf/engine/fetcher.py +118 -0
  13. dcf/engine/iterator.py +96 -0
  14. dcf/engine/projector.py +56 -0
  15. dcf/engine/runner.py +90 -0
  16. dcf/engine/transforms.py +41 -0
  17. dcf/gcp/__init__.py +0 -0
  18. dcf/gcp/_collector_utils.py +87 -0
  19. dcf/gcp/auth.py +1 -0
  20. dcf/gcp/batch_deploy.py +548 -0
  21. dcf/gcp/bootstrap.py +131 -0
  22. dcf/gcp/gcloud.py +42 -0
  23. dcf/gcp/terraform.py +151 -0
  24. dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
  25. dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
  26. dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
  27. dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
  28. dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
  29. dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
  30. dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
  31. dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
  32. dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
  33. dcf/infra/modules/batch_collector/local/main.tf +32 -0
  34. dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
  35. dcf/infra/modules/batch_collector/local/variables.tf +25 -0
  36. dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
  37. dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
  38. dcf/infra/templates/docker-compose.yml.tftpl +76 -0
  39. dcf/local_deploy.py +756 -0
  40. dcf/project.py +23 -0
  41. dcf/spark_session.py +66 -0
  42. dcf/warehouse_reader.py +323 -0
  43. dcf/writer/__init__.py +3 -0
  44. dcf/writer/iceberg.py +315 -0
dcf/gcp/bootstrap.py ADDED
@@ -0,0 +1,131 @@
1
+ import base64
2
+ import json
3
+ import logging
4
+
5
+ from google.api_core.exceptions import Conflict, NotFound
6
+ from google.auth.credentials import Credentials
7
+ from google.cloud import secretmanager, storage
8
+ from googleapiclient import discovery
9
+ from googleapiclient.errors import HttpError
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ _SA_ACCOUNT_ID = "dcf-lake"
14
+ _SECRET_ID = "dcf-lake-sa-key"
15
+
16
+
17
+ def create_state_bucket(project_id: str, region: str, credentials: Credentials) -> str:
18
+ """Create the GCS bucket used for Terraform state. Returns bucket name."""
19
+ bucket_name = f"dcf-tf-state-{project_id}"
20
+ client = storage.Client(project=project_id, credentials=credentials)
21
+ try:
22
+ bucket = client.create_bucket(bucket_name, location=region)
23
+ bucket.versioning_enabled = True
24
+ bucket.patch()
25
+ logger.info("Created Terraform state bucket %s", bucket_name)
26
+ except Conflict:
27
+ logger.info("Terraform state bucket %s already exists", bucket_name)
28
+ except Exception as e:
29
+ from google.api_core.exceptions import Forbidden
30
+ if isinstance(e, Forbidden) or "billing" in str(e).lower():
31
+ raise RuntimeError(
32
+ f"Billing is not enabled for project '{project_id}'. "
33
+ f"Enable it at: https://console.cloud.google.com/billing"
34
+ ) from e
35
+ raise
36
+ return bucket_name
37
+
38
+
39
+ def create_service_account(project_id: str, credentials: Credentials) -> str:
40
+ """Create the dcf-lake service account. Returns SA email."""
41
+ sa_email = f"{_SA_ACCOUNT_ID}@{project_id}.iam.gserviceaccount.com"
42
+ service = discovery.build("iam", "v1", credentials=credentials, cache_discovery=False)
43
+ try:
44
+ service.projects().serviceAccounts().create(
45
+ name=f"projects/{project_id}",
46
+ body={
47
+ "accountId": _SA_ACCOUNT_ID,
48
+ "serviceAccount": {"displayName": "dcf Lake Service Account"},
49
+ },
50
+ ).execute()
51
+ logger.info("Created service account %s", sa_email)
52
+ except HttpError as e:
53
+ if e.resp.status == 409:
54
+ logger.info("Service account %s already exists", sa_email)
55
+ else:
56
+ raise
57
+ return sa_email
58
+
59
+
60
+ def create_service_account_key(project_id: str, sa_email: str, credentials: Credentials) -> dict:
61
+ """Create a new JSON key for the SA. Returns decoded key dict."""
62
+ service = discovery.build("iam", "v1", credentials=credentials, cache_discovery=False)
63
+ result = service.projects().serviceAccounts().keys().create(
64
+ name=f"projects/{project_id}/serviceAccounts/{sa_email}",
65
+ body={"privateKeyType": "TYPE_GOOGLE_CREDENTIALS_FILE"},
66
+ ).execute()
67
+ key_data = json.loads(base64.b64decode(result["privateKeyData"]).decode())
68
+ logger.info("Created SA key for %s", sa_email)
69
+ return key_data
70
+
71
+
72
+ def store_key_in_secret_manager(project_id: str, key_data: dict, credentials: Credentials) -> str:
73
+ """
74
+ Store the SA key in Secret Manager as 'dcf-lake-sa-key'.
75
+ Creates the secret if it doesn't exist, then adds a new version.
76
+ Returns the full secret resource name.
77
+ """
78
+ client = secretmanager.SecretManagerServiceClient(credentials=credentials)
79
+ parent = f"projects/{project_id}"
80
+ secret_name = f"{parent}/secrets/{_SECRET_ID}"
81
+
82
+ try:
83
+ client.create_secret(request={
84
+ "parent": parent,
85
+ "secret_id": _SECRET_ID,
86
+ "secret": {"replication": {"automatic": {}}},
87
+ })
88
+ logger.info("Created Secret Manager secret %s", _SECRET_ID)
89
+ except Conflict:
90
+ logger.info("Secret %s already exists, adding new version", _SECRET_ID)
91
+
92
+ client.add_secret_version(request={
93
+ "parent": secret_name,
94
+ "payload": {"data": json.dumps(key_data).encode()},
95
+ })
96
+ logger.info("Stored SA key in Secret Manager")
97
+ return secret_name
98
+
99
+
100
+ def delete_secret(secret_name: str, credentials: Credentials) -> None:
101
+ """Delete a Secret Manager secret and all its versions."""
102
+ client = secretmanager.SecretManagerServiceClient(credentials=credentials)
103
+ try:
104
+ client.delete_secret(request={"name": secret_name})
105
+ logger.info("Deleted secret %s", secret_name)
106
+ except NotFound:
107
+ logger.info("Secret %s not found, skipping", secret_name)
108
+
109
+
110
+ def delete_service_account(project_id: str, sa_email: str, credentials: Credentials) -> None:
111
+ """Delete the dcf-lake service account."""
112
+ service = discovery.build("iam", "v1", credentials=credentials, cache_discovery=False)
113
+ try:
114
+ service.projects().serviceAccounts().delete(
115
+ name=f"projects/{project_id}/serviceAccounts/{sa_email}",
116
+ ).execute()
117
+ logger.info("Deleted service account %s", sa_email)
118
+ except HttpError as e:
119
+ if e.resp.status == 404:
120
+ logger.info("Service account %s not found, skipping", sa_email)
121
+ else:
122
+ raise
123
+
124
+
125
+ def fetch_service_account_key(project_id: str, secret_name: str) -> dict:
126
+ """Fetch the latest SA key from Secret Manager using ADC credentials."""
127
+ from .gcloud import get_credentials
128
+ credentials = get_credentials()
129
+ client = secretmanager.SecretManagerServiceClient(credentials=credentials)
130
+ response = client.access_secret_version(request={"name": f"{secret_name}/versions/latest"})
131
+ return json.loads(response.payload.data.decode())
dcf/gcp/gcloud.py ADDED
@@ -0,0 +1,42 @@
1
+ import shutil
2
+ import subprocess
3
+ import logging
4
+
5
+ import google.auth
6
+ from google.auth.exceptions import DefaultCredentialsError
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ _SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
11
+ _INSTALL_URL = "https://cloud.google.com/sdk/docs/install"
12
+
13
+
14
+ def get_credentials():
15
+ """
16
+ Return ADC credentials scoped to cloud-platform.
17
+
18
+ Resolution order:
19
+ 1. Existing ADC (GOOGLE_APPLICATION_CREDENTIALS env var or gcloud ADC file)
20
+ 2. If not configured but gcloud is installed, run `gcloud auth application-default login`
21
+ (opens a browser on the local machine) then retry.
22
+ 3. If gcloud is not installed, raise RuntimeError with install instructions.
23
+ """
24
+ try:
25
+ creds, _ = google.auth.default(scopes=_SCOPES)
26
+ return creds
27
+ except DefaultCredentialsError:
28
+ pass
29
+
30
+ gcloud = shutil.which("gcloud")
31
+ if not gcloud:
32
+ raise RuntimeError(
33
+ "No Google credentials found and gcloud CLI is not installed.\n"
34
+ f"Install it at: {_INSTALL_URL}\n"
35
+ "Then run: gcloud auth application-default login"
36
+ )
37
+
38
+ logger.info("ADC not configured — running gcloud auth application-default login")
39
+ subprocess.run(["gcloud", "auth", "application-default", "login"], check=True)
40
+
41
+ creds, _ = google.auth.default(scopes=_SCOPES)
42
+ return creds
dcf/gcp/terraform.py ADDED
@@ -0,0 +1,151 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+ from google.api_core.exceptions import NotFound, Forbidden
9
+ from google.cloud import storage
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ _MODULE_DIR = Path(__file__).parent.parent / "infra" / "modules" / "gcp"
14
+ _WORK_DIR = Path.home() / ".dcf" / "terraform"
15
+
16
+
17
+ def provision(
18
+ project_id: str,
19
+ region: str,
20
+ sa_email: str,
21
+ tf_state_bucket: str,
22
+ ) -> str:
23
+ """
24
+ Run terraform init + apply.
25
+ Auth is handled automatically via ADC (gcloud application-default credentials).
26
+ Returns the warehouse_bucket name from terraform output.
27
+ Raises RuntimeError on non-zero exit.
28
+ """
29
+ _WORK_DIR.mkdir(parents=True, exist_ok=True)
30
+
31
+ for tf_file in _MODULE_DIR.glob("*.tf"):
32
+ shutil.copy2(tf_file, _WORK_DIR / tf_file.name)
33
+
34
+ env = {
35
+ **os.environ,
36
+ "TF_INPUT": "0",
37
+ "TF_PLUGIN_CACHE_DIR": str(_WORK_DIR / ".plugin-cache"),
38
+ }
39
+ (_WORK_DIR / ".plugin-cache").mkdir(exist_ok=True)
40
+
41
+ _run(
42
+ [
43
+ "terraform", "init", "-reconfigure",
44
+ f"-backend-config=bucket={tf_state_bucket}",
45
+ "-backend-config=prefix=terraform/state",
46
+ ],
47
+ _WORK_DIR, env,
48
+ )
49
+
50
+ # Import any already-existing resources so apply doesn't fail with 409
51
+ _import_existing_resources(project_id, _WORK_DIR, env)
52
+
53
+ _run(
54
+ [
55
+ "terraform", "apply", "-auto-approve",
56
+ f"-var=project_id={project_id}",
57
+ f"-var=region={region}",
58
+ f"-var=sa_email={sa_email}",
59
+ ],
60
+ _WORK_DIR, env,
61
+ )
62
+
63
+ return _read_output(_WORK_DIR, env, "warehouse_bucket")
64
+
65
+
66
+ def destroy(
67
+ project_id: str,
68
+ region: str,
69
+ sa_email: str,
70
+ tf_state_bucket: str,
71
+ ) -> None:
72
+ """Run terraform destroy to remove all provisioned GCP resources."""
73
+ _WORK_DIR.mkdir(parents=True, exist_ok=True)
74
+
75
+ for tf_file in _MODULE_DIR.glob("*.tf"):
76
+ shutil.copy2(tf_file, _WORK_DIR / tf_file.name)
77
+
78
+ env = {
79
+ **os.environ,
80
+ "TF_INPUT": "0",
81
+ "TF_PLUGIN_CACHE_DIR": str(_WORK_DIR / ".plugin-cache"),
82
+ }
83
+ (_WORK_DIR / ".plugin-cache").mkdir(exist_ok=True)
84
+
85
+ _run(
86
+ [
87
+ "terraform", "init", "-reconfigure",
88
+ f"-backend-config=bucket={tf_state_bucket}",
89
+ "-backend-config=prefix=terraform/state",
90
+ ],
91
+ _WORK_DIR, env,
92
+ )
93
+
94
+ _run(
95
+ [
96
+ "terraform", "destroy", "-auto-approve",
97
+ f"-var=project_id={project_id}",
98
+ f"-var=region={region}",
99
+ f"-var=sa_email={sa_email}",
100
+ ],
101
+ _WORK_DIR, env,
102
+ )
103
+
104
+
105
+ def _import_existing_resources(project_id: str, work_dir: Path, env: dict) -> None:
106
+ """Import already-existing GCP resources into Terraform state to avoid 409 on apply."""
107
+ warehouse_bucket = f"dcf-warehouse-{project_id}"
108
+ client = storage.Client(project=project_id)
109
+ try:
110
+ client.get_bucket(warehouse_bucket)
111
+ except (NotFound, Forbidden):
112
+ return # bucket doesn't exist yet — apply will create it
113
+
114
+ # Bucket exists; import it so terraform apply doesn't try to create it again
115
+ result = subprocess.run(
116
+ ["terraform", "import", "google_storage_bucket.warehouse", warehouse_bucket],
117
+ cwd=str(work_dir), env=env, capture_output=True, text=True,
118
+ )
119
+ if result.returncode == 0:
120
+ logger.info("Imported existing warehouse bucket '%s' into Terraform state", warehouse_bucket)
121
+ elif "already managed by Terraform" in result.stdout + result.stderr:
122
+ logger.info("Warehouse bucket '%s' already in Terraform state", warehouse_bucket)
123
+ else:
124
+ # Import failed for an unexpected reason — log and continue; apply may still succeed
125
+ logger.warning("terraform import returned non-zero: %s", result.stderr[-500:])
126
+
127
+
128
+ def _run(cmd: list[str], cwd: Path, env: dict) -> None:
129
+ result = subprocess.run(cmd, cwd=str(cwd), env=env, capture_output=True, text=True)
130
+ if result.returncode != 0:
131
+ logger.error(
132
+ "Terraform command failed: %s\nSTDOUT: %s\nSTDERR: %s",
133
+ " ".join(cmd), result.stdout, result.stderr,
134
+ )
135
+ raise RuntimeError(
136
+ f"terraform {cmd[1]} failed (exit {result.returncode}): {result.stderr[-2000:]}"
137
+ )
138
+ logger.info("terraform %s OK", cmd[1])
139
+
140
+
141
+ def _read_output(work_dir: Path, env: dict, key: str) -> str:
142
+ result = subprocess.run(
143
+ ["terraform", "output", "-json"],
144
+ cwd=str(work_dir), env=env, capture_output=True, text=True,
145
+ )
146
+ if result.returncode != 0:
147
+ raise RuntimeError(f"terraform output failed: {result.stderr}")
148
+ outputs = json.loads(result.stdout)
149
+ if key not in outputs:
150
+ raise RuntimeError(f"'{key}' not in terraform output. Got: {list(outputs)}")
151
+ return outputs[key]["value"]
@@ -0,0 +1,194 @@
1
+ terraform {
2
+ required_version = ">= 1.0"
3
+ required_providers {
4
+ google = {
5
+ source = "hashicorp/google"
6
+ version = "~> 5.0"
7
+ }
8
+ local = {
9
+ source = "hashicorp/local"
10
+ version = "~> 2.0"
11
+ }
12
+ null = {
13
+ source = "hashicorp/null"
14
+ version = "~> 3.0"
15
+ }
16
+ }
17
+ }
18
+
19
+ provider "google" {
20
+ project = var.project_id
21
+ region = var.region
22
+ }
23
+
24
+ resource "local_file" "dockerfile" {
25
+ content = templatefile("${path.module}/templates/airflow.Dockerfile.tftpl", {
26
+ target = "gcp"
27
+ })
28
+ filename = "${var.build_context}/Dockerfile"
29
+ }
30
+
31
+ resource "null_resource" "build" {
32
+ depends_on = [local_file.dockerfile]
33
+
34
+ triggers = {
35
+ content_hash = var.content_hash
36
+ }
37
+
38
+ provisioner "local-exec" {
39
+ command = "gcloud builds submit --project ${var.project_id} --region ${var.region} --tag ${var.image_uri} --timeout 600s ${var.build_context}"
40
+ }
41
+ }
42
+
43
+ resource "google_sql_database_instance" "airflow_db" {
44
+ name = "dcf-airflow-db"
45
+ database_version = "POSTGRES_15"
46
+ region = var.region
47
+
48
+ deletion_protection = false
49
+
50
+ settings {
51
+ tier = "db-f1-micro"
52
+
53
+ backup_configuration {
54
+ enabled = false
55
+ }
56
+ }
57
+ }
58
+
59
+ resource "google_sql_database" "airflow" {
60
+ name = "airflow"
61
+ instance = google_sql_database_instance.airflow_db.name
62
+ }
63
+
64
+ resource "google_sql_user" "airflow" {
65
+ name = "airflow"
66
+ instance = google_sql_database_instance.airflow_db.name
67
+ password = var.db_password
68
+ }
69
+
70
+ resource "google_project_iam_member" "cloudsql_client" {
71
+ project = var.project_id
72
+ role = "roles/cloudsql.client"
73
+ member = "serviceAccount:${var.sa_email}"
74
+ }
75
+
76
+ resource "google_project_iam_member" "storage_viewer" {
77
+ project = var.project_id
78
+ role = "roles/storage.objectViewer"
79
+ member = "serviceAccount:${var.sa_email}"
80
+ }
81
+
82
+ locals {
83
+ db_conn_name = google_sql_database_instance.airflow_db.connection_name
84
+ db_url = "postgresql+psycopg2://airflow:${var.db_password}@/airflow?host=/cloudsql/${local.db_conn_name}"
85
+ }
86
+
87
+ resource "google_cloud_run_v2_service" "airflow" {
88
+ depends_on = [null_resource.build, google_sql_database_instance.airflow_db]
89
+
90
+ name = "dcf-airflow"
91
+ location = var.region
92
+
93
+ template {
94
+ service_account = var.sa_email
95
+
96
+ scaling {
97
+ min_instance_count = 1
98
+ max_instance_count = 1
99
+ }
100
+
101
+ volumes {
102
+ name = "dags"
103
+ gcs {
104
+ bucket = var.warehouse_bucket
105
+ read_only = true
106
+ }
107
+ }
108
+
109
+ volumes {
110
+ name = "cloudsql"
111
+ cloud_sql_instance {
112
+ instances = [local.db_conn_name]
113
+ }
114
+ }
115
+
116
+ containers {
117
+ image = var.image_uri
118
+
119
+ ports {
120
+ container_port = 8080
121
+ }
122
+
123
+ volume_mounts {
124
+ name = "dags"
125
+ mount_path = "/opt/airflow/dags"
126
+ }
127
+
128
+ volume_mounts {
129
+ name = "cloudsql"
130
+ mount_path = "/cloudsql"
131
+ }
132
+
133
+ env {
134
+ name = "AIRFLOW__DATABASE__SQL_ALCHEMY_CONN"
135
+ value = local.db_url
136
+ }
137
+
138
+ env {
139
+ name = "AIRFLOW__CORE__EXECUTOR"
140
+ value = "LocalExecutor"
141
+ }
142
+
143
+ env {
144
+ name = "AIRFLOW__CORE__FERNET_KEY"
145
+ value = var.fernet_key
146
+ }
147
+
148
+ env {
149
+ name = "AIRFLOW__WEBSERVER__SECRET_KEY"
150
+ value = var.fernet_key
151
+ }
152
+
153
+ env {
154
+ name = "AIRFLOW__CORE__LOAD_EXAMPLES"
155
+ value = "false"
156
+ }
157
+
158
+ env {
159
+ name = "AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL"
160
+ value = "30"
161
+ }
162
+
163
+ env {
164
+ name = "_AIRFLOW_WWW_USER_CREATE"
165
+ value = "true"
166
+ }
167
+
168
+ env {
169
+ name = "_AIRFLOW_WWW_USER_USERNAME"
170
+ value = "admin"
171
+ }
172
+
173
+ env {
174
+ name = "_AIRFLOW_WWW_USER_PASSWORD"
175
+ value = var.admin_password
176
+ }
177
+
178
+ resources {
179
+ limits = {
180
+ memory = "2Gi"
181
+ cpu = "1"
182
+ }
183
+ }
184
+
185
+ command = ["airflow", "standalone"]
186
+ }
187
+ }
188
+
189
+ lifecycle {
190
+ ignore_changes = [
191
+ template[0].containers[0].image,
192
+ ]
193
+ }
194
+ }
@@ -0,0 +1,9 @@
1
+ output "webserver_url" {
2
+ description = "HTTPS URL of the Cloud Run Airflow service"
3
+ value = google_cloud_run_v2_service.airflow.uri
4
+ }
5
+
6
+ output "service_name" {
7
+ description = "Name of the Cloud Run Airflow service"
8
+ value = google_cloud_run_v2_service.airflow.name
9
+ }
@@ -0,0 +1,52 @@
1
+ variable "image_uri" {
2
+ type = string
3
+ description = "Artifact Registry URI for the Airflow image"
4
+ }
5
+
6
+ variable "build_context" {
7
+ type = string
8
+ description = "Absolute host path to the Airflow build context directory"
9
+ }
10
+
11
+ variable "content_hash" {
12
+ type = string
13
+ description = "SHA256 of Airflow Dockerfile template — triggers Cloud Build rebuild"
14
+ }
15
+
16
+ variable "project_id" {
17
+ type = string
18
+ description = "GCP project ID"
19
+ }
20
+
21
+ variable "region" {
22
+ type = string
23
+ description = "GCP region"
24
+ }
25
+
26
+ variable "sa_email" {
27
+ type = string
28
+ description = "Service account email for Cloud Run Airflow service"
29
+ }
30
+
31
+ variable "warehouse_bucket" {
32
+ type = string
33
+ description = "GCS bucket where DAGs are stored at airflow/dags/"
34
+ }
35
+
36
+ variable "db_password" {
37
+ type = string
38
+ sensitive = true
39
+ description = "PostgreSQL password for Cloud SQL Airflow database"
40
+ }
41
+
42
+ variable "admin_password" {
43
+ type = string
44
+ sensitive = true
45
+ description = "Airflow webserver admin password"
46
+ }
47
+
48
+ variable "fernet_key" {
49
+ type = string
50
+ sensitive = true
51
+ description = "Airflow fernet key for encrypting connection passwords"
52
+ }
@@ -0,0 +1,70 @@
1
+ terraform {
2
+ required_version = ">= 1.0"
3
+ required_providers {
4
+ google = {
5
+ source = "hashicorp/google"
6
+ version = "~> 5.0"
7
+ }
8
+ local = {
9
+ source = "hashicorp/local"
10
+ version = "~> 2.0"
11
+ }
12
+ null = {
13
+ source = "hashicorp/null"
14
+ version = "~> 3.0"
15
+ }
16
+ }
17
+ }
18
+
19
+ provider "google" {
20
+ project = var.project_id
21
+ region = var.region
22
+ }
23
+
24
+ resource "local_file" "dockerfile" {
25
+ content = templatefile("${path.module}/templates/batch_collector.Dockerfile.tftpl", {
26
+ java_enabled = var.java_enabled
27
+ })
28
+ filename = "${var.build_context}/Dockerfile"
29
+ }
30
+
31
+ resource "null_resource" "build" {
32
+ depends_on = [local_file.dockerfile]
33
+
34
+ triggers = {
35
+ content_hash = var.content_hash
36
+ }
37
+
38
+ provisioner "local-exec" {
39
+ command = "gcloud builds submit --project ${var.project_id} --region ${var.region} --tag ${var.image_uri} --timeout 600s ${var.build_context}"
40
+ }
41
+ }
42
+
43
+ resource "google_cloud_run_v2_job" "collector" {
44
+ depends_on = [null_resource.build]
45
+
46
+ name = "dcf-job-${replace(var.collector_name, "_", "-")}"
47
+ location = var.region
48
+
49
+ template {
50
+ template {
51
+ service_account = var.sa_email
52
+ max_retries = 0
53
+
54
+ containers {
55
+ image = var.image_uri
56
+
57
+ env {
58
+ name = "COLLECTOR_NAME"
59
+ value = var.collector_name
60
+ }
61
+
62
+ resources {
63
+ limits = {
64
+ memory = "512Mi"
65
+ }
66
+ }
67
+ }
68
+ }
69
+ }
70
+ }
@@ -0,0 +1,4 @@
1
+ output "job_name" {
2
+ description = "Name of the provisioned Cloud Run job"
3
+ value = google_cloud_run_v2_job.collector.name
4
+ }
@@ -0,0 +1,40 @@
1
+ variable "project_id" {
2
+ type = string
3
+ description = "GCP project ID"
4
+ }
5
+
6
+ variable "region" {
7
+ type = string
8
+ description = "GCP region"
9
+ }
10
+
11
+ variable "collector_name" {
12
+ type = string
13
+ description = "dcf collector name (e.g. github_repos)"
14
+ }
15
+
16
+ variable "image_uri" {
17
+ type = string
18
+ description = "Container image URI for the Cloud Run job"
19
+ }
20
+
21
+ variable "sa_email" {
22
+ type = string
23
+ description = "Service account email for the Cloud Run job"
24
+ }
25
+
26
+ variable "build_context" {
27
+ type = string
28
+ description = "Absolute path to the stable build context directory"
29
+ }
30
+
31
+ variable "content_hash" {
32
+ type = string
33
+ description = "SHA256 of build context files — triggers Cloud Build rebuild when changed"
34
+ }
35
+
36
+ variable "java_enabled" {
37
+ type = bool
38
+ default = false
39
+ description = "Install OpenJDK in the container (false for GCP — uses PyArrow direct write)"
40
+ }