batch-analytics 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,169 @@
1
+ """
2
+ Configuration for batch analytics pipeline: Extract, Transform, Log stages.
3
+ """
4
+
5
+ import os
6
+ from dataclasses import dataclass, field
7
+ from pathlib import Path
8
+
9
+
10
+ @dataclass
11
+ class ClickHouseConfig:
12
+ """ClickHouse connection settings."""
13
+
14
+ host: str = field(
15
+ default_factory=lambda: os.environ.get(
16
+ "CLICKHOUSE_HOST",
17
+ "my-simple-cluster-clickhouse-headless.default.svc.cluster.local",
18
+ )
19
+ )
20
+ port: int = int(os.environ.get("CLICKHOUSE_HTTP_PORT", "8123"))
21
+ database: str = os.environ.get("CLICKHOUSE_DB", "example_db")
22
+ user: str = os.environ.get("CLICKHOUSE_USER", "default")
23
+ password: str = os.environ.get("CLICKHOUSE_PASSWORD", "")
24
+ protocol: str = os.environ.get("CLICKHOUSE_PROTOCOL", "http")
25
+
26
+ @property
27
+ def jdbc_url(self) -> str:
28
+ return f"jdbc:ch://{self.host}:{self.port}/{self.database}"
29
+
30
+ @property
31
+ def jdbc_properties(self) -> dict:
32
+ props = {"user": self.user, "driver": "com.clickhouse.jdbc.ClickHouseDriver"}
33
+ if self.password:
34
+ props["password"] = self.password
35
+ return props
36
+
37
+
38
+ @dataclass
39
+ class ExtractConfig:
40
+ """Extract stage configuration."""
41
+
42
+ # Source table(s) - comma-separated for multiple, or single table name
43
+ source_tables: str = os.environ.get(
44
+ "BATCH_SOURCE_TABLES",
45
+ "batch_details_table,manufacturing_table,yield_table,temperature_table",
46
+ )
47
+ # Use ClickHouse native connector (format) vs JDBC fallback
48
+ use_native_connector: bool = os.environ.get(
49
+ "BATCH_USE_NATIVE_CONNECTOR", "false"
50
+ ).lower() == "true"
51
+
52
+
53
+ @dataclass
54
+ class TransformConfig:
55
+ """Transform stage configuration."""
56
+
57
+ # Columns to use for deduplication (comma-separated); empty = use all columns
58
+ dedup_columns: str = os.environ.get("BATCH_DEDUP_COLUMNS", "")
59
+ # Staging output path (local or S3)
60
+ staging_path: str = os.environ.get(
61
+ "BATCH_STAGING_PATH",
62
+ "/tmp/analytics_stage",
63
+ )
64
+ # Output format for load_staged when reading (parquet/delta/clickhouse).
65
+ # Stage job always writes to ClickHouse; use clickhouse for analytics to read from staged table.
66
+ staging_format: str = os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
67
+ # Staging table name in ClickHouse (when format=clickhouse)
68
+ staging_table: str = os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
69
+ # Extract anchor_id from add_dimension column (e.g. {'anchor_id':'GP/GPH(D)/II(W)/250019'})
70
+ add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
71
+ anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
72
+
73
+
74
+ @dataclass
75
+ class LogConfig:
76
+ """Log stage configuration."""
77
+
78
+ log_path: str = os.environ.get(
79
+ "BATCH_LOG_PATH",
80
+ "/tmp/analytics_logs",
81
+ )
82
+ # Retention: number of runs to keep
83
+ retention_runs: int = int(os.environ.get("BATCH_LOG_RETENTION", "30"))
84
+
85
+
86
+ @dataclass
87
+ class OutputConfig:
88
+ """
89
+ Output destination for analytics results (injected by analytics_runner).
90
+
91
+ Env vars: OUTPUT_TYPE, OUTPUT_S3_PATH, OUTPUT_CLICKHOUSE_*, TASK_ID
92
+ """
93
+
94
+ type: str = os.environ.get("OUTPUT_TYPE", "local")
95
+ s3_path: str = os.environ.get("OUTPUT_S3_PATH", "")
96
+ clickhouse_database: str = os.environ.get("OUTPUT_CLICKHOUSE_DATABASE", "example_db")
97
+ clickhouse_table: str = os.environ.get("OUTPUT_CLICKHOUSE_TABLE", "analytics_results")
98
+ # Task ID from spark_runner (for S3 key prefix, ClickHouse task_id column)
99
+ task_id: str = os.environ.get("TASK_ID", "")
100
+
101
+
102
+ @dataclass
103
+ class AnalyticsConfig:
104
+ """Analytics modules configuration."""
105
+
106
+ # Module 1: Linear regression - X and Y column names
107
+ lr_x_column: str = os.environ.get("BATCH_LR_X_COLUMN", "x")
108
+ lr_y_column: str = os.environ.get("BATCH_LR_Y_COLUMN", "y")
109
+ # Groups to compare slopes (e.g. "product,batch_no")
110
+ lr_group_columns: str = os.environ.get("BATCH_LR_GROUP_COLUMNS", "")
111
+
112
+ # Module 2: Correlation - feature columns (comma-separated)
113
+ corr_features: str = os.environ.get(
114
+ "BATCH_CORR_FEATURES",
115
+ "",
116
+ )
117
+ corr_threshold: float = float(os.environ.get("BATCH_CORR_THRESHOLD", "0.8"))
118
+
119
+ # Module 3: PCA + Clustering
120
+ pca_features: str = os.environ.get("BATCH_PCA_FEATURES", "")
121
+ pca_variance_threshold: float = float(
122
+ os.environ.get("BATCH_PCA_VARIANCE_THRESHOLD", "0.95")
123
+ )
124
+ cluster_k: int = int(os.environ.get("BATCH_CLUSTER_K", "3"))
125
+
126
+ # Module 4: T-test - compare means of two groups
127
+ # Mode 1: value column + group column (2 levels)
128
+ ttest_value_column: str = os.environ.get("BATCH_TTEST_VALUE_COLUMN", "")
129
+ ttest_group_column: str = os.environ.get("BATCH_TTEST_GROUP_COLUMN", "")
130
+ # Mode 2: two numeric columns
131
+ ttest_col_a: str = os.environ.get("BATCH_TTEST_COL_A", "")
132
+ ttest_col_b: str = os.environ.get("BATCH_TTEST_COL_B", "")
133
+
134
+
135
+ @dataclass
136
+ class SparkK8sConfig:
137
+ """Spark on Kubernetes configuration."""
138
+
139
+ # Set SPARK_MASTER=k8s://https://kubernetes.default.svc:443 to enable K8s
140
+ master: str = os.environ.get("SPARK_MASTER", "local[*]")
141
+ container_image: str = os.environ.get("SPARK_K8S_IMAGE", "sudhakso/spark:3.5.0")
142
+ namespace: str = os.environ.get("SPARK_K8S_NAMESPACE", "default")
143
+ service_account: str = os.environ.get("SPARK_K8S_SERVICE_ACCOUNT", "spark-sa")
144
+ deploy_mode: str = os.environ.get("SPARK_DEPLOY_MODE", "client")
145
+ # Driver/executor resources
146
+ driver_memory: str = os.environ.get("SPARK_DRIVER_MEMORY", "512m")
147
+ driver_memory_overhead: str = os.environ.get("SPARK_DRIVER_MEMORY_OVERHEAD", "256m")
148
+ executor_instances: int = int(os.environ.get("SPARK_EXECUTOR_INSTANCES", "1"))
149
+ executor_cores: int = int(os.environ.get("SPARK_EXECUTOR_CORES", "1"))
150
+ executor_memory: str = os.environ.get("SPARK_EXECUTOR_MEMORY", "512m")
151
+ executor_memory_overhead: str = os.environ.get("SPARK_EXECUTOR_MEMORY_OVERHEAD", "128m")
152
+ # S3 (optional; set for s3a:// paths)
153
+ s3_access_key: str = os.environ.get("AWS_ACCESS_KEY_ID", "")
154
+ s3_secret_key: str = os.environ.get("AWS_SECRET_ACCESS_KEY", "")
155
+ s3_endpoint: str = os.environ.get("AWS_ENDPOINT", "s3.amazonaws.com")
156
+ s3_region: str = os.environ.get("AWS_REGION", "us-east-2")
157
+
158
+
159
+ @dataclass
160
+ class BatchAnalyticsConfig:
161
+ """Aggregate configuration for the pipeline."""
162
+
163
+ clickhouse: ClickHouseConfig = field(default_factory=ClickHouseConfig)
164
+ spark_k8s: SparkK8sConfig = field(default_factory=SparkK8sConfig)
165
+ extract: ExtractConfig = field(default_factory=ExtractConfig)
166
+ transform: TransformConfig = field(default_factory=TransformConfig)
167
+ log: LogConfig = field(default_factory=LogConfig)
168
+ output: OutputConfig = field(default_factory=OutputConfig)
169
+ analytics: AnalyticsConfig = field(default_factory=AnalyticsConfig)
@@ -0,0 +1,118 @@
1
+ """
2
+ Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDBC.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ from pyspark.sql import DataFrame, SparkSession
9
+
10
+ from .config import BatchAnalyticsConfig
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> DataFrame | None:
16
+ """
17
+ Read from ClickHouse using the native format API (clickhouse-spark-runtime).
18
+ Requires: com.clickhouse.spark:clickhouse-spark-runtime in spark.jars.packages
19
+ """
20
+ try:
21
+ df = (
22
+ spark.read.format("clickhouse")
23
+ .option("host", cfg.clickhouse.host)
24
+ .option("protocol", cfg.clickhouse.protocol)
25
+ .option("http_port", str(cfg.clickhouse.port))
26
+ .option("database", cfg.clickhouse.database)
27
+ .option("table", table)
28
+ .option("user", cfg.clickhouse.user)
29
+ .load()
30
+ )
31
+ return df
32
+ except Exception as e:
33
+ logger.warning(
34
+ "Native ClickHouse connector failed for table %s: %s. Falling back to JDBC.",
35
+ table,
36
+ e,
37
+ )
38
+ return None
39
+
40
+
41
+ def _read_via_jdbc(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> DataFrame:
42
+ """Read from ClickHouse via JDBC."""
43
+ return spark.read.jdbc(
44
+ cfg.clickhouse.jdbc_url,
45
+ table,
46
+ properties=cfg.clickhouse.jdbc_properties,
47
+ )
48
+
49
+
50
+ def extract_table(
51
+ spark: SparkSession,
52
+ table: str,
53
+ config: BatchAnalyticsConfig,
54
+ ) -> DataFrame:
55
+ """
56
+ Extract a single table from ClickHouse.
57
+ Uses native connector if configured, otherwise JDBC.
58
+ """
59
+ if config.extract.use_native_connector:
60
+ df = _read_via_format(spark, config, table)
61
+ if df is None:
62
+ df = _read_via_jdbc(spark, config, table)
63
+ else:
64
+ df = _read_via_jdbc(spark, config, table)
65
+
66
+ logger.info("Extracted table %s: %d rows", table, df.count())
67
+ return df
68
+
69
+
70
+ def extract_all(
71
+ spark: SparkSession,
72
+ config: BatchAnalyticsConfig,
73
+ ) -> dict[str, DataFrame]:
74
+ """
75
+ Extract all configured source tables from ClickHouse.
76
+ Returns a dict mapping table name to DataFrame.
77
+ """
78
+ tables = [t.strip() for t in config.extract.source_tables.split(",") if t.strip()]
79
+ if not tables:
80
+ raise ValueError("No source tables configured in BATCH_SOURCE_TABLES")
81
+
82
+ result: dict[str, DataFrame] = {}
83
+ for table in tables:
84
+ df = extract_table(spark, table, config)
85
+ result[table] = df
86
+
87
+ return result
88
+
89
+
90
+ def extract_unified(
91
+ spark: SparkSession,
92
+ config: BatchAnalyticsConfig,
93
+ join_keys: list[str] | None = None,
94
+ primary_table: str | None = None,
95
+ ) -> DataFrame:
96
+ """
97
+ Extract and unify source tables into one DataFrame.
98
+ - Single table: returns it directly.
99
+ - Multiple tables + join_keys: joins on those keys (left join, first table base).
100
+ - Multiple tables, no join_keys: returns the primary (or first) table.
101
+ Use primary_table to choose which table to use for analytics.
102
+ """
103
+ all_dfs = extract_all(spark, config)
104
+
105
+ if len(all_dfs) == 1:
106
+ return list(all_dfs.values())[0]
107
+
108
+ if join_keys:
109
+ dfs = list(all_dfs.values())
110
+ base = dfs[0]
111
+ for other in dfs[1:]:
112
+ base = base.join(other, on=join_keys, how="left")
113
+ return base
114
+
115
+ # Multiple tables, no join: use primary or first
116
+ if primary_table and primary_table in all_dfs:
117
+ return all_dfs[primary_table]
118
+ return list(all_dfs.values())[0]
@@ -0,0 +1,300 @@
1
+ """
2
+ Batch analytics job runner: orchestrates Extract, Transform, Log stages and analytics modules.
3
+ """
4
+
5
+ import argparse
6
+ import logging
7
+ import os
8
+ import socket
9
+ import sys
10
+ import uuid
11
+ from pathlib import Path
12
+
13
+ from pyspark.sql import SparkSession
14
+
15
+ from .config import BatchAnalyticsConfig
16
+ from .extract import extract_unified
17
+ from .log import log_dataframe_summary, log_run
18
+ from .modules import DEFAULT_MODULES, MODULE_REGISTRY, VALID_MODULES
19
+ from .output import write_analytics_output
20
+ from .transform import load_staged, stage_to_clickhouse, transform
21
+
22
+ logging.basicConfig(
23
+ level=logging.INFO,
24
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
25
+ )
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def create_spark_session(
30
+ app_name: str = "BatchAnalytics",
31
+ clickhouse_jars: str | None = None,
32
+ config: BatchAnalyticsConfig | None = None,
33
+ ) -> SparkSession:
34
+ """
35
+ Create SparkSession. Uses Kubernetes config when SPARK_MASTER starts with k8s://.
36
+ """
37
+ config = config or BatchAnalyticsConfig()
38
+ cfg = config.spark_k8s
39
+
40
+ builder = (
41
+ SparkSession.builder.appName(app_name)
42
+ .config("spark.sql.adaptive.enabled", "true")
43
+ .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
44
+ )
45
+
46
+ # JARs: ClickHouse JDBC + hadoop-aws (for S3 when on K8s)
47
+ packages = []
48
+ if clickhouse_jars:
49
+ packages.append(clickhouse_jars)
50
+ if cfg.master.startswith("k8s://"):
51
+ packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
52
+ if packages:
53
+ builder = builder.config("spark.jars.packages", ",".join(packages))
54
+
55
+ if cfg.master.startswith("k8s://"):
56
+ driver_host = socket.gethostbyname(socket.gethostname())
57
+ builder = (
58
+ builder.master(cfg.master)
59
+ .config("spark.kubernetes.container.image", cfg.container_image)
60
+ .config("spark.kubernetes.namespace", cfg.namespace)
61
+ .config("spark.kubernetes.authenticate.serviceAccountName", cfg.service_account)
62
+ .config("spark.driver.host", driver_host)
63
+ .config("spark.driver.bindAddress", "0.0.0.0")
64
+ .config(
65
+ "spark.kubernetes.driver.pod.name",
66
+ os.environ.get("HOSTNAME", socket.gethostname()),
67
+ )
68
+ .config("spark.submit.deployMode", cfg.deploy_mode)
69
+ .config(
70
+ "spark.driver.extraJavaOptions",
71
+ "-Djava.net.preferIPv4Stack=true -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError",
72
+ )
73
+ .config("spark.driver.memory", cfg.driver_memory)
74
+ .config("spark.driver.memoryOverhead", cfg.driver_memory_overhead)
75
+ .config("spark.executor.instances", str(cfg.executor_instances))
76
+ .config("spark.executor.cores", str(cfg.executor_cores))
77
+ .config("spark.executor.memory", cfg.executor_memory)
78
+ .config("spark.executor.memoryOverhead", cfg.executor_memory_overhead)
79
+ .config("spark.kubernetes.executor.serviceAccountName", cfg.service_account)
80
+ .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
81
+ )
82
+ if cfg.s3_access_key and cfg.s3_secret_key:
83
+ builder = (
84
+ builder.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
85
+ .config("spark.hadoop.fs.s3a.access.key", cfg.s3_access_key)
86
+ .config("spark.hadoop.fs.s3a.secret.key", cfg.s3_secret_key)
87
+ .config("spark.hadoop.fs.s3a.endpoint", cfg.s3_endpoint)
88
+ .config("spark.hadoop.fs.s3a.endpoint.region", cfg.s3_region)
89
+ )
90
+ logger.info("Spark on Kubernetes: master=%s", cfg.master)
91
+
92
+ return builder.getOrCreate()
93
+
94
+
95
+ def run_pipeline(
96
+ config: BatchAnalyticsConfig | None = None,
97
+ spark: SparkSession | None = None,
98
+ run_extract: bool = True,
99
+ run_transform: bool = True,
100
+ run_stage: bool = True,
101
+ run_analytics: bool = True,
102
+ modules: list[str] | None = None,
103
+ ) -> dict:
104
+ """
105
+ Run the full pipeline or selected stages.
106
+
107
+ Args:
108
+ config: Pipeline config (default: BatchAnalyticsConfig())
109
+ spark: SparkSession (created if None)
110
+ run_extract: Whether to extract from ClickHouse
111
+ run_transform: Whether to transform (clean, dedupe, extract anchor_id)
112
+ run_stage: Whether to stage transformed data to ClickHouse (separate job)
113
+ run_analytics: Whether to run analytics (requires staged data in ClickHouse)
114
+ modules: Which analytics to run: ["lr", "corr", "pca", "ttest"] or None for all
115
+
116
+ Returns:
117
+ Dict with run_id, stage results, analytics outputs
118
+ """
119
+ config = config or BatchAnalyticsConfig()
120
+ run_id = str(uuid.uuid4())[:8]
121
+
122
+ if spark is None:
123
+ jars = "com.clickhouse:clickhouse-jdbc:0.4.6:all"
124
+ spark = create_spark_session(
125
+ app_name="BatchAnalytics",
126
+ clickhouse_jars=jars,
127
+ config=config,
128
+ )
129
+
130
+ result = {"run_id": run_id, "stages": {}, "analytics": {}}
131
+
132
+ # ----- Extract -----
133
+ if run_extract:
134
+ logger.info("Stage: Extract")
135
+ df_raw = extract_unified(spark, config)
136
+ result["stages"]["extract"] = {
137
+ "row_count": df_raw.count(),
138
+ "columns": [f.name for f in df_raw.schema.fields],
139
+ }
140
+ else:
141
+ logger.info("Stage: Extract (skipped, loading from stage)")
142
+ df_raw = load_staged(spark, config)
143
+ result["stages"]["extract"] = {"skipped": True, "loaded_from_stage": True}
144
+
145
+ # ----- Transform -----
146
+ if run_transform:
147
+ logger.info("Stage: Transform")
148
+ df_transformed = transform(df_raw, config)
149
+ result["stages"]["transform"] = {"row_count": df_transformed.count()}
150
+ else:
151
+ df_transformed = df_raw # df_raw already loaded from staged when not run_extract
152
+ if run_extract and run_analytics:
153
+ from .transform import extract_anchor_id, remove_duplicates
154
+
155
+ df_transformed = extract_anchor_id(df_raw, config)
156
+ dedup_cols = (
157
+ [c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
158
+ if config.transform.dedup_columns
159
+ else None
160
+ )
161
+ df_transformed = remove_duplicates(df_transformed, key_columns=dedup_cols)
162
+ result["stages"]["transform"] = {"skipped": True}
163
+
164
+ # ----- Stage (to ClickHouse) - separate job before analytics -----
165
+ if run_stage:
166
+ logger.info("Stage: Stage to ClickHouse")
167
+ stage_to_clickhouse(spark, df_transformed, config)
168
+ result["stages"]["stage"] = {
169
+ "destination": f"{config.clickhouse.database}.{config.transform.staging_table}",
170
+ "row_count": df_transformed.count(),
171
+ }
172
+ else:
173
+ result["stages"]["stage"] = {"skipped": True}
174
+
175
+ # For analytics: use in-memory df (we always have df_transformed from transform/load)
176
+ df_for_analytics = df_transformed
177
+
178
+ # ----- Log stage metrics -----
179
+ log_run(
180
+ run_id,
181
+ "pipeline",
182
+ result["stages"],
183
+ output_dir=config.log.log_path,
184
+ extra={"config_summary": {"staging_format": config.transform.staging_format}},
185
+ )
186
+
187
+ # ----- Analytics (runs after stage; must run extract+transform+stage first, or use --from-stage) -----
188
+ if run_analytics:
189
+ modules = modules or DEFAULT_MODULES
190
+ for mod in modules:
191
+ if mod not in MODULE_REGISTRY:
192
+ logger.warning("Unknown module %r, skipping. Valid: %s", mod, VALID_MODULES)
193
+ continue
194
+ run_fn, result_key = MODULE_REGISTRY[mod]
195
+ try:
196
+ mod_result = run_fn(spark, df_for_analytics, config)
197
+ result["analytics"][result_key] = mod_result
198
+ except Exception as e:
199
+ logger.exception("%s failed: %s", result_key, e)
200
+ result["analytics"][result_key] = {"error": str(e)}
201
+
202
+ task_id = config.output.task_id or run_id
203
+ write_analytics_output(
204
+ run_id=run_id,
205
+ task_id=task_id,
206
+ artifacts=result["analytics"],
207
+ output_type=config.output.type,
208
+ path=config.log.log_path
209
+ if config.output.type == "local"
210
+ else config.output.s3_path,
211
+ database=config.output.clickhouse_database,
212
+ table=config.output.clickhouse_table,
213
+ host=config.clickhouse.host,
214
+ port=config.clickhouse.port,
215
+ user=config.clickhouse.user,
216
+ password=config.clickhouse.password,
217
+ )
218
+
219
+ return result
220
+
221
+
222
+ def main() -> int:
223
+ parser = argparse.ArgumentParser(description="Batch analytics: Extract, Transform, Log + analytics")
224
+ parser.add_argument(
225
+ "--extract",
226
+ action="store_true",
227
+ default=True,
228
+ help="Run extract stage (default: true)",
229
+ )
230
+ parser.add_argument(
231
+ "--no-extract",
232
+ action="store_false",
233
+ dest="extract",
234
+ )
235
+ parser.add_argument(
236
+ "--transform",
237
+ action="store_true",
238
+ default=True,
239
+ help="Run transform stage (default: true)",
240
+ )
241
+ parser.add_argument(
242
+ "--no-transform",
243
+ action="store_false",
244
+ dest="transform",
245
+ )
246
+ parser.add_argument(
247
+ "--stage",
248
+ action="store_true",
249
+ default=True,
250
+ help="Run stage step: write transformed data to ClickHouse (default: true)",
251
+ )
252
+ parser.add_argument(
253
+ "--no-stage",
254
+ action="store_false",
255
+ dest="stage",
256
+ )
257
+ parser.add_argument(
258
+ "--analytics",
259
+ action="store_true",
260
+ default=True,
261
+ help="Run analytics modules (default: true)",
262
+ )
263
+ parser.add_argument(
264
+ "--no-analytics",
265
+ action="store_false",
266
+ dest="analytics",
267
+ )
268
+ parser.add_argument(
269
+ "--modules",
270
+ nargs="+",
271
+ choices=VALID_MODULES,
272
+ default=None,
273
+ help="Analytics modules to run (default: all). Must match catalog module_arg.",
274
+ )
275
+ parser.add_argument(
276
+ "--from-stage",
277
+ action="store_true",
278
+ help="Load from staged ClickHouse table, run analytics only (implies --no-extract --no-transform --no-stage)",
279
+ )
280
+ args = parser.parse_args()
281
+
282
+ if args.from_stage:
283
+ args.extract = False
284
+ args.transform = False
285
+ args.stage = False
286
+
287
+ result = run_pipeline(
288
+ run_extract=args.extract,
289
+ run_transform=args.transform,
290
+ run_stage=args.stage,
291
+ run_analytics=args.analytics,
292
+ modules=args.modules,
293
+ )
294
+
295
+ logger.info("Run completed: %s", result["run_id"])
296
+ return 0
297
+
298
+
299
+ if __name__ == "__main__":
300
+ sys.exit(main())
batch_analytics/log.py ADDED
@@ -0,0 +1,101 @@
1
+ """
2
+ Log stage: Persist run metadata, metrics, and analytics results for audit and debugging.
3
+ """
4
+
5
+ import json
6
+ import logging
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+
10
+ from pyspark.sql import SparkSession
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def log_run(
16
+ run_id: str,
17
+ stage: str,
18
+ metrics: dict,
19
+ output_dir: str | Path | None = None,
20
+ extra: dict | None = None,
21
+ ) -> Path:
22
+ """
23
+ Write run log (metadata + metrics) to a JSON file.
24
+ Returns the path of the written file.
25
+ """
26
+ output_dir = Path(output_dir) if output_dir else Path("/tmp/analytics_logs")
27
+ output_dir.mkdir(parents=True, exist_ok=True)
28
+
29
+ log_data = {
30
+ "run_id": run_id,
31
+ "stage": stage,
32
+ "timestamp": datetime.utcnow().isoformat() + "Z",
33
+ "metrics": metrics,
34
+ **(extra or {}),
35
+ }
36
+
37
+ log_file = output_dir / f"{run_id}_{stage}.json"
38
+ with open(log_file, "w") as f:
39
+ json.dump(log_data, f, indent=2)
40
+
41
+ logger.info("Logged %s to %s", stage, log_file)
42
+ return log_file
43
+
44
+
45
+ def log_dataframe_summary(
46
+ spark: SparkSession,
47
+ df_name: str,
48
+ df,
49
+ ) -> dict:
50
+ """Produce a summary dict for a DataFrame (row count, schema snippet)."""
51
+ count = df.count()
52
+ schema = [f.name for f in df.schema.fields[:20]] # first 20 columns
53
+ if len(df.schema.fields) > 20:
54
+ schema.append("...")
55
+ return {
56
+ "name": df_name,
57
+ "row_count": count,
58
+ "column_count": len(df.schema.fields),
59
+ "columns": schema,
60
+ }
61
+
62
+
63
+ def log_analytics_artifacts(
64
+ run_id: str,
65
+ artifacts: dict,
66
+ output_dir: str | Path,
67
+ ) -> list[Path]:
68
+ """
69
+ Write analytics module outputs (e.g. slopes, correlation matrix, PCA loadings)
70
+ to JSON files in the log directory.
71
+ """
72
+ output_dir = Path(output_dir)
73
+ output_dir.mkdir(parents=True, exist_ok=True)
74
+
75
+ paths: list[Path] = []
76
+ for name, data in artifacts.items():
77
+ path = output_dir / f"{run_id}_analytics_{name}.json"
78
+ with open(path, "w") as f:
79
+ json.dump(_serialize_for_json(data), f, indent=2)
80
+ paths.append(path)
81
+
82
+ return paths
83
+
84
+
85
+ def _serialize_for_json(obj):
86
+ """Convert numpy/Python types to JSON-serializable forms."""
87
+ import numpy as np
88
+
89
+ if isinstance(obj, dict):
90
+ return {k: _serialize_for_json(v) for k, v in obj.items()}
91
+ if isinstance(obj, (list, tuple)):
92
+ return [_serialize_for_json(x) for x in obj]
93
+ if isinstance(obj, np.ndarray):
94
+ return obj.tolist()
95
+ if isinstance(obj, (np.floating, np.float32, np.float64)):
96
+ return float(obj)
97
+ if isinstance(obj, (np.integer, np.int32, np.int64)):
98
+ return int(obj)
99
+ if isinstance(obj, np.bool_):
100
+ return bool(obj)
101
+ return obj