batch-analytics 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- batch_analytics/__init__.py +44 -0
- batch_analytics/__main__.py +5 -0
- batch_analytics/analytics/__init__.py +19 -0
- batch_analytics/analytics/correlation.py +113 -0
- batch_analytics/analytics/linear_regression.py +136 -0
- batch_analytics/analytics/pca_clustering.py +143 -0
- batch_analytics/analytics/t_test.py +184 -0
- batch_analytics/config.py +169 -0
- batch_analytics/extract.py +118 -0
- batch_analytics/job_runner.py +300 -0
- batch_analytics/log.py +101 -0
- batch_analytics/modules.py +24 -0
- batch_analytics/output/__init__.py +22 -0
- batch_analytics/output/base.py +97 -0
- batch_analytics/output/clickhouse.py +89 -0
- batch_analytics/output/local.py +36 -0
- batch_analytics/output/s3.py +82 -0
- batch_analytics/transform.py +184 -0
- batch_analytics-0.1.0.dist-info/METADATA +80 -0
- batch_analytics-0.1.0.dist-info/RECORD +23 -0
- batch_analytics-0.1.0.dist-info/WHEEL +5 -0
- batch_analytics-0.1.0.dist-info/entry_points.txt +2 -0
- batch_analytics-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration for batch analytics pipeline: Extract, Transform, Log stages.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ClickHouseConfig:
|
|
12
|
+
"""ClickHouse connection settings."""
|
|
13
|
+
|
|
14
|
+
host: str = field(
|
|
15
|
+
default_factory=lambda: os.environ.get(
|
|
16
|
+
"CLICKHOUSE_HOST",
|
|
17
|
+
"my-simple-cluster-clickhouse-headless.default.svc.cluster.local",
|
|
18
|
+
)
|
|
19
|
+
)
|
|
20
|
+
port: int = int(os.environ.get("CLICKHOUSE_HTTP_PORT", "8123"))
|
|
21
|
+
database: str = os.environ.get("CLICKHOUSE_DB", "example_db")
|
|
22
|
+
user: str = os.environ.get("CLICKHOUSE_USER", "default")
|
|
23
|
+
password: str = os.environ.get("CLICKHOUSE_PASSWORD", "")
|
|
24
|
+
protocol: str = os.environ.get("CLICKHOUSE_PROTOCOL", "http")
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def jdbc_url(self) -> str:
|
|
28
|
+
return f"jdbc:ch://{self.host}:{self.port}/{self.database}"
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def jdbc_properties(self) -> dict:
|
|
32
|
+
props = {"user": self.user, "driver": "com.clickhouse.jdbc.ClickHouseDriver"}
|
|
33
|
+
if self.password:
|
|
34
|
+
props["password"] = self.password
|
|
35
|
+
return props
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ExtractConfig:
|
|
40
|
+
"""Extract stage configuration."""
|
|
41
|
+
|
|
42
|
+
# Source table(s) - comma-separated for multiple, or single table name
|
|
43
|
+
source_tables: str = os.environ.get(
|
|
44
|
+
"BATCH_SOURCE_TABLES",
|
|
45
|
+
"batch_details_table,manufacturing_table,yield_table,temperature_table",
|
|
46
|
+
)
|
|
47
|
+
# Use ClickHouse native connector (format) vs JDBC fallback
|
|
48
|
+
use_native_connector: bool = os.environ.get(
|
|
49
|
+
"BATCH_USE_NATIVE_CONNECTOR", "false"
|
|
50
|
+
).lower() == "true"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class TransformConfig:
|
|
55
|
+
"""Transform stage configuration."""
|
|
56
|
+
|
|
57
|
+
# Columns to use for deduplication (comma-separated); empty = use all columns
|
|
58
|
+
dedup_columns: str = os.environ.get("BATCH_DEDUP_COLUMNS", "")
|
|
59
|
+
# Staging output path (local or S3)
|
|
60
|
+
staging_path: str = os.environ.get(
|
|
61
|
+
"BATCH_STAGING_PATH",
|
|
62
|
+
"/tmp/analytics_stage",
|
|
63
|
+
)
|
|
64
|
+
# Output format for load_staged when reading (parquet/delta/clickhouse).
|
|
65
|
+
# Stage job always writes to ClickHouse; use clickhouse for analytics to read from staged table.
|
|
66
|
+
staging_format: str = os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
|
|
67
|
+
# Staging table name in ClickHouse (when format=clickhouse)
|
|
68
|
+
staging_table: str = os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
|
|
69
|
+
# Extract anchor_id from add_dimension column (e.g. {'anchor_id':'GP/GPH(D)/II(W)/250019'})
|
|
70
|
+
add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
|
|
71
|
+
anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class LogConfig:
|
|
76
|
+
"""Log stage configuration."""
|
|
77
|
+
|
|
78
|
+
log_path: str = os.environ.get(
|
|
79
|
+
"BATCH_LOG_PATH",
|
|
80
|
+
"/tmp/analytics_logs",
|
|
81
|
+
)
|
|
82
|
+
# Retention: number of runs to keep
|
|
83
|
+
retention_runs: int = int(os.environ.get("BATCH_LOG_RETENTION", "30"))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class OutputConfig:
|
|
88
|
+
"""
|
|
89
|
+
Output destination for analytics results (injected by analytics_runner).
|
|
90
|
+
|
|
91
|
+
Env vars: OUTPUT_TYPE, OUTPUT_S3_PATH, OUTPUT_CLICKHOUSE_*, TASK_ID
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
type: str = os.environ.get("OUTPUT_TYPE", "local")
|
|
95
|
+
s3_path: str = os.environ.get("OUTPUT_S3_PATH", "")
|
|
96
|
+
clickhouse_database: str = os.environ.get("OUTPUT_CLICKHOUSE_DATABASE", "example_db")
|
|
97
|
+
clickhouse_table: str = os.environ.get("OUTPUT_CLICKHOUSE_TABLE", "analytics_results")
|
|
98
|
+
# Task ID from spark_runner (for S3 key prefix, ClickHouse task_id column)
|
|
99
|
+
task_id: str = os.environ.get("TASK_ID", "")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class AnalyticsConfig:
|
|
104
|
+
"""Analytics modules configuration."""
|
|
105
|
+
|
|
106
|
+
# Module 1: Linear regression - X and Y column names
|
|
107
|
+
lr_x_column: str = os.environ.get("BATCH_LR_X_COLUMN", "x")
|
|
108
|
+
lr_y_column: str = os.environ.get("BATCH_LR_Y_COLUMN", "y")
|
|
109
|
+
# Groups to compare slopes (e.g. "product,batch_no")
|
|
110
|
+
lr_group_columns: str = os.environ.get("BATCH_LR_GROUP_COLUMNS", "")
|
|
111
|
+
|
|
112
|
+
# Module 2: Correlation - feature columns (comma-separated)
|
|
113
|
+
corr_features: str = os.environ.get(
|
|
114
|
+
"BATCH_CORR_FEATURES",
|
|
115
|
+
"",
|
|
116
|
+
)
|
|
117
|
+
corr_threshold: float = float(os.environ.get("BATCH_CORR_THRESHOLD", "0.8"))
|
|
118
|
+
|
|
119
|
+
# Module 3: PCA + Clustering
|
|
120
|
+
pca_features: str = os.environ.get("BATCH_PCA_FEATURES", "")
|
|
121
|
+
pca_variance_threshold: float = float(
|
|
122
|
+
os.environ.get("BATCH_PCA_VARIANCE_THRESHOLD", "0.95")
|
|
123
|
+
)
|
|
124
|
+
cluster_k: int = int(os.environ.get("BATCH_CLUSTER_K", "3"))
|
|
125
|
+
|
|
126
|
+
# Module 4: T-test - compare means of two groups
|
|
127
|
+
# Mode 1: value column + group column (2 levels)
|
|
128
|
+
ttest_value_column: str = os.environ.get("BATCH_TTEST_VALUE_COLUMN", "")
|
|
129
|
+
ttest_group_column: str = os.environ.get("BATCH_TTEST_GROUP_COLUMN", "")
|
|
130
|
+
# Mode 2: two numeric columns
|
|
131
|
+
ttest_col_a: str = os.environ.get("BATCH_TTEST_COL_A", "")
|
|
132
|
+
ttest_col_b: str = os.environ.get("BATCH_TTEST_COL_B", "")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass
|
|
136
|
+
class SparkK8sConfig:
|
|
137
|
+
"""Spark on Kubernetes configuration."""
|
|
138
|
+
|
|
139
|
+
# Set SPARK_MASTER=k8s://https://kubernetes.default.svc:443 to enable K8s
|
|
140
|
+
master: str = os.environ.get("SPARK_MASTER", "local[*]")
|
|
141
|
+
container_image: str = os.environ.get("SPARK_K8S_IMAGE", "sudhakso/spark:3.5.0")
|
|
142
|
+
namespace: str = os.environ.get("SPARK_K8S_NAMESPACE", "default")
|
|
143
|
+
service_account: str = os.environ.get("SPARK_K8S_SERVICE_ACCOUNT", "spark-sa")
|
|
144
|
+
deploy_mode: str = os.environ.get("SPARK_DEPLOY_MODE", "client")
|
|
145
|
+
# Driver/executor resources
|
|
146
|
+
driver_memory: str = os.environ.get("SPARK_DRIVER_MEMORY", "512m")
|
|
147
|
+
driver_memory_overhead: str = os.environ.get("SPARK_DRIVER_MEMORY_OVERHEAD", "256m")
|
|
148
|
+
executor_instances: int = int(os.environ.get("SPARK_EXECUTOR_INSTANCES", "1"))
|
|
149
|
+
executor_cores: int = int(os.environ.get("SPARK_EXECUTOR_CORES", "1"))
|
|
150
|
+
executor_memory: str = os.environ.get("SPARK_EXECUTOR_MEMORY", "512m")
|
|
151
|
+
executor_memory_overhead: str = os.environ.get("SPARK_EXECUTOR_MEMORY_OVERHEAD", "128m")
|
|
152
|
+
# S3 (optional; set for s3a:// paths)
|
|
153
|
+
s3_access_key: str = os.environ.get("AWS_ACCESS_KEY_ID", "")
|
|
154
|
+
s3_secret_key: str = os.environ.get("AWS_SECRET_ACCESS_KEY", "")
|
|
155
|
+
s3_endpoint: str = os.environ.get("AWS_ENDPOINT", "s3.amazonaws.com")
|
|
156
|
+
s3_region: str = os.environ.get("AWS_REGION", "us-east-2")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclass
|
|
160
|
+
class BatchAnalyticsConfig:
|
|
161
|
+
"""Aggregate configuration for the pipeline."""
|
|
162
|
+
|
|
163
|
+
clickhouse: ClickHouseConfig = field(default_factory=ClickHouseConfig)
|
|
164
|
+
spark_k8s: SparkK8sConfig = field(default_factory=SparkK8sConfig)
|
|
165
|
+
extract: ExtractConfig = field(default_factory=ExtractConfig)
|
|
166
|
+
transform: TransformConfig = field(default_factory=TransformConfig)
|
|
167
|
+
log: LogConfig = field(default_factory=LogConfig)
|
|
168
|
+
output: OutputConfig = field(default_factory=OutputConfig)
|
|
169
|
+
analytics: AnalyticsConfig = field(default_factory=AnalyticsConfig)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDBC.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
9
|
+
|
|
10
|
+
from .config import BatchAnalyticsConfig
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> DataFrame | None:
|
|
16
|
+
"""
|
|
17
|
+
Read from ClickHouse using the native format API (clickhouse-spark-runtime).
|
|
18
|
+
Requires: com.clickhouse.spark:clickhouse-spark-runtime in spark.jars.packages
|
|
19
|
+
"""
|
|
20
|
+
try:
|
|
21
|
+
df = (
|
|
22
|
+
spark.read.format("clickhouse")
|
|
23
|
+
.option("host", cfg.clickhouse.host)
|
|
24
|
+
.option("protocol", cfg.clickhouse.protocol)
|
|
25
|
+
.option("http_port", str(cfg.clickhouse.port))
|
|
26
|
+
.option("database", cfg.clickhouse.database)
|
|
27
|
+
.option("table", table)
|
|
28
|
+
.option("user", cfg.clickhouse.user)
|
|
29
|
+
.load()
|
|
30
|
+
)
|
|
31
|
+
return df
|
|
32
|
+
except Exception as e:
|
|
33
|
+
logger.warning(
|
|
34
|
+
"Native ClickHouse connector failed for table %s: %s. Falling back to JDBC.",
|
|
35
|
+
table,
|
|
36
|
+
e,
|
|
37
|
+
)
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _read_via_jdbc(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> DataFrame:
|
|
42
|
+
"""Read from ClickHouse via JDBC."""
|
|
43
|
+
return spark.read.jdbc(
|
|
44
|
+
cfg.clickhouse.jdbc_url,
|
|
45
|
+
table,
|
|
46
|
+
properties=cfg.clickhouse.jdbc_properties,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def extract_table(
|
|
51
|
+
spark: SparkSession,
|
|
52
|
+
table: str,
|
|
53
|
+
config: BatchAnalyticsConfig,
|
|
54
|
+
) -> DataFrame:
|
|
55
|
+
"""
|
|
56
|
+
Extract a single table from ClickHouse.
|
|
57
|
+
Uses native connector if configured, otherwise JDBC.
|
|
58
|
+
"""
|
|
59
|
+
if config.extract.use_native_connector:
|
|
60
|
+
df = _read_via_format(spark, config, table)
|
|
61
|
+
if df is None:
|
|
62
|
+
df = _read_via_jdbc(spark, config, table)
|
|
63
|
+
else:
|
|
64
|
+
df = _read_via_jdbc(spark, config, table)
|
|
65
|
+
|
|
66
|
+
logger.info("Extracted table %s: %d rows", table, df.count())
|
|
67
|
+
return df
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def extract_all(
|
|
71
|
+
spark: SparkSession,
|
|
72
|
+
config: BatchAnalyticsConfig,
|
|
73
|
+
) -> dict[str, DataFrame]:
|
|
74
|
+
"""
|
|
75
|
+
Extract all configured source tables from ClickHouse.
|
|
76
|
+
Returns a dict mapping table name to DataFrame.
|
|
77
|
+
"""
|
|
78
|
+
tables = [t.strip() for t in config.extract.source_tables.split(",") if t.strip()]
|
|
79
|
+
if not tables:
|
|
80
|
+
raise ValueError("No source tables configured in BATCH_SOURCE_TABLES")
|
|
81
|
+
|
|
82
|
+
result: dict[str, DataFrame] = {}
|
|
83
|
+
for table in tables:
|
|
84
|
+
df = extract_table(spark, table, config)
|
|
85
|
+
result[table] = df
|
|
86
|
+
|
|
87
|
+
return result
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def extract_unified(
|
|
91
|
+
spark: SparkSession,
|
|
92
|
+
config: BatchAnalyticsConfig,
|
|
93
|
+
join_keys: list[str] | None = None,
|
|
94
|
+
primary_table: str | None = None,
|
|
95
|
+
) -> DataFrame:
|
|
96
|
+
"""
|
|
97
|
+
Extract and unify source tables into one DataFrame.
|
|
98
|
+
- Single table: returns it directly.
|
|
99
|
+
- Multiple tables + join_keys: joins on those keys (left join, first table base).
|
|
100
|
+
- Multiple tables, no join_keys: returns the primary (or first) table.
|
|
101
|
+
Use primary_table to choose which table to use for analytics.
|
|
102
|
+
"""
|
|
103
|
+
all_dfs = extract_all(spark, config)
|
|
104
|
+
|
|
105
|
+
if len(all_dfs) == 1:
|
|
106
|
+
return list(all_dfs.values())[0]
|
|
107
|
+
|
|
108
|
+
if join_keys:
|
|
109
|
+
dfs = list(all_dfs.values())
|
|
110
|
+
base = dfs[0]
|
|
111
|
+
for other in dfs[1:]:
|
|
112
|
+
base = base.join(other, on=join_keys, how="left")
|
|
113
|
+
return base
|
|
114
|
+
|
|
115
|
+
# Multiple tables, no join: use primary or first
|
|
116
|
+
if primary_table and primary_table in all_dfs:
|
|
117
|
+
return all_dfs[primary_table]
|
|
118
|
+
return list(all_dfs.values())[0]
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch analytics job runner: orchestrates Extract, Transform, Log stages and analytics modules.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import socket
|
|
9
|
+
import sys
|
|
10
|
+
import uuid
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from pyspark.sql import SparkSession
|
|
14
|
+
|
|
15
|
+
from .config import BatchAnalyticsConfig
|
|
16
|
+
from .extract import extract_unified
|
|
17
|
+
from .log import log_dataframe_summary, log_run
|
|
18
|
+
from .modules import DEFAULT_MODULES, MODULE_REGISTRY, VALID_MODULES
|
|
19
|
+
from .output import write_analytics_output
|
|
20
|
+
from .transform import load_staged, stage_to_clickhouse, transform
|
|
21
|
+
|
|
22
|
+
logging.basicConfig(
|
|
23
|
+
level=logging.INFO,
|
|
24
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
25
|
+
)
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def create_spark_session(
|
|
30
|
+
app_name: str = "BatchAnalytics",
|
|
31
|
+
clickhouse_jars: str | None = None,
|
|
32
|
+
config: BatchAnalyticsConfig | None = None,
|
|
33
|
+
) -> SparkSession:
|
|
34
|
+
"""
|
|
35
|
+
Create SparkSession. Uses Kubernetes config when SPARK_MASTER starts with k8s://.
|
|
36
|
+
"""
|
|
37
|
+
config = config or BatchAnalyticsConfig()
|
|
38
|
+
cfg = config.spark_k8s
|
|
39
|
+
|
|
40
|
+
builder = (
|
|
41
|
+
SparkSession.builder.appName(app_name)
|
|
42
|
+
.config("spark.sql.adaptive.enabled", "true")
|
|
43
|
+
.config("spark.sql.adaptive.coalescePartitions.enabled", "true")
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# JARs: ClickHouse JDBC + hadoop-aws (for S3 when on K8s)
|
|
47
|
+
packages = []
|
|
48
|
+
if clickhouse_jars:
|
|
49
|
+
packages.append(clickhouse_jars)
|
|
50
|
+
if cfg.master.startswith("k8s://"):
|
|
51
|
+
packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
|
|
52
|
+
if packages:
|
|
53
|
+
builder = builder.config("spark.jars.packages", ",".join(packages))
|
|
54
|
+
|
|
55
|
+
if cfg.master.startswith("k8s://"):
|
|
56
|
+
driver_host = socket.gethostbyname(socket.gethostname())
|
|
57
|
+
builder = (
|
|
58
|
+
builder.master(cfg.master)
|
|
59
|
+
.config("spark.kubernetes.container.image", cfg.container_image)
|
|
60
|
+
.config("spark.kubernetes.namespace", cfg.namespace)
|
|
61
|
+
.config("spark.kubernetes.authenticate.serviceAccountName", cfg.service_account)
|
|
62
|
+
.config("spark.driver.host", driver_host)
|
|
63
|
+
.config("spark.driver.bindAddress", "0.0.0.0")
|
|
64
|
+
.config(
|
|
65
|
+
"spark.kubernetes.driver.pod.name",
|
|
66
|
+
os.environ.get("HOSTNAME", socket.gethostname()),
|
|
67
|
+
)
|
|
68
|
+
.config("spark.submit.deployMode", cfg.deploy_mode)
|
|
69
|
+
.config(
|
|
70
|
+
"spark.driver.extraJavaOptions",
|
|
71
|
+
"-Djava.net.preferIPv4Stack=true -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError",
|
|
72
|
+
)
|
|
73
|
+
.config("spark.driver.memory", cfg.driver_memory)
|
|
74
|
+
.config("spark.driver.memoryOverhead", cfg.driver_memory_overhead)
|
|
75
|
+
.config("spark.executor.instances", str(cfg.executor_instances))
|
|
76
|
+
.config("spark.executor.cores", str(cfg.executor_cores))
|
|
77
|
+
.config("spark.executor.memory", cfg.executor_memory)
|
|
78
|
+
.config("spark.executor.memoryOverhead", cfg.executor_memory_overhead)
|
|
79
|
+
.config("spark.kubernetes.executor.serviceAccountName", cfg.service_account)
|
|
80
|
+
.config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
|
|
81
|
+
)
|
|
82
|
+
if cfg.s3_access_key and cfg.s3_secret_key:
|
|
83
|
+
builder = (
|
|
84
|
+
builder.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
|
|
85
|
+
.config("spark.hadoop.fs.s3a.access.key", cfg.s3_access_key)
|
|
86
|
+
.config("spark.hadoop.fs.s3a.secret.key", cfg.s3_secret_key)
|
|
87
|
+
.config("spark.hadoop.fs.s3a.endpoint", cfg.s3_endpoint)
|
|
88
|
+
.config("spark.hadoop.fs.s3a.endpoint.region", cfg.s3_region)
|
|
89
|
+
)
|
|
90
|
+
logger.info("Spark on Kubernetes: master=%s", cfg.master)
|
|
91
|
+
|
|
92
|
+
return builder.getOrCreate()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def run_pipeline(
|
|
96
|
+
config: BatchAnalyticsConfig | None = None,
|
|
97
|
+
spark: SparkSession | None = None,
|
|
98
|
+
run_extract: bool = True,
|
|
99
|
+
run_transform: bool = True,
|
|
100
|
+
run_stage: bool = True,
|
|
101
|
+
run_analytics: bool = True,
|
|
102
|
+
modules: list[str] | None = None,
|
|
103
|
+
) -> dict:
|
|
104
|
+
"""
|
|
105
|
+
Run the full pipeline or selected stages.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
config: Pipeline config (default: BatchAnalyticsConfig())
|
|
109
|
+
spark: SparkSession (created if None)
|
|
110
|
+
run_extract: Whether to extract from ClickHouse
|
|
111
|
+
run_transform: Whether to transform (clean, dedupe, extract anchor_id)
|
|
112
|
+
run_stage: Whether to stage transformed data to ClickHouse (separate job)
|
|
113
|
+
run_analytics: Whether to run analytics (requires staged data in ClickHouse)
|
|
114
|
+
modules: Which analytics to run: ["lr", "corr", "pca", "ttest"] or None for all
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Dict with run_id, stage results, analytics outputs
|
|
118
|
+
"""
|
|
119
|
+
config = config or BatchAnalyticsConfig()
|
|
120
|
+
run_id = str(uuid.uuid4())[:8]
|
|
121
|
+
|
|
122
|
+
if spark is None:
|
|
123
|
+
jars = "com.clickhouse:clickhouse-jdbc:0.4.6:all"
|
|
124
|
+
spark = create_spark_session(
|
|
125
|
+
app_name="BatchAnalytics",
|
|
126
|
+
clickhouse_jars=jars,
|
|
127
|
+
config=config,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
result = {"run_id": run_id, "stages": {}, "analytics": {}}
|
|
131
|
+
|
|
132
|
+
# ----- Extract -----
|
|
133
|
+
if run_extract:
|
|
134
|
+
logger.info("Stage: Extract")
|
|
135
|
+
df_raw = extract_unified(spark, config)
|
|
136
|
+
result["stages"]["extract"] = {
|
|
137
|
+
"row_count": df_raw.count(),
|
|
138
|
+
"columns": [f.name for f in df_raw.schema.fields],
|
|
139
|
+
}
|
|
140
|
+
else:
|
|
141
|
+
logger.info("Stage: Extract (skipped, loading from stage)")
|
|
142
|
+
df_raw = load_staged(spark, config)
|
|
143
|
+
result["stages"]["extract"] = {"skipped": True, "loaded_from_stage": True}
|
|
144
|
+
|
|
145
|
+
# ----- Transform -----
|
|
146
|
+
if run_transform:
|
|
147
|
+
logger.info("Stage: Transform")
|
|
148
|
+
df_transformed = transform(df_raw, config)
|
|
149
|
+
result["stages"]["transform"] = {"row_count": df_transformed.count()}
|
|
150
|
+
else:
|
|
151
|
+
df_transformed = df_raw # df_raw already loaded from staged when not run_extract
|
|
152
|
+
if run_extract and run_analytics:
|
|
153
|
+
from .transform import extract_anchor_id, remove_duplicates
|
|
154
|
+
|
|
155
|
+
df_transformed = extract_anchor_id(df_raw, config)
|
|
156
|
+
dedup_cols = (
|
|
157
|
+
[c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
|
|
158
|
+
if config.transform.dedup_columns
|
|
159
|
+
else None
|
|
160
|
+
)
|
|
161
|
+
df_transformed = remove_duplicates(df_transformed, key_columns=dedup_cols)
|
|
162
|
+
result["stages"]["transform"] = {"skipped": True}
|
|
163
|
+
|
|
164
|
+
# ----- Stage (to ClickHouse) - separate job before analytics -----
|
|
165
|
+
if run_stage:
|
|
166
|
+
logger.info("Stage: Stage to ClickHouse")
|
|
167
|
+
stage_to_clickhouse(spark, df_transformed, config)
|
|
168
|
+
result["stages"]["stage"] = {
|
|
169
|
+
"destination": f"{config.clickhouse.database}.{config.transform.staging_table}",
|
|
170
|
+
"row_count": df_transformed.count(),
|
|
171
|
+
}
|
|
172
|
+
else:
|
|
173
|
+
result["stages"]["stage"] = {"skipped": True}
|
|
174
|
+
|
|
175
|
+
# For analytics: use in-memory df (we always have df_transformed from transform/load)
|
|
176
|
+
df_for_analytics = df_transformed
|
|
177
|
+
|
|
178
|
+
# ----- Log stage metrics -----
|
|
179
|
+
log_run(
|
|
180
|
+
run_id,
|
|
181
|
+
"pipeline",
|
|
182
|
+
result["stages"],
|
|
183
|
+
output_dir=config.log.log_path,
|
|
184
|
+
extra={"config_summary": {"staging_format": config.transform.staging_format}},
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# ----- Analytics (runs after stage; must run extract+transform+stage first, or use --from-stage) -----
|
|
188
|
+
if run_analytics:
|
|
189
|
+
modules = modules or DEFAULT_MODULES
|
|
190
|
+
for mod in modules:
|
|
191
|
+
if mod not in MODULE_REGISTRY:
|
|
192
|
+
logger.warning("Unknown module %r, skipping. Valid: %s", mod, VALID_MODULES)
|
|
193
|
+
continue
|
|
194
|
+
run_fn, result_key = MODULE_REGISTRY[mod]
|
|
195
|
+
try:
|
|
196
|
+
mod_result = run_fn(spark, df_for_analytics, config)
|
|
197
|
+
result["analytics"][result_key] = mod_result
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.exception("%s failed: %s", result_key, e)
|
|
200
|
+
result["analytics"][result_key] = {"error": str(e)}
|
|
201
|
+
|
|
202
|
+
task_id = config.output.task_id or run_id
|
|
203
|
+
write_analytics_output(
|
|
204
|
+
run_id=run_id,
|
|
205
|
+
task_id=task_id,
|
|
206
|
+
artifacts=result["analytics"],
|
|
207
|
+
output_type=config.output.type,
|
|
208
|
+
path=config.log.log_path
|
|
209
|
+
if config.output.type == "local"
|
|
210
|
+
else config.output.s3_path,
|
|
211
|
+
database=config.output.clickhouse_database,
|
|
212
|
+
table=config.output.clickhouse_table,
|
|
213
|
+
host=config.clickhouse.host,
|
|
214
|
+
port=config.clickhouse.port,
|
|
215
|
+
user=config.clickhouse.user,
|
|
216
|
+
password=config.clickhouse.password,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return result
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def main() -> int:
|
|
223
|
+
parser = argparse.ArgumentParser(description="Batch analytics: Extract, Transform, Log + analytics")
|
|
224
|
+
parser.add_argument(
|
|
225
|
+
"--extract",
|
|
226
|
+
action="store_true",
|
|
227
|
+
default=True,
|
|
228
|
+
help="Run extract stage (default: true)",
|
|
229
|
+
)
|
|
230
|
+
parser.add_argument(
|
|
231
|
+
"--no-extract",
|
|
232
|
+
action="store_false",
|
|
233
|
+
dest="extract",
|
|
234
|
+
)
|
|
235
|
+
parser.add_argument(
|
|
236
|
+
"--transform",
|
|
237
|
+
action="store_true",
|
|
238
|
+
default=True,
|
|
239
|
+
help="Run transform stage (default: true)",
|
|
240
|
+
)
|
|
241
|
+
parser.add_argument(
|
|
242
|
+
"--no-transform",
|
|
243
|
+
action="store_false",
|
|
244
|
+
dest="transform",
|
|
245
|
+
)
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--stage",
|
|
248
|
+
action="store_true",
|
|
249
|
+
default=True,
|
|
250
|
+
help="Run stage step: write transformed data to ClickHouse (default: true)",
|
|
251
|
+
)
|
|
252
|
+
parser.add_argument(
|
|
253
|
+
"--no-stage",
|
|
254
|
+
action="store_false",
|
|
255
|
+
dest="stage",
|
|
256
|
+
)
|
|
257
|
+
parser.add_argument(
|
|
258
|
+
"--analytics",
|
|
259
|
+
action="store_true",
|
|
260
|
+
default=True,
|
|
261
|
+
help="Run analytics modules (default: true)",
|
|
262
|
+
)
|
|
263
|
+
parser.add_argument(
|
|
264
|
+
"--no-analytics",
|
|
265
|
+
action="store_false",
|
|
266
|
+
dest="analytics",
|
|
267
|
+
)
|
|
268
|
+
parser.add_argument(
|
|
269
|
+
"--modules",
|
|
270
|
+
nargs="+",
|
|
271
|
+
choices=VALID_MODULES,
|
|
272
|
+
default=None,
|
|
273
|
+
help="Analytics modules to run (default: all). Must match catalog module_arg.",
|
|
274
|
+
)
|
|
275
|
+
parser.add_argument(
|
|
276
|
+
"--from-stage",
|
|
277
|
+
action="store_true",
|
|
278
|
+
help="Load from staged ClickHouse table, run analytics only (implies --no-extract --no-transform --no-stage)",
|
|
279
|
+
)
|
|
280
|
+
args = parser.parse_args()
|
|
281
|
+
|
|
282
|
+
if args.from_stage:
|
|
283
|
+
args.extract = False
|
|
284
|
+
args.transform = False
|
|
285
|
+
args.stage = False
|
|
286
|
+
|
|
287
|
+
result = run_pipeline(
|
|
288
|
+
run_extract=args.extract,
|
|
289
|
+
run_transform=args.transform,
|
|
290
|
+
run_stage=args.stage,
|
|
291
|
+
run_analytics=args.analytics,
|
|
292
|
+
modules=args.modules,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
logger.info("Run completed: %s", result["run_id"])
|
|
296
|
+
return 0
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
if __name__ == "__main__":
|
|
300
|
+
sys.exit(main())
|
batch_analytics/log.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Log stage: Persist run metadata, metrics, and analytics results for audit and debugging.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from pyspark.sql import SparkSession
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def log_run(
|
|
16
|
+
run_id: str,
|
|
17
|
+
stage: str,
|
|
18
|
+
metrics: dict,
|
|
19
|
+
output_dir: str | Path | None = None,
|
|
20
|
+
extra: dict | None = None,
|
|
21
|
+
) -> Path:
|
|
22
|
+
"""
|
|
23
|
+
Write run log (metadata + metrics) to a JSON file.
|
|
24
|
+
Returns the path of the written file.
|
|
25
|
+
"""
|
|
26
|
+
output_dir = Path(output_dir) if output_dir else Path("/tmp/analytics_logs")
|
|
27
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
log_data = {
|
|
30
|
+
"run_id": run_id,
|
|
31
|
+
"stage": stage,
|
|
32
|
+
"timestamp": datetime.utcnow().isoformat() + "Z",
|
|
33
|
+
"metrics": metrics,
|
|
34
|
+
**(extra or {}),
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
log_file = output_dir / f"{run_id}_{stage}.json"
|
|
38
|
+
with open(log_file, "w") as f:
|
|
39
|
+
json.dump(log_data, f, indent=2)
|
|
40
|
+
|
|
41
|
+
logger.info("Logged %s to %s", stage, log_file)
|
|
42
|
+
return log_file
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def log_dataframe_summary(
|
|
46
|
+
spark: SparkSession,
|
|
47
|
+
df_name: str,
|
|
48
|
+
df,
|
|
49
|
+
) -> dict:
|
|
50
|
+
"""Produce a summary dict for a DataFrame (row count, schema snippet)."""
|
|
51
|
+
count = df.count()
|
|
52
|
+
schema = [f.name for f in df.schema.fields[:20]] # first 20 columns
|
|
53
|
+
if len(df.schema.fields) > 20:
|
|
54
|
+
schema.append("...")
|
|
55
|
+
return {
|
|
56
|
+
"name": df_name,
|
|
57
|
+
"row_count": count,
|
|
58
|
+
"column_count": len(df.schema.fields),
|
|
59
|
+
"columns": schema,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def log_analytics_artifacts(
|
|
64
|
+
run_id: str,
|
|
65
|
+
artifacts: dict,
|
|
66
|
+
output_dir: str | Path,
|
|
67
|
+
) -> list[Path]:
|
|
68
|
+
"""
|
|
69
|
+
Write analytics module outputs (e.g. slopes, correlation matrix, PCA loadings)
|
|
70
|
+
to JSON files in the log directory.
|
|
71
|
+
"""
|
|
72
|
+
output_dir = Path(output_dir)
|
|
73
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
|
|
75
|
+
paths: list[Path] = []
|
|
76
|
+
for name, data in artifacts.items():
|
|
77
|
+
path = output_dir / f"{run_id}_analytics_{name}.json"
|
|
78
|
+
with open(path, "w") as f:
|
|
79
|
+
json.dump(_serialize_for_json(data), f, indent=2)
|
|
80
|
+
paths.append(path)
|
|
81
|
+
|
|
82
|
+
return paths
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _serialize_for_json(obj):
|
|
86
|
+
"""Convert numpy/Python types to JSON-serializable forms."""
|
|
87
|
+
import numpy as np
|
|
88
|
+
|
|
89
|
+
if isinstance(obj, dict):
|
|
90
|
+
return {k: _serialize_for_json(v) for k, v in obj.items()}
|
|
91
|
+
if isinstance(obj, (list, tuple)):
|
|
92
|
+
return [_serialize_for_json(x) for x in obj]
|
|
93
|
+
if isinstance(obj, np.ndarray):
|
|
94
|
+
return obj.tolist()
|
|
95
|
+
if isinstance(obj, (np.floating, np.float32, np.float64)):
|
|
96
|
+
return float(obj)
|
|
97
|
+
if isinstance(obj, (np.integer, np.int32, np.int64)):
|
|
98
|
+
return int(obj)
|
|
99
|
+
if isinstance(obj, np.bool_):
|
|
100
|
+
return bool(obj)
|
|
101
|
+
return obj
|