batch-analytics 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- batch_analytics/__init__.py +44 -0
- batch_analytics/__main__.py +5 -0
- batch_analytics/analytics/__init__.py +19 -0
- batch_analytics/analytics/correlation.py +113 -0
- batch_analytics/analytics/linear_regression.py +136 -0
- batch_analytics/analytics/pca_clustering.py +143 -0
- batch_analytics/analytics/t_test.py +184 -0
- batch_analytics/config.py +169 -0
- batch_analytics/extract.py +118 -0
- batch_analytics/job_runner.py +300 -0
- batch_analytics/log.py +101 -0
- batch_analytics/modules.py +24 -0
- batch_analytics/output/__init__.py +22 -0
- batch_analytics/output/base.py +97 -0
- batch_analytics/output/clickhouse.py +89 -0
- batch_analytics/output/local.py +36 -0
- batch_analytics/output/s3.py +82 -0
- batch_analytics/transform.py +184 -0
- batch_analytics-0.1.0.dist-info/METADATA +80 -0
- batch_analytics-0.1.0.dist-info/RECORD +23 -0
- batch_analytics-0.1.0.dist-info/WHEEL +5 -0
- batch_analytics-0.1.0.dist-info/entry_points.txt +2 -0
- batch_analytics-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module registry for batch analytics. Maps --modules short names to run functions.
|
|
3
|
+
|
|
4
|
+
Must stay in sync with analytics_runner catalog module_arg for each Spark method.
|
|
5
|
+
See analytics_runner/catalog/analytics_catalog.yaml.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .analytics import (
|
|
9
|
+
run_linear_regression,
|
|
10
|
+
run_correlation,
|
|
11
|
+
run_pca_clustering,
|
|
12
|
+
run_t_test,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# module_arg -> (run_fn, result_key)
|
|
16
|
+
MODULE_REGISTRY = {
|
|
17
|
+
"lr": (run_linear_regression, "linear_regression"),
|
|
18
|
+
"corr": (run_correlation, "correlation"),
|
|
19
|
+
"pca": (run_pca_clustering, "pca_clustering"),
|
|
20
|
+
"ttest": (run_t_test, "t_test"),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
VALID_MODULES = list(MODULE_REGISTRY.keys())
|
|
24
|
+
DEFAULT_MODULES = VALID_MODULES.copy()
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Output drivers for analytics results: local, S3, ClickHouse.
|
|
3
|
+
|
|
4
|
+
Configuration via env vars (injected by analytics_runner):
|
|
5
|
+
- OUTPUT_TYPE: local | s3 | clickhouse
|
|
6
|
+
- OUTPUT_S3_PATH: s3://bucket/prefix/ (when type=s3)
|
|
7
|
+
- OUTPUT_CLICKHOUSE_DATABASE, OUTPUT_CLICKHOUSE_TABLE (when type=clickhouse)
|
|
8
|
+
- TASK_ID: task identifier (injected by spark_runner)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .base import OutputDriver, write_analytics_output
|
|
12
|
+
from .local import LocalOutputDriver
|
|
13
|
+
from .s3 import S3OutputDriver
|
|
14
|
+
from .clickhouse import ClickHouseOutputDriver
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"OutputDriver",
|
|
18
|
+
"write_analytics_output",
|
|
19
|
+
"LocalOutputDriver",
|
|
20
|
+
"S3OutputDriver",
|
|
21
|
+
"ClickHouseOutputDriver",
|
|
22
|
+
]
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base output driver interface and write orchestration.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _serialize_for_json(obj: Any) -> Any:
|
|
14
|
+
"""Convert numpy/Python types to JSON-serializable forms."""
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
if isinstance(obj, dict):
|
|
18
|
+
return {k: _serialize_for_json(v) for k, v in obj.items()}
|
|
19
|
+
if isinstance(obj, (list, tuple)):
|
|
20
|
+
return [_serialize_for_json(x) for x in obj]
|
|
21
|
+
if isinstance(obj, np.ndarray):
|
|
22
|
+
return obj.tolist()
|
|
23
|
+
if isinstance(obj, (np.floating, np.float32, np.float64)):
|
|
24
|
+
return float(obj)
|
|
25
|
+
if isinstance(obj, (np.integer, np.int32, np.int64)):
|
|
26
|
+
return int(obj)
|
|
27
|
+
if isinstance(obj, np.bool_):
|
|
28
|
+
return bool(obj)
|
|
29
|
+
return obj
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class OutputDriver(ABC):
|
|
33
|
+
"""Interface for writing analytics results to a destination."""
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def write(
|
|
37
|
+
self,
|
|
38
|
+
run_id: str,
|
|
39
|
+
task_id: str,
|
|
40
|
+
artifacts: dict[str, Any],
|
|
41
|
+
) -> list[str]:
|
|
42
|
+
"""
|
|
43
|
+
Write analytics artifacts to the destination.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
run_id: Unique run identifier
|
|
47
|
+
task_id: Task identifier from spark_runner
|
|
48
|
+
artifacts: Dict of module_name -> result (e.g. {"linear_regression": {...}})
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
List of written locations (paths, keys, or identifiers)
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def write_analytics_output(
|
|
57
|
+
run_id: str,
|
|
58
|
+
task_id: str,
|
|
59
|
+
artifacts: dict[str, Any],
|
|
60
|
+
output_type: str,
|
|
61
|
+
**driver_kwargs: Any,
|
|
62
|
+
) -> list[str]:
|
|
63
|
+
"""
|
|
64
|
+
Write analytics results using the configured output driver.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
run_id: Run identifier
|
|
68
|
+
task_id: Task identifier (from TASK_ID env or run_id fallback)
|
|
69
|
+
artifacts: Analytics module outputs
|
|
70
|
+
output_type: local | s3 | clickhouse
|
|
71
|
+
**driver_kwargs: Driver-specific config (path, database, table, etc.)
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
List of written locations
|
|
75
|
+
"""
|
|
76
|
+
if not artifacts:
|
|
77
|
+
logger.debug("No analytics artifacts to write")
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
output_type = (output_type or "local").lower().strip()
|
|
81
|
+
if output_type == "local":
|
|
82
|
+
driver = LocalOutputDriver(**driver_kwargs)
|
|
83
|
+
elif output_type == "s3":
|
|
84
|
+
driver = S3OutputDriver(**driver_kwargs)
|
|
85
|
+
elif output_type == "clickhouse":
|
|
86
|
+
driver = ClickHouseOutputDriver(**driver_kwargs)
|
|
87
|
+
else:
|
|
88
|
+
logger.warning("Unknown OUTPUT_TYPE=%r, falling back to local", output_type)
|
|
89
|
+
driver = LocalOutputDriver(**driver_kwargs)
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
locations = driver.write(run_id, task_id, artifacts)
|
|
93
|
+
logger.info("Wrote analytics output to %s: %s", output_type, locations)
|
|
94
|
+
return locations
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.exception("Failed to write analytics output: %s", e)
|
|
97
|
+
raise
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ClickHouse output driver: inserts analytics results into a ClickHouse table.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from .base import OutputDriver, _serialize_for_json
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _create_table_sql(database: str, table: str) -> str:
|
|
15
|
+
"""CREATE TABLE IF NOT EXISTS for analytics results."""
|
|
16
|
+
return f"""
|
|
17
|
+
CREATE TABLE IF NOT EXISTS {database}.{table} (
|
|
18
|
+
task_id String,
|
|
19
|
+
run_id String,
|
|
20
|
+
module String,
|
|
21
|
+
result String,
|
|
22
|
+
created_at DateTime DEFAULT now()
|
|
23
|
+
) ENGINE = MergeTree()
|
|
24
|
+
ORDER BY (task_id, run_id, module)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ClickHouseOutputDriver(OutputDriver):
|
|
29
|
+
"""Insert analytics artifacts into a ClickHouse table as JSON strings."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
database: str,
|
|
34
|
+
table: str,
|
|
35
|
+
host: str | None = None,
|
|
36
|
+
port: int = 8123,
|
|
37
|
+
user: str = "default",
|
|
38
|
+
password: str = "",
|
|
39
|
+
**kwargs: Any,
|
|
40
|
+
) -> None:
|
|
41
|
+
self.database = database
|
|
42
|
+
self.table = table
|
|
43
|
+
self.host = host
|
|
44
|
+
self.port = port
|
|
45
|
+
self.user = user
|
|
46
|
+
self.password = password
|
|
47
|
+
|
|
48
|
+
def _client(self):
|
|
49
|
+
"""Lazy clickhouse-connect client."""
|
|
50
|
+
try:
|
|
51
|
+
import clickhouse_connect
|
|
52
|
+
except ImportError as e:
|
|
53
|
+
raise ImportError(
|
|
54
|
+
"ClickHouse output requires clickhouse-connect. "
|
|
55
|
+
"Install with: pip install batch-analytics[clickhouse]"
|
|
56
|
+
) from e
|
|
57
|
+
|
|
58
|
+
return clickhouse_connect.get_client(
|
|
59
|
+
host=self.host or "localhost",
|
|
60
|
+
port=self.port,
|
|
61
|
+
username=self.user,
|
|
62
|
+
password=self.password if self.password else None,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def write(
|
|
66
|
+
self,
|
|
67
|
+
run_id: str,
|
|
68
|
+
task_id: str,
|
|
69
|
+
artifacts: dict[str, Any],
|
|
70
|
+
) -> list[str]:
|
|
71
|
+
client = self._client()
|
|
72
|
+
|
|
73
|
+
# Ensure table exists
|
|
74
|
+
client.command(_create_table_sql(self.database, self.table).strip())
|
|
75
|
+
|
|
76
|
+
rows: list[tuple[str, str, str, str]] = []
|
|
77
|
+
for module, data in artifacts.items():
|
|
78
|
+
result_json = json.dumps(_serialize_for_json(data))
|
|
79
|
+
rows.append((task_id, run_id, module, result_json))
|
|
80
|
+
|
|
81
|
+
client.insert(
|
|
82
|
+
f"{self.database}.{self.table}",
|
|
83
|
+
rows,
|
|
84
|
+
column_names=["task_id", "run_id", "module", "result"],
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
location = f"{self.database}.{self.table}"
|
|
88
|
+
logger.info("Inserted %d analytics rows into %s", len(rows), location)
|
|
89
|
+
return [location]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Local output driver: writes analytics results to a local directory (BATCH_LOG_PATH).
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from .base import OutputDriver, _serialize_for_json
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LocalOutputDriver(OutputDriver):
|
|
16
|
+
"""Write analytics artifacts to local filesystem."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, path: str | Path = "/tmp/analytics_logs", **kwargs: Any) -> None:
|
|
19
|
+
self.path = Path(path)
|
|
20
|
+
self.path.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
|
|
22
|
+
def write(
|
|
23
|
+
self,
|
|
24
|
+
run_id: str,
|
|
25
|
+
task_id: str,
|
|
26
|
+
artifacts: dict[str, Any],
|
|
27
|
+
) -> list[str]:
|
|
28
|
+
locations: list[str] = []
|
|
29
|
+
for name, data in artifacts.items():
|
|
30
|
+
filepath = self.path / f"{run_id}_analytics_{name}.json"
|
|
31
|
+
with open(filepath, "w") as f:
|
|
32
|
+
json.dump(_serialize_for_json(data), f, indent=2)
|
|
33
|
+
locations.append(str(filepath))
|
|
34
|
+
|
|
35
|
+
logger.info("Wrote %d analytics artifacts to %s", len(locations), self.path)
|
|
36
|
+
return locations
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
S3 output driver: uploads analytics results as JSON to S3.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from .base import OutputDriver, _serialize_for_json
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class S3OutputDriver(OutputDriver):
|
|
15
|
+
"""Upload analytics artifacts to S3 as JSON objects."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
path: str,
|
|
20
|
+
region: str | None = None,
|
|
21
|
+
endpoint: str | None = None,
|
|
22
|
+
**kwargs: Any,
|
|
23
|
+
) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Args:
|
|
26
|
+
path: S3 path (e.g. s3://bucket/prefix/) - must end with /
|
|
27
|
+
region: AWS region (default from env)
|
|
28
|
+
endpoint: Custom endpoint for S3-compatible storage
|
|
29
|
+
"""
|
|
30
|
+
self.path = path.rstrip("/") + "/"
|
|
31
|
+
self.region = region
|
|
32
|
+
self.endpoint = endpoint
|
|
33
|
+
|
|
34
|
+
def _client(self):
|
|
35
|
+
"""Lazy boto3 client to avoid import at module load."""
|
|
36
|
+
try:
|
|
37
|
+
import boto3
|
|
38
|
+
from botocore.config import Config
|
|
39
|
+
except ImportError as e:
|
|
40
|
+
raise ImportError(
|
|
41
|
+
"S3 output requires boto3. Install with: pip install batch-analytics[s3]"
|
|
42
|
+
) from e
|
|
43
|
+
|
|
44
|
+
config = Config(signature_version="s3v4")
|
|
45
|
+
client_kwargs: dict[str, Any] = {}
|
|
46
|
+
if self.region:
|
|
47
|
+
client_kwargs["region_name"] = self.region
|
|
48
|
+
if self.endpoint:
|
|
49
|
+
client_kwargs["endpoint_url"] = self.endpoint
|
|
50
|
+
|
|
51
|
+
return boto3.client("s3", config=config, **client_kwargs)
|
|
52
|
+
|
|
53
|
+
def _parse_s3_path(self) -> tuple[str, str]:
|
|
54
|
+
"""Parse s3://bucket/prefix into bucket and prefix."""
|
|
55
|
+
if not self.path.startswith("s3://"):
|
|
56
|
+
raise ValueError(f"Invalid S3 path: {self.path}")
|
|
57
|
+
parts = self.path[5:].split("/", 1) # Remove s3://
|
|
58
|
+
bucket = parts[0]
|
|
59
|
+
prefix = parts[1].rstrip("/") + "/" if len(parts) > 1 else ""
|
|
60
|
+
return bucket, prefix
|
|
61
|
+
|
|
62
|
+
def write(
|
|
63
|
+
self,
|
|
64
|
+
run_id: str,
|
|
65
|
+
task_id: str,
|
|
66
|
+
artifacts: dict[str, Any],
|
|
67
|
+
) -> list[str]:
|
|
68
|
+
bucket, prefix = self._parse_s3_path()
|
|
69
|
+
client = self._client()
|
|
70
|
+
|
|
71
|
+
# Use task_id for path: prefix/task_id/run_id_analytics_module.json
|
|
72
|
+
key_prefix = f"{prefix}{task_id}/"
|
|
73
|
+
|
|
74
|
+
locations: list[str] = []
|
|
75
|
+
for name, data in artifacts.items():
|
|
76
|
+
key = f"{key_prefix}{run_id}_analytics_{name}.json"
|
|
77
|
+
body = json.dumps(_serialize_for_json(data), indent=2)
|
|
78
|
+
client.put_object(Bucket=bucket, Key=key, Body=body, ContentType="application/json")
|
|
79
|
+
locations.append(f"s3://{bucket}/{key}")
|
|
80
|
+
|
|
81
|
+
logger.info("Uploaded %d analytics artifacts to s3://%s/%s", len(locations), bucket, key_prefix)
|
|
82
|
+
return locations
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transform stage: Clean data (remove duplicates), extract add_dimension, and stage.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Sequence
|
|
7
|
+
|
|
8
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
9
|
+
from pyspark.sql.functions import coalesce, col, get_json_object, regexp_extract
|
|
10
|
+
|
|
11
|
+
from .config import BatchAnalyticsConfig
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def extract_anchor_id(
|
|
17
|
+
df: DataFrame,
|
|
18
|
+
config: BatchAnalyticsConfig,
|
|
19
|
+
) -> DataFrame:
|
|
20
|
+
"""
|
|
21
|
+
Extract anchor_id from add_dimension column.
|
|
22
|
+
Supports JSON format {"anchor_id":"value"} or Python-dict {"anchor_id":"value"}.
|
|
23
|
+
Creates a new column (anchor_id by default) with the extracted value.
|
|
24
|
+
"""
|
|
25
|
+
col_name = config.transform.add_dimension_column
|
|
26
|
+
out_col = config.transform.anchor_id_column
|
|
27
|
+
|
|
28
|
+
if col_name not in df.columns:
|
|
29
|
+
logger.debug("Column %s not found, skipping anchor_id extraction", col_name)
|
|
30
|
+
return df
|
|
31
|
+
|
|
32
|
+
# Valid JSON: {"anchor_id":"GP/GPH(D)/II(W)/250019"}
|
|
33
|
+
json_extract = get_json_object(col(col_name), "$.anchor_id")
|
|
34
|
+
# Python-dict style: {'anchor_id':'GP/GPH(D)/II(W)/250019'}
|
|
35
|
+
regex_extract = regexp_extract(col(col_name), r"'anchor_id'\s*:\s*'([^']*)'", 1)
|
|
36
|
+
|
|
37
|
+
extracted = coalesce(json_extract, regex_extract)
|
|
38
|
+
return df.withColumn(out_col, extracted)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def remove_duplicates(
|
|
42
|
+
df: DataFrame,
|
|
43
|
+
key_columns: Sequence[str] | None = None,
|
|
44
|
+
) -> DataFrame:
|
|
45
|
+
"""
|
|
46
|
+
Remove duplicate rows.
|
|
47
|
+
If key_columns is provided, keeps first occurrence per key.
|
|
48
|
+
Otherwise, drops exact row duplicates.
|
|
49
|
+
"""
|
|
50
|
+
before_count = df.count()
|
|
51
|
+
if key_columns:
|
|
52
|
+
df_cleaned = df.dropDuplicates(key_columns)
|
|
53
|
+
else:
|
|
54
|
+
df_cleaned = df.distinct()
|
|
55
|
+
after_count = df_cleaned.count()
|
|
56
|
+
removed = before_count - after_count
|
|
57
|
+
logger.info(
|
|
58
|
+
"Deduplication: %d -> %d rows (removed %d duplicates)",
|
|
59
|
+
before_count,
|
|
60
|
+
after_count,
|
|
61
|
+
removed,
|
|
62
|
+
)
|
|
63
|
+
return df_cleaned
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def transform(
|
|
67
|
+
df: DataFrame,
|
|
68
|
+
config: BatchAnalyticsConfig,
|
|
69
|
+
) -> DataFrame:
|
|
70
|
+
"""
|
|
71
|
+
Apply transformation only: extract anchor_id, remove duplicates.
|
|
72
|
+
Does not write anywhere. Use stage_to_clickhouse() separately to persist.
|
|
73
|
+
"""
|
|
74
|
+
transformed = extract_anchor_id(df, config)
|
|
75
|
+
dedup_cols = (
|
|
76
|
+
[c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
|
|
77
|
+
if config.transform.dedup_columns
|
|
78
|
+
else None
|
|
79
|
+
)
|
|
80
|
+
return remove_duplicates(transformed, key_columns=dedup_cols)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def stage_to_clickhouse(
|
|
84
|
+
spark: SparkSession,
|
|
85
|
+
df: DataFrame,
|
|
86
|
+
config: BatchAnalyticsConfig,
|
|
87
|
+
) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Write transformed data to ClickHouse staging table.
|
|
90
|
+
Separate job from transform; must complete before analytics can run.
|
|
91
|
+
Uses native connector if available, else JDBC.
|
|
92
|
+
"""
|
|
93
|
+
n = df.count()
|
|
94
|
+
try:
|
|
95
|
+
writer = (
|
|
96
|
+
df.write.format("clickhouse")
|
|
97
|
+
.option("host", config.clickhouse.host)
|
|
98
|
+
.option("database", config.clickhouse.database)
|
|
99
|
+
.option("table", config.transform.staging_table)
|
|
100
|
+
.option("user", config.clickhouse.user)
|
|
101
|
+
.mode("overwrite")
|
|
102
|
+
)
|
|
103
|
+
if config.clickhouse.password:
|
|
104
|
+
writer = writer.option("password", config.clickhouse.password)
|
|
105
|
+
writer.save()
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.warning("ClickHouse connector failed (%s), using JDBC", e)
|
|
108
|
+
df.write.jdbc(
|
|
109
|
+
config.clickhouse.jdbc_url,
|
|
110
|
+
config.transform.staging_table,
|
|
111
|
+
mode="overwrite",
|
|
112
|
+
properties=config.clickhouse.jdbc_properties,
|
|
113
|
+
)
|
|
114
|
+
logger.info(
|
|
115
|
+
"Staged data to ClickHouse %s.%s (%d rows)",
|
|
116
|
+
config.clickhouse.database,
|
|
117
|
+
config.transform.staging_table,
|
|
118
|
+
n,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def stage_to_path(
|
|
123
|
+
spark: SparkSession,
|
|
124
|
+
df: DataFrame,
|
|
125
|
+
config: BatchAnalyticsConfig,
|
|
126
|
+
) -> None:
|
|
127
|
+
"""Write transformed data to parquet/delta (for local dev or intermediate storage)."""
|
|
128
|
+
path = config.transform.staging_path
|
|
129
|
+
fmt = config.transform.staging_format
|
|
130
|
+
if fmt == "parquet":
|
|
131
|
+
df.write.mode("overwrite").parquet(path)
|
|
132
|
+
logger.info("Staged data to %s (parquet)", path)
|
|
133
|
+
elif fmt == "delta":
|
|
134
|
+
df.write.format("delta").mode("overwrite").save(path)
|
|
135
|
+
logger.info("Staged data to %s (delta)", path)
|
|
136
|
+
else:
|
|
137
|
+
df.write.format(fmt).mode("overwrite").save(path)
|
|
138
|
+
logger.info("Staged data to %s (%s)", path, fmt)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def transform_and_stage(
|
|
142
|
+
spark: SparkSession,
|
|
143
|
+
df: DataFrame,
|
|
144
|
+
config: BatchAnalyticsConfig,
|
|
145
|
+
) -> DataFrame:
|
|
146
|
+
"""
|
|
147
|
+
Transform and stage to ClickHouse. Kept for backward compatibility.
|
|
148
|
+
Prefer calling transform() then stage_to_clickhouse() separately.
|
|
149
|
+
"""
|
|
150
|
+
cleaned = transform(df, config)
|
|
151
|
+
stage_to_clickhouse(spark, cleaned, config)
|
|
152
|
+
return cleaned
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def load_staged(
|
|
156
|
+
spark: SparkSession,
|
|
157
|
+
config: BatchAnalyticsConfig,
|
|
158
|
+
) -> DataFrame:
|
|
159
|
+
"""
|
|
160
|
+
Load previously staged data (e.g. when running only analytics modules).
|
|
161
|
+
"""
|
|
162
|
+
staging_path = config.transform.staging_path
|
|
163
|
+
fmt = config.transform.staging_format
|
|
164
|
+
|
|
165
|
+
if fmt == "parquet":
|
|
166
|
+
return spark.read.parquet(staging_path)
|
|
167
|
+
if fmt == "delta":
|
|
168
|
+
return spark.read.format("delta").load(staging_path)
|
|
169
|
+
if fmt == "clickhouse":
|
|
170
|
+
try:
|
|
171
|
+
return (
|
|
172
|
+
spark.read.format("clickhouse")
|
|
173
|
+
.option("host", config.clickhouse.host)
|
|
174
|
+
.option("database", config.clickhouse.database)
|
|
175
|
+
.option("table", config.transform.staging_table)
|
|
176
|
+
.load()
|
|
177
|
+
)
|
|
178
|
+
except Exception:
|
|
179
|
+
return spark.read.jdbc(
|
|
180
|
+
config.clickhouse.jdbc_url,
|
|
181
|
+
config.transform.staging_table,
|
|
182
|
+
properties=config.clickhouse.jdbc_properties,
|
|
183
|
+
)
|
|
184
|
+
return spark.read.format(fmt).load(staging_path)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: batch-analytics
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
|
+
Author: Analytics Team
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pyspark<3.6,>=3.4
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
12
|
+
Provides-Extra: s3
|
|
13
|
+
Requires-Dist: boto3>=1.28; extra == "s3"
|
|
14
|
+
Provides-Extra: clickhouse
|
|
15
|
+
Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
|
|
16
|
+
Provides-Extra: output
|
|
17
|
+
Requires-Dist: boto3>=1.28; extra == "output"
|
|
18
|
+
Requires-Dist: clickhouse-connect>=0.7; extra == "output"
|
|
19
|
+
|
|
20
|
+
# Batch Analytics
|
|
21
|
+
|
|
22
|
+
PySpark-based analytics pipeline for ClickHouse data: **Extract** → **Transform** → **Stage** → **Analytics**. Designed to run as the main application inside a Spark driver container (invoked by `analytics_runners` via SparkApplication CRD).
|
|
23
|
+
|
|
24
|
+
## Bundle contents
|
|
25
|
+
|
|
26
|
+
Only the files required for the batch analytics job runner:
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
analytics/
|
|
30
|
+
├── pyproject.toml
|
|
31
|
+
├── requirements-batch.txt
|
|
32
|
+
├── README.md
|
|
33
|
+
└── src/
|
|
34
|
+
└── batch_analytics/
|
|
35
|
+
├── __init__.py
|
|
36
|
+
├── __main__.py # python -m batch_analytics
|
|
37
|
+
├── job_runner.py # Entry point
|
|
38
|
+
├── config.py
|
|
39
|
+
├── extract.py
|
|
40
|
+
├── transform.py
|
|
41
|
+
├── log.py
|
|
42
|
+
├── README.md
|
|
43
|
+
└── analytics/
|
|
44
|
+
├── __init__.py
|
|
45
|
+
├── linear_regression.py
|
|
46
|
+
├── correlation.py
|
|
47
|
+
├── pca_clustering.py
|
|
48
|
+
└── t_test.py
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Install
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install -e .
|
|
55
|
+
# or: pip install -r requirements-batch.txt && pip install -e .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Run
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# Via module
|
|
62
|
+
python -m batch_analytics
|
|
63
|
+
|
|
64
|
+
# Via CLI (after pip install -e .)
|
|
65
|
+
batch-analytics
|
|
66
|
+
|
|
67
|
+
# Full pipeline
|
|
68
|
+
batch-analytics
|
|
69
|
+
|
|
70
|
+
# Analytics only (from staged ClickHouse table)
|
|
71
|
+
batch-analytics --from-stage --modules lr corr pca ttest
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Configuration
|
|
75
|
+
|
|
76
|
+
See `src/batch_analytics/README.md` for environment variables and usage.
|
|
77
|
+
|
|
78
|
+
## Docker image
|
|
79
|
+
|
|
80
|
+
For Spark on Kubernetes, build an image that includes this package and exposes `job_runner.py` at the path used by `mainApplicationFile` (e.g. `local:///opt/analytics/job_runner.py`).
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
batch_analytics/__init__.py,sha256=5tdHpnDS80B_cgQN1aj1DVj28z0n-PAQyNbw_rJpFOk,1214
|
|
2
|
+
batch_analytics/__main__.py,sha256=lnQrJI_tQWnm1X0KjKwfwm5O9sIcg_SZNECbDanAFXM,119
|
|
3
|
+
batch_analytics/config.py,sha256=PW2NNGifaeOFyEkJ-dV4OXHb4fn32LUskN1MVvnUh5g,6803
|
|
4
|
+
batch_analytics/extract.py,sha256=qZKLebkJ9bn14wT6K04XgvNKwhjfGnDJ77MwQJckCoo,3555
|
|
5
|
+
batch_analytics/job_runner.py,sha256=xznaYVTwkUUs_upAaO_lT9vsiI2vjTv7sac6tFm3oIw,10673
|
|
6
|
+
batch_analytics/log.py,sha256=Hq3jFCdnMZzrHsFzO16lGZfKHxPYXkZOiZJpOgWxJmM,2743
|
|
7
|
+
batch_analytics/modules.py,sha256=ND2fZRtwwlI-HvH4xFg3Cj5YGd31qUEohHmjuejJY0o,677
|
|
8
|
+
batch_analytics/transform.py,sha256=KVlLUIpM2qnB877-QCm42qxPJMyhOcg7Rkd802jCsxs,5844
|
|
9
|
+
batch_analytics/analytics/__init__.py,sha256=wyyAXs3Owu92mhixlViK3yWfqH4KWXmopmswEqrLP70,515
|
|
10
|
+
batch_analytics/analytics/correlation.py,sha256=WmmZll8yfcB2rSSpoCOeTzYn4PDThupAOJKAkxsXzoo,3238
|
|
11
|
+
batch_analytics/analytics/linear_regression.py,sha256=wpdsjyzl29umPD4hHBCcHaBwb1cx4qWfKBNfTxCAX8I,4439
|
|
12
|
+
batch_analytics/analytics/pca_clustering.py,sha256=jdWwtU_G0Mkn50Lc_zOJv14clLE2a5-Gy0o-ynmLFvs,4375
|
|
13
|
+
batch_analytics/analytics/t_test.py,sha256=aYoRTEXSxaxsIYgievHSkRZPogxCK5tObGNIHf5RV04,5497
|
|
14
|
+
batch_analytics/output/__init__.py,sha256=79e5QJ9IJAHCk1e9HZi5g6WXUvnq5MKI9I0WhEaIbD4,665
|
|
15
|
+
batch_analytics/output/base.py,sha256=-HVj5HA4jQ7Lvk0B9WR2kyk_1wu6-wvVXGV10WzxgPg,2852
|
|
16
|
+
batch_analytics/output/clickhouse.py,sha256=gUI0LQqA6Lg4ZObDe3_9QJp_cguM9AbRCxdUeRgAfhY,2531
|
|
17
|
+
batch_analytics/output/local.py,sha256=aDGa-riSEaZStSV4qE_zLfb4Af70O1G4XvHUiXbPMGk,1072
|
|
18
|
+
batch_analytics/output/s3.py,sha256=oSyd8mL5nL-ryNddclpszFgfYChrogzgaNCJwdYZZYU,2695
|
|
19
|
+
batch_analytics-0.1.0.dist-info/METADATA,sha256=HdahIx8c2NiwtPXLQPEisMeXBIrfoBFmrKruYcrNBPM,2308
|
|
20
|
+
batch_analytics-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
21
|
+
batch_analytics-0.1.0.dist-info/entry_points.txt,sha256=v1Yx6LOlDBC0DmcdaaDM7KBROGe8GmBO7d6-Q2_z4dg,68
|
|
22
|
+
batch_analytics-0.1.0.dist-info/top_level.txt,sha256=wpRlC_JZ_uyGxzP1P3HlpNAjL8qS8kjXCE03DE2Dlyk,16
|
|
23
|
+
batch_analytics-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
batch_analytics
|