batch-analytics 0.3.7__tar.gz → 0.3.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/PKG-INFO +1 -1
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/pyproject.toml +1 -1
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/__init__.py +9 -1
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/config.py +22 -6
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/extract.py +94 -1
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/job_runner.py +2 -2
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/transform.py +137 -23
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/README.md +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/setup.cfg +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.14"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -13,8 +13,14 @@ Analytics modules:
|
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
15
|
from .config import BatchAnalyticsConfig, SparkK8sConfig
|
|
16
|
-
from .extract import
|
|
16
|
+
from .extract import (
|
|
17
|
+
extract_all,
|
|
18
|
+
extract_table,
|
|
19
|
+
extract_unified,
|
|
20
|
+
parse_extract_filter_values,
|
|
21
|
+
)
|
|
17
22
|
from .transform import (
|
|
23
|
+
expand_kv_blob_column,
|
|
18
24
|
extract_anchor_id,
|
|
19
25
|
load_staged,
|
|
20
26
|
remove_duplicates,
|
|
@@ -28,10 +34,12 @@ from .job_runner import run_pipeline, create_spark_session
|
|
|
28
34
|
__all__ = [
|
|
29
35
|
"BatchAnalyticsConfig",
|
|
30
36
|
"SparkK8sConfig",
|
|
37
|
+
"expand_kv_blob_column",
|
|
31
38
|
"extract_anchor_id",
|
|
32
39
|
"extract_all",
|
|
33
40
|
"extract_table",
|
|
34
41
|
"extract_unified",
|
|
42
|
+
"parse_extract_filter_values",
|
|
35
43
|
"remove_duplicates",
|
|
36
44
|
"stage_to_clickhouse",
|
|
37
45
|
"transform",
|
|
@@ -29,16 +29,23 @@ class ClickHouseConfig:
|
|
|
29
29
|
|
|
30
30
|
@property
|
|
31
31
|
def jdbc_properties(self) -> dict:
|
|
32
|
+
"""JDBC connection properties for Spark.
|
|
33
|
+
|
|
34
|
+
clickhouse-jdbc (clickhouse-java v0.6+) rejects legacy keys such as
|
|
35
|
+
``compress_algorithm`` (ClientMisconfigurationException). Prefer JDBC URL
|
|
36
|
+
query parameters for compression behavior. To force the old property for
|
|
37
|
+
legacy stacks, set CLICKHOUSE_JDBC_LEGACY_COMPRESS_ALGORITHM (e.g. ``none``).
|
|
38
|
+
"""
|
|
32
39
|
props = {
|
|
33
40
|
"user": self.user,
|
|
34
41
|
"driver": "com.clickhouse.jdbc.ClickHouseDriver",
|
|
35
|
-
# Match Spark read codec default: avoids JDBC LZ4/gzip mismatches with server HTTP compression
|
|
36
|
-
"compress_algorithm": os.environ.get(
|
|
37
|
-
"CLICKHOUSE_JDBC_COMPRESS_ALGORITHM", "none"
|
|
38
|
-
),
|
|
39
42
|
}
|
|
40
43
|
if self.password:
|
|
41
44
|
props["password"] = self.password
|
|
45
|
+
# Opt-in legacy property for older shaded JDBC stacks only.
|
|
46
|
+
legacy = os.environ.get("CLICKHOUSE_JDBC_LEGACY_COMPRESS_ALGORITHM", "").strip()
|
|
47
|
+
if legacy:
|
|
48
|
+
props["compress_algorithm"] = legacy
|
|
42
49
|
return props
|
|
43
50
|
|
|
44
51
|
|
|
@@ -55,13 +62,18 @@ class ExtractConfig:
|
|
|
55
62
|
use_native_connector: bool = os.environ.get(
|
|
56
63
|
"BATCH_USE_NATIVE_CONNECTOR", "false"
|
|
57
64
|
).lower() == "true"
|
|
65
|
+
# Optional WHERE col IN (...) after read. Empty filter_column = no filter (full table).
|
|
66
|
+
# filter_values: comma-separated list, or JSON array e.g. ["a","b"] for values containing commas.
|
|
67
|
+
filter_column: str = os.environ.get("BATCH_EXTRACT_FILTER_COLUMN", "").strip()
|
|
68
|
+
filter_values: str = os.environ.get("BATCH_EXTRACT_FILTER_VALUES", "").strip()
|
|
58
69
|
|
|
59
70
|
|
|
60
71
|
@dataclass
|
|
61
72
|
class TransformConfig:
|
|
62
73
|
"""Transform stage configuration."""
|
|
63
74
|
|
|
64
|
-
#
|
|
75
|
+
# Order: extract anchor_id from add_dimension(s) column, then dedupe by these keys.
|
|
76
|
+
# Deduplication keys (comma-separated). Empty = dropDuplicates() on full row (all columns).
|
|
65
77
|
dedup_columns: str = os.environ.get("BATCH_DEDUP_COLUMNS", "")
|
|
66
78
|
# Staging output path (local or S3)
|
|
67
79
|
staging_path: str = os.environ.get(
|
|
@@ -73,8 +85,12 @@ class TransformConfig:
|
|
|
73
85
|
staging_format: str = os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
|
|
74
86
|
# Staging table name in ClickHouse (when format=clickhouse)
|
|
75
87
|
staging_table: str = os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
|
|
76
|
-
#
|
|
88
|
+
# Spark save mode for ClickHouse staging (and path staging): overwrite | append
|
|
89
|
+
staging_write_mode: str = os.environ.get("BATCH_STAGING_WRITE_MODE", "overwrite")
|
|
90
|
+
# Source column holding a JSON object or Python dict string; every top-level key becomes a new String column
|
|
91
|
+
# (see transform.expand_kv_blob_column). Example: add_dimensions {'anchor_id':'...','lot':'A1'}
|
|
77
92
|
add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
|
|
93
|
+
# Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
|
|
78
94
|
anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
|
|
79
95
|
|
|
80
96
|
|
|
@@ -2,17 +2,106 @@
|
|
|
2
2
|
Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDBC.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import json
|
|
5
6
|
import logging
|
|
6
7
|
import os
|
|
7
|
-
from typing import Dict, List, Optional
|
|
8
|
+
from typing import Dict, List, Optional, Tuple
|
|
8
9
|
|
|
9
10
|
from pyspark.sql import DataFrame, SparkSession
|
|
11
|
+
from pyspark.sql.functions import col
|
|
10
12
|
|
|
11
13
|
from .config import BatchAnalyticsConfig
|
|
12
14
|
|
|
13
15
|
logger = logging.getLogger(__name__)
|
|
14
16
|
|
|
15
17
|
|
|
18
|
+
def parse_extract_filter_values(raw: str) -> List[str]:
|
|
19
|
+
"""
|
|
20
|
+
Parse BATCH_EXTRACT_FILTER_VALUES: comma-separated tokens, or JSON array string.
|
|
21
|
+
|
|
22
|
+
Examples:
|
|
23
|
+
a,b,c -> ["a","b","c"]
|
|
24
|
+
["GP/A","GP/B"] -> JSON list (values may contain commas)
|
|
25
|
+
"""
|
|
26
|
+
text = (raw or "").strip()
|
|
27
|
+
if not text:
|
|
28
|
+
return []
|
|
29
|
+
if text.startswith("["):
|
|
30
|
+
try:
|
|
31
|
+
data = json.loads(text)
|
|
32
|
+
if isinstance(data, list):
|
|
33
|
+
out = [str(x).strip() for x in data if str(x).strip()]
|
|
34
|
+
return out
|
|
35
|
+
except json.JSONDecodeError:
|
|
36
|
+
logger.warning("BATCH_EXTRACT_FILTER_VALUES looks like JSON but failed to parse; using comma split")
|
|
37
|
+
return [p.strip() for p in text.split(",") if p.strip()]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _apply_extract_filter(df: DataFrame, config: BatchAnalyticsConfig) -> DataFrame:
|
|
41
|
+
"""Apply col IN (values) when filter_column is set; empty column = no filter."""
|
|
42
|
+
col_name = (config.extract.filter_column or "").strip()
|
|
43
|
+
if not col_name:
|
|
44
|
+
return df
|
|
45
|
+
if col_name not in df.columns:
|
|
46
|
+
logger.warning(
|
|
47
|
+
"BATCH_EXTRACT_FILTER_COLUMN=%r not in extracted columns %s; skipping filter",
|
|
48
|
+
col_name,
|
|
49
|
+
df.columns,
|
|
50
|
+
)
|
|
51
|
+
return df
|
|
52
|
+
values = parse_extract_filter_values(config.extract.filter_values)
|
|
53
|
+
if not values:
|
|
54
|
+
logger.warning(
|
|
55
|
+
"BATCH_EXTRACT_FILTER_COLUMN=%r set but BATCH_EXTRACT_FILTER_VALUES is empty; skipping IN filter",
|
|
56
|
+
col_name,
|
|
57
|
+
)
|
|
58
|
+
return df
|
|
59
|
+
filtered = df.filter(col(col_name).isin(values))
|
|
60
|
+
return filtered
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _parse_ch_database_table(table: str, default_database: str) -> Tuple[str, str]:
|
|
64
|
+
"""
|
|
65
|
+
Resolve ``table`` reference to (database, table_name).
|
|
66
|
+
|
|
67
|
+
``batch_metric_facts`` → (default_database, batch_metric_facts)
|
|
68
|
+
``analytics.batch_metric_facts`` → (analytics, batch_metric_facts)
|
|
69
|
+
"""
|
|
70
|
+
t = (table or "").strip()
|
|
71
|
+
if not t:
|
|
72
|
+
return default_database, t
|
|
73
|
+
if "." in t and not t.startswith("("):
|
|
74
|
+
db, tbl = t.split(".", 1)
|
|
75
|
+
db, tbl = db.strip(), tbl.strip()
|
|
76
|
+
if db and tbl:
|
|
77
|
+
return db, tbl
|
|
78
|
+
return default_database, t
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _read_via_catalog(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
|
|
82
|
+
"""
|
|
83
|
+
Read via Spark SQL catalog (ClickHouseCatalog), registered in job_runner.create_spark_session.
|
|
84
|
+
|
|
85
|
+
clickhouse-spark-runtime does **not** register legacy short name ``format(\"clickhouse\")`` /
|
|
86
|
+
``clickhouse.DefaultSource``; catalog + ``spark.table(catalog.db.table)`` is the supported path.
|
|
87
|
+
"""
|
|
88
|
+
cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
|
|
89
|
+
if not cat:
|
|
90
|
+
return None
|
|
91
|
+
db, tbl = _parse_ch_database_table(table, cfg.clickhouse.database)
|
|
92
|
+
ident = f"{cat}.{db}.{tbl}"
|
|
93
|
+
try:
|
|
94
|
+
return spark.table(ident)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.warning(
|
|
97
|
+
"Catalog read failed for %s (%s): %s. Trying other readers.",
|
|
98
|
+
ident,
|
|
99
|
+
table,
|
|
100
|
+
e,
|
|
101
|
+
)
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
|
|
16
105
|
def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
|
|
17
106
|
"""
|
|
18
107
|
Read from ClickHouse using the native format API (clickhouse-spark-runtime).
|
|
@@ -60,12 +149,16 @@ def extract_table(
|
|
|
60
149
|
Uses native connector if configured, otherwise JDBC.
|
|
61
150
|
"""
|
|
62
151
|
if config.extract.use_native_connector:
|
|
152
|
+
# Prefer catalog (matches clickhouse-spark-runtime); avoid legacy DefaultSource path.
|
|
63
153
|
df = _read_via_catalog(spark, config, table)
|
|
154
|
+
if df is None:
|
|
155
|
+
df = _read_via_format(spark, config, table)
|
|
64
156
|
if df is None:
|
|
65
157
|
df = _read_via_jdbc(spark, config, table)
|
|
66
158
|
else:
|
|
67
159
|
df = _read_via_jdbc(spark, config, table)
|
|
68
160
|
|
|
161
|
+
df = _apply_extract_filter(df, config)
|
|
69
162
|
logger.info("Extracted table %s: %d rows", table, df.count())
|
|
70
163
|
return df
|
|
71
164
|
|
|
@@ -270,9 +270,9 @@ def run_pipeline(
|
|
|
270
270
|
else:
|
|
271
271
|
df_transformed = df_raw # df_raw already loaded from staged when not run_extract
|
|
272
272
|
if run_extract and run_analytics:
|
|
273
|
-
from .transform import
|
|
273
|
+
from .transform import expand_kv_blob_column, remove_duplicates
|
|
274
274
|
|
|
275
|
-
df_transformed =
|
|
275
|
+
df_transformed = expand_kv_blob_column(df_raw, config)
|
|
276
276
|
dedup_cols = (
|
|
277
277
|
[c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
|
|
278
278
|
if config.transform.dedup_columns
|
|
@@ -1,42 +1,143 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Transform stage: Clean data (remove duplicates),
|
|
2
|
+
Transform stage: Clean data (remove duplicates), expand JSON/KV blob column, and stage.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import ast
|
|
6
|
+
import json
|
|
5
7
|
import logging
|
|
6
8
|
import os
|
|
7
|
-
|
|
9
|
+
import re
|
|
10
|
+
from typing import Any, Dict, List, Optional, Sequence, Set
|
|
8
11
|
|
|
9
12
|
from pyspark.sql import DataFrame, SparkSession
|
|
10
|
-
from pyspark.sql.functions import
|
|
13
|
+
from pyspark.sql.functions import col, explode, map_keys, udf
|
|
14
|
+
from pyspark.sql.types import MapType, StringType
|
|
11
15
|
|
|
12
16
|
from .config import BatchAnalyticsConfig
|
|
13
17
|
|
|
14
18
|
logger = logging.getLogger(__name__)
|
|
15
19
|
|
|
16
20
|
|
|
17
|
-
def
|
|
21
|
+
def _stringify_leaf(v: Any) -> str:
|
|
22
|
+
if v is None:
|
|
23
|
+
return ""
|
|
24
|
+
if isinstance(v, (dict, list)):
|
|
25
|
+
return json.dumps(v, separators=(",", ":"))
|
|
26
|
+
return str(v)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_blob_to_strmap(s: Any) -> Dict[str, str]:
|
|
30
|
+
"""
|
|
31
|
+
Parse a cell value into a flat string map (top-level keys only).
|
|
32
|
+
|
|
33
|
+
Accepts standard JSON objects or Python repr dicts (e.g. single-quoted).
|
|
34
|
+
Non-dict / unparsable input yields an empty map.
|
|
35
|
+
"""
|
|
36
|
+
if s is None:
|
|
37
|
+
return {}
|
|
38
|
+
text = str(s).strip()
|
|
39
|
+
if not text:
|
|
40
|
+
return {}
|
|
41
|
+
obj: Any = None
|
|
42
|
+
try:
|
|
43
|
+
obj = json.loads(text)
|
|
44
|
+
except json.JSONDecodeError:
|
|
45
|
+
pass
|
|
46
|
+
if obj is None:
|
|
47
|
+
try:
|
|
48
|
+
obj = ast.literal_eval(text)
|
|
49
|
+
except (ValueError, SyntaxError, MemoryError):
|
|
50
|
+
return {}
|
|
51
|
+
if not isinstance(obj, dict):
|
|
52
|
+
return {}
|
|
53
|
+
out: Dict[str, str] = {}
|
|
54
|
+
for k, v in obj.items():
|
|
55
|
+
key = str(k).strip()
|
|
56
|
+
if not key:
|
|
57
|
+
continue
|
|
58
|
+
out[key] = _stringify_leaf(v)
|
|
59
|
+
return out
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _spark_safe_base_name(key: str) -> str:
|
|
63
|
+
"""Sanitize JSON key to a usable Spark column name."""
|
|
64
|
+
s = re.sub(r"[^0-9a-zA-Z_]", "_", key.strip())
|
|
65
|
+
s = re.sub(r"_+", "_", s).strip("_")
|
|
66
|
+
if not s:
|
|
67
|
+
return "kv_key"
|
|
68
|
+
if s[0].isdigit():
|
|
69
|
+
s = "c_" + s
|
|
70
|
+
return s
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _unique_column_name(base: str, used: Set[str]) -> str:
|
|
74
|
+
name = base
|
|
75
|
+
n = 1
|
|
76
|
+
while name in used:
|
|
77
|
+
n += 1
|
|
78
|
+
name = f"{base}_{n}"
|
|
79
|
+
used.add(name)
|
|
80
|
+
return name
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def expand_kv_blob_column(
|
|
18
84
|
df: DataFrame,
|
|
19
85
|
config: BatchAnalyticsConfig,
|
|
20
86
|
) -> DataFrame:
|
|
21
87
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
88
|
+
Parse the configured blob column into top-level key/value pairs and add one String column per key.
|
|
89
|
+
|
|
90
|
+
No per-key user configuration: every distinct key observed in the column (across the dataset)
|
|
91
|
+
becomes a column; values are strings (nested dict/list serialized as JSON). Empty / null cells
|
|
92
|
+
yield nulls in those columns.
|
|
93
|
+
|
|
94
|
+
Source column: ``config.transform.add_dimension_column`` (env ``BATCH_ADD_DIMENSION_COLUMN``).
|
|
25
95
|
"""
|
|
26
96
|
col_name = config.transform.add_dimension_column
|
|
27
|
-
out_col = config.transform.anchor_id_column
|
|
28
|
-
|
|
29
97
|
if col_name not in df.columns:
|
|
30
|
-
logger.debug("
|
|
98
|
+
logger.debug("KV blob column %r not found, skipping expansion", col_name)
|
|
31
99
|
return df
|
|
32
100
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
101
|
+
parse_udf = udf(parse_blob_to_strmap, MapType(StringType(), StringType()))
|
|
102
|
+
with_map = df.withColumn("_kv_blob_map", parse_udf(col(col_name)))
|
|
103
|
+
|
|
104
|
+
key_rows = (
|
|
105
|
+
with_map.select(explode(map_keys(col("_kv_blob_map"))).alias("_k"))
|
|
106
|
+
.where(col("_k").isNotNull())
|
|
107
|
+
.distinct()
|
|
108
|
+
.collect()
|
|
109
|
+
)
|
|
110
|
+
all_keys: List[str] = sorted({str(r._k).strip() for r in key_rows if r._k and str(r._k).strip()})
|
|
111
|
+
|
|
112
|
+
if not all_keys:
|
|
113
|
+
logger.info("No keys found in KV blob column %r; dropping temporary map only", col_name)
|
|
114
|
+
return with_map.drop("_kv_blob_map")
|
|
37
115
|
|
|
38
|
-
|
|
39
|
-
|
|
116
|
+
used: Set[str] = set(with_map.columns)
|
|
117
|
+
out = with_map
|
|
118
|
+
added: List[str] = []
|
|
119
|
+
for k in all_keys:
|
|
120
|
+
base = _spark_safe_base_name(k)
|
|
121
|
+
target = _unique_column_name(base, used)
|
|
122
|
+
added.append(target)
|
|
123
|
+
out = out.withColumn(target, col("_kv_blob_map").getItem(k))
|
|
124
|
+
|
|
125
|
+
out = out.drop("_kv_blob_map")
|
|
126
|
+
logger.info(
|
|
127
|
+
"Expanded KV blob column %r into %d columns: %s",
|
|
128
|
+
col_name,
|
|
129
|
+
len(added),
|
|
130
|
+
", ".join(added),
|
|
131
|
+
)
|
|
132
|
+
return out
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def extract_anchor_id(
|
|
136
|
+
df: DataFrame,
|
|
137
|
+
config: BatchAnalyticsConfig,
|
|
138
|
+
) -> DataFrame:
|
|
139
|
+
"""Backward-compatible name: expands all keys from the blob column (not only ``anchor_id``)."""
|
|
140
|
+
return expand_kv_blob_column(df, config)
|
|
40
141
|
|
|
41
142
|
|
|
42
143
|
def remove_duplicates(
|
|
@@ -69,10 +170,11 @@ def transform(
|
|
|
69
170
|
config: BatchAnalyticsConfig,
|
|
70
171
|
) -> DataFrame:
|
|
71
172
|
"""
|
|
72
|
-
Apply transformation only:
|
|
173
|
+
Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
|
|
174
|
+
(2) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
|
|
73
175
|
Does not write anywhere. Use stage_to_clickhouse() separately to persist.
|
|
74
176
|
"""
|
|
75
|
-
transformed =
|
|
177
|
+
transformed = expand_kv_blob_column(df, config)
|
|
76
178
|
dedup_cols = (
|
|
77
179
|
[c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
|
|
78
180
|
if config.transform.dedup_columns
|
|
@@ -81,6 +183,15 @@ def transform(
|
|
|
81
183
|
return remove_duplicates(transformed, key_columns=dedup_cols)
|
|
82
184
|
|
|
83
185
|
|
|
186
|
+
def _normalize_staging_write_mode(raw: str) -> str:
|
|
187
|
+
"""Spark DataFrameWriter mode: overwrite (replace table contents) or append."""
|
|
188
|
+
m = (raw or "overwrite").strip().lower()
|
|
189
|
+
if m in ("overwrite", "append"):
|
|
190
|
+
return m
|
|
191
|
+
logger.warning("Invalid BATCH_STAGING_WRITE_MODE=%r; using overwrite", raw)
|
|
192
|
+
return "overwrite"
|
|
193
|
+
|
|
194
|
+
|
|
84
195
|
def stage_to_clickhouse(
|
|
85
196
|
spark: SparkSession,
|
|
86
197
|
df: DataFrame,
|
|
@@ -90,8 +201,10 @@ def stage_to_clickhouse(
|
|
|
90
201
|
Write transformed data to ClickHouse staging table.
|
|
91
202
|
Separate job from transform; must complete before analytics can run.
|
|
92
203
|
Uses native connector if available, else JDBC.
|
|
204
|
+
Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
|
|
93
205
|
"""
|
|
94
206
|
n = df.count()
|
|
207
|
+
mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
|
|
95
208
|
try:
|
|
96
209
|
ch = config.clickhouse
|
|
97
210
|
writer = (
|
|
@@ -102,7 +215,7 @@ def stage_to_clickhouse(
|
|
|
102
215
|
.option("database", ch.database)
|
|
103
216
|
.option("table", config.transform.staging_table)
|
|
104
217
|
.option("user", ch.user)
|
|
105
|
-
.mode(
|
|
218
|
+
.mode(mode)
|
|
106
219
|
)
|
|
107
220
|
if ch.password:
|
|
108
221
|
writer = writer.option("password", ch.password)
|
|
@@ -112,7 +225,7 @@ def stage_to_clickhouse(
|
|
|
112
225
|
df.write.jdbc(
|
|
113
226
|
config.clickhouse.jdbc_url,
|
|
114
227
|
config.transform.staging_table,
|
|
115
|
-
mode=
|
|
228
|
+
mode=mode,
|
|
116
229
|
properties=config.clickhouse.jdbc_properties,
|
|
117
230
|
)
|
|
118
231
|
logger.info(
|
|
@@ -131,14 +244,15 @@ def stage_to_path(
|
|
|
131
244
|
"""Write transformed data to parquet/delta (for local dev or intermediate storage)."""
|
|
132
245
|
path = config.transform.staging_path
|
|
133
246
|
fmt = config.transform.staging_format
|
|
247
|
+
mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
|
|
134
248
|
if fmt == "parquet":
|
|
135
|
-
df.write.mode(
|
|
249
|
+
df.write.mode(mode).parquet(path)
|
|
136
250
|
logger.info("Staged data to %s (parquet)", path)
|
|
137
251
|
elif fmt == "delta":
|
|
138
|
-
df.write.format("delta").mode(
|
|
252
|
+
df.write.format("delta").mode(mode).save(path)
|
|
139
253
|
logger.info("Staged data to %s (delta)", path)
|
|
140
254
|
else:
|
|
141
|
-
df.write.format(fmt).mode(
|
|
255
|
+
df.write.format(fmt).mode(mode).save(path)
|
|
142
256
|
logger.info("Staged data to %s (%s)", path, fmt)
|
|
143
257
|
|
|
144
258
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|