batch-analytics 0.3.7__tar.gz → 0.3.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/__init__.py +9 -1
  4. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/config.py +22 -6
  5. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/extract.py +94 -1
  6. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/job_runner.py +2 -2
  7. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/transform.py +137 -23
  8. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  9. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/README.md +0 -0
  10. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/setup.cfg +0 -0
  11. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/__main__.py +0 -0
  12. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/__init__.py +0 -0
  13. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/correlation.py +0 -0
  14. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/linear_regression.py +0 -0
  15. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  16. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/analytics/t_test.py +0 -0
  17. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/log.py +0 -0
  18. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/modules.py +0 -0
  19. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/__init__.py +0 -0
  20. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/base.py +0 -0
  21. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/clickhouse.py +0 -0
  22. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/local.py +0 -0
  23. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics/output/s3.py +0 -0
  24. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  25. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  26. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  27. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/requires.txt +0 -0
  28. {batch_analytics-0.3.7 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.7
3
+ Version: 0.3.14
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.7"
7
+ version = "0.3.14"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -13,8 +13,14 @@ Analytics modules:
13
13
  """
14
14
 
15
15
  from .config import BatchAnalyticsConfig, SparkK8sConfig
16
- from .extract import extract_all, extract_table, extract_unified
16
+ from .extract import (
17
+ extract_all,
18
+ extract_table,
19
+ extract_unified,
20
+ parse_extract_filter_values,
21
+ )
17
22
  from .transform import (
23
+ expand_kv_blob_column,
18
24
  extract_anchor_id,
19
25
  load_staged,
20
26
  remove_duplicates,
@@ -28,10 +34,12 @@ from .job_runner import run_pipeline, create_spark_session
28
34
  __all__ = [
29
35
  "BatchAnalyticsConfig",
30
36
  "SparkK8sConfig",
37
+ "expand_kv_blob_column",
31
38
  "extract_anchor_id",
32
39
  "extract_all",
33
40
  "extract_table",
34
41
  "extract_unified",
42
+ "parse_extract_filter_values",
35
43
  "remove_duplicates",
36
44
  "stage_to_clickhouse",
37
45
  "transform",
@@ -29,16 +29,23 @@ class ClickHouseConfig:
29
29
 
30
30
  @property
31
31
  def jdbc_properties(self) -> dict:
32
+ """JDBC connection properties for Spark.
33
+
34
+ clickhouse-jdbc (clickhouse-java v0.6+) rejects legacy keys such as
35
+ ``compress_algorithm`` (ClientMisconfigurationException). Prefer JDBC URL
36
+ query parameters for compression behavior. To force the old property for
37
+ legacy stacks, set CLICKHOUSE_JDBC_LEGACY_COMPRESS_ALGORITHM (e.g. ``none``).
38
+ """
32
39
  props = {
33
40
  "user": self.user,
34
41
  "driver": "com.clickhouse.jdbc.ClickHouseDriver",
35
- # Match Spark read codec default: avoids JDBC LZ4/gzip mismatches with server HTTP compression
36
- "compress_algorithm": os.environ.get(
37
- "CLICKHOUSE_JDBC_COMPRESS_ALGORITHM", "none"
38
- ),
39
42
  }
40
43
  if self.password:
41
44
  props["password"] = self.password
45
+ # Opt-in legacy property for older shaded JDBC stacks only.
46
+ legacy = os.environ.get("CLICKHOUSE_JDBC_LEGACY_COMPRESS_ALGORITHM", "").strip()
47
+ if legacy:
48
+ props["compress_algorithm"] = legacy
42
49
  return props
43
50
 
44
51
 
@@ -55,13 +62,18 @@ class ExtractConfig:
55
62
  use_native_connector: bool = os.environ.get(
56
63
  "BATCH_USE_NATIVE_CONNECTOR", "false"
57
64
  ).lower() == "true"
65
+ # Optional WHERE col IN (...) after read. Empty filter_column = no filter (full table).
66
+ # filter_values: comma-separated list, or JSON array e.g. ["a","b"] for values containing commas.
67
+ filter_column: str = os.environ.get("BATCH_EXTRACT_FILTER_COLUMN", "").strip()
68
+ filter_values: str = os.environ.get("BATCH_EXTRACT_FILTER_VALUES", "").strip()
58
69
 
59
70
 
60
71
  @dataclass
61
72
  class TransformConfig:
62
73
  """Transform stage configuration."""
63
74
 
64
- # Columns to use for deduplication (comma-separated); empty = use all columns
75
+ # Order: extract anchor_id from add_dimension(s) column, then dedupe by these keys.
76
+ # Deduplication keys (comma-separated). Empty = dropDuplicates() on full row (all columns).
65
77
  dedup_columns: str = os.environ.get("BATCH_DEDUP_COLUMNS", "")
66
78
  # Staging output path (local or S3)
67
79
  staging_path: str = os.environ.get(
@@ -73,8 +85,12 @@ class TransformConfig:
73
85
  staging_format: str = os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
74
86
  # Staging table name in ClickHouse (when format=clickhouse)
75
87
  staging_table: str = os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
76
- # Extract anchor_id from add_dimension column (e.g. {'anchor_id':'GP/GPH(D)/II(W)/250019'})
88
+ # Spark save mode for ClickHouse staging (and path staging): overwrite | append
89
+ staging_write_mode: str = os.environ.get("BATCH_STAGING_WRITE_MODE", "overwrite")
90
+ # Source column holding a JSON object or Python dict string; every top-level key becomes a new String column
91
+ # (see transform.expand_kv_blob_column). Example: add_dimensions {'anchor_id':'...','lot':'A1'}
77
92
  add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
93
+ # Legacy: no longer used; output column names match JSON keys (e.g. anchor_id). Kept for env compatibility.
78
94
  anchor_id_column: str = os.environ.get("BATCH_ANCHOR_ID_COLUMN", "anchor_id")
79
95
 
80
96
 
@@ -2,17 +2,106 @@
2
2
  Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDBC.
3
3
  """
4
4
 
5
+ import json
5
6
  import logging
6
7
  import os
7
- from typing import Dict, List, Optional
8
+ from typing import Dict, List, Optional, Tuple
8
9
 
9
10
  from pyspark.sql import DataFrame, SparkSession
11
+ from pyspark.sql.functions import col
10
12
 
11
13
  from .config import BatchAnalyticsConfig
12
14
 
13
15
  logger = logging.getLogger(__name__)
14
16
 
15
17
 
18
+ def parse_extract_filter_values(raw: str) -> List[str]:
19
+ """
20
+ Parse BATCH_EXTRACT_FILTER_VALUES: comma-separated tokens, or JSON array string.
21
+
22
+ Examples:
23
+ a,b,c -> ["a","b","c"]
24
+ ["GP/A","GP/B"] -> JSON list (values may contain commas)
25
+ """
26
+ text = (raw or "").strip()
27
+ if not text:
28
+ return []
29
+ if text.startswith("["):
30
+ try:
31
+ data = json.loads(text)
32
+ if isinstance(data, list):
33
+ out = [str(x).strip() for x in data if str(x).strip()]
34
+ return out
35
+ except json.JSONDecodeError:
36
+ logger.warning("BATCH_EXTRACT_FILTER_VALUES looks like JSON but failed to parse; using comma split")
37
+ return [p.strip() for p in text.split(",") if p.strip()]
38
+
39
+
40
+ def _apply_extract_filter(df: DataFrame, config: BatchAnalyticsConfig) -> DataFrame:
41
+ """Apply col IN (values) when filter_column is set; empty column = no filter."""
42
+ col_name = (config.extract.filter_column or "").strip()
43
+ if not col_name:
44
+ return df
45
+ if col_name not in df.columns:
46
+ logger.warning(
47
+ "BATCH_EXTRACT_FILTER_COLUMN=%r not in extracted columns %s; skipping filter",
48
+ col_name,
49
+ df.columns,
50
+ )
51
+ return df
52
+ values = parse_extract_filter_values(config.extract.filter_values)
53
+ if not values:
54
+ logger.warning(
55
+ "BATCH_EXTRACT_FILTER_COLUMN=%r set but BATCH_EXTRACT_FILTER_VALUES is empty; skipping IN filter",
56
+ col_name,
57
+ )
58
+ return df
59
+ filtered = df.filter(col(col_name).isin(values))
60
+ return filtered
61
+
62
+
63
+ def _parse_ch_database_table(table: str, default_database: str) -> Tuple[str, str]:
64
+ """
65
+ Resolve ``table`` reference to (database, table_name).
66
+
67
+ ``batch_metric_facts`` → (default_database, batch_metric_facts)
68
+ ``analytics.batch_metric_facts`` → (analytics, batch_metric_facts)
69
+ """
70
+ t = (table or "").strip()
71
+ if not t:
72
+ return default_database, t
73
+ if "." in t and not t.startswith("("):
74
+ db, tbl = t.split(".", 1)
75
+ db, tbl = db.strip(), tbl.strip()
76
+ if db and tbl:
77
+ return db, tbl
78
+ return default_database, t
79
+
80
+
81
+ def _read_via_catalog(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
82
+ """
83
+ Read via Spark SQL catalog (ClickHouseCatalog), registered in job_runner.create_spark_session.
84
+
85
+ clickhouse-spark-runtime does **not** register legacy short name ``format(\"clickhouse\")`` /
86
+ ``clickhouse.DefaultSource``; catalog + ``spark.table(catalog.db.table)`` is the supported path.
87
+ """
88
+ cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
89
+ if not cat:
90
+ return None
91
+ db, tbl = _parse_ch_database_table(table, cfg.clickhouse.database)
92
+ ident = f"{cat}.{db}.{tbl}"
93
+ try:
94
+ return spark.table(ident)
95
+ except Exception as e:
96
+ logger.warning(
97
+ "Catalog read failed for %s (%s): %s. Trying other readers.",
98
+ ident,
99
+ table,
100
+ e,
101
+ )
102
+ return None
103
+
104
+
16
105
  def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
17
106
  """
18
107
  Read from ClickHouse using the native format API (clickhouse-spark-runtime).
@@ -60,12 +149,16 @@ def extract_table(
60
149
  Uses native connector if configured, otherwise JDBC.
61
150
  """
62
151
  if config.extract.use_native_connector:
152
+ # Prefer catalog (matches clickhouse-spark-runtime); avoid legacy DefaultSource path.
63
153
  df = _read_via_catalog(spark, config, table)
154
+ if df is None:
155
+ df = _read_via_format(spark, config, table)
64
156
  if df is None:
65
157
  df = _read_via_jdbc(spark, config, table)
66
158
  else:
67
159
  df = _read_via_jdbc(spark, config, table)
68
160
 
161
+ df = _apply_extract_filter(df, config)
69
162
  logger.info("Extracted table %s: %d rows", table, df.count())
70
163
  return df
71
164
 
@@ -270,9 +270,9 @@ def run_pipeline(
270
270
  else:
271
271
  df_transformed = df_raw # df_raw already loaded from staged when not run_extract
272
272
  if run_extract and run_analytics:
273
- from .transform import extract_anchor_id, remove_duplicates
273
+ from .transform import expand_kv_blob_column, remove_duplicates
274
274
 
275
- df_transformed = extract_anchor_id(df_raw, config)
275
+ df_transformed = expand_kv_blob_column(df_raw, config)
276
276
  dedup_cols = (
277
277
  [c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
278
278
  if config.transform.dedup_columns
@@ -1,42 +1,143 @@
1
1
  """
2
- Transform stage: Clean data (remove duplicates), extract add_dimension, and stage.
2
+ Transform stage: Clean data (remove duplicates), expand JSON/KV blob column, and stage.
3
3
  """
4
4
 
5
+ import ast
6
+ import json
5
7
  import logging
6
8
  import os
7
- from typing import Optional, Sequence
9
+ import re
10
+ from typing import Any, Dict, List, Optional, Sequence, Set
8
11
 
9
12
  from pyspark.sql import DataFrame, SparkSession
10
- from pyspark.sql.functions import coalesce, col, get_json_object, regexp_extract
13
+ from pyspark.sql.functions import col, explode, map_keys, udf
14
+ from pyspark.sql.types import MapType, StringType
11
15
 
12
16
  from .config import BatchAnalyticsConfig
13
17
 
14
18
  logger = logging.getLogger(__name__)
15
19
 
16
20
 
17
- def extract_anchor_id(
21
+ def _stringify_leaf(v: Any) -> str:
22
+ if v is None:
23
+ return ""
24
+ if isinstance(v, (dict, list)):
25
+ return json.dumps(v, separators=(",", ":"))
26
+ return str(v)
27
+
28
+
29
+ def parse_blob_to_strmap(s: Any) -> Dict[str, str]:
30
+ """
31
+ Parse a cell value into a flat string map (top-level keys only).
32
+
33
+ Accepts standard JSON objects or Python repr dicts (e.g. single-quoted).
34
+ Non-dict / unparsable input yields an empty map.
35
+ """
36
+ if s is None:
37
+ return {}
38
+ text = str(s).strip()
39
+ if not text:
40
+ return {}
41
+ obj: Any = None
42
+ try:
43
+ obj = json.loads(text)
44
+ except json.JSONDecodeError:
45
+ pass
46
+ if obj is None:
47
+ try:
48
+ obj = ast.literal_eval(text)
49
+ except (ValueError, SyntaxError, MemoryError):
50
+ return {}
51
+ if not isinstance(obj, dict):
52
+ return {}
53
+ out: Dict[str, str] = {}
54
+ for k, v in obj.items():
55
+ key = str(k).strip()
56
+ if not key:
57
+ continue
58
+ out[key] = _stringify_leaf(v)
59
+ return out
60
+
61
+
62
+ def _spark_safe_base_name(key: str) -> str:
63
+ """Sanitize JSON key to a usable Spark column name."""
64
+ s = re.sub(r"[^0-9a-zA-Z_]", "_", key.strip())
65
+ s = re.sub(r"_+", "_", s).strip("_")
66
+ if not s:
67
+ return "kv_key"
68
+ if s[0].isdigit():
69
+ s = "c_" + s
70
+ return s
71
+
72
+
73
+ def _unique_column_name(base: str, used: Set[str]) -> str:
74
+ name = base
75
+ n = 1
76
+ while name in used:
77
+ n += 1
78
+ name = f"{base}_{n}"
79
+ used.add(name)
80
+ return name
81
+
82
+
83
+ def expand_kv_blob_column(
18
84
  df: DataFrame,
19
85
  config: BatchAnalyticsConfig,
20
86
  ) -> DataFrame:
21
87
  """
22
- Extract anchor_id from add_dimension column.
23
- Supports JSON format {"anchor_id":"value"} or Python-dict {"anchor_id":"value"}.
24
- Creates a new column (anchor_id by default) with the extracted value.
88
+ Parse the configured blob column into top-level key/value pairs and add one String column per key.
89
+
90
+ No per-key user configuration: every distinct key observed in the column (across the dataset)
91
+ becomes a column; values are strings (nested dict/list serialized as JSON). Empty / null cells
92
+ yield nulls in those columns.
93
+
94
+ Source column: ``config.transform.add_dimension_column`` (env ``BATCH_ADD_DIMENSION_COLUMN``).
25
95
  """
26
96
  col_name = config.transform.add_dimension_column
27
- out_col = config.transform.anchor_id_column
28
-
29
97
  if col_name not in df.columns:
30
- logger.debug("Column %s not found, skipping anchor_id extraction", col_name)
98
+ logger.debug("KV blob column %r not found, skipping expansion", col_name)
31
99
  return df
32
100
 
33
- # Valid JSON: {"anchor_id":"GP/GPH(D)/II(W)/250019"}
34
- json_extract = get_json_object(col(col_name), "$.anchor_id")
35
- # Python-dict style: {'anchor_id':'GP/GPH(D)/II(W)/250019'}
36
- regex_extract = regexp_extract(col(col_name), r"'anchor_id'\s*:\s*'([^']*)'", 1)
101
+ parse_udf = udf(parse_blob_to_strmap, MapType(StringType(), StringType()))
102
+ with_map = df.withColumn("_kv_blob_map", parse_udf(col(col_name)))
103
+
104
+ key_rows = (
105
+ with_map.select(explode(map_keys(col("_kv_blob_map"))).alias("_k"))
106
+ .where(col("_k").isNotNull())
107
+ .distinct()
108
+ .collect()
109
+ )
110
+ all_keys: List[str] = sorted({str(r._k).strip() for r in key_rows if r._k and str(r._k).strip()})
111
+
112
+ if not all_keys:
113
+ logger.info("No keys found in KV blob column %r; dropping temporary map only", col_name)
114
+ return with_map.drop("_kv_blob_map")
37
115
 
38
- extracted = coalesce(json_extract, regex_extract)
39
- return df.withColumn(out_col, extracted)
116
+ used: Set[str] = set(with_map.columns)
117
+ out = with_map
118
+ added: List[str] = []
119
+ for k in all_keys:
120
+ base = _spark_safe_base_name(k)
121
+ target = _unique_column_name(base, used)
122
+ added.append(target)
123
+ out = out.withColumn(target, col("_kv_blob_map").getItem(k))
124
+
125
+ out = out.drop("_kv_blob_map")
126
+ logger.info(
127
+ "Expanded KV blob column %r into %d columns: %s",
128
+ col_name,
129
+ len(added),
130
+ ", ".join(added),
131
+ )
132
+ return out
133
+
134
+
135
+ def extract_anchor_id(
136
+ df: DataFrame,
137
+ config: BatchAnalyticsConfig,
138
+ ) -> DataFrame:
139
+ """Backward-compatible name: expands all keys from the blob column (not only ``anchor_id``)."""
140
+ return expand_kv_blob_column(df, config)
40
141
 
41
142
 
42
143
  def remove_duplicates(
@@ -69,10 +170,11 @@ def transform(
69
170
  config: BatchAnalyticsConfig,
70
171
  ) -> DataFrame:
71
172
  """
72
- Apply transformation only: extract anchor_id, remove duplicates.
173
+ Apply transformation only: (1) expand JSON/KV blob column into one column per top-level key,
174
+ (2) deduplicate by BATCH_DEDUP_COLUMNS if set, else by full row.
73
175
  Does not write anywhere. Use stage_to_clickhouse() separately to persist.
74
176
  """
75
- transformed = extract_anchor_id(df, config)
177
+ transformed = expand_kv_blob_column(df, config)
76
178
  dedup_cols = (
77
179
  [c.strip() for c in config.transform.dedup_columns.split(",") if c.strip()]
78
180
  if config.transform.dedup_columns
@@ -81,6 +183,15 @@ def transform(
81
183
  return remove_duplicates(transformed, key_columns=dedup_cols)
82
184
 
83
185
 
186
+ def _normalize_staging_write_mode(raw: str) -> str:
187
+ """Spark DataFrameWriter mode: overwrite (replace table contents) or append."""
188
+ m = (raw or "overwrite").strip().lower()
189
+ if m in ("overwrite", "append"):
190
+ return m
191
+ logger.warning("Invalid BATCH_STAGING_WRITE_MODE=%r; using overwrite", raw)
192
+ return "overwrite"
193
+
194
+
84
195
  def stage_to_clickhouse(
85
196
  spark: SparkSession,
86
197
  df: DataFrame,
@@ -90,8 +201,10 @@ def stage_to_clickhouse(
90
201
  Write transformed data to ClickHouse staging table.
91
202
  Separate job from transform; must complete before analytics can run.
92
203
  Uses native connector if available, else JDBC.
204
+ Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
93
205
  """
94
206
  n = df.count()
207
+ mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
95
208
  try:
96
209
  ch = config.clickhouse
97
210
  writer = (
@@ -102,7 +215,7 @@ def stage_to_clickhouse(
102
215
  .option("database", ch.database)
103
216
  .option("table", config.transform.staging_table)
104
217
  .option("user", ch.user)
105
- .mode("overwrite")
218
+ .mode(mode)
106
219
  )
107
220
  if ch.password:
108
221
  writer = writer.option("password", ch.password)
@@ -112,7 +225,7 @@ def stage_to_clickhouse(
112
225
  df.write.jdbc(
113
226
  config.clickhouse.jdbc_url,
114
227
  config.transform.staging_table,
115
- mode="overwrite",
228
+ mode=mode,
116
229
  properties=config.clickhouse.jdbc_properties,
117
230
  )
118
231
  logger.info(
@@ -131,14 +244,15 @@ def stage_to_path(
131
244
  """Write transformed data to parquet/delta (for local dev or intermediate storage)."""
132
245
  path = config.transform.staging_path
133
246
  fmt = config.transform.staging_format
247
+ mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
134
248
  if fmt == "parquet":
135
- df.write.mode("overwrite").parquet(path)
249
+ df.write.mode(mode).parquet(path)
136
250
  logger.info("Staged data to %s (parquet)", path)
137
251
  elif fmt == "delta":
138
- df.write.format("delta").mode("overwrite").save(path)
252
+ df.write.format("delta").mode(mode).save(path)
139
253
  logger.info("Staged data to %s (delta)", path)
140
254
  else:
141
- df.write.format(fmt).mode("overwrite").save(path)
255
+ df.write.format(fmt).mode(mode).save(path)
142
256
  logger.info("Staged data to %s (%s)", path, fmt)
143
257
 
144
258
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.7
3
+ Version: 0.3.14
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT