batch-analytics 0.3.13__tar.gz → 0.3.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/config.py +13 -4
  4. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/extract.py +48 -2
  5. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/transform.py +17 -5
  6. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  7. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/README.md +0 -0
  8. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/setup.cfg +0 -0
  9. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/__init__.py +0 -0
  10. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/__main__.py +0 -0
  11. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/__init__.py +0 -0
  12. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/correlation.py +0 -0
  13. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/linear_regression.py +0 -0
  14. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  15. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/analytics/t_test.py +0 -0
  16. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/job_runner.py +0 -0
  17. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/log.py +0 -0
  18. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/modules.py +0 -0
  19. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/__init__.py +0 -0
  20. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/base.py +0 -0
  21. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/clickhouse.py +0 -0
  22. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/local.py +0 -0
  23. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics/output/s3.py +0 -0
  24. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  25. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  26. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  27. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/requires.txt +0 -0
  28. {batch_analytics-0.3.13 → batch_analytics-0.3.14}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.13
3
+ Version: 0.3.14
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.13"
7
+ version = "0.3.14"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -29,16 +29,23 @@ class ClickHouseConfig:
29
29
 
30
30
  @property
31
31
  def jdbc_properties(self) -> dict:
32
+ """JDBC connection properties for Spark.
33
+
34
+ clickhouse-jdbc (clickhouse-java v0.6+) rejects legacy keys such as
35
+ ``compress_algorithm`` (ClientMisconfigurationException). Prefer JDBC URL
36
+ query parameters for compression behavior. To force the old property for
37
+ legacy stacks, set CLICKHOUSE_JDBC_LEGACY_COMPRESS_ALGORITHM (e.g. ``none``).
38
+ """
32
39
  props = {
33
40
  "user": self.user,
34
41
  "driver": "com.clickhouse.jdbc.ClickHouseDriver",
35
- # Match Spark read codec default: avoids JDBC LZ4/gzip mismatches with server HTTP compression
36
- "compress_algorithm": os.environ.get(
37
- "CLICKHOUSE_JDBC_COMPRESS_ALGORITHM", "none"
38
- ),
39
42
  }
40
43
  if self.password:
41
44
  props["password"] = self.password
45
+ # Opt-in legacy property for older shaded JDBC stacks only.
46
+ legacy = os.environ.get("CLICKHOUSE_JDBC_LEGACY_COMPRESS_ALGORITHM", "").strip()
47
+ if legacy:
48
+ props["compress_algorithm"] = legacy
42
49
  return props
43
50
 
44
51
 
@@ -78,6 +85,8 @@ class TransformConfig:
78
85
  staging_format: str = os.environ.get("BATCH_STAGING_FORMAT", "clickhouse")
79
86
  # Staging table name in ClickHouse (when format=clickhouse)
80
87
  staging_table: str = os.environ.get("BATCH_STAGING_TABLE", "analytics_staging")
88
+ # Spark save mode for ClickHouse staging (and path staging): overwrite | append
89
+ staging_write_mode: str = os.environ.get("BATCH_STAGING_WRITE_MODE", "overwrite")
81
90
  # Source column holding a JSON object or Python dict string; every top-level key becomes a new String column
82
91
  # (see transform.expand_kv_blob_column). Example: add_dimensions {'anchor_id':'...','lot':'A1'}
83
92
  add_dimension_column: str = os.environ.get("BATCH_ADD_DIMENSION_COLUMN", "add_dimension")
@@ -4,7 +4,8 @@ Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDB
4
4
 
5
5
  import json
6
6
  import logging
7
- from typing import Dict, List, Optional
7
+ import os
8
+ from typing import Dict, List, Optional, Tuple
8
9
 
9
10
  from pyspark.sql import DataFrame, SparkSession
10
11
  from pyspark.sql.functions import col
@@ -59,6 +60,48 @@ def _apply_extract_filter(df: DataFrame, config: BatchAnalyticsConfig) -> DataFr
59
60
  return filtered
60
61
 
61
62
 
63
+ def _parse_ch_database_table(table: str, default_database: str) -> Tuple[str, str]:
64
+ """
65
+ Resolve ``table`` reference to (database, table_name).
66
+
67
+ ``batch_metric_facts`` → (default_database, batch_metric_facts)
68
+ ``analytics.batch_metric_facts`` → (analytics, batch_metric_facts)
69
+ """
70
+ t = (table or "").strip()
71
+ if not t:
72
+ return default_database, t
73
+ if "." in t and not t.startswith("("):
74
+ db, tbl = t.split(".", 1)
75
+ db, tbl = db.strip(), tbl.strip()
76
+ if db and tbl:
77
+ return db, tbl
78
+ return default_database, t
79
+
80
+
81
+ def _read_via_catalog(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
82
+ """
83
+ Read via Spark SQL catalog (ClickHouseCatalog), registered in job_runner.create_spark_session.
84
+
85
+ clickhouse-spark-runtime does **not** register legacy short name ``format(\"clickhouse\")`` /
86
+ ``clickhouse.DefaultSource``; catalog + ``spark.table(catalog.db.table)`` is the supported path.
87
+ """
88
+ cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
89
+ if not cat:
90
+ return None
91
+ db, tbl = _parse_ch_database_table(table, cfg.clickhouse.database)
92
+ ident = f"{cat}.{db}.{tbl}"
93
+ try:
94
+ return spark.table(ident)
95
+ except Exception as e:
96
+ logger.warning(
97
+ "Catalog read failed for %s (%s): %s. Trying other readers.",
98
+ ident,
99
+ table,
100
+ e,
101
+ )
102
+ return None
103
+
104
+
62
105
  def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
63
106
  """
64
107
  Read from ClickHouse using the native format API (clickhouse-spark-runtime).
@@ -106,7 +149,10 @@ def extract_table(
106
149
  Uses native connector if configured, otherwise JDBC.
107
150
  """
108
151
  if config.extract.use_native_connector:
109
- df = _read_via_format(spark, config, table)
152
+ # Prefer catalog (matches clickhouse-spark-runtime); avoid legacy DefaultSource path.
153
+ df = _read_via_catalog(spark, config, table)
154
+ if df is None:
155
+ df = _read_via_format(spark, config, table)
110
156
  if df is None:
111
157
  df = _read_via_jdbc(spark, config, table)
112
158
  else:
@@ -183,6 +183,15 @@ def transform(
183
183
  return remove_duplicates(transformed, key_columns=dedup_cols)
184
184
 
185
185
 
186
+ def _normalize_staging_write_mode(raw: str) -> str:
187
+ """Spark DataFrameWriter mode: overwrite (replace table contents) or append."""
188
+ m = (raw or "overwrite").strip().lower()
189
+ if m in ("overwrite", "append"):
190
+ return m
191
+ logger.warning("Invalid BATCH_STAGING_WRITE_MODE=%r; using overwrite", raw)
192
+ return "overwrite"
193
+
194
+
186
195
  def stage_to_clickhouse(
187
196
  spark: SparkSession,
188
197
  df: DataFrame,
@@ -192,8 +201,10 @@ def stage_to_clickhouse(
192
201
  Write transformed data to ClickHouse staging table.
193
202
  Separate job from transform; must complete before analytics can run.
194
203
  Uses native connector if available, else JDBC.
204
+ Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
195
205
  """
196
206
  n = df.count()
207
+ mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
197
208
  try:
198
209
  ch = config.clickhouse
199
210
  writer = (
@@ -204,7 +215,7 @@ def stage_to_clickhouse(
204
215
  .option("database", ch.database)
205
216
  .option("table", config.transform.staging_table)
206
217
  .option("user", ch.user)
207
- .mode("overwrite")
218
+ .mode(mode)
208
219
  )
209
220
  if ch.password:
210
221
  writer = writer.option("password", ch.password)
@@ -214,7 +225,7 @@ def stage_to_clickhouse(
214
225
  df.write.jdbc(
215
226
  config.clickhouse.jdbc_url,
216
227
  config.transform.staging_table,
217
- mode="overwrite",
228
+ mode=mode,
218
229
  properties=config.clickhouse.jdbc_properties,
219
230
  )
220
231
  logger.info(
@@ -233,14 +244,15 @@ def stage_to_path(
233
244
  """Write transformed data to parquet/delta (for local dev or intermediate storage)."""
234
245
  path = config.transform.staging_path
235
246
  fmt = config.transform.staging_format
247
+ mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
236
248
  if fmt == "parquet":
237
- df.write.mode("overwrite").parquet(path)
249
+ df.write.mode(mode).parquet(path)
238
250
  logger.info("Staged data to %s (parquet)", path)
239
251
  elif fmt == "delta":
240
- df.write.format("delta").mode("overwrite").save(path)
252
+ df.write.format("delta").mode(mode).save(path)
241
253
  logger.info("Staged data to %s (delta)", path)
242
254
  else:
243
- df.write.format(fmt).mode("overwrite").save(path)
255
+ df.write.format(fmt).mode(mode).save(path)
244
256
  logger.info("Staged data to %s (%s)", path, fmt)
245
257
 
246
258
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.13
3
+ Version: 0.3.14
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT