batch-analytics 0.3.22__tar.gz → 0.3.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/transform.py +83 -2
  4. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  5. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/README.md +0 -0
  6. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/setup.cfg +0 -0
  7. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/__init__.py +0 -0
  8. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/__main__.py +0 -0
  9. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/analytics/__init__.py +0 -0
  10. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/analytics/correlation.py +0 -0
  11. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
  12. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
  13. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/analytics/linear_regression.py +0 -0
  14. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  15. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/analytics/t_test.py +0 -0
  16. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/config.py +0 -0
  17. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/extract.py +0 -0
  18. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/job_runner.py +0 -0
  19. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/log.py +0 -0
  20. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/modules.py +0 -0
  21. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/output/__init__.py +0 -0
  22. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/output/base.py +0 -0
  23. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/output/clickhouse.py +0 -0
  24. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/output/local.py +0 -0
  25. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/output/s3.py +0 -0
  26. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/utils/__init__.py +0 -0
  27. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  28. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  29. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  30. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  31. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/requires.txt +0 -0
  32. {batch_analytics-0.3.22 → batch_analytics-0.3.23}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.22
3
+ Version: 0.3.23
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.22"
7
+ version = "0.3.23"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -192,6 +192,69 @@ def _normalize_staging_write_mode(raw: str) -> str:
192
192
  return "overwrite"
193
193
 
194
194
 
195
+ def _ch_quoted_ident(name: str) -> str:
196
+ """ClickHouse identifier in ORDER BY (escape backticks)."""
197
+ return "`" + name.replace("`", "``") + "`"
198
+
199
+
200
+ def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
201
+ """
202
+ Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
203
+ rejects nullable sort keys unless ``allow_nullable_key`` is applied — the Spark connector
204
+ often does not forward ``tableProperty("settings.allow_nullable_key")`` into DDL.
205
+
206
+ Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
207
+ Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
208
+ """
209
+ q = _ch_quoted_ident(col)
210
+ for f in df.schema.fields:
211
+ if f.name == col:
212
+ if f.nullable:
213
+ return f"assumeNotNull({q})"
214
+ return q
215
+ return q
216
+
217
+
218
+ def _strip_outer_tuple_parens(s: str) -> str:
219
+ """clickhouse-spark-runtime wraps ``order_by`` in parentheses; do not also wrap here."""
220
+ s = s.strip()
221
+ if len(s) >= 2 and s[0] == "(" and s[-1] == ")":
222
+ return s[1:-1].strip()
223
+ return s
224
+
225
+
226
+ def _merge_tree_order_by_for_staging(df: DataFrame, config: BatchAnalyticsConfig) -> str:
227
+ """
228
+ ClickHouse 25+ rejects MergeTree DDL without ORDER BY. The Spark ClickHouse catalog
229
+ passes this via DataFrameWriterV2.tableProperty("order_by", ...).
230
+
231
+ **Important:** The connector emits ``ORDER BY (<order_by>)``. Do **not** wrap the value
232
+ in an extra ``(...)`` or ClickHouse sees ``ORDER BY ((...))`` and raises a syntax error.
233
+
234
+ Resolution order:
235
+ 1. BATCH_CLICKHOUSE_STAGING_ORDER_BY — comma-separated key expressions (no outer parens);
236
+ one layer of surrounding ``(...)`` is stripped if present.
237
+ 2. BATCH_DEDUP_COLUMNS / transform.dedup_columns — comma-separated, ``assumeNotNull`` when nullable.
238
+ 3. First column of the staging DataFrame schema (same nullable rule).
239
+ """
240
+ explicit = os.environ.get("BATCH_CLICKHOUSE_STAGING_ORDER_BY", "").strip()
241
+ if explicit:
242
+ return _strip_outer_tuple_parens(explicit)
243
+ dedup = (config.transform.dedup_columns or "").strip()
244
+ if dedup:
245
+ cols = [c.strip() for c in dedup.split(",") if c.strip()]
246
+ if cols:
247
+ parts = [_ch_order_by_key_expr(c, df) for c in cols]
248
+ return ", ".join(parts)
249
+ names = [f.name for f in df.schema.fields]
250
+ if names:
251
+ return _ch_order_by_key_expr(names[0], df)
252
+ raise ValueError(
253
+ "Cannot derive MergeTree ORDER BY for staging: set BATCH_CLICKHOUSE_STAGING_ORDER_BY, "
254
+ "or BATCH_DEDUP_COLUMNS / dedup_columns, or ensure the DataFrame has columns."
255
+ )
256
+
257
+
195
258
  def stage_to_clickhouse(
196
259
  spark: SparkSession,
197
260
  df: DataFrame,
@@ -210,6 +273,15 @@ def stage_to_clickhouse(
210
273
  Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
211
274
  ClickHouse 25+ auto-DDL without ORDER BY).
212
275
 
276
+ **MergeTree ORDER BY** (required on ClickHouse 25+): set ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``,
277
+ or rely on ``BATCH_DEDUP_COLUMNS`` / ``dedup_columns`` (comma-separated keys; no extra outer parens).
278
+
279
+ **Nullable sort keys**: (1) Dedup-derived ``ORDER BY`` uses ``assumeNotNull(`col`)`` when
280
+ Spark marks the field nullable. (2) The Spark connector often **does not** pass
281
+ ``tableProperty("settings.allow_nullable_key")`` into DDL, so we also set
282
+ ``engine`` to ``MergeTree() SETTINGS allow_nullable_key = 1``, which ClickHouse
283
+ applies to the created table.
284
+
213
285
  Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
214
286
  """
215
287
  n = df.count()
@@ -221,12 +293,21 @@ def stage_to_clickhouse(
221
293
  if cat:
222
294
  try:
223
295
  full_name = f"{cat}.{ch.database}.{tbl}"
296
+ order_by = _merge_tree_order_by_for_staging(df, config)
224
297
  logger.info(
225
- "Staging to ClickHouse via catalog %s (mode=%s)",
298
+ "Staging to ClickHouse via catalog %s (mode=%s, order_by=%s)",
226
299
  full_name,
227
300
  mode,
301
+ order_by,
302
+ )
303
+ # MergeTree SETTINGS in ENGINE: connector often ignores tableProperty("settings.*")
304
+ # for CREATE TABLE; CH 25 then reports allow_nullable_key disabled.
305
+ _mt_engine = "MergeTree() SETTINGS allow_nullable_key = 1"
306
+ w2 = (
307
+ df.writeTo(full_name)
308
+ .tableProperty("engine", _mt_engine)
309
+ .tableProperty("order_by", order_by)
228
310
  )
229
- w2 = df.writeTo(full_name)
230
311
  if mode == "overwrite":
231
312
  w2.createOrReplace()
232
313
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.22
3
+ Version: 0.3.23
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT