batch-analytics 0.3.22__tar.gz → 0.3.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/job_runner.py +1 -1
  4. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/transform.py +95 -10
  5. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  6. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/README.md +0 -0
  7. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/setup.cfg +0 -0
  8. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/__init__.py +0 -0
  9. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/__main__.py +0 -0
  10. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/analytics/__init__.py +0 -0
  11. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/analytics/correlation.py +0 -0
  12. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
  13. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
  14. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/analytics/linear_regression.py +0 -0
  15. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  16. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/analytics/t_test.py +0 -0
  17. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/config.py +0 -0
  18. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/extract.py +0 -0
  19. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/log.py +0 -0
  20. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/modules.py +0 -0
  21. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/output/__init__.py +0 -0
  22. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/output/base.py +0 -0
  23. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/output/clickhouse.py +0 -0
  24. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/output/local.py +0 -0
  25. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/output/s3.py +0 -0
  26. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/utils/__init__.py +0 -0
  27. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  28. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  29. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  30. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  31. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/requires.txt +0 -0
  32. {batch_analytics-0.3.22 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.22
3
+ Version: 0.3.24
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.22"
7
+ version = "0.3.24"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -235,7 +235,7 @@ def run_pipeline(
235
235
  # Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
236
236
  # spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
237
237
  # For ad-hoc runs without the image, set e.g.
238
- # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
238
+ # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.10.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
239
239
  _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
240
240
  if _raw_ch is None or not _raw_ch.strip():
241
241
  ch_pkgs = None
@@ -192,6 +192,70 @@ def _normalize_staging_write_mode(raw: str) -> str:
192
192
  return "overwrite"
193
193
 
194
194
 
195
+ def _ch_quoted_ident(name: str) -> str:
196
+ """ClickHouse identifier in ORDER BY (escape backticks)."""
197
+ return "`" + name.replace("`", "``") + "`"
198
+
199
+
200
+ def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
201
+ """
202
+ Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
203
+ rejects nullable sort keys unless ``allow_nullable_key`` is set (via
204
+ ``tableProperty("settings.allow_nullable_key", "1")`` in the catalog path) or keys are
205
+ wrapped with ``assumeNotNull``.
206
+
207
+ Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
208
+ Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
209
+ """
210
+ q = _ch_quoted_ident(col)
211
+ for f in df.schema.fields:
212
+ if f.name == col:
213
+ if f.nullable:
214
+ return f"assumeNotNull({q})"
215
+ return q
216
+ return q
217
+
218
+
219
+ def _strip_outer_tuple_parens(s: str) -> str:
220
+ """clickhouse-spark-runtime wraps ``order_by`` in parentheses; do not also wrap here."""
221
+ s = s.strip()
222
+ if len(s) >= 2 and s[0] == "(" and s[-1] == ")":
223
+ return s[1:-1].strip()
224
+ return s
225
+
226
+
227
+ def _merge_tree_order_by_for_staging(df: DataFrame, config: BatchAnalyticsConfig) -> str:
228
+ """
229
+ ClickHouse 25+ rejects MergeTree DDL without ORDER BY. The Spark ClickHouse catalog
230
+ passes this via DataFrameWriterV2.tableProperty("order_by", ...).
231
+
232
+ **Important:** The connector emits ``ORDER BY (<order_by>)``. Do **not** wrap the value
233
+ in an extra ``(...)`` or ClickHouse sees ``ORDER BY ((...))`` and raises a syntax error.
234
+
235
+ Resolution order:
236
+ 1. BATCH_CLICKHOUSE_STAGING_ORDER_BY — comma-separated key expressions (no outer parens);
237
+ one layer of surrounding ``(...)`` is stripped if present.
238
+ 2. BATCH_DEDUP_COLUMNS / transform.dedup_columns — comma-separated, ``assumeNotNull`` when nullable.
239
+ 3. First column of the staging DataFrame schema (same nullable rule).
240
+ """
241
+ explicit = os.environ.get("BATCH_CLICKHOUSE_STAGING_ORDER_BY", "").strip()
242
+ if explicit:
243
+ return _strip_outer_tuple_parens(explicit)
244
+ dedup = (config.transform.dedup_columns or "").strip()
245
+ if dedup:
246
+ cols = [c.strip() for c in dedup.split(",") if c.strip()]
247
+ if cols:
248
+ parts = [_ch_order_by_key_expr(c, df) for c in cols]
249
+ return ", ".join(parts)
250
+ names = [f.name for f in df.schema.fields]
251
+ if names:
252
+ return _ch_order_by_key_expr(names[0], df)
253
+ raise ValueError(
254
+ "Cannot derive MergeTree ORDER BY for staging: set BATCH_CLICKHOUSE_STAGING_ORDER_BY, "
255
+ "or BATCH_DEDUP_COLUMNS / dedup_columns, or ensure the DataFrame has columns."
256
+ )
257
+
258
+
195
259
  def stage_to_clickhouse(
196
260
  spark: SparkSession,
197
261
  df: DataFrame,
@@ -201,16 +265,28 @@ def stage_to_clickhouse(
201
265
  Write transformed data to ClickHouse staging table.
202
266
  Separate job from transform; must complete before analytics can run.
203
267
 
204
- Preferred path: Spark SQL **catalog** API (``DataFrame.writeTo``), matching
205
- ``job_runner.create_spark_session`` registration of ``ClickHouseCatalog``
206
- (``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
207
- 0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
208
- / ``clickhouse.DefaultSource``.
268
+ **Only Spark’s native ClickHouse integration** (``clickhouse-spark-runtime`` DataSourceV2):
269
+ no separate Python DDL client for table creation.
209
270
 
210
- Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
211
- ClickHouse 25+ auto-DDL without ORDER BY).
271
+ 1. If ``BATCH_CLICKHOUSE_CATALOG`` is set (default ``batch_ch`` when the job registers
272
+ ``ClickHouseCatalog`` in ``job_runner.create_spark_session``): ``DataFrame.writeTo`` with
273
+ ``tableProperty("engine", "MergeTree()")``, ``order_by``, and
274
+ ``tableProperty("settings.allow_nullable_key", "1")``, then ``createOrReplace()`` or
275
+ ``append()``.
276
+ 2. Otherwise (or on catalog failure): ``format("clickhouse")`` with the same connection
277
+ options, then JDBC as last resort.
212
278
 
213
- Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
279
+ Pin ``clickhouse-spark-runtime-3.5_2.12`` **0.10.0+** on the Spark classpath (see
280
+ ``analytics_runner`` Dockerfile ``CLICKHOUSE_SPARK_RUNTIME_VERSION``) for ClickHouse **25.x**
281
+ servers.
282
+
283
+ **MergeTree ORDER BY**: ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``, or dedup columns, or first
284
+ column. Do not add an extra outer ``(...)`` around ``order_by`` (the connector wraps it).
285
+
286
+ **Nullable keys**: ``assumeNotNull(`col`)`` for nullable Spark columns in the sort key, plus
287
+ ``settings.allow_nullable_key`` when needed.
288
+
289
+ Write mode from ``BATCH_STAGING_WRITE_MODE`` (default overwrite).
214
290
  """
215
291
  n = df.count()
216
292
  mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
@@ -221,12 +297,21 @@ def stage_to_clickhouse(
221
297
  if cat:
222
298
  try:
223
299
  full_name = f"{cat}.{ch.database}.{tbl}"
300
+ order_by = _merge_tree_order_by_for_staging(df, config)
224
301
  logger.info(
225
- "Staging to ClickHouse via catalog %s (mode=%s)",
302
+ "Staging to ClickHouse via catalog %s (mode=%s, order_by=%s)",
226
303
  full_name,
227
304
  mode,
305
+ order_by,
306
+ )
307
+ # Plain MergeTree() only — SETTINGS belong in tableProperty("settings.*", ...) so the
308
+ # connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
309
+ w2 = (
310
+ df.writeTo(full_name)
311
+ .tableProperty("engine", "MergeTree()")
312
+ .tableProperty("order_by", order_by)
313
+ .tableProperty("settings.allow_nullable_key", "1")
228
314
  )
229
- w2 = df.writeTo(full_name)
230
315
  if mode == "overwrite":
231
316
  w2.createOrReplace()
232
317
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.22
3
+ Version: 0.3.24
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT