batch-analytics 0.3.23__tar.gz → 0.3.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/job_runner.py +1 -1
  4. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/transform.py +25 -21
  5. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  6. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/README.md +0 -0
  7. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/setup.cfg +0 -0
  8. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/__init__.py +0 -0
  9. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/__main__.py +0 -0
  10. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/__init__.py +0 -0
  11. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/correlation.py +0 -0
  12. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
  13. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
  14. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/linear_regression.py +0 -0
  15. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  16. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/analytics/t_test.py +0 -0
  17. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/config.py +0 -0
  18. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/extract.py +0 -0
  19. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/log.py +0 -0
  20. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/modules.py +0 -0
  21. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/__init__.py +0 -0
  22. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/base.py +0 -0
  23. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/clickhouse.py +0 -0
  24. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/local.py +0 -0
  25. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/output/s3.py +0 -0
  26. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/utils/__init__.py +0 -0
  27. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  28. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  29. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  30. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  31. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/requires.txt +0 -0
  32. {batch_analytics-0.3.23 → batch_analytics-0.3.24}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.23
3
+ Version: 0.3.24
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.23"
7
+ version = "0.3.24"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -235,7 +235,7 @@ def run_pipeline(
235
235
  # Empty/unset: rely on $SPARK_HOME/jars (analytics-runner image). Do not add spark.jars /
236
236
  # spark.jars.packages for ClickHouse here — that breaks K8s executors (./basename.jar).
237
237
  # For ad-hoc runs without the image, set e.g.
238
- # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
238
+ # BATCH_SPARK_CLICKHOUSE_PACKAGES=com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.10.0,https://repo1.maven.org/maven2/com/clickhouse/clickhouse-jdbc/0.9.8/clickhouse-jdbc-0.9.8-all.jar
239
239
  _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
240
240
  if _raw_ch is None or not _raw_ch.strip():
241
241
  ch_pkgs = None
@@ -200,8 +200,9 @@ def _ch_quoted_ident(name: str) -> str:
200
200
  def _ch_order_by_key_expr(col: str, df: DataFrame) -> str:
201
201
  """
202
202
  Build one ORDER BY key expression. Spark nullable columns map to Nullable in CH; CH 25
203
- rejects nullable sort keys unless ``allow_nullable_key`` is applied — the Spark connector
204
- often does not forward ``tableProperty("settings.allow_nullable_key")`` into DDL.
203
+ rejects nullable sort keys unless ``allow_nullable_key`` is set (via
204
+ ``tableProperty("settings.allow_nullable_key", "1")`` in the catalog path) or keys are
205
+ wrapped with ``assumeNotNull``.
205
206
 
206
207
  Using ``assumeNotNull(col)`` yields a non-nullable key expression so CREATE TABLE succeeds.
207
208
  Only wrap when Spark marks the field nullable (``assumeNotNull`` requires Nullable input).
@@ -264,25 +265,28 @@ def stage_to_clickhouse(
264
265
  Write transformed data to ClickHouse staging table.
265
266
  Separate job from transform; must complete before analytics can run.
266
267
 
267
- Preferred path: Spark SQL **catalog** API (``DataFrame.writeTo``), matching
268
- ``job_runner.create_spark_session`` registration of ``ClickHouseCatalog``
269
- (``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
270
- 0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
271
- / ``clickhouse.DefaultSource``.
268
+ **Only Spark’s native ClickHouse integration** (``clickhouse-spark-runtime`` DataSourceV2):
269
+ no separate Python DDL client for table creation.
272
270
 
273
- Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
274
- ClickHouse 25+ auto-DDL without ORDER BY).
271
+ 1. If ``BATCH_CLICKHOUSE_CATALOG`` is set (default ``batch_ch`` when the job registers
272
+ ``ClickHouseCatalog`` in ``job_runner.create_spark_session``): ``DataFrame.writeTo`` with
273
+ ``tableProperty("engine", "MergeTree()")``, ``order_by``, and
274
+ ``tableProperty("settings.allow_nullable_key", "1")``, then ``createOrReplace()`` or
275
+ ``append()``.
276
+ 2. Otherwise (or on catalog failure): ``format("clickhouse")`` with the same connection
277
+ options, then JDBC as last resort.
275
278
 
276
- **MergeTree ORDER BY** (required on ClickHouse 25+): set ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``,
277
- or rely on ``BATCH_DEDUP_COLUMNS`` / ``dedup_columns`` (comma-separated keys; no extra outer parens).
279
+ Pin ``clickhouse-spark-runtime-3.5_2.12`` **0.10.0+** on the Spark classpath (see
280
+ ``analytics_runner`` Dockerfile ``CLICKHOUSE_SPARK_RUNTIME_VERSION``) for ClickHouse **25.x**
281
+ servers.
278
282
 
279
- **Nullable sort keys**: (1) Dedup-derived ``ORDER BY`` uses ``assumeNotNull(`col`)`` when
280
- Spark marks the field nullable. (2) The Spark connector often **does not** pass
281
- ``tableProperty("settings.allow_nullable_key")`` into DDL, so we also set
282
- ``engine`` to ``MergeTree() SETTINGS allow_nullable_key = 1``, which ClickHouse
283
- applies to the created table.
283
+ **MergeTree ORDER BY**: ``BATCH_CLICKHOUSE_STAGING_ORDER_BY``, or dedup columns, or first
284
+ column. Do not add an extra outer ``(...)`` around ``order_by`` (the connector wraps it).
284
285
 
285
- Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
286
+ **Nullable keys**: ``assumeNotNull(`col`)`` for nullable Spark columns in the sort key, plus
287
+ ``settings.allow_nullable_key`` when needed.
288
+
289
+ Write mode from ``BATCH_STAGING_WRITE_MODE`` (default overwrite).
286
290
  """
287
291
  n = df.count()
288
292
  mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
@@ -300,13 +304,13 @@ def stage_to_clickhouse(
300
304
  mode,
301
305
  order_by,
302
306
  )
303
- # MergeTree SETTINGS in ENGINE: connector often ignores tableProperty("settings.*")
304
- # for CREATE TABLE; CH 25 then reports allow_nullable_key disabled.
305
- _mt_engine = "MergeTree() SETTINGS allow_nullable_key = 1"
307
+ # Plain MergeTree() only SETTINGS belong in tableProperty("settings.*", ...) so the
308
+ # connector emits them after ORDER BY; inline SETTINGS in ENGINE breaks CH 25.5 parsing.
306
309
  w2 = (
307
310
  df.writeTo(full_name)
308
- .tableProperty("engine", _mt_engine)
311
+ .tableProperty("engine", "MergeTree()")
309
312
  .tableProperty("order_by", order_by)
313
+ .tableProperty("settings.allow_nullable_key", "1")
310
314
  )
311
315
  if mode == "overwrite":
312
316
  w2.createOrReplace()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.23
3
+ Version: 0.3.24
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT