batch-analytics 0.3.20__tar.gz → 0.3.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/PKG-INFO +5 -3
  2. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/pyproject.toml +6 -3
  3. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/gluon_autogluon_train.py +30 -5
  4. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/transform.py +45 -7
  5. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/PKG-INFO +5 -3
  6. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/requires.txt +4 -2
  7. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/README.md +0 -0
  8. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/setup.cfg +0 -0
  9. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/__init__.py +0 -0
  10. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/__main__.py +0 -0
  11. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/__init__.py +0 -0
  12. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/correlation.py +0 -0
  13. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
  14. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/linear_regression.py +0 -0
  15. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  16. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/t_test.py +0 -0
  17. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/config.py +0 -0
  18. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/extract.py +0 -0
  19. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/job_runner.py +0 -0
  20. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/log.py +0 -0
  21. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/modules.py +0 -0
  22. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/__init__.py +0 -0
  23. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/base.py +0 -0
  24. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/clickhouse.py +0 -0
  25. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/local.py +0 -0
  26. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/s3.py +0 -0
  27. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/utils/__init__.py +0 -0
  28. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  29. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  30. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  31. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  32. {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.20
3
+ Version: 0.3.22
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -24,7 +24,8 @@ Requires-Dist: boto3>=1.28; extra == "output"
24
24
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
25
25
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
26
26
  Provides-Extra: autogluon
27
- Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
27
+ Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
28
+ Requires-Dist: typing-extensions>=4.8.0; extra == "autogluon"
28
29
  Requires-Dist: pandas>=1.3.0; extra == "autogluon"
29
30
  Requires-Dist: boto3>=1.28; extra == "autogluon"
30
31
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
@@ -36,7 +37,8 @@ Requires-Dist: scipy>=1.5.0; extra == "full"
36
37
  Requires-Dist: boto3>=1.28; extra == "full"
37
38
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
38
39
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
39
- Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
40
+ Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
41
+ Requires-Dist: typing-extensions>=4.8.0; extra == "full"
40
42
  Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
41
43
 
42
44
  # Batch Analytics
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.20"
7
+ version = "0.3.22"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -31,8 +31,10 @@ output = [
31
31
  "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
32
32
  "clickhouse-connect>=0.7; python_version >= '3.9'",
33
33
  ]
34
+ # autogluon-tabular[lightgbm]: tabular AutoML with LightGBM only (no PyTorch from full autogluon metapackage)
34
35
  autogluon = [
35
- "autogluon>=1.0,<2.0",
36
+ "autogluon-tabular[lightgbm]>=1.0,<2.0",
37
+ "typing-extensions>=4.8.0",
36
38
  "pandas>=1.3.0",
37
39
  "boto3>=1.28",
38
40
  "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
@@ -45,7 +47,8 @@ full = [
45
47
  "boto3>=1.28",
46
48
  "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
47
49
  "clickhouse-connect>=0.7; python_version >= '3.9'",
48
- "autogluon>=1.0,<2.0",
50
+ "autogluon-tabular[lightgbm]>=1.0,<2.0",
51
+ "typing-extensions>=4.8.0",
49
52
  "pyarrow>=10.0.0; python_version >= '3.8'",
50
53
  ]
51
54
 
@@ -4,11 +4,13 @@ Train AutoGluon TabularPredictor from ClickHouse staging data; upload artifacts
4
4
  Env (injected by analytics_runner Gluon job):
5
5
  CLICKHOUSE_*, BATCH_STAGING_TABLE, MODEL_S3_PREFIX, TASK_ID,
6
6
  AUTOGLUON_LABEL, AUTOGLUON_FEATURES, AUTOGLUON_PROBLEM_TYPE, AUTOGLUON_TIME_LIMIT,
7
- optional AUTOGLUON_MAX_ROWS
7
+ optional AUTOGLUON_MAX_ROWS,
8
+ optional AUTOGLUON_HYPERPARAMETERS — JSON object of AutoGluon model hyperparameters; default {"GBM": {}} (LightGBM only, no PyTorch)
8
9
  """
9
10
 
10
11
  from __future__ import annotations
11
12
 
13
+ import json
12
14
  import logging
13
15
  import os
14
16
  import shutil
@@ -44,6 +46,22 @@ def _require(name: str) -> str:
44
46
  return v
45
47
 
46
48
 
49
+ def _hyperparameters_from_env() -> dict:
50
+ """Default: GBM (LightGBM) only — matches autogluon-tabular[lightgbm] and avoids NN/torch models."""
51
+ raw = os.environ.get("AUTOGLUON_HYPERPARAMETERS", "").strip()
52
+ if not raw:
53
+ return {"GBM": {}}
54
+ try:
55
+ parsed = json.loads(raw)
56
+ except json.JSONDecodeError as e:
57
+ logger.error("AUTOGLUON_HYPERPARAMETERS must be valid JSON: %s", e)
58
+ sys.exit(2)
59
+ if not isinstance(parsed, dict):
60
+ logger.error("AUTOGLUON_HYPERPARAMETERS must be a JSON object, e.g. {\"GBM\": {}}")
61
+ sys.exit(2)
62
+ return parsed
63
+
64
+
47
65
  def main() -> None:
48
66
  model_prefix = _require("MODEL_S3_PREFIX")
49
67
  label = _require("AUTOGLUON_LABEL")
@@ -79,8 +97,13 @@ def main() -> None:
79
97
 
80
98
  try:
81
99
  from autogluon.tabular import TabularPredictor
82
- except ImportError:
83
- logger.exception("autogluon is not installed; use pip install 'batch-analytics[autogluon]'")
100
+ except ImportError as e:
101
+ if isinstance(e, ModuleNotFoundError) and e.name == "typing_extensions":
102
+ logger.exception(
103
+ "Install typing-extensions (required by AutoGluon): pip install 'typing-extensions>=4.8'"
104
+ )
105
+ else:
106
+ logger.exception("autogluon is not installed; use pip install 'batch-analytics[autogluon]'")
84
107
  sys.exit(4)
85
108
 
86
109
  problem_type = os.environ.get("AUTOGLUON_PROBLEM_TYPE", "binary").strip() or "binary"
@@ -92,10 +115,12 @@ def main() -> None:
92
115
  os.makedirs(local_dir, exist_ok=True)
93
116
 
94
117
  train_df = df[feature_list + [label]]
118
+ hyper = _hyperparameters_from_env()
95
119
  logger.info(
96
- "Fitting TabularPredictor problem_type=%s time_limit=%ss rows=%s",
120
+ "Fitting TabularPredictor problem_type=%s time_limit=%ss hyperparameters=%s rows=%s",
97
121
  problem_type,
98
122
  time_limit,
123
+ hyper,
99
124
  len(train_df),
100
125
  )
101
126
  predictor = TabularPredictor(
@@ -103,7 +128,7 @@ def main() -> None:
103
128
  problem_type=problem_type,
104
129
  path=local_dir,
105
130
  )
106
- predictor.fit(train_df, time_limit=time_limit)
131
+ predictor.fit(train_df, time_limit=time_limit, hyperparameters=hyper)
107
132
 
108
133
  logger.info("Uploading model artifacts to %s", model_prefix)
109
134
  upload_directory_to_s3(local_dir, model_prefix)
@@ -200,20 +200,58 @@ def stage_to_clickhouse(
200
200
  """
201
201
  Write transformed data to ClickHouse staging table.
202
202
  Separate job from transform; must complete before analytics can run.
203
- Uses native connector if available, else JDBC.
203
+
204
+ Preferred path: Spark SQL **catalog** API (``DataFrame.writeTo``), matching
205
+ ``job_runner.create_spark_session`` registration of ``ClickHouseCatalog``
206
+ (``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
207
+ 0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
208
+ / ``clickhouse.DefaultSource``.
209
+
210
+ Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
211
+ ClickHouse 25+ auto-DDL without ORDER BY).
212
+
204
213
  Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
205
214
  """
206
215
  n = df.count()
207
216
  mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
217
+ ch = config.clickhouse
218
+ tbl = config.transform.staging_table
219
+ cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
220
+
221
+ if cat:
222
+ try:
223
+ full_name = f"{cat}.{ch.database}.{tbl}"
224
+ logger.info(
225
+ "Staging to ClickHouse via catalog %s (mode=%s)",
226
+ full_name,
227
+ mode,
228
+ )
229
+ w2 = df.writeTo(full_name)
230
+ if mode == "overwrite":
231
+ w2.createOrReplace()
232
+ else:
233
+ w2.append()
234
+ logger.info(
235
+ "Staged data to ClickHouse %s.%s (%d rows)",
236
+ ch.database,
237
+ tbl,
238
+ n,
239
+ )
240
+ return
241
+ except Exception as e:
242
+ logger.warning(
243
+ "ClickHouse catalog write failed (%s), trying legacy format/jdbc",
244
+ e,
245
+ )
246
+
208
247
  try:
209
- ch = config.clickhouse
210
248
  writer = (
211
249
  df.write.format("clickhouse")
212
250
  .option("host", ch.host)
213
251
  .option("protocol", ch.protocol)
214
252
  .option("http_port", str(ch.port))
215
253
  .option("database", ch.database)
216
- .option("table", config.transform.staging_table)
254
+ .option("table", tbl)
217
255
  .option("user", ch.user)
218
256
  .mode(mode)
219
257
  )
@@ -221,17 +259,17 @@ def stage_to_clickhouse(
221
259
  writer = writer.option("password", ch.password)
222
260
  writer.save()
223
261
  except Exception as e:
224
- logger.warning("ClickHouse connector failed (%s), using JDBC", e)
262
+ logger.warning("ClickHouse legacy format failed (%s), using JDBC", e)
225
263
  df.write.jdbc(
226
264
  config.clickhouse.jdbc_url,
227
- config.transform.staging_table,
265
+ tbl,
228
266
  mode=mode,
229
267
  properties=config.clickhouse.jdbc_properties,
230
268
  )
231
269
  logger.info(
232
270
  "Staged data to ClickHouse %s.%s (%d rows)",
233
- config.clickhouse.database,
234
- config.transform.staging_table,
271
+ ch.database,
272
+ tbl,
235
273
  n,
236
274
  )
237
275
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.20
3
+ Version: 0.3.22
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -24,7 +24,8 @@ Requires-Dist: boto3>=1.28; extra == "output"
24
24
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
25
25
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
26
26
  Provides-Extra: autogluon
27
- Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
27
+ Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
28
+ Requires-Dist: typing-extensions>=4.8.0; extra == "autogluon"
28
29
  Requires-Dist: pandas>=1.3.0; extra == "autogluon"
29
30
  Requires-Dist: boto3>=1.28; extra == "autogluon"
30
31
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
@@ -36,7 +37,8 @@ Requires-Dist: scipy>=1.5.0; extra == "full"
36
37
  Requires-Dist: boto3>=1.28; extra == "full"
37
38
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
38
39
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
39
- Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
40
+ Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
41
+ Requires-Dist: typing-extensions>=4.8.0; extra == "full"
40
42
  Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
41
43
 
42
44
  # Batch Analytics
@@ -1,7 +1,8 @@
1
1
  numpy>=1.19.0
2
2
 
3
3
  [autogluon]
4
- autogluon<2.0,>=1.0
4
+ autogluon-tabular[lightgbm]<2.0,>=1.0
5
+ typing-extensions>=4.8.0
5
6
  pandas>=1.3.0
6
7
  boto3>=1.28
7
8
 
@@ -29,7 +30,8 @@ pytest>=7.0
29
30
  pyspark<3.6,>=3.4
30
31
  scipy>=1.5.0
31
32
  boto3>=1.28
32
- autogluon<2.0,>=1.0
33
+ autogluon-tabular[lightgbm]<2.0,>=1.0
34
+ typing-extensions>=4.8.0
33
35
 
34
36
  [full:python_version < "3.9"]
35
37
  clickhouse-connect<0.9,>=0.7