batch-analytics 0.3.20__tar.gz → 0.3.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/PKG-INFO +5 -3
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/pyproject.toml +6 -3
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/gluon_autogluon_train.py +30 -5
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/transform.py +45 -7
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/PKG-INFO +5 -3
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/requires.txt +4 -2
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/README.md +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/setup.cfg +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.22
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -24,7 +24,8 @@ Requires-Dist: boto3>=1.28; extra == "output"
|
|
|
24
24
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
25
25
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
26
26
|
Provides-Extra: autogluon
|
|
27
|
-
Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
|
|
27
|
+
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
|
|
28
|
+
Requires-Dist: typing-extensions>=4.8.0; extra == "autogluon"
|
|
28
29
|
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
29
30
|
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
30
31
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
@@ -36,7 +37,8 @@ Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
|
36
37
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
37
38
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
38
39
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
39
|
-
Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
|
|
40
|
+
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
|
|
41
|
+
Requires-Dist: typing-extensions>=4.8.0; extra == "full"
|
|
40
42
|
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
41
43
|
|
|
42
44
|
# Batch Analytics
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.22"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -31,8 +31,10 @@ output = [
|
|
|
31
31
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
32
32
|
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
33
33
|
]
|
|
34
|
+
# autogluon-tabular[lightgbm]: tabular AutoML with LightGBM only (no PyTorch from full autogluon metapackage)
|
|
34
35
|
autogluon = [
|
|
35
|
-
"autogluon>=1.0,<2.0",
|
|
36
|
+
"autogluon-tabular[lightgbm]>=1.0,<2.0",
|
|
37
|
+
"typing-extensions>=4.8.0",
|
|
36
38
|
"pandas>=1.3.0",
|
|
37
39
|
"boto3>=1.28",
|
|
38
40
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
@@ -45,7 +47,8 @@ full = [
|
|
|
45
47
|
"boto3>=1.28",
|
|
46
48
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
47
49
|
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
48
|
-
"autogluon>=1.0,<2.0",
|
|
50
|
+
"autogluon-tabular[lightgbm]>=1.0,<2.0",
|
|
51
|
+
"typing-extensions>=4.8.0",
|
|
49
52
|
"pyarrow>=10.0.0; python_version >= '3.8'",
|
|
50
53
|
]
|
|
51
54
|
|
|
@@ -4,11 +4,13 @@ Train AutoGluon TabularPredictor from ClickHouse staging data; upload artifacts
|
|
|
4
4
|
Env (injected by analytics_runner Gluon job):
|
|
5
5
|
CLICKHOUSE_*, BATCH_STAGING_TABLE, MODEL_S3_PREFIX, TASK_ID,
|
|
6
6
|
AUTOGLUON_LABEL, AUTOGLUON_FEATURES, AUTOGLUON_PROBLEM_TYPE, AUTOGLUON_TIME_LIMIT,
|
|
7
|
-
optional AUTOGLUON_MAX_ROWS
|
|
7
|
+
optional AUTOGLUON_MAX_ROWS,
|
|
8
|
+
optional AUTOGLUON_HYPERPARAMETERS — JSON object of AutoGluon model hyperparameters; default {"GBM": {}} (LightGBM only, no PyTorch)
|
|
8
9
|
"""
|
|
9
10
|
|
|
10
11
|
from __future__ import annotations
|
|
11
12
|
|
|
13
|
+
import json
|
|
12
14
|
import logging
|
|
13
15
|
import os
|
|
14
16
|
import shutil
|
|
@@ -44,6 +46,22 @@ def _require(name: str) -> str:
|
|
|
44
46
|
return v
|
|
45
47
|
|
|
46
48
|
|
|
49
|
+
def _hyperparameters_from_env() -> dict:
|
|
50
|
+
"""Default: GBM (LightGBM) only — matches autogluon-tabular[lightgbm] and avoids NN/torch models."""
|
|
51
|
+
raw = os.environ.get("AUTOGLUON_HYPERPARAMETERS", "").strip()
|
|
52
|
+
if not raw:
|
|
53
|
+
return {"GBM": {}}
|
|
54
|
+
try:
|
|
55
|
+
parsed = json.loads(raw)
|
|
56
|
+
except json.JSONDecodeError as e:
|
|
57
|
+
logger.error("AUTOGLUON_HYPERPARAMETERS must be valid JSON: %s", e)
|
|
58
|
+
sys.exit(2)
|
|
59
|
+
if not isinstance(parsed, dict):
|
|
60
|
+
logger.error("AUTOGLUON_HYPERPARAMETERS must be a JSON object, e.g. {\"GBM\": {}}")
|
|
61
|
+
sys.exit(2)
|
|
62
|
+
return parsed
|
|
63
|
+
|
|
64
|
+
|
|
47
65
|
def main() -> None:
|
|
48
66
|
model_prefix = _require("MODEL_S3_PREFIX")
|
|
49
67
|
label = _require("AUTOGLUON_LABEL")
|
|
@@ -79,8 +97,13 @@ def main() -> None:
|
|
|
79
97
|
|
|
80
98
|
try:
|
|
81
99
|
from autogluon.tabular import TabularPredictor
|
|
82
|
-
except ImportError:
|
|
83
|
-
|
|
100
|
+
except ImportError as e:
|
|
101
|
+
if isinstance(e, ModuleNotFoundError) and e.name == "typing_extensions":
|
|
102
|
+
logger.exception(
|
|
103
|
+
"Install typing-extensions (required by AutoGluon): pip install 'typing-extensions>=4.8'"
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
logger.exception("autogluon is not installed; use pip install 'batch-analytics[autogluon]'")
|
|
84
107
|
sys.exit(4)
|
|
85
108
|
|
|
86
109
|
problem_type = os.environ.get("AUTOGLUON_PROBLEM_TYPE", "binary").strip() or "binary"
|
|
@@ -92,10 +115,12 @@ def main() -> None:
|
|
|
92
115
|
os.makedirs(local_dir, exist_ok=True)
|
|
93
116
|
|
|
94
117
|
train_df = df[feature_list + [label]]
|
|
118
|
+
hyper = _hyperparameters_from_env()
|
|
95
119
|
logger.info(
|
|
96
|
-
"Fitting TabularPredictor problem_type=%s time_limit=%ss rows=%s",
|
|
120
|
+
"Fitting TabularPredictor problem_type=%s time_limit=%ss hyperparameters=%s rows=%s",
|
|
97
121
|
problem_type,
|
|
98
122
|
time_limit,
|
|
123
|
+
hyper,
|
|
99
124
|
len(train_df),
|
|
100
125
|
)
|
|
101
126
|
predictor = TabularPredictor(
|
|
@@ -103,7 +128,7 @@ def main() -> None:
|
|
|
103
128
|
problem_type=problem_type,
|
|
104
129
|
path=local_dir,
|
|
105
130
|
)
|
|
106
|
-
predictor.fit(train_df, time_limit=time_limit)
|
|
131
|
+
predictor.fit(train_df, time_limit=time_limit, hyperparameters=hyper)
|
|
107
132
|
|
|
108
133
|
logger.info("Uploading model artifacts to %s", model_prefix)
|
|
109
134
|
upload_directory_to_s3(local_dir, model_prefix)
|
|
@@ -200,20 +200,58 @@ def stage_to_clickhouse(
|
|
|
200
200
|
"""
|
|
201
201
|
Write transformed data to ClickHouse staging table.
|
|
202
202
|
Separate job from transform; must complete before analytics can run.
|
|
203
|
-
|
|
203
|
+
|
|
204
|
+
Preferred path: Spark SQL **catalog** API (``DataFrame.writeTo``), matching
|
|
205
|
+
``job_runner.create_spark_session`` registration of ``ClickHouseCatalog``
|
|
206
|
+
(``BATCH_CLICKHOUSE_CATALOG``, default ``batch_ch``). The clickhouse-spark-runtime
|
|
207
|
+
0.8.x connector does **not** register the legacy short name ``format("clickhouse")``
|
|
208
|
+
/ ``clickhouse.DefaultSource``.
|
|
209
|
+
|
|
210
|
+
Fallback: legacy ``format("clickhouse")`` (older stacks), then JDBC (may fail on
|
|
211
|
+
ClickHouse 25+ auto-DDL without ORDER BY).
|
|
212
|
+
|
|
204
213
|
Write mode from BATCH_STAGING_WRITE_MODE (default overwrite = full replace).
|
|
205
214
|
"""
|
|
206
215
|
n = df.count()
|
|
207
216
|
mode = _normalize_staging_write_mode(config.transform.staging_write_mode)
|
|
217
|
+
ch = config.clickhouse
|
|
218
|
+
tbl = config.transform.staging_table
|
|
219
|
+
cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
|
|
220
|
+
|
|
221
|
+
if cat:
|
|
222
|
+
try:
|
|
223
|
+
full_name = f"{cat}.{ch.database}.{tbl}"
|
|
224
|
+
logger.info(
|
|
225
|
+
"Staging to ClickHouse via catalog %s (mode=%s)",
|
|
226
|
+
full_name,
|
|
227
|
+
mode,
|
|
228
|
+
)
|
|
229
|
+
w2 = df.writeTo(full_name)
|
|
230
|
+
if mode == "overwrite":
|
|
231
|
+
w2.createOrReplace()
|
|
232
|
+
else:
|
|
233
|
+
w2.append()
|
|
234
|
+
logger.info(
|
|
235
|
+
"Staged data to ClickHouse %s.%s (%d rows)",
|
|
236
|
+
ch.database,
|
|
237
|
+
tbl,
|
|
238
|
+
n,
|
|
239
|
+
)
|
|
240
|
+
return
|
|
241
|
+
except Exception as e:
|
|
242
|
+
logger.warning(
|
|
243
|
+
"ClickHouse catalog write failed (%s), trying legacy format/jdbc",
|
|
244
|
+
e,
|
|
245
|
+
)
|
|
246
|
+
|
|
208
247
|
try:
|
|
209
|
-
ch = config.clickhouse
|
|
210
248
|
writer = (
|
|
211
249
|
df.write.format("clickhouse")
|
|
212
250
|
.option("host", ch.host)
|
|
213
251
|
.option("protocol", ch.protocol)
|
|
214
252
|
.option("http_port", str(ch.port))
|
|
215
253
|
.option("database", ch.database)
|
|
216
|
-
.option("table",
|
|
254
|
+
.option("table", tbl)
|
|
217
255
|
.option("user", ch.user)
|
|
218
256
|
.mode(mode)
|
|
219
257
|
)
|
|
@@ -221,17 +259,17 @@ def stage_to_clickhouse(
|
|
|
221
259
|
writer = writer.option("password", ch.password)
|
|
222
260
|
writer.save()
|
|
223
261
|
except Exception as e:
|
|
224
|
-
logger.warning("ClickHouse
|
|
262
|
+
logger.warning("ClickHouse legacy format failed (%s), using JDBC", e)
|
|
225
263
|
df.write.jdbc(
|
|
226
264
|
config.clickhouse.jdbc_url,
|
|
227
|
-
|
|
265
|
+
tbl,
|
|
228
266
|
mode=mode,
|
|
229
267
|
properties=config.clickhouse.jdbc_properties,
|
|
230
268
|
)
|
|
231
269
|
logger.info(
|
|
232
270
|
"Staged data to ClickHouse %s.%s (%d rows)",
|
|
233
|
-
|
|
234
|
-
|
|
271
|
+
ch.database,
|
|
272
|
+
tbl,
|
|
235
273
|
n,
|
|
236
274
|
)
|
|
237
275
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.22
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -24,7 +24,8 @@ Requires-Dist: boto3>=1.28; extra == "output"
|
|
|
24
24
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
25
25
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
26
26
|
Provides-Extra: autogluon
|
|
27
|
-
Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
|
|
27
|
+
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
|
|
28
|
+
Requires-Dist: typing-extensions>=4.8.0; extra == "autogluon"
|
|
28
29
|
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
29
30
|
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
30
31
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
@@ -36,7 +37,8 @@ Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
|
36
37
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
37
38
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
38
39
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
39
|
-
Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
|
|
40
|
+
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
|
|
41
|
+
Requires-Dist: typing-extensions>=4.8.0; extra == "full"
|
|
40
42
|
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
41
43
|
|
|
42
44
|
# Batch Analytics
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
numpy>=1.19.0
|
|
2
2
|
|
|
3
3
|
[autogluon]
|
|
4
|
-
autogluon<2.0,>=1.0
|
|
4
|
+
autogluon-tabular[lightgbm]<2.0,>=1.0
|
|
5
|
+
typing-extensions>=4.8.0
|
|
5
6
|
pandas>=1.3.0
|
|
6
7
|
boto3>=1.28
|
|
7
8
|
|
|
@@ -29,7 +30,8 @@ pytest>=7.0
|
|
|
29
30
|
pyspark<3.6,>=3.4
|
|
30
31
|
scipy>=1.5.0
|
|
31
32
|
boto3>=1.28
|
|
32
|
-
autogluon<2.0,>=1.0
|
|
33
|
+
autogluon-tabular[lightgbm]<2.0,>=1.0
|
|
34
|
+
typing-extensions>=4.8.0
|
|
33
35
|
|
|
34
36
|
[full:python_version < "3.9"]
|
|
35
37
|
clickhouse-connect<0.9,>=0.7
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.22}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|