batch-analytics 0.3.20__tar.gz → 0.3.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/PKG-INFO +3 -3
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/pyproject.toml +4 -3
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/gluon_autogluon_train.py +23 -3
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/PKG-INFO +3 -3
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/requires.txt +2 -2
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/README.md +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/setup.cfg +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.21
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -24,7 +24,7 @@ Requires-Dist: boto3>=1.28; extra == "output"
|
|
|
24
24
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
25
25
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
26
26
|
Provides-Extra: autogluon
|
|
27
|
-
Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
|
|
27
|
+
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
|
|
28
28
|
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
29
29
|
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
30
30
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
@@ -36,7 +36,7 @@ Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
|
36
36
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
37
37
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
38
38
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
39
|
-
Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
|
|
39
|
+
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
|
|
40
40
|
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
41
41
|
|
|
42
42
|
# Batch Analytics
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.21"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -31,8 +31,9 @@ output = [
|
|
|
31
31
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
32
32
|
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
33
33
|
]
|
|
34
|
+
# autogluon-tabular[lightgbm]: tabular AutoML with LightGBM only (no PyTorch from full autogluon metapackage)
|
|
34
35
|
autogluon = [
|
|
35
|
-
"autogluon>=1.0,<2.0",
|
|
36
|
+
"autogluon-tabular[lightgbm]>=1.0,<2.0",
|
|
36
37
|
"pandas>=1.3.0",
|
|
37
38
|
"boto3>=1.28",
|
|
38
39
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
@@ -45,7 +46,7 @@ full = [
|
|
|
45
46
|
"boto3>=1.28",
|
|
46
47
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
47
48
|
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
48
|
-
"autogluon>=1.0,<2.0",
|
|
49
|
+
"autogluon-tabular[lightgbm]>=1.0,<2.0",
|
|
49
50
|
"pyarrow>=10.0.0; python_version >= '3.8'",
|
|
50
51
|
]
|
|
51
52
|
|
|
@@ -4,11 +4,13 @@ Train AutoGluon TabularPredictor from ClickHouse staging data; upload artifacts
|
|
|
4
4
|
Env (injected by analytics_runner Gluon job):
|
|
5
5
|
CLICKHOUSE_*, BATCH_STAGING_TABLE, MODEL_S3_PREFIX, TASK_ID,
|
|
6
6
|
AUTOGLUON_LABEL, AUTOGLUON_FEATURES, AUTOGLUON_PROBLEM_TYPE, AUTOGLUON_TIME_LIMIT,
|
|
7
|
-
optional AUTOGLUON_MAX_ROWS
|
|
7
|
+
optional AUTOGLUON_MAX_ROWS,
|
|
8
|
+
optional AUTOGLUON_HYPERPARAMETERS — JSON object of AutoGluon model hyperparameters; default {"GBM": {}} (LightGBM only, no PyTorch)
|
|
8
9
|
"""
|
|
9
10
|
|
|
10
11
|
from __future__ import annotations
|
|
11
12
|
|
|
13
|
+
import json
|
|
12
14
|
import logging
|
|
13
15
|
import os
|
|
14
16
|
import shutil
|
|
@@ -44,6 +46,22 @@ def _require(name: str) -> str:
|
|
|
44
46
|
return v
|
|
45
47
|
|
|
46
48
|
|
|
49
|
+
def _hyperparameters_from_env() -> dict:
|
|
50
|
+
"""Default: GBM (LightGBM) only — matches autogluon-tabular[lightgbm] and avoids NN/torch models."""
|
|
51
|
+
raw = os.environ.get("AUTOGLUON_HYPERPARAMETERS", "").strip()
|
|
52
|
+
if not raw:
|
|
53
|
+
return {"GBM": {}}
|
|
54
|
+
try:
|
|
55
|
+
parsed = json.loads(raw)
|
|
56
|
+
except json.JSONDecodeError as e:
|
|
57
|
+
logger.error("AUTOGLUON_HYPERPARAMETERS must be valid JSON: %s", e)
|
|
58
|
+
sys.exit(2)
|
|
59
|
+
if not isinstance(parsed, dict):
|
|
60
|
+
logger.error("AUTOGLUON_HYPERPARAMETERS must be a JSON object, e.g. {\"GBM\": {}}")
|
|
61
|
+
sys.exit(2)
|
|
62
|
+
return parsed
|
|
63
|
+
|
|
64
|
+
|
|
47
65
|
def main() -> None:
|
|
48
66
|
model_prefix = _require("MODEL_S3_PREFIX")
|
|
49
67
|
label = _require("AUTOGLUON_LABEL")
|
|
@@ -92,10 +110,12 @@ def main() -> None:
|
|
|
92
110
|
os.makedirs(local_dir, exist_ok=True)
|
|
93
111
|
|
|
94
112
|
train_df = df[feature_list + [label]]
|
|
113
|
+
hyper = _hyperparameters_from_env()
|
|
95
114
|
logger.info(
|
|
96
|
-
"Fitting TabularPredictor problem_type=%s time_limit=%ss rows=%s",
|
|
115
|
+
"Fitting TabularPredictor problem_type=%s time_limit=%ss hyperparameters=%s rows=%s",
|
|
97
116
|
problem_type,
|
|
98
117
|
time_limit,
|
|
118
|
+
hyper,
|
|
99
119
|
len(train_df),
|
|
100
120
|
)
|
|
101
121
|
predictor = TabularPredictor(
|
|
@@ -103,7 +123,7 @@ def main() -> None:
|
|
|
103
123
|
problem_type=problem_type,
|
|
104
124
|
path=local_dir,
|
|
105
125
|
)
|
|
106
|
-
predictor.fit(train_df, time_limit=time_limit)
|
|
126
|
+
predictor.fit(train_df, time_limit=time_limit, hyperparameters=hyper)
|
|
107
127
|
|
|
108
128
|
logger.info("Uploading model artifacts to %s", model_prefix)
|
|
109
129
|
upload_directory_to_s3(local_dir, model_prefix)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.21
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -24,7 +24,7 @@ Requires-Dist: boto3>=1.28; extra == "output"
|
|
|
24
24
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
25
25
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
26
26
|
Provides-Extra: autogluon
|
|
27
|
-
Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
|
|
27
|
+
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "autogluon"
|
|
28
28
|
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
29
29
|
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
30
30
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
@@ -36,7 +36,7 @@ Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
|
36
36
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
37
37
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
38
38
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
39
|
-
Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
|
|
39
|
+
Requires-Dist: autogluon-tabular[lightgbm]<2.0,>=1.0; extra == "full"
|
|
40
40
|
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
41
41
|
|
|
42
42
|
# Batch Analytics
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
numpy>=1.19.0
|
|
2
2
|
|
|
3
3
|
[autogluon]
|
|
4
|
-
autogluon<2.0,>=1.0
|
|
4
|
+
autogluon-tabular[lightgbm]<2.0,>=1.0
|
|
5
5
|
pandas>=1.3.0
|
|
6
6
|
boto3>=1.28
|
|
7
7
|
|
|
@@ -29,7 +29,7 @@ pytest>=7.0
|
|
|
29
29
|
pyspark<3.6,>=3.4
|
|
30
30
|
scipy>=1.5.0
|
|
31
31
|
boto3>=1.28
|
|
32
|
-
autogluon<2.0,>=1.0
|
|
32
|
+
autogluon-tabular[lightgbm]<2.0,>=1.0
|
|
33
33
|
|
|
34
34
|
[full:python_version < "3.9"]
|
|
35
35
|
clickhouse-connect<0.9,>=0.7
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.20 → batch_analytics-0.3.21}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|