batch-analytics 0.3.15__tar.gz → 0.3.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/PKG-INFO +1 -1
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/pyproject.toml +1 -1
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/gluon_autogluon_infer.py +52 -1
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/PKG-INFO +1 -1
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/README.md +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/setup.cfg +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.16"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -2,7 +2,8 @@
|
|
|
2
2
|
Load AutoGluon TabularPredictor from S3; score ClickHouse rows; write predictions to output.
|
|
3
3
|
|
|
4
4
|
Env: CLICKHOUSE_*, BATCH_STAGING_TABLE (inference feature table), MODEL_S3_PREFIX,
|
|
5
|
-
OUTPUT_TYPE, OUTPUT_CLICKHOUSE_DATABASE, OUTPUT_CLICKHOUSE_TABLE (clickhouse),
|
|
5
|
+
OUTPUT_TYPE (from OutputConfig.type), OUTPUT_CLICKHOUSE_DATABASE, OUTPUT_CLICKHOUSE_TABLE (clickhouse),
|
|
6
|
+
OUTPUT_CLICKHOUSE_AUTO_CREATE (optional; default true when unset — CREATE TABLE IF NOT EXISTS for clickhouse),
|
|
6
7
|
or OUTPUT_S3_PATH (s3 parquet)
|
|
7
8
|
"""
|
|
8
9
|
|
|
@@ -37,6 +38,54 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
|
37
38
|
logger = logging.getLogger(__name__)
|
|
38
39
|
|
|
39
40
|
|
|
41
|
+
def _env_truthy(name: str, *, default: bool) -> bool:
|
|
42
|
+
raw = (os.environ.get(name) or "").strip().lower()
|
|
43
|
+
if not raw:
|
|
44
|
+
return default
|
|
45
|
+
return raw in ("1", "true", "yes", "on")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _sql_ident(name: str) -> str:
|
|
49
|
+
return "`" + str(name).replace("`", "``") + "`"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _pandas_col_ch_type(series: pd.Series) -> str:
|
|
53
|
+
"""Map a pandas column to ClickHouse; Nullable(...) when the column has nulls."""
|
|
54
|
+
try:
|
|
55
|
+
kind = series.dtype.kind
|
|
56
|
+
except AttributeError:
|
|
57
|
+
kind = "O"
|
|
58
|
+
if kind == "b":
|
|
59
|
+
base = "Bool"
|
|
60
|
+
elif kind == "i":
|
|
61
|
+
sz = getattr(series.dtype, "itemsize", 8) or 8
|
|
62
|
+
base = {1: "Int8", 2: "Int16", 4: "Int32", 8: "Int64"}.get(sz, "Int64")
|
|
63
|
+
elif kind == "u":
|
|
64
|
+
sz = getattr(series.dtype, "itemsize", 8) or 8
|
|
65
|
+
base = {1: "UInt8", 2: "UInt16", 4: "UInt32", 8: "UInt64"}.get(sz, "UInt64")
|
|
66
|
+
elif kind == "f":
|
|
67
|
+
sz = getattr(series.dtype, "itemsize", 8) or 8
|
|
68
|
+
base = "Float32" if sz <= 4 else "Float64"
|
|
69
|
+
elif kind == "M":
|
|
70
|
+
base = "DateTime64(3)"
|
|
71
|
+
else:
|
|
72
|
+
base = "String"
|
|
73
|
+
if series.isna().any():
|
|
74
|
+
return f"Nullable({base})"
|
|
75
|
+
return base
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _ensure_clickhouse_output_table(client, database: str, table: str, out_df: pd.DataFrame) -> None:
|
|
79
|
+
col_defs = []
|
|
80
|
+
for col in out_df.columns:
|
|
81
|
+
col_defs.append(f" {_sql_ident(col)} {_pandas_col_ch_type(out_df[col])}")
|
|
82
|
+
body = ",\n".join(col_defs)
|
|
83
|
+
fq = f"{_sql_ident(database)}.{_sql_ident(table)}"
|
|
84
|
+
ddl = f"CREATE TABLE IF NOT EXISTS {fq} (\n{body}\n) ENGINE = MergeTree ORDER BY tuple()"
|
|
85
|
+
logger.info("Ensuring ClickHouse output table exists: %s.%s", database, table)
|
|
86
|
+
client.command(ddl)
|
|
87
|
+
|
|
88
|
+
|
|
40
89
|
def _require(name: str) -> str:
|
|
41
90
|
v = os.environ.get(name, "").strip()
|
|
42
91
|
if not v:
|
|
@@ -93,6 +142,8 @@ def main() -> None:
|
|
|
93
142
|
odb = _require("OUTPUT_CLICKHOUSE_DATABASE")
|
|
94
143
|
otbl = _require("OUTPUT_CLICKHOUSE_TABLE")
|
|
95
144
|
out_full = clickhouse_full_table(odb, otbl)
|
|
145
|
+
if _env_truthy("OUTPUT_CLICKHOUSE_AUTO_CREATE", default=True):
|
|
146
|
+
_ensure_clickhouse_output_table(client, odb, otbl, out_df)
|
|
96
147
|
logger.info("Inserting %s rows into %s", len(out_df), out_full)
|
|
97
148
|
client.insert_df(out_full, out_df)
|
|
98
149
|
else:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|