batch-analytics 0.3.15__tar.gz → 0.3.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/PKG-INFO +1 -1
  2. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/pyproject.toml +1 -1
  3. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/gluon_autogluon_infer.py +52 -1
  4. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  5. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/README.md +0 -0
  6. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/setup.cfg +0 -0
  7. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/__init__.py +0 -0
  8. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/__main__.py +0 -0
  9. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/__init__.py +0 -0
  10. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/correlation.py +0 -0
  11. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
  12. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/linear_regression.py +0 -0
  13. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  14. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/analytics/t_test.py +0 -0
  15. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/config.py +0 -0
  16. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/extract.py +0 -0
  17. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/job_runner.py +0 -0
  18. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/log.py +0 -0
  19. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/modules.py +0 -0
  20. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/__init__.py +0 -0
  21. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/base.py +0 -0
  22. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/clickhouse.py +0 -0
  23. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/local.py +0 -0
  24. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/output/s3.py +0 -0
  25. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/transform.py +0 -0
  26. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/utils/__init__.py +0 -0
  27. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  28. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  29. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  30. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  31. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/requires.txt +0 -0
  32. {batch_analytics-0.3.15 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.15
3
+ Version: 0.3.16
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.15"
7
+ version = "0.3.16"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -2,7 +2,8 @@
2
2
  Load AutoGluon TabularPredictor from S3; score ClickHouse rows; write predictions to output.
3
3
 
4
4
  Env: CLICKHOUSE_*, BATCH_STAGING_TABLE (inference feature table), MODEL_S3_PREFIX,
5
- OUTPUT_TYPE, OUTPUT_CLICKHOUSE_DATABASE, OUTPUT_CLICKHOUSE_TABLE (clickhouse),
5
+ OUTPUT_TYPE (from OutputConfig.type), OUTPUT_CLICKHOUSE_DATABASE, OUTPUT_CLICKHOUSE_TABLE (clickhouse),
6
+ OUTPUT_CLICKHOUSE_AUTO_CREATE (optional; default true when unset — CREATE TABLE IF NOT EXISTS for clickhouse),
6
7
  or OUTPUT_S3_PATH (s3 parquet)
7
8
  """
8
9
 
@@ -37,6 +38,54 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
37
38
  logger = logging.getLogger(__name__)
38
39
 
39
40
 
41
+ def _env_truthy(name: str, *, default: bool) -> bool:
42
+ raw = (os.environ.get(name) or "").strip().lower()
43
+ if not raw:
44
+ return default
45
+ return raw in ("1", "true", "yes", "on")
46
+
47
+
48
+ def _sql_ident(name: str) -> str:
49
+ return "`" + str(name).replace("`", "``") + "`"
50
+
51
+
52
+ def _pandas_col_ch_type(series: pd.Series) -> str:
53
+ """Map a pandas column to ClickHouse; Nullable(...) when the column has nulls."""
54
+ try:
55
+ kind = series.dtype.kind
56
+ except AttributeError:
57
+ kind = "O"
58
+ if kind == "b":
59
+ base = "Bool"
60
+ elif kind == "i":
61
+ sz = getattr(series.dtype, "itemsize", 8) or 8
62
+ base = {1: "Int8", 2: "Int16", 4: "Int32", 8: "Int64"}.get(sz, "Int64")
63
+ elif kind == "u":
64
+ sz = getattr(series.dtype, "itemsize", 8) or 8
65
+ base = {1: "UInt8", 2: "UInt16", 4: "UInt32", 8: "UInt64"}.get(sz, "UInt64")
66
+ elif kind == "f":
67
+ sz = getattr(series.dtype, "itemsize", 8) or 8
68
+ base = "Float32" if sz <= 4 else "Float64"
69
+ elif kind == "M":
70
+ base = "DateTime64(3)"
71
+ else:
72
+ base = "String"
73
+ if series.isna().any():
74
+ return f"Nullable({base})"
75
+ return base
76
+
77
+
78
+ def _ensure_clickhouse_output_table(client, database: str, table: str, out_df: pd.DataFrame) -> None:
79
+ col_defs = []
80
+ for col in out_df.columns:
81
+ col_defs.append(f" {_sql_ident(col)} {_pandas_col_ch_type(out_df[col])}")
82
+ body = ",\n".join(col_defs)
83
+ fq = f"{_sql_ident(database)}.{_sql_ident(table)}"
84
+ ddl = f"CREATE TABLE IF NOT EXISTS {fq} (\n{body}\n) ENGINE = MergeTree ORDER BY tuple()"
85
+ logger.info("Ensuring ClickHouse output table exists: %s.%s", database, table)
86
+ client.command(ddl)
87
+
88
+
40
89
  def _require(name: str) -> str:
41
90
  v = os.environ.get(name, "").strip()
42
91
  if not v:
@@ -93,6 +142,8 @@ def main() -> None:
93
142
  odb = _require("OUTPUT_CLICKHOUSE_DATABASE")
94
143
  otbl = _require("OUTPUT_CLICKHOUSE_TABLE")
95
144
  out_full = clickhouse_full_table(odb, otbl)
145
+ if _env_truthy("OUTPUT_CLICKHOUSE_AUTO_CREATE", default=True):
146
+ _ensure_clickhouse_output_table(client, odb, otbl, out_df)
96
147
  logger.info("Inserting %s rows into %s", len(out_df), out_full)
97
148
  client.insert_df(out_full, out_df)
98
149
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.15
3
+ Version: 0.3.16
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT