batch-analytics 0.3.15__tar.gz → 0.3.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/PKG-INFO +9 -13
  2. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/pyproject.toml +11 -15
  3. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/analytics/gluon_autogluon_infer.py +52 -1
  4. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics.egg-info/PKG-INFO +9 -13
  5. batch_analytics-0.3.17/src/batch_analytics.egg-info/requires.txt +33 -0
  6. batch_analytics-0.3.15/src/batch_analytics.egg-info/requires.txt +0 -57
  7. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/README.md +0 -0
  8. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/setup.cfg +0 -0
  9. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/__init__.py +0 -0
  10. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/__main__.py +0 -0
  11. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/analytics/__init__.py +0 -0
  12. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/analytics/correlation.py +0 -0
  13. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
  14. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/analytics/linear_regression.py +0 -0
  15. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  16. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/analytics/t_test.py +0 -0
  17. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/config.py +0 -0
  18. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/extract.py +0 -0
  19. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/job_runner.py +0 -0
  20. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/log.py +0 -0
  21. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/modules.py +0 -0
  22. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/output/__init__.py +0 -0
  23. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/output/base.py +0 -0
  24. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/output/clickhouse.py +0 -0
  25. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/output/local.py +0 -0
  26. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/output/s3.py +0 -0
  27. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/transform.py +0 -0
  28. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/utils/__init__.py +0 -0
  29. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
  30. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  31. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  32. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  33. {batch_analytics-0.3.15 → batch_analytics-0.3.17}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.15
3
+ Version: 0.3.17
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
7
- Requires-Python: >=3.8
7
+ Requires-Python: >=3.9
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: pyspark<3.6,>=3.4
10
- Requires-Dist: numpy>=1.19.0
10
+ Requires-Dist: numpy>=1.22.0
11
11
  Requires-Dist: scipy>=1.5.0
12
12
  Provides-Extra: dev
13
13
  Requires-Dist: pytest>=7.0; extra == "dev"
@@ -16,26 +16,22 @@ Requires-Dist: scipy>=1.5.0; extra == "ttest"
16
16
  Provides-Extra: s3
17
17
  Requires-Dist: boto3>=1.28; extra == "s3"
18
18
  Provides-Extra: clickhouse
19
- Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "clickhouse"
20
- Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "clickhouse"
19
+ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
21
20
  Provides-Extra: output
22
21
  Requires-Dist: boto3>=1.28; extra == "output"
23
- Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
24
- Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
22
+ Requires-Dist: clickhouse-connect>=0.7; extra == "output"
25
23
  Provides-Extra: autogluon
26
24
  Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
27
25
  Requires-Dist: pandas>=1.3.0; extra == "autogluon"
28
26
  Requires-Dist: boto3>=1.28; extra == "autogluon"
29
- Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
30
- Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
31
- Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
27
+ Requires-Dist: clickhouse-connect>=0.7; extra == "autogluon"
28
+ Requires-Dist: pyarrow>=10.0.0; extra == "autogluon"
32
29
  Provides-Extra: full
33
30
  Requires-Dist: scipy>=1.5.0; extra == "full"
34
31
  Requires-Dist: boto3>=1.28; extra == "full"
35
- Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
36
- Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
32
+ Requires-Dist: clickhouse-connect>=0.7; extra == "full"
37
33
  Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
38
- Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
34
+ Requires-Dist: pyarrow>=10.0.0; extra == "full"
39
35
 
40
36
  # Batch Analytics
41
37
 
@@ -4,13 +4,14 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.15"
7
+ version = "0.3.17"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
- requires-python = ">=3.8"
10
+ # AutoGluon 1.x pulls thinc/blis which need NumPy 2.x; NumPy 2 does not support Python 3.8.
11
+ requires-python = ">=3.9"
11
12
  dependencies = [
12
13
  "pyspark>=3.4,<3.6",
13
- "numpy>=1.19.0",
14
+ "numpy>=1.22.0",
14
15
  # Welch t-test (t_test.py); keep on core deps so `pip install batch-analytics` works in minimal driver images
15
16
  "scipy>=1.5.0",
16
17
  ]
@@ -22,33 +23,28 @@ dev = ["pytest>=7.0"]
22
23
  # Legacy: scipy is a core dependency; kept so `pip install "batch-analytics[ttest]"` still resolves.
23
24
  ttest = ["scipy>=1.5.0"]
24
25
  s3 = ["boto3>=1.28"]
25
- # 0.9+ uses list[...] etc. and breaks on Python 3.8; 3.9+ can take current clickhouse-connect.
26
26
  clickhouse = [
27
- "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
28
- "clickhouse-connect>=0.7; python_version >= '3.9'",
27
+ "clickhouse-connect>=0.7",
29
28
  ]
30
29
  output = [
31
30
  "boto3>=1.28",
32
- "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
33
- "clickhouse-connect>=0.7; python_version >= '3.9'",
31
+ "clickhouse-connect>=0.7",
34
32
  ]
35
- # AutoGluon Tabular train/infer (POST /submit/train, /submit/inf on analytics_runner)
33
+ # AutoGluon Tabular train/infer (POST /submit/train, /submit/inf on analytics_runner). Requires Python 3.9+.
36
34
  autogluon = [
37
35
  "autogluon>=1.0,<2.0",
38
36
  "pandas>=1.3.0",
39
37
  "boto3>=1.28",
40
- "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
41
- "clickhouse-connect>=0.7; python_version >= '3.9'",
42
- "pyarrow>=10.0.0; python_version >= '3.8'",
38
+ "clickhouse-connect>=0.7",
39
+ "pyarrow>=10.0.0",
43
40
  ]
44
41
  # Install all optional runtime deps used anywhere in the package
45
42
  full = [
46
43
  "scipy>=1.5.0",
47
44
  "boto3>=1.28",
48
- "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
49
- "clickhouse-connect>=0.7; python_version >= '3.9'",
45
+ "clickhouse-connect>=0.7",
50
46
  "autogluon>=1.0,<2.0",
51
- "pyarrow>=10.0.0; python_version >= '3.8'",
47
+ "pyarrow>=10.0.0",
52
48
  ]
53
49
 
54
50
  [project.scripts]
@@ -2,7 +2,8 @@
2
2
  Load AutoGluon TabularPredictor from S3; score ClickHouse rows; write predictions to output.
3
3
 
4
4
  Env: CLICKHOUSE_*, BATCH_STAGING_TABLE (inference feature table), MODEL_S3_PREFIX,
5
- OUTPUT_TYPE, OUTPUT_CLICKHOUSE_DATABASE, OUTPUT_CLICKHOUSE_TABLE (clickhouse),
5
+ OUTPUT_TYPE (from OutputConfig.type), OUTPUT_CLICKHOUSE_DATABASE, OUTPUT_CLICKHOUSE_TABLE (clickhouse),
6
+ OUTPUT_CLICKHOUSE_AUTO_CREATE (optional; default true when unset — CREATE TABLE IF NOT EXISTS for clickhouse),
6
7
  or OUTPUT_S3_PATH (s3 parquet)
7
8
  """
8
9
 
@@ -37,6 +38,54 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
37
38
  logger = logging.getLogger(__name__)
38
39
 
39
40
 
41
+ def _env_truthy(name: str, *, default: bool) -> bool:
42
+ raw = (os.environ.get(name) or "").strip().lower()
43
+ if not raw:
44
+ return default
45
+ return raw in ("1", "true", "yes", "on")
46
+
47
+
48
+ def _sql_ident(name: str) -> str:
49
+ return "`" + str(name).replace("`", "``") + "`"
50
+
51
+
52
+ def _pandas_col_ch_type(series: pd.Series) -> str:
53
+ """Map a pandas column to ClickHouse; Nullable(...) when the column has nulls."""
54
+ try:
55
+ kind = series.dtype.kind
56
+ except AttributeError:
57
+ kind = "O"
58
+ if kind == "b":
59
+ base = "Bool"
60
+ elif kind == "i":
61
+ sz = getattr(series.dtype, "itemsize", 8) or 8
62
+ base = {1: "Int8", 2: "Int16", 4: "Int32", 8: "Int64"}.get(sz, "Int64")
63
+ elif kind == "u":
64
+ sz = getattr(series.dtype, "itemsize", 8) or 8
65
+ base = {1: "UInt8", 2: "UInt16", 4: "UInt32", 8: "UInt64"}.get(sz, "UInt64")
66
+ elif kind == "f":
67
+ sz = getattr(series.dtype, "itemsize", 8) or 8
68
+ base = "Float32" if sz <= 4 else "Float64"
69
+ elif kind == "M":
70
+ base = "DateTime64(3)"
71
+ else:
72
+ base = "String"
73
+ if series.isna().any():
74
+ return f"Nullable({base})"
75
+ return base
76
+
77
+
78
+ def _ensure_clickhouse_output_table(client, database: str, table: str, out_df: pd.DataFrame) -> None:
79
+ col_defs = []
80
+ for col in out_df.columns:
81
+ col_defs.append(f" {_sql_ident(col)} {_pandas_col_ch_type(out_df[col])}")
82
+ body = ",\n".join(col_defs)
83
+ fq = f"{_sql_ident(database)}.{_sql_ident(table)}"
84
+ ddl = f"CREATE TABLE IF NOT EXISTS {fq} (\n{body}\n) ENGINE = MergeTree ORDER BY tuple()"
85
+ logger.info("Ensuring ClickHouse output table exists: %s.%s", database, table)
86
+ client.command(ddl)
87
+
88
+
40
89
  def _require(name: str) -> str:
41
90
  v = os.environ.get(name, "").strip()
42
91
  if not v:
@@ -93,6 +142,8 @@ def main() -> None:
93
142
  odb = _require("OUTPUT_CLICKHOUSE_DATABASE")
94
143
  otbl = _require("OUTPUT_CLICKHOUSE_TABLE")
95
144
  out_full = clickhouse_full_table(odb, otbl)
145
+ if _env_truthy("OUTPUT_CLICKHOUSE_AUTO_CREATE", default=True):
146
+ _ensure_clickhouse_output_table(client, odb, otbl, out_df)
96
147
  logger.info("Inserting %s rows into %s", len(out_df), out_full)
97
148
  client.insert_df(out_full, out_df)
98
149
  else:
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.15
3
+ Version: 0.3.17
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
7
- Requires-Python: >=3.8
7
+ Requires-Python: >=3.9
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: pyspark<3.6,>=3.4
10
- Requires-Dist: numpy>=1.19.0
10
+ Requires-Dist: numpy>=1.22.0
11
11
  Requires-Dist: scipy>=1.5.0
12
12
  Provides-Extra: dev
13
13
  Requires-Dist: pytest>=7.0; extra == "dev"
@@ -16,26 +16,22 @@ Requires-Dist: scipy>=1.5.0; extra == "ttest"
16
16
  Provides-Extra: s3
17
17
  Requires-Dist: boto3>=1.28; extra == "s3"
18
18
  Provides-Extra: clickhouse
19
- Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "clickhouse"
20
- Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "clickhouse"
19
+ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
21
20
  Provides-Extra: output
22
21
  Requires-Dist: boto3>=1.28; extra == "output"
23
- Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
24
- Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
22
+ Requires-Dist: clickhouse-connect>=0.7; extra == "output"
25
23
  Provides-Extra: autogluon
26
24
  Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
27
25
  Requires-Dist: pandas>=1.3.0; extra == "autogluon"
28
26
  Requires-Dist: boto3>=1.28; extra == "autogluon"
29
- Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
30
- Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
31
- Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
27
+ Requires-Dist: clickhouse-connect>=0.7; extra == "autogluon"
28
+ Requires-Dist: pyarrow>=10.0.0; extra == "autogluon"
32
29
  Provides-Extra: full
33
30
  Requires-Dist: scipy>=1.5.0; extra == "full"
34
31
  Requires-Dist: boto3>=1.28; extra == "full"
35
- Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
36
- Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
32
+ Requires-Dist: clickhouse-connect>=0.7; extra == "full"
37
33
  Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
38
- Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
34
+ Requires-Dist: pyarrow>=10.0.0; extra == "full"
39
35
 
40
36
  # Batch Analytics
41
37
 
@@ -0,0 +1,33 @@
1
+ pyspark<3.6,>=3.4
2
+ numpy>=1.22.0
3
+ scipy>=1.5.0
4
+
5
+ [autogluon]
6
+ autogluon<2.0,>=1.0
7
+ pandas>=1.3.0
8
+ boto3>=1.28
9
+ clickhouse-connect>=0.7
10
+ pyarrow>=10.0.0
11
+
12
+ [clickhouse]
13
+ clickhouse-connect>=0.7
14
+
15
+ [dev]
16
+ pytest>=7.0
17
+
18
+ [full]
19
+ scipy>=1.5.0
20
+ boto3>=1.28
21
+ clickhouse-connect>=0.7
22
+ autogluon<2.0,>=1.0
23
+ pyarrow>=10.0.0
24
+
25
+ [output]
26
+ boto3>=1.28
27
+ clickhouse-connect>=0.7
28
+
29
+ [s3]
30
+ boto3>=1.28
31
+
32
+ [ttest]
33
+ scipy>=1.5.0
@@ -1,57 +0,0 @@
1
- pyspark<3.6,>=3.4
2
- numpy>=1.19.0
3
- scipy>=1.5.0
4
-
5
- [autogluon]
6
- autogluon<2.0,>=1.0
7
- pandas>=1.3.0
8
- boto3>=1.28
9
-
10
- [autogluon:python_version < "3.9"]
11
- clickhouse-connect<0.9,>=0.7
12
-
13
- [autogluon:python_version >= "3.8"]
14
- pyarrow>=10.0.0
15
-
16
- [autogluon:python_version >= "3.9"]
17
- clickhouse-connect>=0.7
18
-
19
- [clickhouse]
20
-
21
- [clickhouse:python_version < "3.9"]
22
- clickhouse-connect<0.9,>=0.7
23
-
24
- [clickhouse:python_version >= "3.9"]
25
- clickhouse-connect>=0.7
26
-
27
- [dev]
28
- pytest>=7.0
29
-
30
- [full]
31
- scipy>=1.5.0
32
- boto3>=1.28
33
- autogluon<2.0,>=1.0
34
-
35
- [full:python_version < "3.9"]
36
- clickhouse-connect<0.9,>=0.7
37
-
38
- [full:python_version >= "3.8"]
39
- pyarrow>=10.0.0
40
-
41
- [full:python_version >= "3.9"]
42
- clickhouse-connect>=0.7
43
-
44
- [output]
45
- boto3>=1.28
46
-
47
- [output:python_version < "3.9"]
48
- clickhouse-connect<0.9,>=0.7
49
-
50
- [output:python_version >= "3.9"]
51
- clickhouse-connect>=0.7
52
-
53
- [s3]
54
- boto3>=1.28
55
-
56
- [ttest]
57
- scipy>=1.5.0