batch-analytics 0.3.14__tar.gz → 0.3.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/PKG-INFO +10 -1
  2. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/pyproject.toml +12 -1
  3. batch_analytics-0.3.16/src/batch_analytics/analytics/gluon_autogluon_infer.py +167 -0
  4. batch_analytics-0.3.16/src/batch_analytics/analytics/gluon_autogluon_train.py +114 -0
  5. batch_analytics-0.3.16/src/batch_analytics/utils/__init__.py +1 -0
  6. batch_analytics-0.3.16/src/batch_analytics/utils/gluon_autogluon_common.py +83 -0
  7. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/PKG-INFO +10 -1
  8. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/SOURCES.txt +5 -1
  9. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/requires.txt +18 -0
  10. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/README.md +0 -0
  11. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/setup.cfg +0 -0
  12. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/__init__.py +0 -0
  13. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/__main__.py +0 -0
  14. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/analytics/__init__.py +0 -0
  15. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/analytics/correlation.py +0 -0
  16. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/analytics/linear_regression.py +0 -0
  17. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  18. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/analytics/t_test.py +0 -0
  19. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/config.py +0 -0
  20. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/extract.py +0 -0
  21. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/job_runner.py +0 -0
  22. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/log.py +0 -0
  23. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/modules.py +0 -0
  24. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/output/__init__.py +0 -0
  25. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/output/base.py +0 -0
  26. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/output/clickhouse.py +0 -0
  27. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/output/local.py +0 -0
  28. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/output/s3.py +0 -0
  29. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics/transform.py +0 -0
  30. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  31. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  32. {batch_analytics-0.3.14 → batch_analytics-0.3.16}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.14
3
+ Version: 0.3.16
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -22,11 +22,20 @@ Provides-Extra: output
22
22
  Requires-Dist: boto3>=1.28; extra == "output"
23
23
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
24
24
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
25
+ Provides-Extra: autogluon
26
+ Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
27
+ Requires-Dist: pandas>=1.3.0; extra == "autogluon"
28
+ Requires-Dist: boto3>=1.28; extra == "autogluon"
29
+ Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
30
+ Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
31
+ Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
25
32
  Provides-Extra: full
26
33
  Requires-Dist: scipy>=1.5.0; extra == "full"
27
34
  Requires-Dist: boto3>=1.28; extra == "full"
28
35
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
29
36
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
37
+ Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
38
+ Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
30
39
 
31
40
  # Batch Analytics
32
41
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.3.14"
7
+ version = "0.3.16"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -32,12 +32,23 @@ output = [
32
32
  "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
33
33
  "clickhouse-connect>=0.7; python_version >= '3.9'",
34
34
  ]
35
+ # AutoGluon Tabular train/infer (POST /submit/train, /submit/inf on analytics_runner)
36
+ autogluon = [
37
+ "autogluon>=1.0,<2.0",
38
+ "pandas>=1.3.0",
39
+ "boto3>=1.28",
40
+ "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
41
+ "clickhouse-connect>=0.7; python_version >= '3.9'",
42
+ "pyarrow>=10.0.0; python_version >= '3.8'",
43
+ ]
35
44
  # Install all optional runtime deps used anywhere in the package
36
45
  full = [
37
46
  "scipy>=1.5.0",
38
47
  "boto3>=1.28",
39
48
  "clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
40
49
  "clickhouse-connect>=0.7; python_version >= '3.9'",
50
+ "autogluon>=1.0,<2.0",
51
+ "pyarrow>=10.0.0; python_version >= '3.8'",
41
52
  ]
42
53
 
43
54
  [project.scripts]
@@ -0,0 +1,167 @@
1
+ """
2
+ Load AutoGluon TabularPredictor from S3; score ClickHouse rows; write predictions to output.
3
+
4
+ Env: CLICKHOUSE_*, BATCH_STAGING_TABLE (inference feature table), MODEL_S3_PREFIX,
5
+ OUTPUT_TYPE (from OutputConfig.type), OUTPUT_CLICKHOUSE_DATABASE, OUTPUT_CLICKHOUSE_TABLE (clickhouse),
6
+ OUTPUT_CLICKHOUSE_AUTO_CREATE (optional; default true when unset — CREATE TABLE IF NOT EXISTS for clickhouse),
7
+ or OUTPUT_S3_PATH (s3 parquet)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import os
14
+ import shutil
15
+ import sys
16
+ import tempfile
17
+
18
+ import clickhouse_connect
19
+ import pandas as pd
20
+
21
+ try:
22
+ from batch_analytics.utils.gluon_autogluon_common import (
23
+ clickhouse_full_table,
24
+ download_s3_prefix_to_dir,
25
+ parse_s3_uri,
26
+ )
27
+ except ImportError:
28
+ _pkg_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
29
+ if _pkg_root not in sys.path:
30
+ sys.path.insert(0, _pkg_root)
31
+ from utils.gluon_autogluon_common import ( # noqa: E402
32
+ clickhouse_full_table,
33
+ download_s3_prefix_to_dir,
34
+ parse_s3_uri,
35
+ )
36
+
37
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ def _env_truthy(name: str, *, default: bool) -> bool:
42
+ raw = (os.environ.get(name) or "").strip().lower()
43
+ if not raw:
44
+ return default
45
+ return raw in ("1", "true", "yes", "on")
46
+
47
+
48
+ def _sql_ident(name: str) -> str:
49
+ return "`" + str(name).replace("`", "``") + "`"
50
+
51
+
52
+ def _pandas_col_ch_type(series: pd.Series) -> str:
53
+ """Map a pandas column to ClickHouse; Nullable(...) when the column has nulls."""
54
+ try:
55
+ kind = series.dtype.kind
56
+ except AttributeError:
57
+ kind = "O"
58
+ if kind == "b":
59
+ base = "Bool"
60
+ elif kind == "i":
61
+ sz = getattr(series.dtype, "itemsize", 8) or 8
62
+ base = {1: "Int8", 2: "Int16", 4: "Int32", 8: "Int64"}.get(sz, "Int64")
63
+ elif kind == "u":
64
+ sz = getattr(series.dtype, "itemsize", 8) or 8
65
+ base = {1: "UInt8", 2: "UInt16", 4: "UInt32", 8: "UInt64"}.get(sz, "UInt64")
66
+ elif kind == "f":
67
+ sz = getattr(series.dtype, "itemsize", 8) or 8
68
+ base = "Float32" if sz <= 4 else "Float64"
69
+ elif kind == "M":
70
+ base = "DateTime64(3)"
71
+ else:
72
+ base = "String"
73
+ if series.isna().any():
74
+ return f"Nullable({base})"
75
+ return base
76
+
77
+
78
+ def _ensure_clickhouse_output_table(client, database: str, table: str, out_df: pd.DataFrame) -> None:
79
+ col_defs = []
80
+ for col in out_df.columns:
81
+ col_defs.append(f" {_sql_ident(col)} {_pandas_col_ch_type(out_df[col])}")
82
+ body = ",\n".join(col_defs)
83
+ fq = f"{_sql_ident(database)}.{_sql_ident(table)}"
84
+ ddl = f"CREATE TABLE IF NOT EXISTS {fq} (\n{body}\n) ENGINE = MergeTree ORDER BY tuple()"
85
+ logger.info("Ensuring ClickHouse output table exists: %s.%s", database, table)
86
+ client.command(ddl)
87
+
88
+
89
+ def _require(name: str) -> str:
90
+ v = os.environ.get(name, "").strip()
91
+ if not v:
92
+ logger.error("Missing required env var: %s", name)
93
+ sys.exit(2)
94
+ return v
95
+
96
+
97
+ def main() -> None:
98
+ model_prefix = _require("MODEL_S3_PREFIX")
99
+ out_type = os.environ.get("OUTPUT_TYPE", "clickhouse").strip().lower()
100
+ if out_type not in ("clickhouse", "s3"):
101
+ logger.error("OUTPUT_TYPE must be clickhouse or s3, got %r", out_type)
102
+ sys.exit(2)
103
+
104
+ host = _require("CLICKHOUSE_HOST")
105
+ port = int(os.environ.get("CLICKHOUSE_HTTP_PORT", "8123"))
106
+ database = os.environ.get("CLICKHOUSE_DB", "default").strip() or "default"
107
+ user = os.environ.get("CLICKHOUSE_USER", "default")
108
+ password = os.environ.get("CLICKHOUSE_PASSWORD", "")
109
+ inference_table = _require("BATCH_STAGING_TABLE")
110
+ full_table = clickhouse_full_table(database, inference_table)
111
+
112
+ sql = f"SELECT * FROM {full_table}"
113
+ logger.info("Loading rows to score: %s", sql)
114
+ client = clickhouse_connect.get_client(
115
+ host=host,
116
+ port=port,
117
+ username=user,
118
+ password=password or None,
119
+ database=database,
120
+ )
121
+ df = client.query_df(sql)
122
+ if df.empty:
123
+ logger.warning("Inference input is empty; nothing to write")
124
+ return
125
+
126
+ try:
127
+ from autogluon.tabular import TabularPredictor
128
+ except ImportError:
129
+ logger.exception("autogluon is not installed; use pip install 'batch-analytics[autogluon]'")
130
+ sys.exit(4)
131
+
132
+ local_model = tempfile.mkdtemp(prefix="ag_infer_")
133
+ try:
134
+ logger.info("Downloading model from %s", model_prefix)
135
+ download_s3_prefix_to_dir(model_prefix, local_model)
136
+ predictor = TabularPredictor.load(local_model)
137
+ y_pred = predictor.predict(df)
138
+ out_df = df.copy()
139
+ out_df["prediction"] = pd.Series(y_pred, index=df.index)
140
+
141
+ if out_type == "clickhouse":
142
+ odb = _require("OUTPUT_CLICKHOUSE_DATABASE")
143
+ otbl = _require("OUTPUT_CLICKHOUSE_TABLE")
144
+ out_full = clickhouse_full_table(odb, otbl)
145
+ if _env_truthy("OUTPUT_CLICKHOUSE_AUTO_CREATE", default=True):
146
+ _ensure_clickhouse_output_table(client, odb, otbl, out_df)
147
+ logger.info("Inserting %s rows into %s", len(out_df), out_full)
148
+ client.insert_df(out_full, out_df)
149
+ else:
150
+ path = _require("OUTPUT_S3_PATH")
151
+ bucket, key = parse_s3_uri(path.rstrip("/") + "/")
152
+ key = key.rstrip("/")
153
+ if key:
154
+ key = key + "/"
155
+ parquet_key = key + "predictions.parquet"
156
+ tmp_parquet = os.path.join(local_model, "predictions.parquet")
157
+ out_df.to_parquet(tmp_parquet, index=False)
158
+ import boto3
159
+
160
+ boto3.client("s3").upload_file(tmp_parquet, bucket, parquet_key)
161
+ logger.info("Wrote s3://%s/%s", bucket, parquet_key)
162
+ finally:
163
+ shutil.rmtree(local_model, ignore_errors=True)
164
+
165
+
166
+ if __name__ == "__main__":
167
+ main()
@@ -0,0 +1,114 @@
1
+ """
2
+ Train AutoGluon TabularPredictor from ClickHouse staging data; upload artifacts to S3.
3
+
4
+ Env (injected by analytics_runner Gluon job):
5
+ CLICKHOUSE_*, BATCH_STAGING_TABLE, MODEL_S3_PREFIX, TASK_ID,
6
+ AUTOGLUON_LABEL, AUTOGLUON_FEATURES, AUTOGLUON_PROBLEM_TYPE, AUTOGLUON_TIME_LIMIT,
7
+ optional AUTOGLUON_MAX_ROWS
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import os
14
+ import shutil
15
+ import sys
16
+
17
+ import clickhouse_connect
18
+
19
+ try:
20
+ from batch_analytics.utils.gluon_autogluon_common import (
21
+ clickhouse_full_table,
22
+ local_training_dir,
23
+ upload_directory_to_s3,
24
+ )
25
+ except ImportError:
26
+ _pkg_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
27
+ if _pkg_root not in sys.path:
28
+ sys.path.insert(0, _pkg_root)
29
+ from utils.gluon_autogluon_common import ( # noqa: E402
30
+ clickhouse_full_table,
31
+ local_training_dir,
32
+ upload_directory_to_s3,
33
+ )
34
+
35
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ def _require(name: str) -> str:
40
+ v = os.environ.get(name, "").strip()
41
+ if not v:
42
+ logger.error("Missing required env var: %s", name)
43
+ sys.exit(2)
44
+ return v
45
+
46
+
47
+ def main() -> None:
48
+ model_prefix = _require("MODEL_S3_PREFIX")
49
+ label = _require("AUTOGLUON_LABEL")
50
+ features_raw = _require("AUTOGLUON_FEATURES")
51
+ feature_list = [c.strip() for c in features_raw.split(",") if c.strip()]
52
+
53
+ host = _require("CLICKHOUSE_HOST")
54
+ port = int(os.environ.get("CLICKHOUSE_HTTP_PORT", "8123"))
55
+ database = os.environ.get("CLICKHOUSE_DB", "default").strip() or "default"
56
+ user = os.environ.get("CLICKHOUSE_USER", "default")
57
+ password = os.environ.get("CLICKHOUSE_PASSWORD", "")
58
+ staging_table = _require("BATCH_STAGING_TABLE")
59
+ full_table = clickhouse_full_table(database, staging_table)
60
+
61
+ max_rows = os.environ.get("AUTOGLUON_MAX_ROWS", "").strip()
62
+ limit_sql = f" LIMIT {int(max_rows)}" if max_rows else ""
63
+ sql = f"SELECT * FROM {full_table}{limit_sql}"
64
+
65
+ logger.info("Loading training data: %s", sql)
66
+ client = clickhouse_connect.get_client(
67
+ host=host,
68
+ port=port,
69
+ username=user,
70
+ password=password or None,
71
+ database=database,
72
+ )
73
+ df = client.query_df(sql)
74
+
75
+ missing = [c for c in feature_list + [label] if c not in df.columns]
76
+ if missing:
77
+ logger.error("Columns missing from training data: %s (have: %s)", missing, list(df.columns))
78
+ sys.exit(3)
79
+
80
+ try:
81
+ from autogluon.tabular import TabularPredictor
82
+ except ImportError:
83
+ logger.exception("autogluon is not installed; use pip install 'batch-analytics[autogluon]'")
84
+ sys.exit(4)
85
+
86
+ problem_type = os.environ.get("AUTOGLUON_PROBLEM_TYPE", "binary").strip() or "binary"
87
+ time_limit = int(os.environ.get("AUTOGLUON_TIME_LIMIT", "300"))
88
+
89
+ local_dir = local_training_dir()
90
+ if os.path.isdir(local_dir):
91
+ shutil.rmtree(local_dir)
92
+ os.makedirs(local_dir, exist_ok=True)
93
+
94
+ train_df = df[feature_list + [label]]
95
+ logger.info(
96
+ "Fitting TabularPredictor problem_type=%s time_limit=%ss rows=%s",
97
+ problem_type,
98
+ time_limit,
99
+ len(train_df),
100
+ )
101
+ predictor = TabularPredictor(
102
+ label=label,
103
+ problem_type=problem_type,
104
+ path=local_dir,
105
+ )
106
+ predictor.fit(train_df, time_limit=time_limit)
107
+
108
+ logger.info("Uploading model artifacts to %s", model_prefix)
109
+ upload_directory_to_s3(local_dir, model_prefix)
110
+ logger.info("Train finished; task_id=%s", os.environ.get("TASK_ID", ""))
111
+
112
+
113
+ if __name__ == "__main__":
114
+ main()
@@ -0,0 +1 @@
1
+ """Utilities shared across batch_analytics (e.g. Gluon / AutoGluon helpers)."""
@@ -0,0 +1,83 @@
1
+ """Shared helpers for AutoGluon train/infer Gluon jobs (S3 + table naming)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Tuple
8
+
9
+
10
+ def parse_s3_uri(uri: str) -> Tuple[str, str]:
11
+ """
12
+ Split ``s3://bucket/key/prefix`` into bucket and key prefix (may be empty).
13
+
14
+ The key prefix does not include a leading slash; trailing slashes are preserved on the key side.
15
+ """
16
+ u = (uri or "").strip()
17
+ if not u.startswith("s3://"):
18
+ raise ValueError(f"Not an s3 URI: {uri!r}")
19
+ rest = u[5:]
20
+ if "/" not in rest:
21
+ return rest, ""
22
+ bucket, key = rest.split("/", 1)
23
+ return bucket, key
24
+
25
+
26
+ def clickhouse_full_table(database: str, table: str) -> str:
27
+ """Build ``db.table`` when ``table`` is unqualified."""
28
+ t = (table or "").strip()
29
+ if not t:
30
+ return t
31
+ if "." in t and "'" not in t:
32
+ return t
33
+ db = (database or "").strip() or "default"
34
+ return f"{db}.{t}"
35
+
36
+
37
+ def local_training_dir() -> str:
38
+ return os.environ.get("AUTOGLUON_LOCAL_MODEL_DIR", "/tmp/autogluon_model")
39
+
40
+
41
+ def boto3_client():
42
+ import boto3
43
+
44
+ return boto3.client("s3")
45
+
46
+
47
+ def upload_directory_to_s3(local_dir: str, s3_dir_uri: str) -> None:
48
+ """Upload every file under ``local_dir`` to ``s3_dir_uri`` (directory URI, trailing ``/`` optional)."""
49
+ bucket, prefix = parse_s3_uri(s3_dir_uri.rstrip("/") + "/")
50
+ prefix = prefix.rstrip("/")
51
+ if prefix:
52
+ prefix = prefix + "/"
53
+ cli = boto3_client()
54
+ root = Path(local_dir)
55
+ for path in root.rglob("*"):
56
+ if not path.is_file():
57
+ continue
58
+ rel = path.relative_to(root).as_posix()
59
+ key = prefix + rel
60
+ cli.upload_file(str(path), bucket, key)
61
+
62
+
63
+ def download_s3_prefix_to_dir(s3_dir_uri: str, local_dir: str) -> None:
64
+ """Download all objects under the S3 prefix implied by ``s3_dir_uri`` into ``local_dir``."""
65
+ bucket, pfx = parse_s3_uri(s3_dir_uri.rstrip("/") + "/")
66
+ pfx = pfx.rstrip("/")
67
+ if pfx:
68
+ pfx = pfx + "/"
69
+ cli = boto3_client()
70
+ os.makedirs(local_dir, exist_ok=True)
71
+ paginator = cli.get_paginator("list_objects_v2")
72
+ pages = paginator.paginate(Bucket=bucket, Prefix=pfx)
73
+ for page in pages:
74
+ for obj in page.get("Contents") or []:
75
+ key = obj["Key"]
76
+ if key.endswith("/"):
77
+ continue
78
+ rel = key[len(pfx) :] if key.startswith(pfx) else key
79
+ if not rel:
80
+ continue
81
+ dest = Path(local_dir) / rel
82
+ dest.parent.mkdir(parents=True, exist_ok=True)
83
+ cli.download_file(bucket, key, str(dest))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.3.14
3
+ Version: 0.3.16
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -22,11 +22,20 @@ Provides-Extra: output
22
22
  Requires-Dist: boto3>=1.28; extra == "output"
23
23
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
24
24
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
25
+ Provides-Extra: autogluon
26
+ Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
27
+ Requires-Dist: pandas>=1.3.0; extra == "autogluon"
28
+ Requires-Dist: boto3>=1.28; extra == "autogluon"
29
+ Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
30
+ Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
31
+ Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
25
32
  Provides-Extra: full
26
33
  Requires-Dist: scipy>=1.5.0; extra == "full"
27
34
  Requires-Dist: boto3>=1.28; extra == "full"
28
35
  Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
29
36
  Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
37
+ Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
38
+ Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
30
39
 
31
40
  # Batch Analytics
32
41
 
@@ -16,6 +16,8 @@ src/batch_analytics.egg-info/requires.txt
16
16
  src/batch_analytics.egg-info/top_level.txt
17
17
  src/batch_analytics/analytics/__init__.py
18
18
  src/batch_analytics/analytics/correlation.py
19
+ src/batch_analytics/analytics/gluon_autogluon_infer.py
20
+ src/batch_analytics/analytics/gluon_autogluon_train.py
19
21
  src/batch_analytics/analytics/linear_regression.py
20
22
  src/batch_analytics/analytics/pca_clustering.py
21
23
  src/batch_analytics/analytics/t_test.py
@@ -23,4 +25,6 @@ src/batch_analytics/output/__init__.py
23
25
  src/batch_analytics/output/base.py
24
26
  src/batch_analytics/output/clickhouse.py
25
27
  src/batch_analytics/output/local.py
26
- src/batch_analytics/output/s3.py
28
+ src/batch_analytics/output/s3.py
29
+ src/batch_analytics/utils/__init__.py
30
+ src/batch_analytics/utils/gluon_autogluon_common.py
@@ -2,6 +2,20 @@ pyspark<3.6,>=3.4
2
2
  numpy>=1.19.0
3
3
  scipy>=1.5.0
4
4
 
5
+ [autogluon]
6
+ autogluon<2.0,>=1.0
7
+ pandas>=1.3.0
8
+ boto3>=1.28
9
+
10
+ [autogluon:python_version < "3.9"]
11
+ clickhouse-connect<0.9,>=0.7
12
+
13
+ [autogluon:python_version >= "3.8"]
14
+ pyarrow>=10.0.0
15
+
16
+ [autogluon:python_version >= "3.9"]
17
+ clickhouse-connect>=0.7
18
+
5
19
  [clickhouse]
6
20
 
7
21
  [clickhouse:python_version < "3.9"]
@@ -16,10 +30,14 @@ pytest>=7.0
16
30
  [full]
17
31
  scipy>=1.5.0
18
32
  boto3>=1.28
33
+ autogluon<2.0,>=1.0
19
34
 
20
35
  [full:python_version < "3.9"]
21
36
  clickhouse-connect<0.9,>=0.7
22
37
 
38
+ [full:python_version >= "3.8"]
39
+ pyarrow>=10.0.0
40
+
23
41
  [full:python_version >= "3.9"]
24
42
  clickhouse-connect>=0.7
25
43