batch-analytics 0.3.14__tar.gz → 0.3.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/PKG-INFO +10 -1
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/pyproject.toml +12 -1
- batch_analytics-0.3.15/src/batch_analytics/analytics/gluon_autogluon_infer.py +116 -0
- batch_analytics-0.3.15/src/batch_analytics/analytics/gluon_autogluon_train.py +114 -0
- batch_analytics-0.3.15/src/batch_analytics/utils/__init__.py +1 -0
- batch_analytics-0.3.15/src/batch_analytics/utils/gluon_autogluon_common.py +83 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics.egg-info/PKG-INFO +10 -1
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics.egg-info/SOURCES.txt +5 -1
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics.egg-info/requires.txt +18 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/README.md +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/setup.cfg +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.15
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -22,11 +22,20 @@ Provides-Extra: output
|
|
|
22
22
|
Requires-Dist: boto3>=1.28; extra == "output"
|
|
23
23
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
24
24
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
25
|
+
Provides-Extra: autogluon
|
|
26
|
+
Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
|
|
27
|
+
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
28
|
+
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
29
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
30
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
|
|
31
|
+
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
|
|
25
32
|
Provides-Extra: full
|
|
26
33
|
Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
27
34
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
28
35
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
29
36
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
37
|
+
Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
|
|
38
|
+
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
30
39
|
|
|
31
40
|
# Batch Analytics
|
|
32
41
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.15"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -32,12 +32,23 @@ output = [
|
|
|
32
32
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
33
33
|
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
34
34
|
]
|
|
35
|
+
# AutoGluon Tabular train/infer (POST /submit/train, /submit/inf on analytics_runner)
|
|
36
|
+
autogluon = [
|
|
37
|
+
"autogluon>=1.0,<2.0",
|
|
38
|
+
"pandas>=1.3.0",
|
|
39
|
+
"boto3>=1.28",
|
|
40
|
+
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
41
|
+
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
42
|
+
"pyarrow>=10.0.0; python_version >= '3.8'",
|
|
43
|
+
]
|
|
35
44
|
# Install all optional runtime deps used anywhere in the package
|
|
36
45
|
full = [
|
|
37
46
|
"scipy>=1.5.0",
|
|
38
47
|
"boto3>=1.28",
|
|
39
48
|
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
40
49
|
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
50
|
+
"autogluon>=1.0,<2.0",
|
|
51
|
+
"pyarrow>=10.0.0; python_version >= '3.8'",
|
|
41
52
|
]
|
|
42
53
|
|
|
43
54
|
[project.scripts]
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Load AutoGluon TabularPredictor from S3; score ClickHouse rows; write predictions to output.
|
|
3
|
+
|
|
4
|
+
Env: CLICKHOUSE_*, BATCH_STAGING_TABLE (inference feature table), MODEL_S3_PREFIX,
|
|
5
|
+
OUTPUT_TYPE, OUTPUT_CLICKHOUSE_DATABASE, OUTPUT_CLICKHOUSE_TABLE (clickhouse),
|
|
6
|
+
or OUTPUT_S3_PATH (s3 parquet)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import shutil
|
|
14
|
+
import sys
|
|
15
|
+
import tempfile
|
|
16
|
+
|
|
17
|
+
import clickhouse_connect
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from batch_analytics.utils.gluon_autogluon_common import (
|
|
22
|
+
clickhouse_full_table,
|
|
23
|
+
download_s3_prefix_to_dir,
|
|
24
|
+
parse_s3_uri,
|
|
25
|
+
)
|
|
26
|
+
except ImportError:
|
|
27
|
+
_pkg_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
28
|
+
if _pkg_root not in sys.path:
|
|
29
|
+
sys.path.insert(0, _pkg_root)
|
|
30
|
+
from utils.gluon_autogluon_common import ( # noqa: E402
|
|
31
|
+
clickhouse_full_table,
|
|
32
|
+
download_s3_prefix_to_dir,
|
|
33
|
+
parse_s3_uri,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _require(name: str) -> str:
|
|
41
|
+
v = os.environ.get(name, "").strip()
|
|
42
|
+
if not v:
|
|
43
|
+
logger.error("Missing required env var: %s", name)
|
|
44
|
+
sys.exit(2)
|
|
45
|
+
return v
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def main() -> None:
|
|
49
|
+
model_prefix = _require("MODEL_S3_PREFIX")
|
|
50
|
+
out_type = os.environ.get("OUTPUT_TYPE", "clickhouse").strip().lower()
|
|
51
|
+
if out_type not in ("clickhouse", "s3"):
|
|
52
|
+
logger.error("OUTPUT_TYPE must be clickhouse or s3, got %r", out_type)
|
|
53
|
+
sys.exit(2)
|
|
54
|
+
|
|
55
|
+
host = _require("CLICKHOUSE_HOST")
|
|
56
|
+
port = int(os.environ.get("CLICKHOUSE_HTTP_PORT", "8123"))
|
|
57
|
+
database = os.environ.get("CLICKHOUSE_DB", "default").strip() or "default"
|
|
58
|
+
user = os.environ.get("CLICKHOUSE_USER", "default")
|
|
59
|
+
password = os.environ.get("CLICKHOUSE_PASSWORD", "")
|
|
60
|
+
inference_table = _require("BATCH_STAGING_TABLE")
|
|
61
|
+
full_table = clickhouse_full_table(database, inference_table)
|
|
62
|
+
|
|
63
|
+
sql = f"SELECT * FROM {full_table}"
|
|
64
|
+
logger.info("Loading rows to score: %s", sql)
|
|
65
|
+
client = clickhouse_connect.get_client(
|
|
66
|
+
host=host,
|
|
67
|
+
port=port,
|
|
68
|
+
username=user,
|
|
69
|
+
password=password or None,
|
|
70
|
+
database=database,
|
|
71
|
+
)
|
|
72
|
+
df = client.query_df(sql)
|
|
73
|
+
if df.empty:
|
|
74
|
+
logger.warning("Inference input is empty; nothing to write")
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
from autogluon.tabular import TabularPredictor
|
|
79
|
+
except ImportError:
|
|
80
|
+
logger.exception("autogluon is not installed; use pip install 'batch-analytics[autogluon]'")
|
|
81
|
+
sys.exit(4)
|
|
82
|
+
|
|
83
|
+
local_model = tempfile.mkdtemp(prefix="ag_infer_")
|
|
84
|
+
try:
|
|
85
|
+
logger.info("Downloading model from %s", model_prefix)
|
|
86
|
+
download_s3_prefix_to_dir(model_prefix, local_model)
|
|
87
|
+
predictor = TabularPredictor.load(local_model)
|
|
88
|
+
y_pred = predictor.predict(df)
|
|
89
|
+
out_df = df.copy()
|
|
90
|
+
out_df["prediction"] = pd.Series(y_pred, index=df.index)
|
|
91
|
+
|
|
92
|
+
if out_type == "clickhouse":
|
|
93
|
+
odb = _require("OUTPUT_CLICKHOUSE_DATABASE")
|
|
94
|
+
otbl = _require("OUTPUT_CLICKHOUSE_TABLE")
|
|
95
|
+
out_full = clickhouse_full_table(odb, otbl)
|
|
96
|
+
logger.info("Inserting %s rows into %s", len(out_df), out_full)
|
|
97
|
+
client.insert_df(out_full, out_df)
|
|
98
|
+
else:
|
|
99
|
+
path = _require("OUTPUT_S3_PATH")
|
|
100
|
+
bucket, key = parse_s3_uri(path.rstrip("/") + "/")
|
|
101
|
+
key = key.rstrip("/")
|
|
102
|
+
if key:
|
|
103
|
+
key = key + "/"
|
|
104
|
+
parquet_key = key + "predictions.parquet"
|
|
105
|
+
tmp_parquet = os.path.join(local_model, "predictions.parquet")
|
|
106
|
+
out_df.to_parquet(tmp_parquet, index=False)
|
|
107
|
+
import boto3
|
|
108
|
+
|
|
109
|
+
boto3.client("s3").upload_file(tmp_parquet, bucket, parquet_key)
|
|
110
|
+
logger.info("Wrote s3://%s/%s", bucket, parquet_key)
|
|
111
|
+
finally:
|
|
112
|
+
shutil.rmtree(local_model, ignore_errors=True)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
main()
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Train AutoGluon TabularPredictor from ClickHouse staging data; upload artifacts to S3.
|
|
3
|
+
|
|
4
|
+
Env (injected by analytics_runner Gluon job):
|
|
5
|
+
CLICKHOUSE_*, BATCH_STAGING_TABLE, MODEL_S3_PREFIX, TASK_ID,
|
|
6
|
+
AUTOGLUON_LABEL, AUTOGLUON_FEATURES, AUTOGLUON_PROBLEM_TYPE, AUTOGLUON_TIME_LIMIT,
|
|
7
|
+
optional AUTOGLUON_MAX_ROWS
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import shutil
|
|
15
|
+
import sys
|
|
16
|
+
|
|
17
|
+
import clickhouse_connect
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from batch_analytics.utils.gluon_autogluon_common import (
|
|
21
|
+
clickhouse_full_table,
|
|
22
|
+
local_training_dir,
|
|
23
|
+
upload_directory_to_s3,
|
|
24
|
+
)
|
|
25
|
+
except ImportError:
|
|
26
|
+
_pkg_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
27
|
+
if _pkg_root not in sys.path:
|
|
28
|
+
sys.path.insert(0, _pkg_root)
|
|
29
|
+
from utils.gluon_autogluon_common import ( # noqa: E402
|
|
30
|
+
clickhouse_full_table,
|
|
31
|
+
local_training_dir,
|
|
32
|
+
upload_directory_to_s3,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _require(name: str) -> str:
|
|
40
|
+
v = os.environ.get(name, "").strip()
|
|
41
|
+
if not v:
|
|
42
|
+
logger.error("Missing required env var: %s", name)
|
|
43
|
+
sys.exit(2)
|
|
44
|
+
return v
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def main() -> None:
|
|
48
|
+
model_prefix = _require("MODEL_S3_PREFIX")
|
|
49
|
+
label = _require("AUTOGLUON_LABEL")
|
|
50
|
+
features_raw = _require("AUTOGLUON_FEATURES")
|
|
51
|
+
feature_list = [c.strip() for c in features_raw.split(",") if c.strip()]
|
|
52
|
+
|
|
53
|
+
host = _require("CLICKHOUSE_HOST")
|
|
54
|
+
port = int(os.environ.get("CLICKHOUSE_HTTP_PORT", "8123"))
|
|
55
|
+
database = os.environ.get("CLICKHOUSE_DB", "default").strip() or "default"
|
|
56
|
+
user = os.environ.get("CLICKHOUSE_USER", "default")
|
|
57
|
+
password = os.environ.get("CLICKHOUSE_PASSWORD", "")
|
|
58
|
+
staging_table = _require("BATCH_STAGING_TABLE")
|
|
59
|
+
full_table = clickhouse_full_table(database, staging_table)
|
|
60
|
+
|
|
61
|
+
max_rows = os.environ.get("AUTOGLUON_MAX_ROWS", "").strip()
|
|
62
|
+
limit_sql = f" LIMIT {int(max_rows)}" if max_rows else ""
|
|
63
|
+
sql = f"SELECT * FROM {full_table}{limit_sql}"
|
|
64
|
+
|
|
65
|
+
logger.info("Loading training data: %s", sql)
|
|
66
|
+
client = clickhouse_connect.get_client(
|
|
67
|
+
host=host,
|
|
68
|
+
port=port,
|
|
69
|
+
username=user,
|
|
70
|
+
password=password or None,
|
|
71
|
+
database=database,
|
|
72
|
+
)
|
|
73
|
+
df = client.query_df(sql)
|
|
74
|
+
|
|
75
|
+
missing = [c for c in feature_list + [label] if c not in df.columns]
|
|
76
|
+
if missing:
|
|
77
|
+
logger.error("Columns missing from training data: %s (have: %s)", missing, list(df.columns))
|
|
78
|
+
sys.exit(3)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
from autogluon.tabular import TabularPredictor
|
|
82
|
+
except ImportError:
|
|
83
|
+
logger.exception("autogluon is not installed; use pip install 'batch-analytics[autogluon]'")
|
|
84
|
+
sys.exit(4)
|
|
85
|
+
|
|
86
|
+
problem_type = os.environ.get("AUTOGLUON_PROBLEM_TYPE", "binary").strip() or "binary"
|
|
87
|
+
time_limit = int(os.environ.get("AUTOGLUON_TIME_LIMIT", "300"))
|
|
88
|
+
|
|
89
|
+
local_dir = local_training_dir()
|
|
90
|
+
if os.path.isdir(local_dir):
|
|
91
|
+
shutil.rmtree(local_dir)
|
|
92
|
+
os.makedirs(local_dir, exist_ok=True)
|
|
93
|
+
|
|
94
|
+
train_df = df[feature_list + [label]]
|
|
95
|
+
logger.info(
|
|
96
|
+
"Fitting TabularPredictor problem_type=%s time_limit=%ss rows=%s",
|
|
97
|
+
problem_type,
|
|
98
|
+
time_limit,
|
|
99
|
+
len(train_df),
|
|
100
|
+
)
|
|
101
|
+
predictor = TabularPredictor(
|
|
102
|
+
label=label,
|
|
103
|
+
problem_type=problem_type,
|
|
104
|
+
path=local_dir,
|
|
105
|
+
)
|
|
106
|
+
predictor.fit(train_df, time_limit=time_limit)
|
|
107
|
+
|
|
108
|
+
logger.info("Uploading model artifacts to %s", model_prefix)
|
|
109
|
+
upload_directory_to_s3(local_dir, model_prefix)
|
|
110
|
+
logger.info("Train finished; task_id=%s", os.environ.get("TASK_ID", ""))
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
if __name__ == "__main__":
|
|
114
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utilities shared across batch_analytics (e.g. Gluon / AutoGluon helpers)."""
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Shared helpers for AutoGluon train/infer Gluon jobs (S3 + table naming)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Tuple
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_s3_uri(uri: str) -> Tuple[str, str]:
|
|
11
|
+
"""
|
|
12
|
+
Split ``s3://bucket/key/prefix`` into bucket and key prefix (may be empty).
|
|
13
|
+
|
|
14
|
+
The key prefix does not include a leading slash; trailing slashes are preserved on the key side.
|
|
15
|
+
"""
|
|
16
|
+
u = (uri or "").strip()
|
|
17
|
+
if not u.startswith("s3://"):
|
|
18
|
+
raise ValueError(f"Not an s3 URI: {uri!r}")
|
|
19
|
+
rest = u[5:]
|
|
20
|
+
if "/" not in rest:
|
|
21
|
+
return rest, ""
|
|
22
|
+
bucket, key = rest.split("/", 1)
|
|
23
|
+
return bucket, key
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def clickhouse_full_table(database: str, table: str) -> str:
|
|
27
|
+
"""Build ``db.table`` when ``table`` is unqualified."""
|
|
28
|
+
t = (table or "").strip()
|
|
29
|
+
if not t:
|
|
30
|
+
return t
|
|
31
|
+
if "." in t and "'" not in t:
|
|
32
|
+
return t
|
|
33
|
+
db = (database or "").strip() or "default"
|
|
34
|
+
return f"{db}.{t}"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def local_training_dir() -> str:
|
|
38
|
+
return os.environ.get("AUTOGLUON_LOCAL_MODEL_DIR", "/tmp/autogluon_model")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def boto3_client():
|
|
42
|
+
import boto3
|
|
43
|
+
|
|
44
|
+
return boto3.client("s3")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def upload_directory_to_s3(local_dir: str, s3_dir_uri: str) -> None:
|
|
48
|
+
"""Upload every file under ``local_dir`` to ``s3_dir_uri`` (directory URI, trailing ``/`` optional)."""
|
|
49
|
+
bucket, prefix = parse_s3_uri(s3_dir_uri.rstrip("/") + "/")
|
|
50
|
+
prefix = prefix.rstrip("/")
|
|
51
|
+
if prefix:
|
|
52
|
+
prefix = prefix + "/"
|
|
53
|
+
cli = boto3_client()
|
|
54
|
+
root = Path(local_dir)
|
|
55
|
+
for path in root.rglob("*"):
|
|
56
|
+
if not path.is_file():
|
|
57
|
+
continue
|
|
58
|
+
rel = path.relative_to(root).as_posix()
|
|
59
|
+
key = prefix + rel
|
|
60
|
+
cli.upload_file(str(path), bucket, key)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def download_s3_prefix_to_dir(s3_dir_uri: str, local_dir: str) -> None:
|
|
64
|
+
"""Download all objects under the S3 prefix implied by ``s3_dir_uri`` into ``local_dir``."""
|
|
65
|
+
bucket, pfx = parse_s3_uri(s3_dir_uri.rstrip("/") + "/")
|
|
66
|
+
pfx = pfx.rstrip("/")
|
|
67
|
+
if pfx:
|
|
68
|
+
pfx = pfx + "/"
|
|
69
|
+
cli = boto3_client()
|
|
70
|
+
os.makedirs(local_dir, exist_ok=True)
|
|
71
|
+
paginator = cli.get_paginator("list_objects_v2")
|
|
72
|
+
pages = paginator.paginate(Bucket=bucket, Prefix=pfx)
|
|
73
|
+
for page in pages:
|
|
74
|
+
for obj in page.get("Contents") or []:
|
|
75
|
+
key = obj["Key"]
|
|
76
|
+
if key.endswith("/"):
|
|
77
|
+
continue
|
|
78
|
+
rel = key[len(pfx) :] if key.startswith(pfx) else key
|
|
79
|
+
if not rel:
|
|
80
|
+
continue
|
|
81
|
+
dest = Path(local_dir) / rel
|
|
82
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
cli.download_file(bucket, key, str(dest))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.15
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -22,11 +22,20 @@ Provides-Extra: output
|
|
|
22
22
|
Requires-Dist: boto3>=1.28; extra == "output"
|
|
23
23
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
24
24
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
25
|
+
Provides-Extra: autogluon
|
|
26
|
+
Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
|
|
27
|
+
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
28
|
+
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
29
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
30
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
|
|
31
|
+
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
|
|
25
32
|
Provides-Extra: full
|
|
26
33
|
Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
27
34
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
28
35
|
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
29
36
|
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
37
|
+
Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
|
|
38
|
+
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
30
39
|
|
|
31
40
|
# Batch Analytics
|
|
32
41
|
|
|
@@ -16,6 +16,8 @@ src/batch_analytics.egg-info/requires.txt
|
|
|
16
16
|
src/batch_analytics.egg-info/top_level.txt
|
|
17
17
|
src/batch_analytics/analytics/__init__.py
|
|
18
18
|
src/batch_analytics/analytics/correlation.py
|
|
19
|
+
src/batch_analytics/analytics/gluon_autogluon_infer.py
|
|
20
|
+
src/batch_analytics/analytics/gluon_autogluon_train.py
|
|
19
21
|
src/batch_analytics/analytics/linear_regression.py
|
|
20
22
|
src/batch_analytics/analytics/pca_clustering.py
|
|
21
23
|
src/batch_analytics/analytics/t_test.py
|
|
@@ -23,4 +25,6 @@ src/batch_analytics/output/__init__.py
|
|
|
23
25
|
src/batch_analytics/output/base.py
|
|
24
26
|
src/batch_analytics/output/clickhouse.py
|
|
25
27
|
src/batch_analytics/output/local.py
|
|
26
|
-
src/batch_analytics/output/s3.py
|
|
28
|
+
src/batch_analytics/output/s3.py
|
|
29
|
+
src/batch_analytics/utils/__init__.py
|
|
30
|
+
src/batch_analytics/utils/gluon_autogluon_common.py
|
|
@@ -2,6 +2,20 @@ pyspark<3.6,>=3.4
|
|
|
2
2
|
numpy>=1.19.0
|
|
3
3
|
scipy>=1.5.0
|
|
4
4
|
|
|
5
|
+
[autogluon]
|
|
6
|
+
autogluon<2.0,>=1.0
|
|
7
|
+
pandas>=1.3.0
|
|
8
|
+
boto3>=1.28
|
|
9
|
+
|
|
10
|
+
[autogluon:python_version < "3.9"]
|
|
11
|
+
clickhouse-connect<0.9,>=0.7
|
|
12
|
+
|
|
13
|
+
[autogluon:python_version >= "3.8"]
|
|
14
|
+
pyarrow>=10.0.0
|
|
15
|
+
|
|
16
|
+
[autogluon:python_version >= "3.9"]
|
|
17
|
+
clickhouse-connect>=0.7
|
|
18
|
+
|
|
5
19
|
[clickhouse]
|
|
6
20
|
|
|
7
21
|
[clickhouse:python_version < "3.9"]
|
|
@@ -16,10 +30,14 @@ pytest>=7.0
|
|
|
16
30
|
[full]
|
|
17
31
|
scipy>=1.5.0
|
|
18
32
|
boto3>=1.28
|
|
33
|
+
autogluon<2.0,>=1.0
|
|
19
34
|
|
|
20
35
|
[full:python_version < "3.9"]
|
|
21
36
|
clickhouse-connect<0.9,>=0.7
|
|
22
37
|
|
|
38
|
+
[full:python_version >= "3.8"]
|
|
39
|
+
pyarrow>=10.0.0
|
|
40
|
+
|
|
23
41
|
[full:python_version >= "3.9"]
|
|
24
42
|
clickhouse-connect>=0.7
|
|
25
43
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.14 → batch_analytics-0.3.15}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|