batch-analytics 0.3.17__tar.gz → 0.3.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/PKG-INFO +23 -16
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/README.md +6 -5
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/pyproject.toml +18 -16
- batch_analytics-0.3.20/src/batch_analytics/__init__.py +63 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/__main__.py +1 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics.egg-info/PKG-INFO +23 -16
- batch_analytics-0.3.20/src/batch_analytics.egg-info/requires.txt +60 -0
- batch_analytics-0.3.17/src/batch_analytics/__init__.py +0 -52
- batch_analytics-0.3.17/src/batch_analytics.egg-info/requires.txt +0 -33
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/setup.cfg +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/gluon_autogluon_infer.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/gluon_autogluon_train.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/job_runner.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/utils/__init__.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/utils/gluon_autogluon_common.py +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -1,37 +1,43 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.20
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
|
-
Requires-Dist:
|
|
10
|
-
Requires-Dist: numpy>=1.22.0
|
|
11
|
-
Requires-Dist: scipy>=1.5.0
|
|
9
|
+
Requires-Dist: numpy>=1.19.0
|
|
12
10
|
Provides-Extra: dev
|
|
13
11
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
12
|
+
Provides-Extra: spark
|
|
13
|
+
Requires-Dist: pyspark<3.6,>=3.4; extra == "spark"
|
|
14
|
+
Requires-Dist: scipy>=1.5.0; extra == "spark"
|
|
14
15
|
Provides-Extra: ttest
|
|
15
16
|
Requires-Dist: scipy>=1.5.0; extra == "ttest"
|
|
16
17
|
Provides-Extra: s3
|
|
17
18
|
Requires-Dist: boto3>=1.28; extra == "s3"
|
|
18
19
|
Provides-Extra: clickhouse
|
|
19
|
-
Requires-Dist: clickhouse-connect
|
|
20
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "clickhouse"
|
|
21
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "clickhouse"
|
|
20
22
|
Provides-Extra: output
|
|
21
23
|
Requires-Dist: boto3>=1.28; extra == "output"
|
|
22
|
-
Requires-Dist: clickhouse-connect
|
|
24
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
25
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
23
26
|
Provides-Extra: autogluon
|
|
24
27
|
Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
|
|
25
28
|
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
26
29
|
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
27
|
-
Requires-Dist: clickhouse-connect
|
|
28
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
31
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
|
|
32
|
+
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
|
|
29
33
|
Provides-Extra: full
|
|
34
|
+
Requires-Dist: pyspark<3.6,>=3.4; extra == "full"
|
|
30
35
|
Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
31
36
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
32
|
-
Requires-Dist: clickhouse-connect
|
|
37
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
38
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
33
39
|
Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
|
|
34
|
-
Requires-Dist: pyarrow>=10.0.0; extra == "full"
|
|
40
|
+
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
35
41
|
|
|
36
42
|
# Batch Analytics
|
|
37
43
|
|
|
@@ -67,12 +73,13 @@ analytics/
|
|
|
67
73
|
|
|
68
74
|
## Install
|
|
69
75
|
|
|
76
|
+
- `pip install batch-analytics` — core (numpy only)
|
|
77
|
+
- `pip install batch-analytics[spark]` — PySpark ETL + job runner + SciPy (t-test / ANOVA)
|
|
78
|
+
- `pip install batch-analytics[autogluon]` — AutoGluon + I/O (Gluon image; no PySpark)
|
|
79
|
+
- `pip install batch-analytics[full]` — PySpark + AutoGluon + I/O
|
|
80
|
+
|
|
70
81
|
```bash
|
|
71
|
-
pip install -e .
|
|
72
|
-
# or install every runtime dependency used anywhere in the package, then editable:
|
|
73
|
-
pip install -r requirements.txt && pip install -e .
|
|
74
|
-
# PyPI install includes numpy and scipy (t-test); extras: s3, clickhouse, output, full
|
|
75
|
-
pip install "batch-analytics[full]"
|
|
82
|
+
pip install -e ".[spark]" # dev
|
|
76
83
|
```
|
|
77
84
|
|
|
78
85
|
## Run
|
|
@@ -32,12 +32,13 @@ analytics/
|
|
|
32
32
|
|
|
33
33
|
## Install
|
|
34
34
|
|
|
35
|
+
- `pip install batch-analytics` — core (numpy only)
|
|
36
|
+
- `pip install batch-analytics[spark]` — PySpark ETL + job runner + SciPy (t-test / ANOVA)
|
|
37
|
+
- `pip install batch-analytics[autogluon]` — AutoGluon + I/O (Gluon image; no PySpark)
|
|
38
|
+
- `pip install batch-analytics[full]` — PySpark + AutoGluon + I/O
|
|
39
|
+
|
|
35
40
|
```bash
|
|
36
|
-
pip install -e .
|
|
37
|
-
# or install every runtime dependency used anywhere in the package, then editable:
|
|
38
|
-
pip install -r requirements.txt && pip install -e .
|
|
39
|
-
# PyPI install includes numpy and scipy (t-test); extras: s3, clickhouse, output, full
|
|
40
|
-
pip install "batch-analytics[full]"
|
|
41
|
+
pip install -e ".[spark]" # dev
|
|
41
42
|
```
|
|
42
43
|
|
|
43
44
|
## Run
|
|
@@ -4,47 +4,49 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.20"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
|
|
11
|
-
requires-python = ">=3.9"
|
|
10
|
+
requires-python = ">=3.8"
|
|
12
11
|
dependencies = [
|
|
13
|
-
"
|
|
14
|
-
"numpy>=1.22.0",
|
|
15
|
-
# Welch t-test (t_test.py); keep on core deps so `pip install batch-analytics` works in minimal driver images
|
|
16
|
-
"scipy>=1.5.0",
|
|
12
|
+
"numpy>=1.19.0",
|
|
17
13
|
]
|
|
18
14
|
authors = [{ name = "Litewave Analytics Team" }]
|
|
19
15
|
license = { text = "MIT" }
|
|
20
16
|
|
|
21
17
|
[project.optional-dependencies]
|
|
22
18
|
dev = ["pytest>=7.0"]
|
|
23
|
-
|
|
19
|
+
spark = [
|
|
20
|
+
"pyspark>=3.4,<3.6",
|
|
21
|
+
"scipy>=1.5.0",
|
|
22
|
+
]
|
|
24
23
|
ttest = ["scipy>=1.5.0"]
|
|
25
24
|
s3 = ["boto3>=1.28"]
|
|
26
25
|
clickhouse = [
|
|
27
|
-
"clickhouse-connect>=0.7",
|
|
26
|
+
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
27
|
+
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
28
28
|
]
|
|
29
29
|
output = [
|
|
30
30
|
"boto3>=1.28",
|
|
31
|
-
"clickhouse-connect>=0.7",
|
|
31
|
+
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
32
|
+
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
32
33
|
]
|
|
33
|
-
# AutoGluon Tabular train/infer (POST /submit/train, /submit/inf on analytics_runner). Requires Python 3.9+.
|
|
34
34
|
autogluon = [
|
|
35
35
|
"autogluon>=1.0,<2.0",
|
|
36
36
|
"pandas>=1.3.0",
|
|
37
37
|
"boto3>=1.28",
|
|
38
|
-
"clickhouse-connect>=0.7",
|
|
39
|
-
"
|
|
38
|
+
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
39
|
+
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
40
|
+
"pyarrow>=10.0.0; python_version >= '3.8'",
|
|
40
41
|
]
|
|
41
|
-
# Install all optional runtime deps used anywhere in the package
|
|
42
42
|
full = [
|
|
43
|
+
"pyspark>=3.4,<3.6",
|
|
43
44
|
"scipy>=1.5.0",
|
|
44
45
|
"boto3>=1.28",
|
|
45
|
-
"clickhouse-connect>=0.7",
|
|
46
|
+
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
47
|
+
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
46
48
|
"autogluon>=1.0,<2.0",
|
|
47
|
-
"pyarrow>=10.0.0",
|
|
49
|
+
"pyarrow>=10.0.0; python_version >= '3.8'",
|
|
48
50
|
]
|
|
49
51
|
|
|
50
52
|
[project.scripts]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch analytics pipeline: Extract, Transform, Log stages + analytics modules.
|
|
3
|
+
|
|
4
|
+
PySpark is optional: install ``batch-analytics[spark]``. Gluon image: ``[autogluon]`` only.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import importlib
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from .config import BatchAnalyticsConfig, SparkK8sConfig
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"BatchAnalyticsConfig",
|
|
16
|
+
"SparkK8sConfig",
|
|
17
|
+
"expand_kv_blob_column",
|
|
18
|
+
"extract_anchor_id",
|
|
19
|
+
"extract_all",
|
|
20
|
+
"extract_table",
|
|
21
|
+
"extract_unified",
|
|
22
|
+
"parse_extract_filter_values",
|
|
23
|
+
"remove_duplicates",
|
|
24
|
+
"stage_to_clickhouse",
|
|
25
|
+
"transform",
|
|
26
|
+
"transform_and_stage",
|
|
27
|
+
"load_staged",
|
|
28
|
+
"log_run",
|
|
29
|
+
"log_analytics_artifacts",
|
|
30
|
+
"run_pipeline",
|
|
31
|
+
"create_spark_session",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
# Lazy imports so ``import batch_analytics`` works without PySpark (Gluon image).
|
|
35
|
+
_LAZY = {
|
|
36
|
+
"expand_kv_blob_column": ("transform", "expand_kv_blob_column"),
|
|
37
|
+
"extract_anchor_id": ("transform", "extract_anchor_id"),
|
|
38
|
+
"extract_all": ("extract", "extract_all"),
|
|
39
|
+
"extract_table": ("extract", "extract_table"),
|
|
40
|
+
"extract_unified": ("extract", "extract_unified"),
|
|
41
|
+
"parse_extract_filter_values": ("extract", "parse_extract_filter_values"),
|
|
42
|
+
"remove_duplicates": ("transform", "remove_duplicates"),
|
|
43
|
+
"stage_to_clickhouse": ("transform", "stage_to_clickhouse"),
|
|
44
|
+
"transform": ("transform", "transform"),
|
|
45
|
+
"transform_and_stage": ("transform", "transform_and_stage"),
|
|
46
|
+
"load_staged": ("transform", "load_staged"),
|
|
47
|
+
"log_run": ("log", "log_run"),
|
|
48
|
+
"log_analytics_artifacts": ("log", "log_analytics_artifacts"),
|
|
49
|
+
"run_pipeline": ("job_runner", "run_pipeline"),
|
|
50
|
+
"create_spark_session": ("job_runner", "create_spark_session"),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def __getattr__(name: str) -> Any:
|
|
55
|
+
if name in _LAZY:
|
|
56
|
+
mod_name, attr = _LAZY[name]
|
|
57
|
+
mod = importlib.import_module(f".{mod_name}", __name__)
|
|
58
|
+
return getattr(mod, attr)
|
|
59
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def __dir__() -> list[str]:
|
|
63
|
+
return sorted({*__all__, *globals().keys()})
|
|
@@ -1,37 +1,43 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.20
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
|
-
Requires-Dist:
|
|
10
|
-
Requires-Dist: numpy>=1.22.0
|
|
11
|
-
Requires-Dist: scipy>=1.5.0
|
|
9
|
+
Requires-Dist: numpy>=1.19.0
|
|
12
10
|
Provides-Extra: dev
|
|
13
11
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
12
|
+
Provides-Extra: spark
|
|
13
|
+
Requires-Dist: pyspark<3.6,>=3.4; extra == "spark"
|
|
14
|
+
Requires-Dist: scipy>=1.5.0; extra == "spark"
|
|
14
15
|
Provides-Extra: ttest
|
|
15
16
|
Requires-Dist: scipy>=1.5.0; extra == "ttest"
|
|
16
17
|
Provides-Extra: s3
|
|
17
18
|
Requires-Dist: boto3>=1.28; extra == "s3"
|
|
18
19
|
Provides-Extra: clickhouse
|
|
19
|
-
Requires-Dist: clickhouse-connect
|
|
20
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "clickhouse"
|
|
21
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "clickhouse"
|
|
20
22
|
Provides-Extra: output
|
|
21
23
|
Requires-Dist: boto3>=1.28; extra == "output"
|
|
22
|
-
Requires-Dist: clickhouse-connect
|
|
24
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
25
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
23
26
|
Provides-Extra: autogluon
|
|
24
27
|
Requires-Dist: autogluon<2.0,>=1.0; extra == "autogluon"
|
|
25
28
|
Requires-Dist: pandas>=1.3.0; extra == "autogluon"
|
|
26
29
|
Requires-Dist: boto3>=1.28; extra == "autogluon"
|
|
27
|
-
Requires-Dist: clickhouse-connect
|
|
28
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "autogluon"
|
|
31
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "autogluon"
|
|
32
|
+
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "autogluon"
|
|
29
33
|
Provides-Extra: full
|
|
34
|
+
Requires-Dist: pyspark<3.6,>=3.4; extra == "full"
|
|
30
35
|
Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
31
36
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
32
|
-
Requires-Dist: clickhouse-connect
|
|
37
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
38
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
33
39
|
Requires-Dist: autogluon<2.0,>=1.0; extra == "full"
|
|
34
|
-
Requires-Dist: pyarrow>=10.0.0; extra == "full"
|
|
40
|
+
Requires-Dist: pyarrow>=10.0.0; python_version >= "3.8" and extra == "full"
|
|
35
41
|
|
|
36
42
|
# Batch Analytics
|
|
37
43
|
|
|
@@ -67,12 +73,13 @@ analytics/
|
|
|
67
73
|
|
|
68
74
|
## Install
|
|
69
75
|
|
|
76
|
+
- `pip install batch-analytics` — core (numpy only)
|
|
77
|
+
- `pip install batch-analytics[spark]` — PySpark ETL + job runner + SciPy (t-test / ANOVA)
|
|
78
|
+
- `pip install batch-analytics[autogluon]` — AutoGluon + I/O (Gluon image; no PySpark)
|
|
79
|
+
- `pip install batch-analytics[full]` — PySpark + AutoGluon + I/O
|
|
80
|
+
|
|
70
81
|
```bash
|
|
71
|
-
pip install -e .
|
|
72
|
-
# or install every runtime dependency used anywhere in the package, then editable:
|
|
73
|
-
pip install -r requirements.txt && pip install -e .
|
|
74
|
-
# PyPI install includes numpy and scipy (t-test); extras: s3, clickhouse, output, full
|
|
75
|
-
pip install "batch-analytics[full]"
|
|
82
|
+
pip install -e ".[spark]" # dev
|
|
76
83
|
```
|
|
77
84
|
|
|
78
85
|
## Run
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
numpy>=1.19.0
|
|
2
|
+
|
|
3
|
+
[autogluon]
|
|
4
|
+
autogluon<2.0,>=1.0
|
|
5
|
+
pandas>=1.3.0
|
|
6
|
+
boto3>=1.28
|
|
7
|
+
|
|
8
|
+
[autogluon:python_version < "3.9"]
|
|
9
|
+
clickhouse-connect<0.9,>=0.7
|
|
10
|
+
|
|
11
|
+
[autogluon:python_version >= "3.8"]
|
|
12
|
+
pyarrow>=10.0.0
|
|
13
|
+
|
|
14
|
+
[autogluon:python_version >= "3.9"]
|
|
15
|
+
clickhouse-connect>=0.7
|
|
16
|
+
|
|
17
|
+
[clickhouse]
|
|
18
|
+
|
|
19
|
+
[clickhouse:python_version < "3.9"]
|
|
20
|
+
clickhouse-connect<0.9,>=0.7
|
|
21
|
+
|
|
22
|
+
[clickhouse:python_version >= "3.9"]
|
|
23
|
+
clickhouse-connect>=0.7
|
|
24
|
+
|
|
25
|
+
[dev]
|
|
26
|
+
pytest>=7.0
|
|
27
|
+
|
|
28
|
+
[full]
|
|
29
|
+
pyspark<3.6,>=3.4
|
|
30
|
+
scipy>=1.5.0
|
|
31
|
+
boto3>=1.28
|
|
32
|
+
autogluon<2.0,>=1.0
|
|
33
|
+
|
|
34
|
+
[full:python_version < "3.9"]
|
|
35
|
+
clickhouse-connect<0.9,>=0.7
|
|
36
|
+
|
|
37
|
+
[full:python_version >= "3.8"]
|
|
38
|
+
pyarrow>=10.0.0
|
|
39
|
+
|
|
40
|
+
[full:python_version >= "3.9"]
|
|
41
|
+
clickhouse-connect>=0.7
|
|
42
|
+
|
|
43
|
+
[output]
|
|
44
|
+
boto3>=1.28
|
|
45
|
+
|
|
46
|
+
[output:python_version < "3.9"]
|
|
47
|
+
clickhouse-connect<0.9,>=0.7
|
|
48
|
+
|
|
49
|
+
[output:python_version >= "3.9"]
|
|
50
|
+
clickhouse-connect>=0.7
|
|
51
|
+
|
|
52
|
+
[s3]
|
|
53
|
+
boto3>=1.28
|
|
54
|
+
|
|
55
|
+
[spark]
|
|
56
|
+
pyspark<3.6,>=3.4
|
|
57
|
+
scipy>=1.5.0
|
|
58
|
+
|
|
59
|
+
[ttest]
|
|
60
|
+
scipy>=1.5.0
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Batch analytics pipeline: Extract, Transform, Log stages + analytics modules.
|
|
3
|
-
|
|
4
|
-
Stages:
|
|
5
|
-
- Extract: Load data from ClickHouse via Spark ClickHouse connector or JDBC
|
|
6
|
-
- Transform: Deduplicate and stage data (parquet/delta/clickhouse)
|
|
7
|
-
- Log: Persist run metadata and analytics results
|
|
8
|
-
|
|
9
|
-
Analytics modules:
|
|
10
|
-
- Module 1: Linear regression (XY) with slope comparison across groups
|
|
11
|
-
- Module 2: Multi-feature correlation
|
|
12
|
-
- Module 3: PCA + KMeans clustering
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
from .config import BatchAnalyticsConfig, SparkK8sConfig
|
|
16
|
-
from .extract import (
|
|
17
|
-
extract_all,
|
|
18
|
-
extract_table,
|
|
19
|
-
extract_unified,
|
|
20
|
-
parse_extract_filter_values,
|
|
21
|
-
)
|
|
22
|
-
from .transform import (
|
|
23
|
-
expand_kv_blob_column,
|
|
24
|
-
extract_anchor_id,
|
|
25
|
-
load_staged,
|
|
26
|
-
remove_duplicates,
|
|
27
|
-
stage_to_clickhouse,
|
|
28
|
-
transform,
|
|
29
|
-
transform_and_stage,
|
|
30
|
-
)
|
|
31
|
-
from .log import log_analytics_artifacts, log_run
|
|
32
|
-
from .job_runner import run_pipeline, create_spark_session
|
|
33
|
-
|
|
34
|
-
__all__ = [
|
|
35
|
-
"BatchAnalyticsConfig",
|
|
36
|
-
"SparkK8sConfig",
|
|
37
|
-
"expand_kv_blob_column",
|
|
38
|
-
"extract_anchor_id",
|
|
39
|
-
"extract_all",
|
|
40
|
-
"extract_table",
|
|
41
|
-
"extract_unified",
|
|
42
|
-
"parse_extract_filter_values",
|
|
43
|
-
"remove_duplicates",
|
|
44
|
-
"stage_to_clickhouse",
|
|
45
|
-
"transform",
|
|
46
|
-
"transform_and_stage",
|
|
47
|
-
"load_staged",
|
|
48
|
-
"log_run",
|
|
49
|
-
"log_analytics_artifacts",
|
|
50
|
-
"run_pipeline",
|
|
51
|
-
"create_spark_session",
|
|
52
|
-
]
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
pyspark<3.6,>=3.4
|
|
2
|
-
numpy>=1.22.0
|
|
3
|
-
scipy>=1.5.0
|
|
4
|
-
|
|
5
|
-
[autogluon]
|
|
6
|
-
autogluon<2.0,>=1.0
|
|
7
|
-
pandas>=1.3.0
|
|
8
|
-
boto3>=1.28
|
|
9
|
-
clickhouse-connect>=0.7
|
|
10
|
-
pyarrow>=10.0.0
|
|
11
|
-
|
|
12
|
-
[clickhouse]
|
|
13
|
-
clickhouse-connect>=0.7
|
|
14
|
-
|
|
15
|
-
[dev]
|
|
16
|
-
pytest>=7.0
|
|
17
|
-
|
|
18
|
-
[full]
|
|
19
|
-
scipy>=1.5.0
|
|
20
|
-
boto3>=1.28
|
|
21
|
-
clickhouse-connect>=0.7
|
|
22
|
-
autogluon<2.0,>=1.0
|
|
23
|
-
pyarrow>=10.0.0
|
|
24
|
-
|
|
25
|
-
[output]
|
|
26
|
-
boto3>=1.28
|
|
27
|
-
clickhouse-connect>=0.7
|
|
28
|
-
|
|
29
|
-
[s3]
|
|
30
|
-
boto3>=1.28
|
|
31
|
-
|
|
32
|
-
[ttest]
|
|
33
|
-
scipy>=1.5.0
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{batch_analytics-0.3.17 → batch_analytics-0.3.20}/src/batch_analytics.egg-info/top_level.txt
RENAMED
|
File without changes
|