batch-analytics 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/PKG-INFO +14 -3
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/README.md +6 -2
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/pyproject.toml +6 -1
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/extract.py +4 -2
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/job_runner.py +35 -5
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/transform.py +18 -9
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/PKG-INFO +14 -3
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/requires.txt +9 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/setup.cfg +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
7
7
|
Requires-Python: >=3.8
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pyspark<3.6,>=3.4
|
|
10
|
+
Requires-Dist: numpy>=1.19.0
|
|
10
11
|
Provides-Extra: dev
|
|
11
12
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
13
|
+
Provides-Extra: ttest
|
|
14
|
+
Requires-Dist: scipy>=1.5.0; extra == "ttest"
|
|
12
15
|
Provides-Extra: s3
|
|
13
16
|
Requires-Dist: boto3>=1.28; extra == "s3"
|
|
14
17
|
Provides-Extra: clickhouse
|
|
@@ -16,6 +19,10 @@ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
|
|
|
16
19
|
Provides-Extra: output
|
|
17
20
|
Requires-Dist: boto3>=1.28; extra == "output"
|
|
18
21
|
Requires-Dist: clickhouse-connect>=0.7; extra == "output"
|
|
22
|
+
Provides-Extra: full
|
|
23
|
+
Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
24
|
+
Requires-Dist: boto3>=1.28; extra == "full"
|
|
25
|
+
Requires-Dist: clickhouse-connect>=0.7; extra == "full"
|
|
19
26
|
|
|
20
27
|
# Batch Analytics
|
|
21
28
|
|
|
@@ -28,7 +35,8 @@ Only the files required for the batch analytics job runner:
|
|
|
28
35
|
```
|
|
29
36
|
analytics/
|
|
30
37
|
├── pyproject.toml
|
|
31
|
-
├── requirements
|
|
38
|
+
├── requirements.txt # core + scipy + boto3 + clickhouse-connect (single-file install)
|
|
39
|
+
├── requirements-batch.txt # includes requirements.txt
|
|
32
40
|
├── README.md
|
|
33
41
|
└── src/
|
|
34
42
|
└── batch_analytics/
|
|
@@ -52,7 +60,10 @@ analytics/
|
|
|
52
60
|
|
|
53
61
|
```bash
|
|
54
62
|
pip install -e .
|
|
55
|
-
# or
|
|
63
|
+
# or install every runtime dependency used anywhere in the package, then editable:
|
|
64
|
+
pip install -r requirements.txt && pip install -e .
|
|
65
|
+
# PyPI wheel pulls numpy automatically (pyspark.ml); extras: ttest, s3, clickhouse, output, full
|
|
66
|
+
pip install "batch-analytics[full]"
|
|
56
67
|
```
|
|
57
68
|
|
|
58
69
|
## Run
|
|
@@ -9,7 +9,8 @@ Only the files required for the batch analytics job runner:
|
|
|
9
9
|
```
|
|
10
10
|
analytics/
|
|
11
11
|
├── pyproject.toml
|
|
12
|
-
├── requirements
|
|
12
|
+
├── requirements.txt # core + scipy + boto3 + clickhouse-connect (single-file install)
|
|
13
|
+
├── requirements-batch.txt # includes requirements.txt
|
|
13
14
|
├── README.md
|
|
14
15
|
└── src/
|
|
15
16
|
└── batch_analytics/
|
|
@@ -33,7 +34,10 @@ analytics/
|
|
|
33
34
|
|
|
34
35
|
```bash
|
|
35
36
|
pip install -e .
|
|
36
|
-
# or
|
|
37
|
+
# or install every runtime dependency used anywhere in the package, then editable:
|
|
38
|
+
pip install -r requirements.txt && pip install -e .
|
|
39
|
+
# PyPI wheel pulls numpy automatically (pyspark.ml); extras: ttest, s3, clickhouse, output, full
|
|
40
|
+
pip install "batch-analytics[full]"
|
|
37
41
|
```
|
|
38
42
|
|
|
39
43
|
## Run
|
|
@@ -4,21 +4,26 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
11
11
|
dependencies = [
|
|
12
12
|
"pyspark>=3.4,<3.6",
|
|
13
|
+
"numpy>=1.19.0",
|
|
13
14
|
]
|
|
14
15
|
authors = [{ name = "Litewave Analytics Team" }]
|
|
15
16
|
license = { text = "MIT" }
|
|
16
17
|
|
|
17
18
|
[project.optional-dependencies]
|
|
18
19
|
dev = ["pytest>=7.0"]
|
|
20
|
+
# Welch t-test module (batch_analytics.analytics.t_test)
|
|
21
|
+
ttest = ["scipy>=1.5.0"]
|
|
19
22
|
s3 = ["boto3>=1.28"]
|
|
20
23
|
clickhouse = ["clickhouse-connect>=0.7"]
|
|
21
24
|
output = ["boto3>=1.28", "clickhouse-connect>=0.7"]
|
|
25
|
+
# Install all optional runtime deps used anywhere in the package
|
|
26
|
+
full = ["scipy>=1.5.0", "boto3>=1.28", "clickhouse-connect>=0.7"]
|
|
22
27
|
|
|
23
28
|
[project.scripts]
|
|
24
29
|
batch-analytics = "batch_analytics.job_runner:main"
|
|
@@ -18,7 +18,7 @@ def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str)
|
|
|
18
18
|
Requires: com.clickhouse.spark:clickhouse-spark-runtime in spark.jars.packages
|
|
19
19
|
"""
|
|
20
20
|
try:
|
|
21
|
-
|
|
21
|
+
rd = (
|
|
22
22
|
spark.read.format("clickhouse")
|
|
23
23
|
.option("host", cfg.clickhouse.host)
|
|
24
24
|
.option("protocol", cfg.clickhouse.protocol)
|
|
@@ -26,8 +26,10 @@ def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str)
|
|
|
26
26
|
.option("database", cfg.clickhouse.database)
|
|
27
27
|
.option("table", table)
|
|
28
28
|
.option("user", cfg.clickhouse.user)
|
|
29
|
-
.load()
|
|
30
29
|
)
|
|
30
|
+
if cfg.clickhouse.password:
|
|
31
|
+
rd = rd.option("password", cfg.clickhouse.password)
|
|
32
|
+
df = rd.load()
|
|
31
33
|
return df
|
|
32
34
|
except Exception as e:
|
|
33
35
|
logger.warning(
|
|
@@ -44,12 +44,31 @@ def create_spark_session(
|
|
|
44
44
|
.config("spark.sql.adaptive.coalescePartitions.enabled", "true")
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
-
# JARs:
|
|
48
|
-
|
|
47
|
+
# JARs: spark.jars = local/https/file paths; spark.jars.packages = Maven coordinates
|
|
48
|
+
# SPARK_JARS can be comma-separated URLs (e.g. Maven Central https://repo1.maven.org/...jar)
|
|
49
|
+
jar_list: List[str] = []
|
|
50
|
+
sp_jars = os.environ.get("SPARK_JARS", "").strip()
|
|
51
|
+
if sp_jars:
|
|
52
|
+
jar_list.extend(p.strip() for p in sp_jars.split(",") if p.strip())
|
|
53
|
+
|
|
54
|
+
packages: List[str] = []
|
|
49
55
|
if clickhouse_jars:
|
|
50
|
-
|
|
56
|
+
for part in clickhouse_jars.split(","):
|
|
57
|
+
part = part.strip()
|
|
58
|
+
if not part:
|
|
59
|
+
continue
|
|
60
|
+
if part.startswith(("http://", "https://", "file:")):
|
|
61
|
+
jar_list.append(part)
|
|
62
|
+
elif part.endswith(".jar") and ":" not in part:
|
|
63
|
+
jar_list.append(part)
|
|
64
|
+
else:
|
|
65
|
+
packages.append(part)
|
|
66
|
+
|
|
51
67
|
if cfg.master.startswith("k8s://"):
|
|
52
68
|
packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
|
|
69
|
+
|
|
70
|
+
if jar_list:
|
|
71
|
+
builder = builder.config("spark.jars", ",".join(jar_list))
|
|
53
72
|
if packages:
|
|
54
73
|
builder = builder.config("spark.jars.packages", ",".join(packages))
|
|
55
74
|
|
|
@@ -121,10 +140,21 @@ def run_pipeline(
|
|
|
121
140
|
run_id = str(uuid.uuid4())[:8]
|
|
122
141
|
|
|
123
142
|
if spark is None:
|
|
124
|
-
|
|
143
|
+
# Native format("clickhouse") needs clickhouse-spark-runtime; JDBC fallback needs clickhouse-jdbc.
|
|
144
|
+
# Override: BATCH_SPARK_CLICKHOUSE_PACKAGES=maven coords (comma-sep) or "" to use only SPARK_JARS.
|
|
145
|
+
_raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
|
|
146
|
+
if _raw_ch is None:
|
|
147
|
+
ch_pkgs = (
|
|
148
|
+
"com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,"
|
|
149
|
+
"com.clickhouse:clickhouse-jdbc:0.6.2"
|
|
150
|
+
)
|
|
151
|
+
elif not _raw_ch.strip():
|
|
152
|
+
ch_pkgs = None
|
|
153
|
+
else:
|
|
154
|
+
ch_pkgs = _raw_ch.strip()
|
|
125
155
|
spark = create_spark_session(
|
|
126
156
|
app_name="BatchAnalytics",
|
|
127
|
-
clickhouse_jars=
|
|
157
|
+
clickhouse_jars=ch_pkgs,
|
|
128
158
|
config=config,
|
|
129
159
|
)
|
|
130
160
|
|
|
@@ -92,16 +92,19 @@ def stage_to_clickhouse(
|
|
|
92
92
|
"""
|
|
93
93
|
n = df.count()
|
|
94
94
|
try:
|
|
95
|
+
ch = config.clickhouse
|
|
95
96
|
writer = (
|
|
96
97
|
df.write.format("clickhouse")
|
|
97
|
-
.option("host",
|
|
98
|
-
.option("
|
|
98
|
+
.option("host", ch.host)
|
|
99
|
+
.option("protocol", ch.protocol)
|
|
100
|
+
.option("http_port", str(ch.port))
|
|
101
|
+
.option("database", ch.database)
|
|
99
102
|
.option("table", config.transform.staging_table)
|
|
100
|
-
.option("user",
|
|
103
|
+
.option("user", ch.user)
|
|
101
104
|
.mode("overwrite")
|
|
102
105
|
)
|
|
103
|
-
if
|
|
104
|
-
writer = writer.option("password",
|
|
106
|
+
if ch.password:
|
|
107
|
+
writer = writer.option("password", ch.password)
|
|
105
108
|
writer.save()
|
|
106
109
|
except Exception as e:
|
|
107
110
|
logger.warning("ClickHouse connector failed (%s), using JDBC", e)
|
|
@@ -168,13 +171,19 @@ def load_staged(
|
|
|
168
171
|
return spark.read.format("delta").load(staging_path)
|
|
169
172
|
if fmt == "clickhouse":
|
|
170
173
|
try:
|
|
171
|
-
|
|
174
|
+
ch = config.clickhouse
|
|
175
|
+
rd = (
|
|
172
176
|
spark.read.format("clickhouse")
|
|
173
|
-
.option("host",
|
|
174
|
-
.option("
|
|
177
|
+
.option("host", ch.host)
|
|
178
|
+
.option("protocol", ch.protocol)
|
|
179
|
+
.option("http_port", str(ch.port))
|
|
180
|
+
.option("database", ch.database)
|
|
175
181
|
.option("table", config.transform.staging_table)
|
|
176
|
-
.
|
|
182
|
+
.option("user", ch.user)
|
|
177
183
|
)
|
|
184
|
+
if ch.password:
|
|
185
|
+
rd = rd.option("password", ch.password)
|
|
186
|
+
return rd.load()
|
|
178
187
|
except Exception:
|
|
179
188
|
return spark.read.jdbc(
|
|
180
189
|
config.clickhouse.jdbc_url,
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
7
7
|
Requires-Python: >=3.8
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pyspark<3.6,>=3.4
|
|
10
|
+
Requires-Dist: numpy>=1.19.0
|
|
10
11
|
Provides-Extra: dev
|
|
11
12
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
13
|
+
Provides-Extra: ttest
|
|
14
|
+
Requires-Dist: scipy>=1.5.0; extra == "ttest"
|
|
12
15
|
Provides-Extra: s3
|
|
13
16
|
Requires-Dist: boto3>=1.28; extra == "s3"
|
|
14
17
|
Provides-Extra: clickhouse
|
|
@@ -16,6 +19,10 @@ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
|
|
|
16
19
|
Provides-Extra: output
|
|
17
20
|
Requires-Dist: boto3>=1.28; extra == "output"
|
|
18
21
|
Requires-Dist: clickhouse-connect>=0.7; extra == "output"
|
|
22
|
+
Provides-Extra: full
|
|
23
|
+
Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
24
|
+
Requires-Dist: boto3>=1.28; extra == "full"
|
|
25
|
+
Requires-Dist: clickhouse-connect>=0.7; extra == "full"
|
|
19
26
|
|
|
20
27
|
# Batch Analytics
|
|
21
28
|
|
|
@@ -28,7 +35,8 @@ Only the files required for the batch analytics job runner:
|
|
|
28
35
|
```
|
|
29
36
|
analytics/
|
|
30
37
|
├── pyproject.toml
|
|
31
|
-
├── requirements
|
|
38
|
+
├── requirements.txt # core + scipy + boto3 + clickhouse-connect (single-file install)
|
|
39
|
+
├── requirements-batch.txt # includes requirements.txt
|
|
32
40
|
├── README.md
|
|
33
41
|
└── src/
|
|
34
42
|
└── batch_analytics/
|
|
@@ -52,7 +60,10 @@ analytics/
|
|
|
52
60
|
|
|
53
61
|
```bash
|
|
54
62
|
pip install -e .
|
|
55
|
-
# or
|
|
63
|
+
# or install every runtime dependency used anywhere in the package, then editable:
|
|
64
|
+
pip install -r requirements.txt && pip install -e .
|
|
65
|
+
# PyPI wheel pulls numpy automatically (pyspark.ml); extras: ttest, s3, clickhouse, output, full
|
|
66
|
+
pip install "batch-analytics[full]"
|
|
56
67
|
```
|
|
57
68
|
|
|
58
69
|
## Run
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
pyspark<3.6,>=3.4
|
|
2
|
+
numpy>=1.19.0
|
|
2
3
|
|
|
3
4
|
[clickhouse]
|
|
4
5
|
clickhouse-connect>=0.7
|
|
@@ -6,9 +7,17 @@ clickhouse-connect>=0.7
|
|
|
6
7
|
[dev]
|
|
7
8
|
pytest>=7.0
|
|
8
9
|
|
|
10
|
+
[full]
|
|
11
|
+
scipy>=1.5.0
|
|
12
|
+
boto3>=1.28
|
|
13
|
+
clickhouse-connect>=0.7
|
|
14
|
+
|
|
9
15
|
[output]
|
|
10
16
|
boto3>=1.28
|
|
11
17
|
clickhouse-connect>=0.7
|
|
12
18
|
|
|
13
19
|
[s3]
|
|
14
20
|
boto3>=1.28
|
|
21
|
+
|
|
22
|
+
[ttest]
|
|
23
|
+
scipy>=1.5.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|