batch-analytics 0.2.8__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/PKG-INFO +7 -4
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/pyproject.toml +17 -4
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/job_runner.py +22 -3
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/PKG-INFO +7 -4
- batch_analytics-0.3.0/src/batch_analytics.egg-info/requires.txt +38 -0
- batch_analytics-0.2.8/src/batch_analytics.egg-info/requires.txt +0 -23
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/README.md +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/setup.cfg +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/analytics/correlation.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/analytics/linear_regression.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/analytics/pca_clustering.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/analytics/t_test.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/extract.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/log.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/output/base.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/output/clickhouse.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/output/local.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/output/s3.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/transform.py +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -15,14 +15,17 @@ Requires-Dist: scipy>=1.5.0; extra == "ttest"
|
|
|
15
15
|
Provides-Extra: s3
|
|
16
16
|
Requires-Dist: boto3>=1.28; extra == "s3"
|
|
17
17
|
Provides-Extra: clickhouse
|
|
18
|
-
Requires-Dist: clickhouse-connect
|
|
18
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "clickhouse"
|
|
19
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "clickhouse"
|
|
19
20
|
Provides-Extra: output
|
|
20
21
|
Requires-Dist: boto3>=1.28; extra == "output"
|
|
21
|
-
Requires-Dist: clickhouse-connect
|
|
22
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
23
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
22
24
|
Provides-Extra: full
|
|
23
25
|
Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
24
26
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
25
|
-
Requires-Dist: clickhouse-connect
|
|
27
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
28
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
26
29
|
|
|
27
30
|
# Batch Analytics
|
|
28
31
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -20,10 +20,23 @@ dev = ["pytest>=7.0"]
|
|
|
20
20
|
# Welch t-test module (batch_analytics.analytics.t_test)
|
|
21
21
|
ttest = ["scipy>=1.5.0"]
|
|
22
22
|
s3 = ["boto3>=1.28"]
|
|
23
|
-
|
|
24
|
-
|
|
23
|
+
# 0.9+ uses list[...] etc. and breaks on Python 3.8; 3.9+ can take current clickhouse-connect.
|
|
24
|
+
clickhouse = [
|
|
25
|
+
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
26
|
+
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
27
|
+
]
|
|
28
|
+
output = [
|
|
29
|
+
"boto3>=1.28",
|
|
30
|
+
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
31
|
+
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
32
|
+
]
|
|
25
33
|
# Install all optional runtime deps used anywhere in the package
|
|
26
|
-
full = [
|
|
34
|
+
full = [
|
|
35
|
+
"scipy>=1.5.0",
|
|
36
|
+
"boto3>=1.28",
|
|
37
|
+
"clickhouse-connect>=0.7,<0.9; python_version < '3.9'",
|
|
38
|
+
"clickhouse-connect>=0.7; python_version >= '3.9'",
|
|
39
|
+
]
|
|
27
40
|
|
|
28
41
|
[project.scripts]
|
|
29
42
|
batch-analytics = "batch_analytics.job_runner:main"
|
|
@@ -13,7 +13,7 @@ from typing import Dict, List, Optional
|
|
|
13
13
|
|
|
14
14
|
from pyspark.sql import SparkSession
|
|
15
15
|
|
|
16
|
-
from .config import BatchAnalyticsConfig
|
|
16
|
+
from .config import BatchAnalyticsConfig, SparkK8sConfig
|
|
17
17
|
from .extract import extract_unified
|
|
18
18
|
from .log import log_dataframe_summary, log_run
|
|
19
19
|
from .modules import DEFAULT_MODULES, MODULE_REGISTRY, VALID_MODULES
|
|
@@ -27,6 +27,21 @@ logging.basicConfig(
|
|
|
27
27
|
logger = logging.getLogger(__name__)
|
|
28
28
|
|
|
29
29
|
|
|
30
|
+
def _spark_on_kubernetes_for_classpath(cfg: SparkK8sConfig) -> bool:
|
|
31
|
+
"""
|
|
32
|
+
True when Spark executors run on Kubernetes.
|
|
33
|
+
|
|
34
|
+
Spark Operator cluster mode passes --master k8s://... to spark-submit; SPARK_MASTER is often
|
|
35
|
+
unset, so SparkK8sConfig.master defaults to local[*] while the JVM still uses Kubernetes.
|
|
36
|
+
Image-baked JARs must go on extraClassPath in that case (spark.jars breaks executors).
|
|
37
|
+
"""
|
|
38
|
+
if cfg.master.startswith("k8s://"):
|
|
39
|
+
return True
|
|
40
|
+
if os.environ.get("SPARK_MASTER", "").strip().startswith("k8s://"):
|
|
41
|
+
return True
|
|
42
|
+
return bool(os.environ.get("KUBERNETES_SERVICE_HOST"))
|
|
43
|
+
|
|
44
|
+
|
|
30
45
|
def create_spark_session(
|
|
31
46
|
app_name: str = "BatchAnalytics",
|
|
32
47
|
clickhouse_jars: Optional[str] = None,
|
|
@@ -64,13 +79,13 @@ def create_spark_session(
|
|
|
64
79
|
else:
|
|
65
80
|
packages.append(part)
|
|
66
81
|
|
|
67
|
-
if cfg
|
|
82
|
+
if _spark_on_kubernetes_for_classpath(cfg):
|
|
68
83
|
packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
|
|
69
84
|
|
|
70
85
|
# Spark-on-K8s: absolute paths in spark.jars are re-sent to executors as ./basename.jar and
|
|
71
86
|
# fail there ("Unable to create executor due to ./clickhouse-spark-runtime-..."). JARs baked
|
|
72
87
|
# into the driver/executor image belong on the JVM classpath instead.
|
|
73
|
-
if cfg
|
|
88
|
+
if _spark_on_kubernetes_for_classpath(cfg) and jar_list:
|
|
74
89
|
local_cp: List[str] = []
|
|
75
90
|
remote_jars: List[str] = []
|
|
76
91
|
for p in jar_list:
|
|
@@ -83,6 +98,10 @@ def create_spark_session(
|
|
|
83
98
|
remote_jars.append(p)
|
|
84
99
|
if local_cp:
|
|
85
100
|
joined = ":".join(local_cp)
|
|
101
|
+
logger.info(
|
|
102
|
+
"Spark on Kubernetes: image JARs on driver/executor extraClassPath (not spark.jars): %s",
|
|
103
|
+
joined,
|
|
104
|
+
)
|
|
86
105
|
builder = (
|
|
87
106
|
builder.config("spark.driver.extraClassPath", joined)
|
|
88
107
|
.config("spark.executor.extraClassPath", joined)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
5
|
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
@@ -15,14 +15,17 @@ Requires-Dist: scipy>=1.5.0; extra == "ttest"
|
|
|
15
15
|
Provides-Extra: s3
|
|
16
16
|
Requires-Dist: boto3>=1.28; extra == "s3"
|
|
17
17
|
Provides-Extra: clickhouse
|
|
18
|
-
Requires-Dist: clickhouse-connect
|
|
18
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "clickhouse"
|
|
19
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "clickhouse"
|
|
19
20
|
Provides-Extra: output
|
|
20
21
|
Requires-Dist: boto3>=1.28; extra == "output"
|
|
21
|
-
Requires-Dist: clickhouse-connect
|
|
22
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "output"
|
|
23
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "output"
|
|
22
24
|
Provides-Extra: full
|
|
23
25
|
Requires-Dist: scipy>=1.5.0; extra == "full"
|
|
24
26
|
Requires-Dist: boto3>=1.28; extra == "full"
|
|
25
|
-
Requires-Dist: clickhouse-connect
|
|
27
|
+
Requires-Dist: clickhouse-connect<0.9,>=0.7; python_version < "3.9" and extra == "full"
|
|
28
|
+
Requires-Dist: clickhouse-connect>=0.7; python_version >= "3.9" and extra == "full"
|
|
26
29
|
|
|
27
30
|
# Batch Analytics
|
|
28
31
|
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
pyspark<3.6,>=3.4
|
|
2
|
+
numpy>=1.19.0
|
|
3
|
+
|
|
4
|
+
[clickhouse]
|
|
5
|
+
|
|
6
|
+
[clickhouse:python_version < "3.9"]
|
|
7
|
+
clickhouse-connect<0.9,>=0.7
|
|
8
|
+
|
|
9
|
+
[clickhouse:python_version >= "3.9"]
|
|
10
|
+
clickhouse-connect>=0.7
|
|
11
|
+
|
|
12
|
+
[dev]
|
|
13
|
+
pytest>=7.0
|
|
14
|
+
|
|
15
|
+
[full]
|
|
16
|
+
scipy>=1.5.0
|
|
17
|
+
boto3>=1.28
|
|
18
|
+
|
|
19
|
+
[full:python_version < "3.9"]
|
|
20
|
+
clickhouse-connect<0.9,>=0.7
|
|
21
|
+
|
|
22
|
+
[full:python_version >= "3.9"]
|
|
23
|
+
clickhouse-connect>=0.7
|
|
24
|
+
|
|
25
|
+
[output]
|
|
26
|
+
boto3>=1.28
|
|
27
|
+
|
|
28
|
+
[output:python_version < "3.9"]
|
|
29
|
+
clickhouse-connect<0.9,>=0.7
|
|
30
|
+
|
|
31
|
+
[output:python_version >= "3.9"]
|
|
32
|
+
clickhouse-connect>=0.7
|
|
33
|
+
|
|
34
|
+
[s3]
|
|
35
|
+
boto3>=1.28
|
|
36
|
+
|
|
37
|
+
[ttest]
|
|
38
|
+
scipy>=1.5.0
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
pyspark<3.6,>=3.4
|
|
2
|
-
numpy>=1.19.0
|
|
3
|
-
|
|
4
|
-
[clickhouse]
|
|
5
|
-
clickhouse-connect>=0.7
|
|
6
|
-
|
|
7
|
-
[dev]
|
|
8
|
-
pytest>=7.0
|
|
9
|
-
|
|
10
|
-
[full]
|
|
11
|
-
scipy>=1.5.0
|
|
12
|
-
boto3>=1.28
|
|
13
|
-
clickhouse-connect>=0.7
|
|
14
|
-
|
|
15
|
-
[output]
|
|
16
|
-
boto3>=1.28
|
|
17
|
-
clickhouse-connect>=0.7
|
|
18
|
-
|
|
19
|
-
[s3]
|
|
20
|
-
boto3>=1.28
|
|
21
|
-
|
|
22
|
-
[ttest]
|
|
23
|
-
scipy>=1.5.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/analytics/correlation.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
File without changes
|
{batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.2.8 → batch_analytics-0.3.0}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|