batch-analytics 0.1.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/PKG-INFO +3 -3
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/pyproject.toml +3 -3
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/correlation.py +3 -3
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/linear_regression.py +4 -4
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/pca_clustering.py +3 -3
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/t_test.py +5 -5
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/extract.py +6 -6
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/job_runner.py +7 -6
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/log.py +6 -5
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/output/base.py +9 -5
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/output/clickhouse.py +5 -5
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/output/local.py +5 -5
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/output/s3.py +8 -8
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/transform.py +2 -2
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics.egg-info/PKG-INFO +3 -3
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/README.md +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/setup.cfg +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/__init__.py +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/__main__.py +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/__init__.py +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/config.py +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/modules.py +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/output/__init__.py +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics.egg-info/entry_points.txt +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics.egg-info/requires.txt +0 -0
- {batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics.egg-info/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
|
-
Author: Analytics Team
|
|
5
|
+
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pyspark<3.6,>=3.4
|
|
10
10
|
Provides-Extra: dev
|
|
@@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "batch-analytics"
|
|
7
|
-
version = "0.1
|
|
7
|
+
version = "0.2.1"
|
|
8
8
|
description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
11
|
dependencies = [
|
|
12
12
|
"pyspark>=3.4,<3.6",
|
|
13
13
|
]
|
|
14
|
-
authors = [{ name = "Analytics Team" }]
|
|
14
|
+
authors = [{ name = "Litewave Analytics Team" }]
|
|
15
15
|
license = { text = "MIT" }
|
|
16
16
|
|
|
17
17
|
[project.optional-dependencies]
|
{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/correlation.py
RENAMED
|
@@ -3,7 +3,7 @@ Module 2: Multi-feature correlation analysis.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
7
|
|
|
8
8
|
from pyspark.ml.feature import VectorAssembler
|
|
9
9
|
from pyspark.ml.stat import Correlation
|
|
@@ -18,7 +18,7 @@ def run_correlation(
|
|
|
18
18
|
spark: SparkSession,
|
|
19
19
|
df: DataFrame,
|
|
20
20
|
config: BatchAnalyticsConfig,
|
|
21
|
-
) ->
|
|
21
|
+
) -> Dict[str, Any]:
|
|
22
22
|
"""
|
|
23
23
|
Compute correlation matrix over multiple numeric features.
|
|
24
24
|
Optionally identify pairs above a threshold (collinearity).
|
|
@@ -93,7 +93,7 @@ def run_correlation(
|
|
|
93
93
|
matrix = np.asarray(arr)
|
|
94
94
|
|
|
95
95
|
threshold = config.analytics.corr_threshold
|
|
96
|
-
high_pairs:
|
|
96
|
+
high_pairs: List[dict] = []
|
|
97
97
|
n = len(feature_cols)
|
|
98
98
|
for i in range(n):
|
|
99
99
|
for j in range(i + 1, n):
|
{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/linear_regression.py
RENAMED
|
@@ -3,7 +3,7 @@ Module 1: Simple linear regression on XY data with slope comparison across group
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Dict
|
|
7
7
|
|
|
8
8
|
from pyspark.ml.feature import VectorAssembler
|
|
9
9
|
from pyspark.ml.regression import LinearRegression
|
|
@@ -29,7 +29,7 @@ def run_linear_regression(
|
|
|
29
29
|
spark: SparkSession,
|
|
30
30
|
df: DataFrame,
|
|
31
31
|
config: BatchAnalyticsConfig,
|
|
32
|
-
) ->
|
|
32
|
+
) -> Dict[str, Any]:
|
|
33
33
|
"""
|
|
34
34
|
Run simple linear regression: Y ~ X.
|
|
35
35
|
If group columns are configured, fit separate models per group and compare slopes.
|
|
@@ -76,7 +76,7 @@ def run_linear_regression(
|
|
|
76
76
|
global_intercept = float(global_model.intercept)
|
|
77
77
|
global_r2 = float(global_model.summary.r2)
|
|
78
78
|
|
|
79
|
-
result:
|
|
79
|
+
result: Dict[str, Any] = {
|
|
80
80
|
"global": {
|
|
81
81
|
"slope": global_slope,
|
|
82
82
|
"intercept": global_intercept,
|
|
@@ -97,7 +97,7 @@ def run_linear_regression(
|
|
|
97
97
|
)
|
|
98
98
|
groups = df_grouped.select(group_key).distinct().collect()
|
|
99
99
|
|
|
100
|
-
slopes_by_group:
|
|
100
|
+
slopes_by_group: Dict[str, dict] = {}
|
|
101
101
|
for row in groups:
|
|
102
102
|
key_str = row[group_key]
|
|
103
103
|
if key_str is None:
|
{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/pca_clustering.py
RENAMED
|
@@ -3,7 +3,7 @@ Module 3: PCA for key feature identification and clustering on staged data.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
7
|
|
|
8
8
|
from pyspark.ml.clustering import KMeans
|
|
9
9
|
from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
|
|
@@ -18,7 +18,7 @@ def run_pca_clustering(
|
|
|
18
18
|
spark: SparkSession,
|
|
19
19
|
df: DataFrame,
|
|
20
20
|
config: BatchAnalyticsConfig,
|
|
21
|
-
) ->
|
|
21
|
+
) -> Dict[str, Any]:
|
|
22
22
|
"""
|
|
23
23
|
Run PCA to identify key features (principal components) and KMeans clustering.
|
|
24
24
|
Uses scaled features; PCA components capture variance; clustering groups similar rows.
|
|
@@ -94,7 +94,7 @@ def run_pca_clustering(
|
|
|
94
94
|
|
|
95
95
|
# Feature loadings per PC (which original features contribute most)
|
|
96
96
|
components = pca_model.pc.toArray()
|
|
97
|
-
loadings:
|
|
97
|
+
loadings: List[dict] = []
|
|
98
98
|
for i, comp in enumerate(components):
|
|
99
99
|
ranked = sorted(
|
|
100
100
|
zip(feature_cols, comp.tolist()),
|
|
@@ -3,7 +3,7 @@ Module 4: T-test to compare means of two sets of data.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Dict
|
|
7
7
|
|
|
8
8
|
from pyspark.sql import DataFrame, SparkSession
|
|
9
9
|
from pyspark.sql.functions import col, avg, stddev, count
|
|
@@ -18,7 +18,7 @@ def run_t_test(
|
|
|
18
18
|
spark: SparkSession,
|
|
19
19
|
df: DataFrame,
|
|
20
20
|
config: BatchAnalyticsConfig,
|
|
21
|
-
) ->
|
|
21
|
+
) -> Dict[str, Any]:
|
|
22
22
|
"""
|
|
23
23
|
Perform an independent samples t-test to compare the means of two groups.
|
|
24
24
|
|
|
@@ -75,7 +75,7 @@ def _run_t_test_by_group(
|
|
|
75
75
|
df: DataFrame,
|
|
76
76
|
value_col: str,
|
|
77
77
|
group_col: str,
|
|
78
|
-
) ->
|
|
78
|
+
) -> Dict[str, Any]:
|
|
79
79
|
"""T-test: compare mean of value_col across two levels of group_col."""
|
|
80
80
|
df_num = df.select(
|
|
81
81
|
col(value_col).cast(DoubleType()).alias("_val"),
|
|
@@ -110,7 +110,7 @@ def _run_t_test_by_group(
|
|
|
110
110
|
)
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
def _run_t_test_two_columns(df: DataFrame, col_a: str, col_b: str) ->
|
|
113
|
+
def _run_t_test_two_columns(df: DataFrame, col_a: str, col_b: str) -> Dict[str, Any]:
|
|
114
114
|
"""T-test: compare means of two numeric columns."""
|
|
115
115
|
df_num = df.select(
|
|
116
116
|
col(col_a).cast(DoubleType()).alias("_a"),
|
|
@@ -147,7 +147,7 @@ def _compute_t_test_result(
|
|
|
147
147
|
mean_b: float,
|
|
148
148
|
std_b: float,
|
|
149
149
|
n_b: int,
|
|
150
|
-
) ->
|
|
150
|
+
) -> Dict[str, Any]:
|
|
151
151
|
"""Compute Welch's t-test from summary statistics."""
|
|
152
152
|
try:
|
|
153
153
|
from scipy import stats
|
|
@@ -3,7 +3,7 @@ Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDB
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
from pyspark.sql import DataFrame, SparkSession
|
|
9
9
|
|
|
@@ -12,7 +12,7 @@ from .config import BatchAnalyticsConfig
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> DataFrame
|
|
15
|
+
def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
|
|
16
16
|
"""
|
|
17
17
|
Read from ClickHouse using the native format API (clickhouse-spark-runtime).
|
|
18
18
|
Requires: com.clickhouse.spark:clickhouse-spark-runtime in spark.jars.packages
|
|
@@ -70,7 +70,7 @@ def extract_table(
|
|
|
70
70
|
def extract_all(
|
|
71
71
|
spark: SparkSession,
|
|
72
72
|
config: BatchAnalyticsConfig,
|
|
73
|
-
) ->
|
|
73
|
+
) -> Dict[str, DataFrame]:
|
|
74
74
|
"""
|
|
75
75
|
Extract all configured source tables from ClickHouse.
|
|
76
76
|
Returns a dict mapping table name to DataFrame.
|
|
@@ -79,7 +79,7 @@ def extract_all(
|
|
|
79
79
|
if not tables:
|
|
80
80
|
raise ValueError("No source tables configured in BATCH_SOURCE_TABLES")
|
|
81
81
|
|
|
82
|
-
result:
|
|
82
|
+
result: Dict[str, DataFrame] = {}
|
|
83
83
|
for table in tables:
|
|
84
84
|
df = extract_table(spark, table, config)
|
|
85
85
|
result[table] = df
|
|
@@ -90,8 +90,8 @@ def extract_all(
|
|
|
90
90
|
def extract_unified(
|
|
91
91
|
spark: SparkSession,
|
|
92
92
|
config: BatchAnalyticsConfig,
|
|
93
|
-
join_keys:
|
|
94
|
-
primary_table: str
|
|
93
|
+
join_keys: Optional[List[str]] = None,
|
|
94
|
+
primary_table: Optional[str] = None,
|
|
95
95
|
) -> DataFrame:
|
|
96
96
|
"""
|
|
97
97
|
Extract and unify source tables into one DataFrame.
|
|
@@ -9,6 +9,7 @@ import socket
|
|
|
9
9
|
import sys
|
|
10
10
|
import uuid
|
|
11
11
|
from pathlib import Path
|
|
12
|
+
from typing import Dict, List, Optional
|
|
12
13
|
|
|
13
14
|
from pyspark.sql import SparkSession
|
|
14
15
|
|
|
@@ -28,8 +29,8 @@ logger = logging.getLogger(__name__)
|
|
|
28
29
|
|
|
29
30
|
def create_spark_session(
|
|
30
31
|
app_name: str = "BatchAnalytics",
|
|
31
|
-
clickhouse_jars: str
|
|
32
|
-
config: BatchAnalyticsConfig
|
|
32
|
+
clickhouse_jars: Optional[str] = None,
|
|
33
|
+
config: Optional[BatchAnalyticsConfig] = None,
|
|
33
34
|
) -> SparkSession:
|
|
34
35
|
"""
|
|
35
36
|
Create SparkSession. Uses Kubernetes config when SPARK_MASTER starts with k8s://.
|
|
@@ -93,14 +94,14 @@ def create_spark_session(
|
|
|
93
94
|
|
|
94
95
|
|
|
95
96
|
def run_pipeline(
|
|
96
|
-
config: BatchAnalyticsConfig
|
|
97
|
-
spark: SparkSession
|
|
97
|
+
config: Optional[BatchAnalyticsConfig] = None,
|
|
98
|
+
spark: Optional[SparkSession] = None,
|
|
98
99
|
run_extract: bool = True,
|
|
99
100
|
run_transform: bool = True,
|
|
100
101
|
run_stage: bool = True,
|
|
101
102
|
run_analytics: bool = True,
|
|
102
|
-
modules:
|
|
103
|
-
) ->
|
|
103
|
+
modules: Optional[List[str]] = None,
|
|
104
|
+
) -> Dict:
|
|
104
105
|
"""
|
|
105
106
|
Run the full pipeline or selected stages.
|
|
106
107
|
|
|
@@ -6,6 +6,7 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from pathlib import Path
|
|
9
|
+
from typing import List, Optional, Union
|
|
9
10
|
|
|
10
11
|
from pyspark.sql import SparkSession
|
|
11
12
|
|
|
@@ -16,8 +17,8 @@ def log_run(
|
|
|
16
17
|
run_id: str,
|
|
17
18
|
stage: str,
|
|
18
19
|
metrics: dict,
|
|
19
|
-
output_dir: str
|
|
20
|
-
extra: dict
|
|
20
|
+
output_dir: Optional[Union[str, Path]] = None,
|
|
21
|
+
extra: Optional[dict] = None,
|
|
21
22
|
) -> Path:
|
|
22
23
|
"""
|
|
23
24
|
Write run log (metadata + metrics) to a JSON file.
|
|
@@ -63,8 +64,8 @@ def log_dataframe_summary(
|
|
|
63
64
|
def log_analytics_artifacts(
|
|
64
65
|
run_id: str,
|
|
65
66
|
artifacts: dict,
|
|
66
|
-
output_dir: str
|
|
67
|
-
) ->
|
|
67
|
+
output_dir: Union[str, Path],
|
|
68
|
+
) -> List[Path]:
|
|
68
69
|
"""
|
|
69
70
|
Write analytics module outputs (e.g. slopes, correlation matrix, PCA loadings)
|
|
70
71
|
to JSON files in the log directory.
|
|
@@ -72,7 +73,7 @@ def log_analytics_artifacts(
|
|
|
72
73
|
output_dir = Path(output_dir)
|
|
73
74
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
74
75
|
|
|
75
|
-
paths:
|
|
76
|
+
paths: List[Path] = []
|
|
76
77
|
for name, data in artifacts.items():
|
|
77
78
|
path = output_dir / f"{run_id}_analytics_{name}.json"
|
|
78
79
|
with open(path, "w") as f:
|
|
@@ -5,7 +5,7 @@ Base output driver interface and write orchestration.
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import Any, Dict, List
|
|
9
9
|
|
|
10
10
|
logger = logging.getLogger(__name__)
|
|
11
11
|
|
|
@@ -37,8 +37,8 @@ class OutputDriver(ABC):
|
|
|
37
37
|
self,
|
|
38
38
|
run_id: str,
|
|
39
39
|
task_id: str,
|
|
40
|
-
artifacts:
|
|
41
|
-
) ->
|
|
40
|
+
artifacts: Dict[str, Any],
|
|
41
|
+
) -> List[str]:
|
|
42
42
|
"""
|
|
43
43
|
Write analytics artifacts to the destination.
|
|
44
44
|
|
|
@@ -56,10 +56,10 @@ class OutputDriver(ABC):
|
|
|
56
56
|
def write_analytics_output(
|
|
57
57
|
run_id: str,
|
|
58
58
|
task_id: str,
|
|
59
|
-
artifacts:
|
|
59
|
+
artifacts: Dict[str, Any],
|
|
60
60
|
output_type: str,
|
|
61
61
|
**driver_kwargs: Any,
|
|
62
|
-
) ->
|
|
62
|
+
) -> List[str]:
|
|
63
63
|
"""
|
|
64
64
|
Write analytics results using the configured output driver.
|
|
65
65
|
|
|
@@ -77,6 +77,10 @@ def write_analytics_output(
|
|
|
77
77
|
logger.debug("No analytics artifacts to write")
|
|
78
78
|
return []
|
|
79
79
|
|
|
80
|
+
from .clickhouse import ClickHouseOutputDriver
|
|
81
|
+
from .local import LocalOutputDriver
|
|
82
|
+
from .s3 import S3OutputDriver
|
|
83
|
+
|
|
80
84
|
output_type = (output_type or "local").lower().strip()
|
|
81
85
|
if output_type == "local":
|
|
82
86
|
driver = LocalOutputDriver(**driver_kwargs)
|
|
@@ -4,7 +4,7 @@ ClickHouse output driver: inserts analytics results into a ClickHouse table.
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
8
|
|
|
9
9
|
from .base import OutputDriver, _serialize_for_json
|
|
10
10
|
|
|
@@ -32,7 +32,7 @@ class ClickHouseOutputDriver(OutputDriver):
|
|
|
32
32
|
self,
|
|
33
33
|
database: str,
|
|
34
34
|
table: str,
|
|
35
|
-
host: str
|
|
35
|
+
host: Optional[str] = None,
|
|
36
36
|
port: int = 8123,
|
|
37
37
|
user: str = "default",
|
|
38
38
|
password: str = "",
|
|
@@ -66,14 +66,14 @@ class ClickHouseOutputDriver(OutputDriver):
|
|
|
66
66
|
self,
|
|
67
67
|
run_id: str,
|
|
68
68
|
task_id: str,
|
|
69
|
-
artifacts:
|
|
70
|
-
) ->
|
|
69
|
+
artifacts: Dict[str, Any],
|
|
70
|
+
) -> List[str]:
|
|
71
71
|
client = self._client()
|
|
72
72
|
|
|
73
73
|
# Ensure table exists
|
|
74
74
|
client.command(_create_table_sql(self.database, self.table).strip())
|
|
75
75
|
|
|
76
|
-
rows:
|
|
76
|
+
rows: List[Tuple[str, str, str, str]] = []
|
|
77
77
|
for module, data in artifacts.items():
|
|
78
78
|
result_json = json.dumps(_serialize_for_json(data))
|
|
79
79
|
rows.append((task_id, run_id, module, result_json))
|
|
@@ -5,7 +5,7 @@ Local output driver: writes analytics results to a local directory (BATCH_LOG_PA
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import Any, Dict, List, Union
|
|
9
9
|
|
|
10
10
|
from .base import OutputDriver, _serialize_for_json
|
|
11
11
|
|
|
@@ -15,7 +15,7 @@ logger = logging.getLogger(__name__)
|
|
|
15
15
|
class LocalOutputDriver(OutputDriver):
|
|
16
16
|
"""Write analytics artifacts to local filesystem."""
|
|
17
17
|
|
|
18
|
-
def __init__(self, path: str
|
|
18
|
+
def __init__(self, path: Union[str, Path] = "/tmp/analytics_logs", **kwargs: Any) -> None:
|
|
19
19
|
self.path = Path(path)
|
|
20
20
|
self.path.mkdir(parents=True, exist_ok=True)
|
|
21
21
|
|
|
@@ -23,9 +23,9 @@ class LocalOutputDriver(OutputDriver):
|
|
|
23
23
|
self,
|
|
24
24
|
run_id: str,
|
|
25
25
|
task_id: str,
|
|
26
|
-
artifacts:
|
|
27
|
-
) ->
|
|
28
|
-
locations:
|
|
26
|
+
artifacts: Dict[str, Any],
|
|
27
|
+
) -> List[str]:
|
|
28
|
+
locations: List[str] = []
|
|
29
29
|
for name, data in artifacts.items():
|
|
30
30
|
filepath = self.path / f"{run_id}_analytics_{name}.json"
|
|
31
31
|
with open(filepath, "w") as f:
|
|
@@ -4,7 +4,7 @@ S3 output driver: uploads analytics results as JSON to S3.
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
8
|
|
|
9
9
|
from .base import OutputDriver, _serialize_for_json
|
|
10
10
|
|
|
@@ -17,8 +17,8 @@ class S3OutputDriver(OutputDriver):
|
|
|
17
17
|
def __init__(
|
|
18
18
|
self,
|
|
19
19
|
path: str,
|
|
20
|
-
region: str
|
|
21
|
-
endpoint: str
|
|
20
|
+
region: Optional[str] = None,
|
|
21
|
+
endpoint: Optional[str] = None,
|
|
22
22
|
**kwargs: Any,
|
|
23
23
|
) -> None:
|
|
24
24
|
"""
|
|
@@ -42,7 +42,7 @@ class S3OutputDriver(OutputDriver):
|
|
|
42
42
|
) from e
|
|
43
43
|
|
|
44
44
|
config = Config(signature_version="s3v4")
|
|
45
|
-
client_kwargs:
|
|
45
|
+
client_kwargs: Dict[str, Any] = {}
|
|
46
46
|
if self.region:
|
|
47
47
|
client_kwargs["region_name"] = self.region
|
|
48
48
|
if self.endpoint:
|
|
@@ -50,7 +50,7 @@ class S3OutputDriver(OutputDriver):
|
|
|
50
50
|
|
|
51
51
|
return boto3.client("s3", config=config, **client_kwargs)
|
|
52
52
|
|
|
53
|
-
def _parse_s3_path(self) ->
|
|
53
|
+
def _parse_s3_path(self) -> Tuple[str, str]:
|
|
54
54
|
"""Parse s3://bucket/prefix into bucket and prefix."""
|
|
55
55
|
if not self.path.startswith("s3://"):
|
|
56
56
|
raise ValueError(f"Invalid S3 path: {self.path}")
|
|
@@ -63,15 +63,15 @@ class S3OutputDriver(OutputDriver):
|
|
|
63
63
|
self,
|
|
64
64
|
run_id: str,
|
|
65
65
|
task_id: str,
|
|
66
|
-
artifacts:
|
|
67
|
-
) ->
|
|
66
|
+
artifacts: Dict[str, Any],
|
|
67
|
+
) -> List[str]:
|
|
68
68
|
bucket, prefix = self._parse_s3_path()
|
|
69
69
|
client = self._client()
|
|
70
70
|
|
|
71
71
|
# Use task_id for path: prefix/task_id/run_id_analytics_module.json
|
|
72
72
|
key_prefix = f"{prefix}{task_id}/"
|
|
73
73
|
|
|
74
|
-
locations:
|
|
74
|
+
locations: List[str] = []
|
|
75
75
|
for name, data in artifacts.items():
|
|
76
76
|
key = f"{key_prefix}{run_id}_analytics_{name}.json"
|
|
77
77
|
body = json.dumps(_serialize_for_json(data), indent=2)
|
|
@@ -3,7 +3,7 @@ Transform stage: Clean data (remove duplicates), extract add_dimension, and stag
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Sequence
|
|
6
|
+
from typing import Optional, Sequence
|
|
7
7
|
|
|
8
8
|
from pyspark.sql import DataFrame, SparkSession
|
|
9
9
|
from pyspark.sql.functions import coalesce, col, get_json_object, regexp_extract
|
|
@@ -40,7 +40,7 @@ def extract_anchor_id(
|
|
|
40
40
|
|
|
41
41
|
def remove_duplicates(
|
|
42
42
|
df: DataFrame,
|
|
43
|
-
key_columns: Sequence[str]
|
|
43
|
+
key_columns: Optional[Sequence[str]] = None,
|
|
44
44
|
) -> DataFrame:
|
|
45
45
|
"""
|
|
46
46
|
Remove duplicate rows.
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batch-analytics
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
|
|
5
|
-
Author: Analytics Team
|
|
5
|
+
Author: Litewave Analytics Team
|
|
6
6
|
License: MIT
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
9
|
Requires-Dist: pyspark<3.6,>=3.4
|
|
10
10
|
Provides-Extra: dev
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics.egg-info/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|