PyPI - batch-analytics - Versions diffs - 0.1.0__tar.gz → 0.2.1__tar.gz - Mend

batch-analytics 0.1.0tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.1.0
+Version: 0.2.1
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
-Author: Analytics Team
+Author: Litewave Analytics Team
 License: MIT
-Requires-Python: >=3.9
+Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 Requires-Dist: pyspark<3.6,>=3.4
 Provides-Extra: dev

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/pyproject.toml RENAMED Viewed

@@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "batch-analytics"
-version = "0.1.0"
+version = "0.2.1"
 description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 dependencies = [
     "pyspark>=3.4,<3.6",
 ]
-authors = [{ name = "Analytics Team" }]
+authors = [{ name = "Litewave Analytics Team" }]
 license = { text = "MIT" }
 [project.optional-dependencies]

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/correlation.py RENAMED Viewed

@@ -3,7 +3,7 @@ Module 2: Multi-feature correlation analysis.
 """
 import logging
-from typing import Any
+from typing import Any, Dict, List
 from pyspark.ml.feature import VectorAssembler
 from pyspark.ml.stat import Correlation
@@ -18,7 +18,7 @@ def run_correlation(
     spark: SparkSession,
     df: DataFrame,
     config: BatchAnalyticsConfig,
-) -> dict[str, Any]:
+) -> Dict[str, Any]:
     """
     Compute correlation matrix over multiple numeric features.
     Optionally identify pairs above a threshold (collinearity).
@@ -93,7 +93,7 @@ def run_correlation(
     matrix = np.asarray(arr)
     threshold = config.analytics.corr_threshold
-    high_pairs: list[dict] = []
+    high_pairs: List[dict] = []
     n = len(feature_cols)
     for i in range(n):
         for j in range(i + 1, n):

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/linear_regression.py RENAMED Viewed

@@ -3,7 +3,7 @@ Module 1: Simple linear regression on XY data with slope comparison across group
 """
 import logging
-from typing import Any
+from typing import Any, Dict
 from pyspark.ml.feature import VectorAssembler
 from pyspark.ml.regression import LinearRegression
@@ -29,7 +29,7 @@ def run_linear_regression(
     spark: SparkSession,
     df: DataFrame,
     config: BatchAnalyticsConfig,
-) -> dict[str, Any]:
+) -> Dict[str, Any]:
     """
     Run simple linear regression: Y ~ X.
     If group columns are configured, fit separate models per group and compare slopes.
@@ -76,7 +76,7 @@ def run_linear_regression(
     global_intercept = float(global_model.intercept)
     global_r2 = float(global_model.summary.r2)
-    result: dict[str, Any] = {
+    result: Dict[str, Any] = {
         "global": {
             "slope": global_slope,
             "intercept": global_intercept,
@@ -97,7 +97,7 @@ def run_linear_regression(
     )
     groups = df_grouped.select(group_key).distinct().collect()
-    slopes_by_group: dict[str, dict] = {}
+    slopes_by_group: Dict[str, dict] = {}
     for row in groups:
         key_str = row[group_key]
         if key_str is None:

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/pca_clustering.py RENAMED Viewed

@@ -3,7 +3,7 @@ Module 3: PCA for key feature identification and clustering on staged data.
 """
 import logging
-from typing import Any
+from typing import Any, Dict, List
 from pyspark.ml.clustering import KMeans
 from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
@@ -18,7 +18,7 @@ def run_pca_clustering(
     spark: SparkSession,
     df: DataFrame,
     config: BatchAnalyticsConfig,
-) -> dict[str, Any]:
+) -> Dict[str, Any]:
     """
     Run PCA to identify key features (principal components) and KMeans clustering.
     Uses scaled features; PCA components capture variance; clustering groups similar rows.
@@ -94,7 +94,7 @@ def run_pca_clustering(
     # Feature loadings per PC (which original features contribute most)
     components = pca_model.pc.toArray()
-    loadings: list[dict] = []
+    loadings: List[dict] = []
     for i, comp in enumerate(components):
         ranked = sorted(
             zip(feature_cols, comp.tolist()),

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/analytics/t_test.py RENAMED Viewed

@@ -3,7 +3,7 @@ Module 4: T-test to compare means of two sets of data.
 """
 import logging
-from typing import Any
+from typing import Any, Dict
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import col, avg, stddev, count
@@ -18,7 +18,7 @@ def run_t_test(
     spark: SparkSession,
     df: DataFrame,
     config: BatchAnalyticsConfig,
-) -> dict[str, Any]:
+) -> Dict[str, Any]:
     """
     Perform an independent samples t-test to compare the means of two groups.
@@ -75,7 +75,7 @@ def _run_t_test_by_group(
     df: DataFrame,
     value_col: str,
     group_col: str,
-) -> dict[str, Any]:
+) -> Dict[str, Any]:
     """T-test: compare mean of value_col across two levels of group_col."""
     df_num = df.select(
         col(value_col).cast(DoubleType()).alias("_val"),
@@ -110,7 +110,7 @@ def _run_t_test_by_group(
     )
-def _run_t_test_two_columns(df: DataFrame, col_a: str, col_b: str) -> dict[str, Any]:
+def _run_t_test_two_columns(df: DataFrame, col_a: str, col_b: str) -> Dict[str, Any]:
     """T-test: compare means of two numeric columns."""
     df_num = df.select(
         col(col_a).cast(DoubleType()).alias("_a"),
@@ -147,7 +147,7 @@ def _compute_t_test_result(
     mean_b: float,
     std_b: float,
     n_b: int,
-) -> dict[str, Any]:
+) -> Dict[str, Any]:
     """Compute Welch's t-test from summary statistics."""
     try:
         from scipy import stats

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/extract.py RENAMED Viewed

@@ -3,7 +3,7 @@ Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDB
 """
 import logging
-from typing import Any
+from typing import Dict, List, Optional
 from pyspark.sql import DataFrame, SparkSession
@@ -12,7 +12,7 @@ from .config import BatchAnalyticsConfig
 logger = logging.getLogger(__name__)
-def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> DataFrame | None:
+def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str) -> Optional[DataFrame]:
     """
     Read from ClickHouse using the native format API (clickhouse-spark-runtime).
     Requires: com.clickhouse.spark:clickhouse-spark-runtime in spark.jars.packages
@@ -70,7 +70,7 @@ def extract_table(
 def extract_all(
     spark: SparkSession,
     config: BatchAnalyticsConfig,
-) -> dict[str, DataFrame]:
+) -> Dict[str, DataFrame]:
     """
     Extract all configured source tables from ClickHouse.
     Returns a dict mapping table name to DataFrame.
@@ -79,7 +79,7 @@ def extract_all(
     if not tables:
         raise ValueError("No source tables configured in BATCH_SOURCE_TABLES")
-    result: dict[str, DataFrame] = {}
+    result: Dict[str, DataFrame] = {}
     for table in tables:
         df = extract_table(spark, table, config)
         result[table] = df
@@ -90,8 +90,8 @@ def extract_all(
 def extract_unified(
     spark: SparkSession,
     config: BatchAnalyticsConfig,
-    join_keys: list[str] | None = None,
-    primary_table: str | None = None,
+    join_keys: Optional[List[str]] = None,
+    primary_table: Optional[str] = None,
 ) -> DataFrame:
     """
     Extract and unify source tables into one DataFrame.

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/job_runner.py RENAMED Viewed

@@ -9,6 +9,7 @@ import socket
 import sys
 import uuid
 from pathlib import Path
+from typing import Dict, List, Optional
 from pyspark.sql import SparkSession
@@ -28,8 +29,8 @@ logger = logging.getLogger(__name__)
 def create_spark_session(
     app_name: str = "BatchAnalytics",
-    clickhouse_jars: str | None = None,
-    config: BatchAnalyticsConfig | None = None,
+    clickhouse_jars: Optional[str] = None,
+    config: Optional[BatchAnalyticsConfig] = None,
 ) -> SparkSession:
     """
     Create SparkSession. Uses Kubernetes config when SPARK_MASTER starts with k8s://.
@@ -93,14 +94,14 @@ def create_spark_session(
 def run_pipeline(
-    config: BatchAnalyticsConfig | None = None,
-    spark: SparkSession | None = None,
+    config: Optional[BatchAnalyticsConfig] = None,
+    spark: Optional[SparkSession] = None,
     run_extract: bool = True,
     run_transform: bool = True,
     run_stage: bool = True,
     run_analytics: bool = True,
-    modules: list[str] | None = None,
-) -> dict:
+    modules: Optional[List[str]] = None,
+) -> Dict:
     """
     Run the full pipeline or selected stages.

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/log.py RENAMED Viewed

@@ -6,6 +6,7 @@ import json
 import logging
 from datetime import datetime
 from pathlib import Path
+from typing import List, Optional, Union
 from pyspark.sql import SparkSession
@@ -16,8 +17,8 @@ def log_run(
     run_id: str,
     stage: str,
     metrics: dict,
-    output_dir: str | Path | None = None,
-    extra: dict | None = None,
+    output_dir: Optional[Union[str, Path]] = None,
+    extra: Optional[dict] = None,
 ) -> Path:
     """
     Write run log (metadata + metrics) to a JSON file.
@@ -63,8 +64,8 @@ def log_dataframe_summary(
 def log_analytics_artifacts(
     run_id: str,
     artifacts: dict,
-    output_dir: str | Path,
-) -> list[Path]:
+    output_dir: Union[str, Path],
+) -> List[Path]:
     """
     Write analytics module outputs (e.g. slopes, correlation matrix, PCA loadings)
     to JSON files in the log directory.
@@ -72,7 +73,7 @@ def log_analytics_artifacts(
     output_dir = Path(output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
-    paths: list[Path] = []
+    paths: List[Path] = []
     for name, data in artifacts.items():
         path = output_dir / f"{run_id}_analytics_{name}.json"
         with open(path, "w") as f:

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/output/base.py RENAMED Viewed

@@ -5,7 +5,7 @@ Base output driver interface and write orchestration.
 import json
 import logging
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, Dict, List
 logger = logging.getLogger(__name__)
@@ -37,8 +37,8 @@ class OutputDriver(ABC):
         self,
         run_id: str,
         task_id: str,
-        artifacts: dict[str, Any],
-    ) -> list[str]:
+        artifacts: Dict[str, Any],
+    ) -> List[str]:
         """
         Write analytics artifacts to the destination.
@@ -56,10 +56,10 @@ class OutputDriver(ABC):
 def write_analytics_output(
     run_id: str,
     task_id: str,
-    artifacts: dict[str, Any],
+    artifacts: Dict[str, Any],
     output_type: str,
     **driver_kwargs: Any,
-) -> list[str]:
+) -> List[str]:
     """
     Write analytics results using the configured output driver.
@@ -77,6 +77,10 @@ def write_analytics_output(
         logger.debug("No analytics artifacts to write")
         return []
+    from .clickhouse import ClickHouseOutputDriver
+    from .local import LocalOutputDriver
+    from .s3 import S3OutputDriver
     output_type = (output_type or "local").lower().strip()
     if output_type == "local":
         driver = LocalOutputDriver(**driver_kwargs)

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/output/clickhouse.py RENAMED Viewed

@@ -4,7 +4,7 @@ ClickHouse output driver: inserts analytics results into a ClickHouse table.
 import json
 import logging
-from typing import Any
+from typing import Any, Dict, List, Optional, Tuple
 from .base import OutputDriver, _serialize_for_json
@@ -32,7 +32,7 @@ class ClickHouseOutputDriver(OutputDriver):
         self,
         database: str,
         table: str,
-        host: str | None = None,
+        host: Optional[str] = None,
         port: int = 8123,
         user: str = "default",
         password: str = "",
@@ -66,14 +66,14 @@ class ClickHouseOutputDriver(OutputDriver):
         self,
         run_id: str,
         task_id: str,
-        artifacts: dict[str, Any],
-    ) -> list[str]:
+        artifacts: Dict[str, Any],
+    ) -> List[str]:
         client = self._client()
         # Ensure table exists
         client.command(_create_table_sql(self.database, self.table).strip())
-        rows: list[tuple[str, str, str, str]] = []
+        rows: List[Tuple[str, str, str, str]] = []
         for module, data in artifacts.items():
             result_json = json.dumps(_serialize_for_json(data))
             rows.append((task_id, run_id, module, result_json))

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/output/local.py RENAMED Viewed

@@ -5,7 +5,7 @@ Local output driver: writes analytics results to a local directory (BATCH_LOG_PA
 import json
 import logging
 from pathlib import Path
-from typing import Any
+from typing import Any, Dict, List, Union
 from .base import OutputDriver, _serialize_for_json
@@ -15,7 +15,7 @@ logger = logging.getLogger(__name__)
 class LocalOutputDriver(OutputDriver):
     """Write analytics artifacts to local filesystem."""
-    def __init__(self, path: str | Path = "/tmp/analytics_logs", **kwargs: Any) -> None:
+    def __init__(self, path: Union[str, Path] = "/tmp/analytics_logs", **kwargs: Any) -> None:
         self.path = Path(path)
         self.path.mkdir(parents=True, exist_ok=True)
@@ -23,9 +23,9 @@ class LocalOutputDriver(OutputDriver):
         self,
         run_id: str,
         task_id: str,
-        artifacts: dict[str, Any],
-    ) -> list[str]:
-        locations: list[str] = []
+        artifacts: Dict[str, Any],
+    ) -> List[str]:
+        locations: List[str] = []
         for name, data in artifacts.items():
             filepath = self.path / f"{run_id}_analytics_{name}.json"
             with open(filepath, "w") as f:

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/output/s3.py RENAMED Viewed

@@ -4,7 +4,7 @@ S3 output driver: uploads analytics results as JSON to S3.
 import json
 import logging
-from typing import Any
+from typing import Any, Dict, List, Optional, Tuple
 from .base import OutputDriver, _serialize_for_json
@@ -17,8 +17,8 @@ class S3OutputDriver(OutputDriver):
     def __init__(
         self,
         path: str,
-        region: str | None = None,
-        endpoint: str | None = None,
+        region: Optional[str] = None,
+        endpoint: Optional[str] = None,
         **kwargs: Any,
     ) -> None:
         """
@@ -42,7 +42,7 @@ class S3OutputDriver(OutputDriver):
             ) from e
         config = Config(signature_version="s3v4")
-        client_kwargs: dict[str, Any] = {}
+        client_kwargs: Dict[str, Any] = {}
         if self.region:
             client_kwargs["region_name"] = self.region
         if self.endpoint:
@@ -50,7 +50,7 @@ class S3OutputDriver(OutputDriver):
         return boto3.client("s3", config=config, **client_kwargs)
-    def _parse_s3_path(self) -> tuple[str, str]:
+    def _parse_s3_path(self) -> Tuple[str, str]:
         """Parse s3://bucket/prefix into bucket and prefix."""
         if not self.path.startswith("s3://"):
             raise ValueError(f"Invalid S3 path: {self.path}")
@@ -63,15 +63,15 @@ class S3OutputDriver(OutputDriver):
         self,
         run_id: str,
         task_id: str,
-        artifacts: dict[str, Any],
-    ) -> list[str]:
+        artifacts: Dict[str, Any],
+    ) -> List[str]:
         bucket, prefix = self._parse_s3_path()
         client = self._client()
         # Use task_id for path: prefix/task_id/run_id_analytics_module.json
         key_prefix = f"{prefix}{task_id}/"
-        locations: list[str] = []
+        locations: List[str] = []
         for name, data in artifacts.items():
             key = f"{key_prefix}{run_id}_analytics_{name}.json"
             body = json.dumps(_serialize_for_json(data), indent=2)

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics/transform.py RENAMED Viewed

@@ -3,7 +3,7 @@ Transform stage: Clean data (remove duplicates), extract add_dimension, and stag
 """
 import logging
-from typing import Sequence
+from typing import Optional, Sequence
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import coalesce, col, get_json_object, regexp_extract
@@ -40,7 +40,7 @@ def extract_anchor_id(
 def remove_duplicates(
     df: DataFrame,
-    key_columns: Sequence[str] | None = None,
+    key_columns: Optional[Sequence[str]] = None,
 ) -> DataFrame:
     """
     Remove duplicate rows.

{batch_analytics-0.1.0 → batch_analytics-0.2.1}/src/batch_analytics.egg-info/PKG-INFO RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.1.0
+Version: 0.2.1
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
-Author: Analytics Team
+Author: Litewave Analytics Team
 License: MIT
-Requires-Python: >=3.9
+Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 Requires-Dist: pyspark<3.6,>=3.4
 Provides-Extra: dev