PyPI - batch-analytics - Versions diffs - 0.2.1__tar.gz → 0.2.3__tar.gz - Mend

batch-analytics 0.2.1tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{batch_analytics-0.2.1 → batch_analytics-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,14 +1,17 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.2.1
+Version: 0.2.3
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 Requires-Dist: pyspark<3.6,>=3.4
+Requires-Dist: numpy>=1.19.0
 Provides-Extra: dev
 Requires-Dist: pytest>=7.0; extra == "dev"
+Provides-Extra: ttest
+Requires-Dist: scipy>=1.5.0; extra == "ttest"
 Provides-Extra: s3
 Requires-Dist: boto3>=1.28; extra == "s3"
 Provides-Extra: clickhouse
@@ -16,6 +19,10 @@ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
 Provides-Extra: output
 Requires-Dist: boto3>=1.28; extra == "output"
 Requires-Dist: clickhouse-connect>=0.7; extra == "output"
+Provides-Extra: full
+Requires-Dist: scipy>=1.5.0; extra == "full"
+Requires-Dist: boto3>=1.28; extra == "full"
+Requires-Dist: clickhouse-connect>=0.7; extra == "full"
 # Batch Analytics
@@ -28,7 +35,8 @@ Only the files required for the batch analytics job runner:
 ```
 analytics/
 ├── pyproject.toml
-├── requirements-batch.txt
+├── requirements.txt          # core + scipy + boto3 + clickhouse-connect (single-file install)
+├── requirements-batch.txt  # includes requirements.txt
 ├── README.md
 └── src/
     └── batch_analytics/
@@ -52,7 +60,10 @@ analytics/
 ```bash
 pip install -e .
-# or: pip install -r requirements-batch.txt && pip install -e .
+# or install every runtime dependency used anywhere in the package, then editable:
+pip install -r requirements.txt && pip install -e .
+# PyPI wheel pulls numpy automatically (pyspark.ml); extras: ttest, s3, clickhouse, output, full
+pip install "batch-analytics[full]"
 ```
 ## Run

{batch_analytics-0.2.1 → batch_analytics-0.2.3}/README.md RENAMED Viewed

@@ -9,7 +9,8 @@ Only the files required for the batch analytics job runner:
 ```
 analytics/
 ├── pyproject.toml
-├── requirements-batch.txt
+├── requirements.txt          # core + scipy + boto3 + clickhouse-connect (single-file install)
+├── requirements-batch.txt  # includes requirements.txt
 ├── README.md
 └── src/
     └── batch_analytics/
@@ -33,7 +34,10 @@ analytics/
 ```bash
 pip install -e .
-# or: pip install -r requirements-batch.txt && pip install -e .
+# or install every runtime dependency used anywhere in the package, then editable:
+pip install -r requirements.txt && pip install -e .
+# PyPI wheel pulls numpy automatically (pyspark.ml); extras: ttest, s3, clickhouse, output, full
+pip install "batch-analytics[full]"
 ```
 ## Run

{batch_analytics-0.2.1 → batch_analytics-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,21 +4,26 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "batch-analytics"
-version = "0.2.1"
+version = "0.2.3"
 description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
 readme = "README.md"
 requires-python = ">=3.8"
 dependencies = [
     "pyspark>=3.4,<3.6",
+    "numpy>=1.19.0",
 ]
 authors = [{ name = "Litewave Analytics Team" }]
 license = { text = "MIT" }
 [project.optional-dependencies]
 dev = ["pytest>=7.0"]
+# Welch t-test module (batch_analytics.analytics.t_test)
+ttest = ["scipy>=1.5.0"]
 s3 = ["boto3>=1.28"]
 clickhouse = ["clickhouse-connect>=0.7"]
 output = ["boto3>=1.28", "clickhouse-connect>=0.7"]
+# Install all optional runtime deps used anywhere in the package
+full = ["scipy>=1.5.0", "boto3>=1.28", "clickhouse-connect>=0.7"]
 [project.scripts]
 batch-analytics = "batch_analytics.job_runner:main"

{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/extract.py RENAMED Viewed

@@ -18,7 +18,7 @@ def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str)
     Requires: com.clickhouse.spark:clickhouse-spark-runtime in spark.jars.packages
     """
     try:
-        df = (
+        rd = (
             spark.read.format("clickhouse")
             .option("host", cfg.clickhouse.host)
             .option("protocol", cfg.clickhouse.protocol)
@@ -26,8 +26,10 @@ def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str)
             .option("database", cfg.clickhouse.database)
             .option("table", table)
             .option("user", cfg.clickhouse.user)
-            .load()
         )
+        if cfg.clickhouse.password:
+            rd = rd.option("password", cfg.clickhouse.password)
+        df = rd.load()
         return df
     except Exception as e:
         logger.warning(

{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/job_runner.py RENAMED Viewed

@@ -44,12 +44,31 @@ def create_spark_session(
         .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
     )
-    # JARs: ClickHouse JDBC + hadoop-aws (for S3 when on K8s)
-    packages = []
+    # JARs: spark.jars = local/https/file paths; spark.jars.packages = Maven coordinates
+    # SPARK_JARS can be comma-separated URLs (e.g. Maven Central https://repo1.maven.org/...jar)
+    jar_list: List[str] = []
+    sp_jars = os.environ.get("SPARK_JARS", "").strip()
+    if sp_jars:
+        jar_list.extend(p.strip() for p in sp_jars.split(",") if p.strip())
+    packages: List[str] = []
     if clickhouse_jars:
-        packages.append(clickhouse_jars)
+        for part in clickhouse_jars.split(","):
+            part = part.strip()
+            if not part:
+                continue
+            if part.startswith(("http://", "https://", "file:")):
+                jar_list.append(part)
+            elif part.endswith(".jar") and ":" not in part:
+                jar_list.append(part)
+            else:
+                packages.append(part)
     if cfg.master.startswith("k8s://"):
         packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
+    if jar_list:
+        builder = builder.config("spark.jars", ",".join(jar_list))
     if packages:
         builder = builder.config("spark.jars.packages", ",".join(packages))
@@ -121,10 +140,21 @@ def run_pipeline(
     run_id = str(uuid.uuid4())[:8]
     if spark is None:
-        jars = "com.clickhouse:clickhouse-jdbc:0.4.6:all"
+        # Native format("clickhouse") needs clickhouse-spark-runtime; JDBC fallback needs clickhouse-jdbc.
+        # Override: BATCH_SPARK_CLICKHOUSE_PACKAGES=maven coords (comma-sep) or "" to use only SPARK_JARS.
+        _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
+        if _raw_ch is None:
+            ch_pkgs = (
+                "com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,"
+                "com.clickhouse:clickhouse-jdbc:0.6.2"
+            )
+        elif not _raw_ch.strip():
+            ch_pkgs = None
+        else:
+            ch_pkgs = _raw_ch.strip()
         spark = create_spark_session(
             app_name="BatchAnalytics",
-            clickhouse_jars=jars,
+            clickhouse_jars=ch_pkgs,
             config=config,
         )

{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/transform.py RENAMED Viewed

@@ -92,16 +92,19 @@ def stage_to_clickhouse(
     """
     n = df.count()
     try:
+        ch = config.clickhouse
         writer = (
             df.write.format("clickhouse")
-            .option("host", config.clickhouse.host)
-            .option("database", config.clickhouse.database)
+            .option("host", ch.host)
+            .option("protocol", ch.protocol)
+            .option("http_port", str(ch.port))
+            .option("database", ch.database)
             .option("table", config.transform.staging_table)
-            .option("user", config.clickhouse.user)
+            .option("user", ch.user)
             .mode("overwrite")
         )
-        if config.clickhouse.password:
-            writer = writer.option("password", config.clickhouse.password)
+        if ch.password:
+            writer = writer.option("password", ch.password)
         writer.save()
     except Exception as e:
         logger.warning("ClickHouse connector failed (%s), using JDBC", e)
@@ -168,13 +171,19 @@ def load_staged(
         return spark.read.format("delta").load(staging_path)
     if fmt == "clickhouse":
         try:
-            return (
+            ch = config.clickhouse
+            rd = (
                 spark.read.format("clickhouse")
-                .option("host", config.clickhouse.host)
-                .option("database", config.clickhouse.database)
+                .option("host", ch.host)
+                .option("protocol", ch.protocol)
+                .option("http_port", str(ch.port))
+                .option("database", ch.database)
                 .option("table", config.transform.staging_table)
-                .load()
+                .option("user", ch.user)
             )
+            if ch.password:
+                rd = rd.option("password", ch.password)
+            return rd.load()
         except Exception:
             return spark.read.jdbc(
                 config.clickhouse.jdbc_url,

{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/PKG-INFO RENAMED Viewed

@@ -1,14 +1,17 @@
 Metadata-Version: 2.4
 Name: batch-analytics
-Version: 0.2.1
+Version: 0.2.3
 Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
 Author: Litewave Analytics Team
 License: MIT
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 Requires-Dist: pyspark<3.6,>=3.4
+Requires-Dist: numpy>=1.19.0
 Provides-Extra: dev
 Requires-Dist: pytest>=7.0; extra == "dev"
+Provides-Extra: ttest
+Requires-Dist: scipy>=1.5.0; extra == "ttest"
 Provides-Extra: s3
 Requires-Dist: boto3>=1.28; extra == "s3"
 Provides-Extra: clickhouse
@@ -16,6 +19,10 @@ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
 Provides-Extra: output
 Requires-Dist: boto3>=1.28; extra == "output"
 Requires-Dist: clickhouse-connect>=0.7; extra == "output"
+Provides-Extra: full
+Requires-Dist: scipy>=1.5.0; extra == "full"
+Requires-Dist: boto3>=1.28; extra == "full"
+Requires-Dist: clickhouse-connect>=0.7; extra == "full"
 # Batch Analytics
@@ -28,7 +35,8 @@ Only the files required for the batch analytics job runner:
 ```
 analytics/
 ├── pyproject.toml
-├── requirements-batch.txt
+├── requirements.txt          # core + scipy + boto3 + clickhouse-connect (single-file install)
+├── requirements-batch.txt  # includes requirements.txt
 ├── README.md
 └── src/
     └── batch_analytics/
@@ -52,7 +60,10 @@ analytics/
 ```bash
 pip install -e .
-# or: pip install -r requirements-batch.txt && pip install -e .
+# or install every runtime dependency used anywhere in the package, then editable:
+pip install -r requirements.txt && pip install -e .
+# PyPI wheel pulls numpy automatically (pyspark.ml); extras: ttest, s3, clickhouse, output, full
+pip install "batch-analytics[full]"
 ```
 ## Run

{batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 pyspark<3.6,>=3.4
+numpy>=1.19.0
 [clickhouse]
 clickhouse-connect>=0.7
@@ -6,9 +7,17 @@ clickhouse-connect>=0.7
 [dev]
 pytest>=7.0
+[full]
+scipy>=1.5.0
+boto3>=1.28
+clickhouse-connect>=0.7
 [output]
 boto3>=1.28
 clickhouse-connect>=0.7
 [s3]
 boto3>=1.28
+[ttest]
+scipy>=1.5.0