batch-analytics 0.2.4__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/PKG-INFO +1 -1
  2. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/pyproject.toml +1 -1
  3. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/extract.py +2 -1
  4. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/job_runner.py +21 -0
  5. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/transform.py +21 -20
  6. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  7. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/README.md +0 -0
  8. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/setup.cfg +0 -0
  9. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/__init__.py +0 -0
  10. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/__main__.py +0 -0
  11. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/analytics/__init__.py +0 -0
  12. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/analytics/correlation.py +0 -0
  13. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/analytics/linear_regression.py +0 -0
  14. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  15. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/analytics/t_test.py +0 -0
  16. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/config.py +0 -0
  17. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/log.py +0 -0
  18. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/modules.py +0 -0
  19. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/output/__init__.py +0 -0
  20. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/output/base.py +0 -0
  21. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/output/clickhouse.py +0 -0
  22. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/output/local.py +0 -0
  23. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics/output/s3.py +0 -0
  24. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  25. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  26. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  27. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/requires.txt +0 -0
  28. {batch_analytics-0.2.4 → batch_analytics-0.2.5}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.2.4
3
+ Version: 0.2.5
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.2.4"
7
+ version = "0.2.5"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -3,6 +3,7 @@ Extract stage: Load data from ClickHouse using Spark ClickHouse connector or JDB
3
3
  """
4
4
 
5
5
  import logging
6
+ import os
6
7
  from typing import Dict, List, Optional
7
8
 
8
9
  from pyspark.sql import DataFrame, SparkSession
@@ -59,7 +60,7 @@ def extract_table(
59
60
  Uses native connector if configured, otherwise JDBC.
60
61
  """
61
62
  if config.extract.use_native_connector:
62
- df = _read_via_format(spark, config, table)
63
+ df = _read_via_catalog(spark, config, table)
63
64
  if df is None:
64
65
  df = _read_via_jdbc(spark, config, table)
65
66
  else:
@@ -72,6 +72,27 @@ def create_spark_session(
72
72
  if packages:
73
73
  builder = builder.config("spark.jars.packages", ",".join(packages))
74
74
 
75
+ # clickhouse-spark-runtime does not register legacy clickhouse.DefaultSource; the connector
76
+ # expects a Spark catalog (see ClickHouse docs). Enables spark.table("catalog.db.table").
77
+ ch_cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
78
+ if ch_cat:
79
+ ch = config.clickhouse
80
+ builder = (
81
+ builder.config(
82
+ f"spark.sql.catalog.{ch_cat}",
83
+ "com.clickhouse.spark.ClickHouseCatalog",
84
+ )
85
+ .config(f"spark.sql.catalog.{ch_cat}.host", ch.host)
86
+ .config(f"spark.sql.catalog.{ch_cat}.protocol", ch.protocol)
87
+ .config(f"spark.sql.catalog.{ch_cat}.http_port", str(ch.port))
88
+ .config(f"spark.sql.catalog.{ch_cat}.user", ch.user)
89
+ .config(f"spark.sql.catalog.{ch_cat}.database", ch.database)
90
+ )
91
+ if ch.password:
92
+ builder = builder.config(f"spark.sql.catalog.{ch_cat}.password", ch.password)
93
+ if ch.protocol.lower() == "https":
94
+ builder = builder.config(f"spark.sql.catalog.{ch_cat}.option.ssl", "true")
95
+
75
96
  if cfg.master.startswith("k8s://"):
76
97
  driver_host = socket.gethostbyname(socket.gethostname())
77
98
  builder = (
@@ -3,6 +3,7 @@ Transform stage: Clean data (remove duplicates), extract add_dimension, and stag
3
3
  """
4
4
 
5
5
  import logging
6
+ import os
6
7
  from typing import Optional, Sequence
7
8
 
8
9
  from pyspark.sql import DataFrame, SparkSession
@@ -170,24 +171,24 @@ def load_staged(
170
171
  if fmt == "delta":
171
172
  return spark.read.format("delta").load(staging_path)
172
173
  if fmt == "clickhouse":
173
- try:
174
- ch = config.clickhouse
175
- rd = (
176
- spark.read.format("clickhouse")
177
- .option("host", ch.host)
178
- .option("protocol", ch.protocol)
179
- .option("http_port", str(ch.port))
180
- .option("database", ch.database)
181
- .option("table", config.transform.staging_table)
182
- .option("user", ch.user)
183
- )
184
- if ch.password:
185
- rd = rd.option("password", ch.password)
186
- return rd.load()
187
- except Exception:
188
- return spark.read.jdbc(
189
- config.clickhouse.jdbc_url,
190
- config.transform.staging_table,
191
- properties=config.clickhouse.jdbc_properties,
192
- )
174
+ ch = config.clickhouse
175
+ tbl = config.transform.staging_table
176
+ cat = os.environ.get("BATCH_CLICKHOUSE_CATALOG", "batch_ch").strip()
177
+ if cat:
178
+ try:
179
+ return spark.table(f"{cat}.{ch.database}.{tbl}")
180
+ except Exception as e:
181
+ logger.warning(
182
+ "load_staged: catalog table %s.%s.%s failed (%s), using JDBC",
183
+ cat,
184
+ ch.database,
185
+ tbl,
186
+ e,
187
+ )
188
+ dbtable = f"(SELECT * FROM `{ch.database}`.`{tbl}`) AS _stg"
189
+ return spark.read.jdbc(
190
+ ch.jdbc_url,
191
+ dbtable,
192
+ properties=ch.jdbc_properties,
193
+ )
193
194
  return spark.read.format(fmt).load(staging_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.2.4
3
+ Version: 0.2.5
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT