batch-analytics 0.2.5__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/PKG-INFO +1 -1
  2. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/pyproject.toml +1 -1
  3. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/config.py +8 -1
  4. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/job_runner.py +10 -0
  5. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics.egg-info/PKG-INFO +1 -1
  6. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/README.md +0 -0
  7. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/setup.cfg +0 -0
  8. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/__init__.py +0 -0
  9. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/__main__.py +0 -0
  10. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/analytics/__init__.py +0 -0
  11. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/analytics/correlation.py +0 -0
  12. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/analytics/linear_regression.py +0 -0
  13. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  14. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/analytics/t_test.py +0 -0
  15. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/extract.py +0 -0
  16. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/log.py +0 -0
  17. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/modules.py +0 -0
  18. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/output/__init__.py +0 -0
  19. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/output/base.py +0 -0
  20. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/output/clickhouse.py +0 -0
  21. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/output/local.py +0 -0
  22. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/output/s3.py +0 -0
  23. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics/transform.py +0 -0
  24. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  25. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  26. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  27. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics.egg-info/requires.txt +0 -0
  28. {batch_analytics-0.2.5 → batch_analytics-0.2.6}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.2.5"
7
+ version = "0.2.6"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -29,7 +29,14 @@ class ClickHouseConfig:
29
29
 
30
30
  @property
31
31
  def jdbc_properties(self) -> dict:
32
- props = {"user": self.user, "driver": "com.clickhouse.jdbc.ClickHouseDriver"}
32
+ props = {
33
+ "user": self.user,
34
+ "driver": "com.clickhouse.jdbc.ClickHouseDriver",
35
+ # Match Spark read codec default: avoids JDBC LZ4/gzip mismatches with server HTTP compression
36
+ "compress_algorithm": os.environ.get(
37
+ "CLICKHOUSE_JDBC_COMPRESS_ALGORITHM", "none"
38
+ ),
39
+ }
33
40
  if self.password:
34
41
  props["password"] = self.password
35
42
  return props
@@ -92,6 +92,16 @@ def create_spark_session(
92
92
  builder = builder.config(f"spark.sql.catalog.{ch_cat}.password", ch.password)
93
93
  if ch.protocol.lower() == "https":
94
94
  builder = builder.config(f"spark.sql.catalog.{ch_cat}.option.ssl", "true")
95
+ # Avoid Lz4InputStream "Magic is not correct" when server HTTP compression != client expectation
96
+ # (see clickhouse-java#1449 / server enable_http_compression user defaults).
97
+ read_codec = os.environ.get(
98
+ "SPARK_CLICKHOUSE_READ_COMPRESSION_CODEC", "none"
99
+ ).strip()
100
+ if read_codec:
101
+ builder = builder.config(
102
+ "spark.clickhouse.read.compression.codec",
103
+ read_codec,
104
+ )
95
105
 
96
106
  if cfg.master.startswith("k8s://"):
97
107
  driver_host = socket.gethostbyname(socket.gethostname())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT