batch-analytics 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/PKG-INFO +14 -3
  2. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/README.md +6 -2
  3. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/pyproject.toml +6 -1
  4. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/extract.py +4 -2
  5. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/job_runner.py +35 -5
  6. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/transform.py +18 -9
  7. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/PKG-INFO +14 -3
  8. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/requires.txt +9 -0
  9. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/setup.cfg +0 -0
  10. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/__init__.py +0 -0
  11. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/__main__.py +0 -0
  12. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/__init__.py +0 -0
  13. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/correlation.py +0 -0
  14. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/linear_regression.py +0 -0
  15. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/pca_clustering.py +0 -0
  16. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/analytics/t_test.py +0 -0
  17. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/config.py +0 -0
  18. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/log.py +0 -0
  19. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/modules.py +0 -0
  20. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/__init__.py +0 -0
  21. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/base.py +0 -0
  22. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/clickhouse.py +0 -0
  23. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/local.py +0 -0
  24. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics/output/s3.py +0 -0
  25. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/SOURCES.txt +0 -0
  26. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/dependency_links.txt +0 -0
  27. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/entry_points.txt +0 -0
  28. {batch_analytics-0.2.1 → batch_analytics-0.2.3}/src/batch_analytics.egg-info/top_level.txt +0 -0
@@ -1,14 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
7
7
  Requires-Python: >=3.8
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: pyspark<3.6,>=3.4
10
+ Requires-Dist: numpy>=1.19.0
10
11
  Provides-Extra: dev
11
12
  Requires-Dist: pytest>=7.0; extra == "dev"
13
+ Provides-Extra: ttest
14
+ Requires-Dist: scipy>=1.5.0; extra == "ttest"
12
15
  Provides-Extra: s3
13
16
  Requires-Dist: boto3>=1.28; extra == "s3"
14
17
  Provides-Extra: clickhouse
@@ -16,6 +19,10 @@ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
16
19
  Provides-Extra: output
17
20
  Requires-Dist: boto3>=1.28; extra == "output"
18
21
  Requires-Dist: clickhouse-connect>=0.7; extra == "output"
22
+ Provides-Extra: full
23
+ Requires-Dist: scipy>=1.5.0; extra == "full"
24
+ Requires-Dist: boto3>=1.28; extra == "full"
25
+ Requires-Dist: clickhouse-connect>=0.7; extra == "full"
19
26
 
20
27
  # Batch Analytics
21
28
 
@@ -28,7 +35,8 @@ Only the files required for the batch analytics job runner:
28
35
  ```
29
36
  analytics/
30
37
  ├── pyproject.toml
31
- ├── requirements-batch.txt
38
+ ├── requirements.txt # core + scipy + boto3 + clickhouse-connect (single-file install)
39
+ ├── requirements-batch.txt # includes requirements.txt
32
40
  ├── README.md
33
41
  └── src/
34
42
  └── batch_analytics/
@@ -52,7 +60,10 @@ analytics/
52
60
 
53
61
  ```bash
54
62
  pip install -e .
55
- # or: pip install -r requirements-batch.txt && pip install -e .
63
+ # or install every runtime dependency used anywhere in the package, then editable:
64
+ pip install -r requirements.txt && pip install -e .
65
+ # PyPI wheel pulls numpy automatically (pyspark.ml); extras: ttest, s3, clickhouse, output, full
66
+ pip install "batch-analytics[full]"
56
67
  ```
57
68
 
58
69
  ## Run
@@ -9,7 +9,8 @@ Only the files required for the batch analytics job runner:
9
9
  ```
10
10
  analytics/
11
11
  ├── pyproject.toml
12
- ├── requirements-batch.txt
12
+ ├── requirements.txt # core + scipy + boto3 + clickhouse-connect (single-file install)
13
+ ├── requirements-batch.txt # includes requirements.txt
13
14
  ├── README.md
14
15
  └── src/
15
16
  └── batch_analytics/
@@ -33,7 +34,10 @@ analytics/
33
34
 
34
35
  ```bash
35
36
  pip install -e .
36
- # or: pip install -r requirements-batch.txt && pip install -e .
37
+ # or install every runtime dependency used anywhere in the package, then editable:
38
+ pip install -r requirements.txt && pip install -e .
39
+ # PyPI wheel pulls numpy automatically (pyspark.ml); extras: ttest, s3, clickhouse, output, full
40
+ pip install "batch-analytics[full]"
37
41
  ```
38
42
 
39
43
  ## Run
@@ -4,21 +4,26 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "batch-analytics"
7
- version = "0.2.1"
7
+ version = "0.2.3"
8
8
  description = "PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
11
11
  dependencies = [
12
12
  "pyspark>=3.4,<3.6",
13
+ "numpy>=1.19.0",
13
14
  ]
14
15
  authors = [{ name = "Litewave Analytics Team" }]
15
16
  license = { text = "MIT" }
16
17
 
17
18
  [project.optional-dependencies]
18
19
  dev = ["pytest>=7.0"]
20
+ # Welch t-test module (batch_analytics.analytics.t_test)
21
+ ttest = ["scipy>=1.5.0"]
19
22
  s3 = ["boto3>=1.28"]
20
23
  clickhouse = ["clickhouse-connect>=0.7"]
21
24
  output = ["boto3>=1.28", "clickhouse-connect>=0.7"]
25
+ # Install all optional runtime deps used anywhere in the package
26
+ full = ["scipy>=1.5.0", "boto3>=1.28", "clickhouse-connect>=0.7"]
22
27
 
23
28
  [project.scripts]
24
29
  batch-analytics = "batch_analytics.job_runner:main"
@@ -18,7 +18,7 @@ def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str)
18
18
  Requires: com.clickhouse.spark:clickhouse-spark-runtime in spark.jars.packages
19
19
  """
20
20
  try:
21
- df = (
21
+ rd = (
22
22
  spark.read.format("clickhouse")
23
23
  .option("host", cfg.clickhouse.host)
24
24
  .option("protocol", cfg.clickhouse.protocol)
@@ -26,8 +26,10 @@ def _read_via_format(spark: SparkSession, cfg: BatchAnalyticsConfig, table: str)
26
26
  .option("database", cfg.clickhouse.database)
27
27
  .option("table", table)
28
28
  .option("user", cfg.clickhouse.user)
29
- .load()
30
29
  )
30
+ if cfg.clickhouse.password:
31
+ rd = rd.option("password", cfg.clickhouse.password)
32
+ df = rd.load()
31
33
  return df
32
34
  except Exception as e:
33
35
  logger.warning(
@@ -44,12 +44,31 @@ def create_spark_session(
44
44
  .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
45
45
  )
46
46
 
47
- # JARs: ClickHouse JDBC + hadoop-aws (for S3 when on K8s)
48
- packages = []
47
+ # JARs: spark.jars = local/https/file paths; spark.jars.packages = Maven coordinates
48
+ # SPARK_JARS can be comma-separated URLs (e.g. Maven Central https://repo1.maven.org/...jar)
49
+ jar_list: List[str] = []
50
+ sp_jars = os.environ.get("SPARK_JARS", "").strip()
51
+ if sp_jars:
52
+ jar_list.extend(p.strip() for p in sp_jars.split(",") if p.strip())
53
+
54
+ packages: List[str] = []
49
55
  if clickhouse_jars:
50
- packages.append(clickhouse_jars)
56
+ for part in clickhouse_jars.split(","):
57
+ part = part.strip()
58
+ if not part:
59
+ continue
60
+ if part.startswith(("http://", "https://", "file:")):
61
+ jar_list.append(part)
62
+ elif part.endswith(".jar") and ":" not in part:
63
+ jar_list.append(part)
64
+ else:
65
+ packages.append(part)
66
+
51
67
  if cfg.master.startswith("k8s://"):
52
68
  packages.append("org.apache.hadoop:hadoop-aws:3.3.4")
69
+
70
+ if jar_list:
71
+ builder = builder.config("spark.jars", ",".join(jar_list))
53
72
  if packages:
54
73
  builder = builder.config("spark.jars.packages", ",".join(packages))
55
74
 
@@ -121,10 +140,21 @@ def run_pipeline(
121
140
  run_id = str(uuid.uuid4())[:8]
122
141
 
123
142
  if spark is None:
124
- jars = "com.clickhouse:clickhouse-jdbc:0.4.6:all"
143
+ # Native format("clickhouse") needs clickhouse-spark-runtime; JDBC fallback needs clickhouse-jdbc.
144
+ # Override: BATCH_SPARK_CLICKHOUSE_PACKAGES=maven coords (comma-sep) or "" to use only SPARK_JARS.
145
+ _raw_ch = os.environ.get("BATCH_SPARK_CLICKHOUSE_PACKAGES")
146
+ if _raw_ch is None:
147
+ ch_pkgs = (
148
+ "com.clickhouse.spark:clickhouse-spark-runtime-3.5_2.12:0.8.0,"
149
+ "com.clickhouse:clickhouse-jdbc:0.6.2"
150
+ )
151
+ elif not _raw_ch.strip():
152
+ ch_pkgs = None
153
+ else:
154
+ ch_pkgs = _raw_ch.strip()
125
155
  spark = create_spark_session(
126
156
  app_name="BatchAnalytics",
127
- clickhouse_jars=jars,
157
+ clickhouse_jars=ch_pkgs,
128
158
  config=config,
129
159
  )
130
160
 
@@ -92,16 +92,19 @@ def stage_to_clickhouse(
92
92
  """
93
93
  n = df.count()
94
94
  try:
95
+ ch = config.clickhouse
95
96
  writer = (
96
97
  df.write.format("clickhouse")
97
- .option("host", config.clickhouse.host)
98
- .option("database", config.clickhouse.database)
98
+ .option("host", ch.host)
99
+ .option("protocol", ch.protocol)
100
+ .option("http_port", str(ch.port))
101
+ .option("database", ch.database)
99
102
  .option("table", config.transform.staging_table)
100
- .option("user", config.clickhouse.user)
103
+ .option("user", ch.user)
101
104
  .mode("overwrite")
102
105
  )
103
- if config.clickhouse.password:
104
- writer = writer.option("password", config.clickhouse.password)
106
+ if ch.password:
107
+ writer = writer.option("password", ch.password)
105
108
  writer.save()
106
109
  except Exception as e:
107
110
  logger.warning("ClickHouse connector failed (%s), using JDBC", e)
@@ -168,13 +171,19 @@ def load_staged(
168
171
  return spark.read.format("delta").load(staging_path)
169
172
  if fmt == "clickhouse":
170
173
  try:
171
- return (
174
+ ch = config.clickhouse
175
+ rd = (
172
176
  spark.read.format("clickhouse")
173
- .option("host", config.clickhouse.host)
174
- .option("database", config.clickhouse.database)
177
+ .option("host", ch.host)
178
+ .option("protocol", ch.protocol)
179
+ .option("http_port", str(ch.port))
180
+ .option("database", ch.database)
175
181
  .option("table", config.transform.staging_table)
176
- .load()
182
+ .option("user", ch.user)
177
183
  )
184
+ if ch.password:
185
+ rd = rd.option("password", ch.password)
186
+ return rd.load()
178
187
  except Exception:
179
188
  return spark.read.jdbc(
180
189
  config.clickhouse.jdbc_url,
@@ -1,14 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batch-analytics
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: PySpark batch analytics: Extract, Transform, Stage, and analytical modules (linear regression, correlation, PCA, t-test).
5
5
  Author: Litewave Analytics Team
6
6
  License: MIT
7
7
  Requires-Python: >=3.8
8
8
  Description-Content-Type: text/markdown
9
9
  Requires-Dist: pyspark<3.6,>=3.4
10
+ Requires-Dist: numpy>=1.19.0
10
11
  Provides-Extra: dev
11
12
  Requires-Dist: pytest>=7.0; extra == "dev"
13
+ Provides-Extra: ttest
14
+ Requires-Dist: scipy>=1.5.0; extra == "ttest"
12
15
  Provides-Extra: s3
13
16
  Requires-Dist: boto3>=1.28; extra == "s3"
14
17
  Provides-Extra: clickhouse
@@ -16,6 +19,10 @@ Requires-Dist: clickhouse-connect>=0.7; extra == "clickhouse"
16
19
  Provides-Extra: output
17
20
  Requires-Dist: boto3>=1.28; extra == "output"
18
21
  Requires-Dist: clickhouse-connect>=0.7; extra == "output"
22
+ Provides-Extra: full
23
+ Requires-Dist: scipy>=1.5.0; extra == "full"
24
+ Requires-Dist: boto3>=1.28; extra == "full"
25
+ Requires-Dist: clickhouse-connect>=0.7; extra == "full"
19
26
 
20
27
  # Batch Analytics
21
28
 
@@ -28,7 +35,8 @@ Only the files required for the batch analytics job runner:
28
35
  ```
29
36
  analytics/
30
37
  ├── pyproject.toml
31
- ├── requirements-batch.txt
38
+ ├── requirements.txt # core + scipy + boto3 + clickhouse-connect (single-file install)
39
+ ├── requirements-batch.txt # includes requirements.txt
32
40
  ├── README.md
33
41
  └── src/
34
42
  └── batch_analytics/
@@ -52,7 +60,10 @@ analytics/
52
60
 
53
61
  ```bash
54
62
  pip install -e .
55
- # or: pip install -r requirements-batch.txt && pip install -e .
63
+ # or install every runtime dependency used anywhere in the package, then editable:
64
+ pip install -r requirements.txt && pip install -e .
65
+ # PyPI wheel pulls numpy automatically (pyspark.ml); extras: ttest, s3, clickhouse, output, full
66
+ pip install "batch-analytics[full]"
56
67
  ```
57
68
 
58
69
  ## Run
@@ -1,4 +1,5 @@
1
1
  pyspark<3.6,>=3.4
2
+ numpy>=1.19.0
2
3
 
3
4
  [clickhouse]
4
5
  clickhouse-connect>=0.7
@@ -6,9 +7,17 @@ clickhouse-connect>=0.7
6
7
  [dev]
7
8
  pytest>=7.0
8
9
 
10
+ [full]
11
+ scipy>=1.5.0
12
+ boto3>=1.28
13
+ clickhouse-connect>=0.7
14
+
9
15
  [output]
10
16
  boto3>=1.28
11
17
  clickhouse-connect>=0.7
12
18
 
13
19
  [s3]
14
20
  boto3>=1.28
21
+
22
+ [ttest]
23
+ scipy>=1.5.0