markovgpu-rane 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published] # Runs ONLY when you click "Create Release" on GitHub
6
+
7
+ jobs:
8
+ pypi-publish:
9
+ name: Publish to PyPI
10
+ runs-on: ubuntu-latest
11
+ environment: pypi # Matches what you typed in PyPI website
12
+ permissions:
13
+ id-token: write # REQUIRED for Trusted Publishing
14
+
15
+ steps:
16
+ - name: Checkout Code
17
+ uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v5
21
+
22
+ - name: Build Package
23
+ run: uv build
24
+
25
+ - name: Publish to PyPI
26
+ # This action uses the "Trusted" connection automatically
27
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,37 @@
1
+ name: CI (Test & Lint)
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ name: Test on Python ${{ matrix.python-version }}
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12"] # Test multiple versions
16
+
17
+ steps:
18
+ - name: Checkout Code
19
+ uses: actions/checkout@v4
20
+
21
+ - name: Install uv
22
+ uses: astral-sh/setup-uv@v5 # The official uv action
23
+
24
+ - name: Set up Python ${{ matrix.python-version }}
25
+ run: uv python install ${{ matrix.python-version }}
26
+
27
+ - name: Install Project
28
+ run: uv sync --all-extras --dev
29
+
30
+ - name: Lint Code (Ruff)
31
+ run: |
32
+ uv run ruff check .
33
+ uv run ruff format --check .
34
+
35
+ - name: Run Tests
36
+ # We assume CPU fallback works (GitHub runners don't have GPUs)
37
+ run: uv run pytest
@@ -0,0 +1,76 @@
1
+ # --- Python Basics ---
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # --- C Extensions & Shared Libraries ---
7
+ *.so
8
+ *.pyd
9
+ *.dylib
10
+
11
+ # --- Distribution / Packaging ---
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # --- Virtual Environments ---
32
+ # Common names for virtual envs
33
+ .venv
34
+ venv/
35
+ ENV/
36
+ env/
37
+
38
+ # --- Unit Test / Coverage ---
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py.cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+
52
+ # --- IDEs & Editors (Optional but Recommended) ---
53
+ # VS Code
54
+ .vscode/
55
+ !.vscode/settings.json
56
+ !.vscode/tasks.json
57
+ !.vscode/launch.json
58
+ !.vscode/extensions.json
59
+ *.code-workspace
60
+
61
+ # PyCharm / IntelliJ
62
+ .idea/
63
+
64
+ # Mac / Windows System Files
65
+ .DS_Store
66
+ Thumbs.db
67
+
68
+ # --- Environment Variables (Security) ---
69
+ # NEVER commit your secrets
70
+ .env
71
+ .env.local
72
+ .env.*.local
73
+
74
+ # --- Project Specific ---
75
+ # If your OpenCL kernels generate binary caches
76
+ *.cl.bin
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: markovgpu-rane
3
+ Version: 0.1.0
4
+ Summary: High-performance Markov Chains & HMMs using OpenCL
5
+ Author-email: Sahil Rane <sahilrane249@gmail.com>
6
+ Classifier: Development Status :: 4 - Beta
7
+ Classifier: Intended Audience :: Developers
8
+ Classifier: Intended Audience :: Financial and Insurance Industry
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
15
+ Requires-Python: >=3.12
16
+ Requires-Dist: matplotlib>=3.8.0
17
+ Requires-Dist: numpy>=1.26.0
18
+ Requires-Dist: pyopencl>=2024.1
19
+ Requires-Dist: scipy>=1.11.0
20
+ Description-Content-Type: text/markdown
21
+
22
+ hello
@@ -0,0 +1 @@
1
+ hello
@@ -0,0 +1,58 @@
1
+ [project]
2
+ name = "markovgpu-rane"
3
+ version = "0.1.0"
4
+ description = "High-performance Markov Chains & HMMs using OpenCL"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Sahil Rane", email = "sahilrane249@gmail.com" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+
11
+ # Your core libraries + visualization/stats tools you used in the demos
12
+ dependencies = [
13
+ "numpy>=1.26.0", # Relaxed version constraint slightly for better compatibility
14
+ "pyopencl>=2024.1",
15
+ "matplotlib>=3.8.0",
16
+ "scipy>=1.11.0",
17
+ ]
18
+
19
+ # Metadata tags to help people find your library on PyPI
20
+ classifiers = [
21
+ "Development Status :: 4 - Beta",
22
+ "Intended Audience :: Developers",
23
+ "Intended Audience :: Science/Research",
24
+ "Intended Audience :: Financial and Insurance Industry",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Operating System :: OS Independent",
27
+ "Programming Language :: Python :: 3",
28
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
+ "Topic :: Scientific/Engineering :: Mathematics",
30
+ ]
31
+
32
+ # (Optional) Add your GitHub URL if you have pushed the code
33
+ # [project.urls]
34
+ # Repository = "https://github.com/yourusername/markovgpu"
35
+ # Issues = "https://github.com/yourusername/markovgpu/issues"
36
+
37
+ # ---------------------------------------------------------
38
+ # BUILD SYSTEM: Hatchling (Standard for uv)
39
+ # ---------------------------------------------------------
40
+ [build-system]
41
+ requires = ["hatchling"]
42
+ build-backend = "hatchling.build"
43
+
44
+ # ---------------------------------------------------------
45
+ # PACKAGING: Include the .cl Kernel file
46
+ # ---------------------------------------------------------
47
+ [tool.hatch.build.targets.wheel]
48
+ packages = ["src/markovgpu"]
49
+
50
+ [tool.hatch.build.targets.wheel.force-include]
51
+ # Map source path (left) to package destination (right)
52
+ "src/markovgpu/kernels.cl" = "markovgpu/kernels.cl"
53
+
54
+ [dependency-groups]
55
+ dev = [
56
+ "pytest>=9.0.2",
57
+ "ruff>=0.15.0",
58
+ ]
@@ -0,0 +1,3 @@
1
+ from .backend import MarkovEngine
2
+
3
+ __all__ = ["MarkovEngine"]
@@ -0,0 +1,326 @@
1
+ import pyopencl as cl
2
+ import numpy as np
3
+ import os
4
+
5
+ # Locate the kernel file
6
+ MODULE_PATH = os.path.dirname(os.path.abspath(__file__))
7
+ KERNEL_PATH = os.path.join(MODULE_PATH, "kernels.cl")
8
+
9
+ # Threshold: Use GPU if states >= 64, otherwise CPU is faster
10
+ GPU_THRESHOLD = 64
11
+
12
+
13
+ class MarkovEngine:
14
+ def __init__(self):
15
+ self.use_gpu = False
16
+ self.ctx = None
17
+ self.queue = None
18
+ self.prg = None
19
+
20
+ # 1. Try to Connect to GPU
21
+ try:
22
+ platforms = cl.get_platforms()
23
+ gpu_devices = []
24
+ for p in platforms:
25
+ gpu_devices.extend(p.get_devices(device_type=cl.device_type.GPU))
26
+
27
+ if gpu_devices:
28
+ # Pick the discrete GPU (highest compute units)
29
+ best_dev = sorted(
30
+ gpu_devices, key=lambda d: d.max_compute_units, reverse=True
31
+ )[0]
32
+ self.ctx = cl.Context([best_dev])
33
+ print(
34
+ f"🔌 Connected to Accelerator: {best_dev.name} ({best_dev.max_compute_units} CUs)"
35
+ )
36
+ else:
37
+ self.ctx = cl.create_some_context(interactive=False)
38
+ print(f"⚠️ No Dedicated GPU found. Using: {self.ctx.devices[0].name}")
39
+
40
+ self.queue = cl.CommandQueue(self.ctx)
41
+
42
+ # 2. Compile Kernels
43
+ if not os.path.exists(KERNEL_PATH):
44
+ raise FileNotFoundError(f"Kernel file missing at: {KERNEL_PATH}")
45
+
46
+ with open(KERNEL_PATH, "r") as f:
47
+ self.prg = cl.Program(self.ctx, f.read()).build()
48
+
49
+ # 3. Cache Kernels (Robust Retrieval)
50
+ self.use_gpu = True
51
+ try:
52
+ # Basic
53
+ self.k_markov = self.prg.markov_step
54
+ self.k_hmm_basic = self.prg.hmm_forward_step
55
+
56
+ # Advanced / Viterbi
57
+ self.k_hmm_log = self.prg.hmm_forward_log
58
+ self.k_viterbi = self.prg.viterbi_step
59
+
60
+ # Training
61
+ self.k_hmm_back = self.prg.hmm_backward_log
62
+ self.k_acc_trans = self.prg.accumulate_transitions
63
+ self.k_acc_gamma = self.prg.accumulate_gammas
64
+
65
+ except AttributeError as e:
66
+ print(f"❌ Kernel Warning: {e}")
67
+ print("⚠️ Some GPU features may be disabled.")
68
+
69
+ except Exception as e:
70
+ print(f"⚠️ OpenCL Initialization failed: {e}")
71
+ print("⚠️ Running in CPU-Only Mode (NumPy).")
72
+ self.use_gpu = False
73
+
74
+ # --- 1. Simulation ---
75
+ def step(self, P, v):
76
+ """Runs one step: v_new = v * P"""
77
+ N = len(v)
78
+
79
+ if not self.use_gpu or N < GPU_THRESHOLD:
80
+ return v.dot(P)
81
+
82
+ mf = cl.mem_flags
83
+ P = np.ascontiguousarray(P, dtype=np.float32)
84
+ v = np.ascontiguousarray(v, dtype=np.float32)
85
+ result = np.empty_like(v)
86
+
87
+ d_P = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P)
88
+ d_v = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=v)
89
+ d_res = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=result.nbytes)
90
+
91
+ self.k_markov(self.queue, (N,), None, np.int32(N), d_v, d_P, d_res)
92
+ cl.enqueue_copy(self.queue, result, d_res)
93
+ return result
94
+
95
+ def converge(self, P, start_v, tolerance=1e-5, max_steps=1000):
96
+ N = len(start_v)
97
+
98
+ # CPU Path
99
+ if not self.use_gpu or N < GPU_THRESHOLD:
100
+ # print(f"🔄 Converging on CPU (N={N})...")
101
+ current_v = start_v.copy()
102
+ for i in range(max_steps):
103
+ new_v = current_v.dot(P)
104
+ if np.sum(np.abs(new_v - current_v)) < tolerance:
105
+ return new_v
106
+ current_v = new_v
107
+ return current_v
108
+
109
+ # GPU Path
110
+ # print(f"🔄 Converging on GPU (N={N})...")
111
+ mf = cl.mem_flags
112
+ P = np.ascontiguousarray(P, dtype=np.float32)
113
+ start_v = np.ascontiguousarray(start_v, dtype=np.float32)
114
+
115
+ d_P = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P)
116
+ d_v_read = cl.Buffer(
117
+ self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=start_v
118
+ )
119
+ d_v_write = cl.Buffer(self.ctx, mf.READ_WRITE, size=start_v.nbytes)
120
+
121
+ current_v = start_v.copy()
122
+
123
+ for i in range(max_steps):
124
+ self.k_markov(self.queue, (N,), None, np.int32(N), d_v_read, d_P, d_v_write)
125
+
126
+ if i % 10 == 0:
127
+ new_v = np.empty_like(current_v)
128
+ cl.enqueue_copy(self.queue, new_v, d_v_write)
129
+ if np.sum(np.abs(new_v - current_v)) < tolerance:
130
+ return new_v
131
+ current_v = new_v
132
+
133
+ d_v_read, d_v_write = d_v_write, d_v_read
134
+
135
+ print("⚠️ Reached max steps without full convergence.")
136
+ return current_v
137
+
138
+ # --- 2. Inference & Viterbi ---
139
+ def hmm_filter(self, transition_matrix, observation_probs):
140
+ """Standard HMM Filter (Returns Probabilities)"""
141
+ # Simplification: Running basic HMM forward pass
142
+ # For production use, usually prefer Log-Space to avoid underflow.
143
+ # This wrapper can be upgraded to use k_hmm_log if needed.
144
+ pass
145
+
146
+ def decode_regime(self, transition_matrix, observation_probs):
147
+ """Viterbi Algorithm (Finds Most Likely Path)"""
148
+ T, N = observation_probs.shape
149
+ epsilon = 1e-20
150
+
151
+ # CPU Path
152
+ if not self.use_gpu or N < GPU_THRESHOLD:
153
+ log_trans = np.log(transition_matrix + epsilon)
154
+ log_emis = np.log(observation_probs + epsilon)
155
+ log_delta = np.zeros((T, N))
156
+ backpointers = np.zeros((T, N), dtype=int)
157
+
158
+ log_delta[0] = -np.log(N) + log_emis[0]
159
+
160
+ for t in range(1, T):
161
+ for j in range(N):
162
+ vals = log_delta[t - 1] + log_trans[:, j]
163
+ best_prev = np.argmax(vals)
164
+ backpointers[t, j] = best_prev
165
+ log_delta[t, j] = vals[best_prev] + log_emis[t, j]
166
+
167
+ path = np.zeros(T, dtype=int)
168
+ path[-1] = np.argmax(log_delta[-1])
169
+ for t in range(T - 2, -1, -1):
170
+ path[t] = backpointers[t + 1, path[t + 1]]
171
+ return path
172
+
173
+ # GPU Path
174
+ mf = cl.mem_flags
175
+ log_trans = np.log(transition_matrix + epsilon).astype(np.float32)
176
+ log_emis = np.log(observation_probs + epsilon).astype(np.float32)
177
+ log_delta = np.full(N, -np.log(N), dtype=np.float32)
178
+
179
+ d_trans = cl.Buffer(
180
+ self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_trans
181
+ )
182
+ d_delta_in = cl.Buffer(
183
+ self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_delta
184
+ )
185
+ d_delta_out = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_delta.nbytes)
186
+
187
+ full_backpointer_history = np.zeros((T, N), dtype=np.int32)
188
+ d_backpointers = cl.Buffer(
189
+ self.ctx, mf.WRITE_ONLY, size=full_backpointer_history.nbytes // T
190
+ )
191
+
192
+ print(f"🕵️ Decoding {T} days (GPU Accelerated)...")
193
+
194
+ for t in range(T):
195
+ d_emis = cl.Buffer(
196
+ self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_emis[t]
197
+ )
198
+
199
+ self.k_viterbi(
200
+ self.queue,
201
+ (N,),
202
+ None,
203
+ np.int32(N),
204
+ d_delta_in,
205
+ d_trans,
206
+ d_emis,
207
+ d_delta_out,
208
+ d_backpointers,
209
+ )
210
+
211
+ step_pointers = np.empty(N, dtype=np.int32)
212
+ cl.enqueue_copy(self.queue, step_pointers, d_backpointers)
213
+ full_backpointer_history[t] = step_pointers
214
+
215
+ d_delta_in, d_delta_out = d_delta_out, d_delta_in
216
+
217
+ final_log_probs = np.empty(N, dtype=np.float32)
218
+ cl.enqueue_copy(self.queue, final_log_probs, d_delta_in)
219
+
220
+ best_path = np.zeros(T, dtype=np.int32)
221
+ best_path[-1] = np.argmax(final_log_probs)
222
+
223
+ for t in range(T - 2, -1, -1):
224
+ next_state = best_path[t + 1]
225
+ best_path[t] = full_backpointer_history[t + 1][next_state]
226
+
227
+ return best_path
228
+
229
+ # --- 3. Training (Baum-Welch) ---
230
+ def fit(self, observations, n_states, n_iters=10, tolerance=1e-4):
231
+ """Baum-Welch Expectation Maximization (Training)"""
232
+ T = observations.shape[0]
233
+ N = n_states
234
+
235
+ # Random Init
236
+ log_trans = np.log(
237
+ np.full((N, N), 1.0 / N) + np.random.rand(N, N) * 0.01
238
+ ).astype(np.float32)
239
+ log_emis = np.log(observations + 1e-20).astype(np.float32)
240
+
241
+ mf = cl.mem_flags
242
+ d_trans = cl.Buffer(
243
+ self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_trans
244
+ )
245
+ d_alpha = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4) # Full history
246
+ d_beta = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4) # Full history
247
+ d_emis = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_emis)
248
+
249
+ d_new_trans = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
250
+ d_gamma_sums = cl.Buffer(self.ctx, mf.READ_WRITE, size=N * 4)
251
+
252
+ prev_score = -np.inf
253
+
254
+ print(f"🧠 Training HMM ({N} States, {T} Steps)...")
255
+
256
+ for i in range(n_iters):
257
+ # 1. CPU Forward/Backward (Latency Optimized)
258
+ alpha_full, log_likelihood = self._cpu_forward(log_trans, log_emis)
259
+ beta_full = self._cpu_backward(log_trans, log_emis)
260
+
261
+ # 2. GPU Accumulation (Throughput Optimized)
262
+ cl.enqueue_copy(self.queue, d_alpha, alpha_full)
263
+ cl.enqueue_copy(self.queue, d_beta, beta_full)
264
+ cl.enqueue_copy(self.queue, d_trans, log_trans)
265
+
266
+ self.k_acc_trans(
267
+ self.queue,
268
+ (N, N),
269
+ None,
270
+ np.int32(T),
271
+ np.int32(N),
272
+ d_alpha,
273
+ d_beta,
274
+ d_emis,
275
+ d_trans,
276
+ d_new_trans,
277
+ )
278
+
279
+ self.k_acc_gamma(
280
+ self.queue,
281
+ (N,),
282
+ None,
283
+ np.int32(T),
284
+ np.int32(N),
285
+ d_alpha,
286
+ d_beta,
287
+ d_gamma_sums,
288
+ )
289
+
290
+ # 3. Update
291
+ new_log_trans_counts = np.empty_like(log_trans)
292
+ log_gamma_sums = np.empty(N, dtype=np.float32)
293
+
294
+ cl.enqueue_copy(self.queue, new_log_trans_counts, d_new_trans)
295
+ cl.enqueue_copy(self.queue, log_gamma_sums, d_gamma_sums)
296
+
297
+ log_trans = new_log_trans_counts - log_gamma_sums[:, None]
298
+
299
+ change = log_likelihood - prev_score
300
+ print(
301
+ f" Iter {i + 1}: Likelihood {log_likelihood:.2f} (Delta: {change:.4f})"
302
+ )
303
+ if abs(change) < tolerance:
304
+ break
305
+ prev_score = log_likelihood
306
+
307
+ return np.exp(log_trans)
308
+
309
+ def _cpu_forward(self, log_trans, log_emis):
310
+ T, N = log_emis.shape
311
+ alpha = np.zeros((T, N), dtype=np.float32)
312
+ alpha[0] = -np.log(N) + log_emis[0]
313
+ for t in range(1, T):
314
+ for j in range(N):
315
+ prev = alpha[t - 1] + log_trans[:, j]
316
+ alpha[t, j] = np.logaddexp.reduce(prev) + log_emis[t, j]
317
+ return alpha, np.logaddexp.reduce(alpha[-1])
318
+
319
+ def _cpu_backward(self, log_trans, log_emis):
320
+ T, N = log_emis.shape
321
+ beta = np.zeros((T, N), dtype=np.float32)
322
+ for t in range(T - 2, -1, -1):
323
+ for i in range(N):
324
+ terms = log_trans[i, :] + log_emis[t + 1] + beta[t + 1]
325
+ beta[t, i] = np.logaddexp.reduce(terms)
326
+ return beta