markovgpu-rane 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
markovgpu/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .backend import MarkovEngine
2
+
3
+ __all__ = ["MarkovEngine"]
markovgpu/backend.py ADDED
@@ -0,0 +1,326 @@
1
+ import pyopencl as cl
2
+ import numpy as np
3
+ import os
4
+
5
+ # Locate the kernel file
6
+ MODULE_PATH = os.path.dirname(os.path.abspath(__file__))
7
+ KERNEL_PATH = os.path.join(MODULE_PATH, "kernels.cl")
8
+
9
+ # Threshold: Use GPU if states >= 64, otherwise CPU is faster
10
+ GPU_THRESHOLD = 64
11
+
12
+
13
+ class MarkovEngine:
14
+ def __init__(self):
15
+ self.use_gpu = False
16
+ self.ctx = None
17
+ self.queue = None
18
+ self.prg = None
19
+
20
+ # 1. Try to Connect to GPU
21
+ try:
22
+ platforms = cl.get_platforms()
23
+ gpu_devices = []
24
+ for p in platforms:
25
+ gpu_devices.extend(p.get_devices(device_type=cl.device_type.GPU))
26
+
27
+ if gpu_devices:
28
+ # Pick the discrete GPU (highest compute units)
29
+ best_dev = sorted(
30
+ gpu_devices, key=lambda d: d.max_compute_units, reverse=True
31
+ )[0]
32
+ self.ctx = cl.Context([best_dev])
33
+ print(
34
+ f"🔌 Connected to Accelerator: {best_dev.name} ({best_dev.max_compute_units} CUs)"
35
+ )
36
+ else:
37
+ self.ctx = cl.create_some_context(interactive=False)
38
+ print(f"⚠️ No Dedicated GPU found. Using: {self.ctx.devices[0].name}")
39
+
40
+ self.queue = cl.CommandQueue(self.ctx)
41
+
42
+ # 2. Compile Kernels
43
+ if not os.path.exists(KERNEL_PATH):
44
+ raise FileNotFoundError(f"Kernel file missing at: {KERNEL_PATH}")
45
+
46
+ with open(KERNEL_PATH, "r") as f:
47
+ self.prg = cl.Program(self.ctx, f.read()).build()
48
+
49
+ # 3. Cache Kernels (Robust Retrieval)
50
+ self.use_gpu = True
51
+ try:
52
+ # Basic
53
+ self.k_markov = self.prg.markov_step
54
+ self.k_hmm_basic = self.prg.hmm_forward_step
55
+
56
+ # Advanced / Viterbi
57
+ self.k_hmm_log = self.prg.hmm_forward_log
58
+ self.k_viterbi = self.prg.viterbi_step
59
+
60
+ # Training
61
+ self.k_hmm_back = self.prg.hmm_backward_log
62
+ self.k_acc_trans = self.prg.accumulate_transitions
63
+ self.k_acc_gamma = self.prg.accumulate_gammas
64
+
65
+ except AttributeError as e:
66
+ print(f"❌ Kernel Warning: {e}")
67
+ print("⚠️ Some GPU features may be disabled.")
68
+
69
+ except Exception as e:
70
+ print(f"⚠️ OpenCL Initialization failed: {e}")
71
+ print("⚠️ Running in CPU-Only Mode (NumPy).")
72
+ self.use_gpu = False
73
+
74
+ # --- 1. Simulation ---
75
+ def step(self, P, v):
76
+ """Runs one step: v_new = v * P"""
77
+ N = len(v)
78
+
79
+ if not self.use_gpu or N < GPU_THRESHOLD:
80
+ return v.dot(P)
81
+
82
+ mf = cl.mem_flags
83
+ P = np.ascontiguousarray(P, dtype=np.float32)
84
+ v = np.ascontiguousarray(v, dtype=np.float32)
85
+ result = np.empty_like(v)
86
+
87
+ d_P = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P)
88
+ d_v = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=v)
89
+ d_res = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=result.nbytes)
90
+
91
+ self.k_markov(self.queue, (N,), None, np.int32(N), d_v, d_P, d_res)
92
+ cl.enqueue_copy(self.queue, result, d_res)
93
+ return result
94
+
95
+ def converge(self, P, start_v, tolerance=1e-5, max_steps=1000):
96
+ N = len(start_v)
97
+
98
+ # CPU Path
99
+ if not self.use_gpu or N < GPU_THRESHOLD:
100
+ # print(f"🔄 Converging on CPU (N={N})...")
101
+ current_v = start_v.copy()
102
+ for i in range(max_steps):
103
+ new_v = current_v.dot(P)
104
+ if np.sum(np.abs(new_v - current_v)) < tolerance:
105
+ return new_v
106
+ current_v = new_v
107
+ return current_v
108
+
109
+ # GPU Path
110
+ # print(f"🔄 Converging on GPU (N={N})...")
111
+ mf = cl.mem_flags
112
+ P = np.ascontiguousarray(P, dtype=np.float32)
113
+ start_v = np.ascontiguousarray(start_v, dtype=np.float32)
114
+
115
+ d_P = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P)
116
+ d_v_read = cl.Buffer(
117
+ self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=start_v
118
+ )
119
+ d_v_write = cl.Buffer(self.ctx, mf.READ_WRITE, size=start_v.nbytes)
120
+
121
+ current_v = start_v.copy()
122
+
123
+ for i in range(max_steps):
124
+ self.k_markov(self.queue, (N,), None, np.int32(N), d_v_read, d_P, d_v_write)
125
+
126
+ if i % 10 == 0:
127
+ new_v = np.empty_like(current_v)
128
+ cl.enqueue_copy(self.queue, new_v, d_v_write)
129
+ if np.sum(np.abs(new_v - current_v)) < tolerance:
130
+ return new_v
131
+ current_v = new_v
132
+
133
+ d_v_read, d_v_write = d_v_write, d_v_read
134
+
135
+ print("⚠️ Reached max steps without full convergence.")
136
+ return current_v
137
+
138
+ # --- 2. Inference & Viterbi ---
139
+ def hmm_filter(self, transition_matrix, observation_probs):
140
+ """Standard HMM Filter (Returns Probabilities)"""
141
+ # Simplification: Running basic HMM forward pass
142
+ # For production use, usually prefer Log-Space to avoid underflow.
143
+ # This wrapper can be upgraded to use k_hmm_log if needed.
144
+ pass
145
+
146
+ def decode_regime(self, transition_matrix, observation_probs):
147
+ """Viterbi Algorithm (Finds Most Likely Path)"""
148
+ T, N = observation_probs.shape
149
+ epsilon = 1e-20
150
+
151
+ # CPU Path
152
+ if not self.use_gpu or N < GPU_THRESHOLD:
153
+ log_trans = np.log(transition_matrix + epsilon)
154
+ log_emis = np.log(observation_probs + epsilon)
155
+ log_delta = np.zeros((T, N))
156
+ backpointers = np.zeros((T, N), dtype=int)
157
+
158
+ log_delta[0] = -np.log(N) + log_emis[0]
159
+
160
+ for t in range(1, T):
161
+ for j in range(N):
162
+ vals = log_delta[t - 1] + log_trans[:, j]
163
+ best_prev = np.argmax(vals)
164
+ backpointers[t, j] = best_prev
165
+ log_delta[t, j] = vals[best_prev] + log_emis[t, j]
166
+
167
+ path = np.zeros(T, dtype=int)
168
+ path[-1] = np.argmax(log_delta[-1])
169
+ for t in range(T - 2, -1, -1):
170
+ path[t] = backpointers[t + 1, path[t + 1]]
171
+ return path
172
+
173
+ # GPU Path
174
+ mf = cl.mem_flags
175
+ log_trans = np.log(transition_matrix + epsilon).astype(np.float32)
176
+ log_emis = np.log(observation_probs + epsilon).astype(np.float32)
177
+ log_delta = np.full(N, -np.log(N), dtype=np.float32)
178
+
179
+ d_trans = cl.Buffer(
180
+ self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_trans
181
+ )
182
+ d_delta_in = cl.Buffer(
183
+ self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_delta
184
+ )
185
+ d_delta_out = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_delta.nbytes)
186
+
187
+ full_backpointer_history = np.zeros((T, N), dtype=np.int32)
188
+ d_backpointers = cl.Buffer(
189
+ self.ctx, mf.WRITE_ONLY, size=full_backpointer_history.nbytes // T
190
+ )
191
+
192
+ print(f"🕵️ Decoding {T} days (GPU Accelerated)...")
193
+
194
+ for t in range(T):
195
+ d_emis = cl.Buffer(
196
+ self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_emis[t]
197
+ )
198
+
199
+ self.k_viterbi(
200
+ self.queue,
201
+ (N,),
202
+ None,
203
+ np.int32(N),
204
+ d_delta_in,
205
+ d_trans,
206
+ d_emis,
207
+ d_delta_out,
208
+ d_backpointers,
209
+ )
210
+
211
+ step_pointers = np.empty(N, dtype=np.int32)
212
+ cl.enqueue_copy(self.queue, step_pointers, d_backpointers)
213
+ full_backpointer_history[t] = step_pointers
214
+
215
+ d_delta_in, d_delta_out = d_delta_out, d_delta_in
216
+
217
+ final_log_probs = np.empty(N, dtype=np.float32)
218
+ cl.enqueue_copy(self.queue, final_log_probs, d_delta_in)
219
+
220
+ best_path = np.zeros(T, dtype=np.int32)
221
+ best_path[-1] = np.argmax(final_log_probs)
222
+
223
+ for t in range(T - 2, -1, -1):
224
+ next_state = best_path[t + 1]
225
+ best_path[t] = full_backpointer_history[t + 1][next_state]
226
+
227
+ return best_path
228
+
229
+ # --- 3. Training (Baum-Welch) ---
230
+ def fit(self, observations, n_states, n_iters=10, tolerance=1e-4):
231
+ """Baum-Welch Expectation Maximization (Training)"""
232
+ T = observations.shape[0]
233
+ N = n_states
234
+
235
+ # Random Init
236
+ log_trans = np.log(
237
+ np.full((N, N), 1.0 / N) + np.random.rand(N, N) * 0.01
238
+ ).astype(np.float32)
239
+ log_emis = np.log(observations + 1e-20).astype(np.float32)
240
+
241
+ mf = cl.mem_flags
242
+ d_trans = cl.Buffer(
243
+ self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_trans
244
+ )
245
+ d_alpha = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4) # Full history
246
+ d_beta = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4) # Full history
247
+ d_emis = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_emis)
248
+
249
+ d_new_trans = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
250
+ d_gamma_sums = cl.Buffer(self.ctx, mf.READ_WRITE, size=N * 4)
251
+
252
+ prev_score = -np.inf
253
+
254
+ print(f"🧠 Training HMM ({N} States, {T} Steps)...")
255
+
256
+ for i in range(n_iters):
257
+ # 1. CPU Forward/Backward (Latency Optimized)
258
+ alpha_full, log_likelihood = self._cpu_forward(log_trans, log_emis)
259
+ beta_full = self._cpu_backward(log_trans, log_emis)
260
+
261
+ # 2. GPU Accumulation (Throughput Optimized)
262
+ cl.enqueue_copy(self.queue, d_alpha, alpha_full)
263
+ cl.enqueue_copy(self.queue, d_beta, beta_full)
264
+ cl.enqueue_copy(self.queue, d_trans, log_trans)
265
+
266
+ self.k_acc_trans(
267
+ self.queue,
268
+ (N, N),
269
+ None,
270
+ np.int32(T),
271
+ np.int32(N),
272
+ d_alpha,
273
+ d_beta,
274
+ d_emis,
275
+ d_trans,
276
+ d_new_trans,
277
+ )
278
+
279
+ self.k_acc_gamma(
280
+ self.queue,
281
+ (N,),
282
+ None,
283
+ np.int32(T),
284
+ np.int32(N),
285
+ d_alpha,
286
+ d_beta,
287
+ d_gamma_sums,
288
+ )
289
+
290
+ # 3. Update
291
+ new_log_trans_counts = np.empty_like(log_trans)
292
+ log_gamma_sums = np.empty(N, dtype=np.float32)
293
+
294
+ cl.enqueue_copy(self.queue, new_log_trans_counts, d_new_trans)
295
+ cl.enqueue_copy(self.queue, log_gamma_sums, d_gamma_sums)
296
+
297
+ log_trans = new_log_trans_counts - log_gamma_sums[:, None]
298
+
299
+ change = log_likelihood - prev_score
300
+ print(
301
+ f" Iter {i + 1}: Likelihood {log_likelihood:.2f} (Delta: {change:.4f})"
302
+ )
303
+ if abs(change) < tolerance:
304
+ break
305
+ prev_score = log_likelihood
306
+
307
+ return np.exp(log_trans)
308
+
309
+ def _cpu_forward(self, log_trans, log_emis):
310
+ T, N = log_emis.shape
311
+ alpha = np.zeros((T, N), dtype=np.float32)
312
+ alpha[0] = -np.log(N) + log_emis[0]
313
+ for t in range(1, T):
314
+ for j in range(N):
315
+ prev = alpha[t - 1] + log_trans[:, j]
316
+ alpha[t, j] = np.logaddexp.reduce(prev) + log_emis[t, j]
317
+ return alpha, np.logaddexp.reduce(alpha[-1])
318
+
319
+ def _cpu_backward(self, log_trans, log_emis):
320
+ T, N = log_emis.shape
321
+ beta = np.zeros((T, N), dtype=np.float32)
322
+ for t in range(T - 2, -1, -1):
323
+ for i in range(N):
324
+ terms = log_trans[i, :] + log_emis[t + 1] + beta[t + 1]
325
+ beta[t, i] = np.logaddexp.reduce(terms)
326
+ return beta
markovgpu/kernels.cl ADDED
@@ -0,0 +1,169 @@
1
+ // kernels.cl - The Complete Suite
2
+
3
+ // --- HELPER: Log-Sum-Exp Trick ---
4
+ // Prevents overflow when adding log-probabilities
5
+ float log_add(float log_a, float log_b) {
6
+ float max_val = max(log_a, log_b);
7
+ float min_val = min(log_a, log_b);
8
+ return max_val + log1p(exp(min_val - max_val));
9
+ }
10
+
11
+ // --- SECTION 1: Basic Markov Operations ---
12
+
13
+ // 1. Standard Markov Step: Next = Current * Matrix
14
+ __kernel void markov_step(
15
+ const int N,
16
+ __global const float *current_state,
17
+ __global const float *transition_mat,
18
+ __global float *next_state)
19
+ {
20
+ int id = get_global_id(0);
21
+ if (id < N) {
22
+ float sum = 0.0f;
23
+ for (int k = 0; k < N; k++) {
24
+ sum += current_state[k] * transition_mat[k * N + id];
25
+ }
26
+ next_state[id] = sum;
27
+ }
28
+ }
29
+
30
+ // 2. Standard HMM Filter (Probability Space)
31
+ // Used for simple "What state am I in?" queries without log-space
32
+ __kernel void hmm_forward_step(
33
+ const int N,
34
+ __global const float *alpha_prev,
35
+ __global const float *trans_mat,
36
+ __global const float *emissions,
37
+ __global float *alpha_new)
38
+ {
39
+ int id = get_global_id(0);
40
+ if (id < N) {
41
+ float sum = 0.0f;
42
+ for (int k = 0; k < N; k++) {
43
+ sum += alpha_prev[k] * trans_mat[k * N + id];
44
+ }
45
+ alpha_new[id] = sum * emissions[id];
46
+ }
47
+ }
48
+
49
+ // --- SECTION 2: Advanced Log-Space Operations (Stable) ---
50
+
51
+ // 3. Log-Space Forward (For Viterbi & Training)
52
+ __kernel void hmm_forward_log(
53
+ const int N,
54
+ __global const float *log_alpha_prev,
55
+ __global const float *log_trans_mat,
56
+ __global const float *log_emissions,
57
+ __global float *log_alpha_new)
58
+ {
59
+ int id = get_global_id(0);
60
+ if (id < N) {
61
+ float log_sum = -INFINITY;
62
+ for (int k = 0; k < N; k++) {
63
+ float val = log_alpha_prev[k] + log_trans_mat[k * N + id];
64
+ if (k == 0) log_sum = val;
65
+ else log_sum = log_add(log_sum, val);
66
+ }
67
+ log_alpha_new[id] = log_sum + log_emissions[id];
68
+ }
69
+ }
70
+
71
+ // 4. Log-Space Backward (For Training)
72
+ __kernel void hmm_backward_log(
73
+ const int N,
74
+ __global const float *beta_future,
75
+ __global const float *trans,
76
+ __global const float *emis_future,
77
+ __global float *beta_curr)
78
+ {
79
+ int id = get_global_id(0); // State 'i'
80
+ if (id < N) {
81
+ float log_sum = -INFINITY;
82
+ for (int j=0; j<N; j++) {
83
+ // transition i->j + emission(t+1) + beta(t+1)
84
+ float val = trans[id*N + j] + emis_future[j] + beta_future[j];
85
+ if (j==0) log_sum = val;
86
+ else log_sum = log_add(log_sum, val);
87
+ }
88
+ beta_curr[id] = log_sum;
89
+ }
90
+ }
91
+
92
+ // 5. Viterbi Algorithm (Finds best path)
93
+ __kernel void viterbi_step(
94
+ const int N,
95
+ __global const float *log_delta_prev,
96
+ __global const float *log_trans_mat,
97
+ __global const float *log_emissions,
98
+ __global float *log_delta_new,
99
+ __global int *backpointers)
100
+ {
101
+ int id = get_global_id(0);
102
+ if (id < N) {
103
+ float max_prob = -INFINITY;
104
+ int best_prev_state = 0;
105
+
106
+ for (int k = 0; k < N; k++) {
107
+ float prob = log_delta_prev[k] + log_trans_mat[k * N + id];
108
+ if (prob > max_prob) {
109
+ max_prob = prob;
110
+ best_prev_state = k;
111
+ }
112
+ }
113
+ log_delta_new[id] = max_prob + log_emissions[id];
114
+ backpointers[id] = best_prev_state;
115
+ }
116
+ }
117
+
118
+ // --- SECTION 3: Learning Accumulators (Baum-Welch) ---
119
+
120
+ // 6. Accumulate Transitions (E-Step)
121
+ // Condenses time T into N*N summary matrix
122
+ __kernel void accumulate_transitions(
123
+ const int T, const int N,
124
+ __global const float *alpha_full,
125
+ __global const float *beta_full,
126
+ __global const float *emis_full,
127
+ __global const float *trans_mat,
128
+ __global float *new_trans_counts)
129
+ {
130
+ int row = get_global_id(1); // From State i
131
+ int col = get_global_id(0); // To State j
132
+
133
+ if (row < N && col < N) {
134
+ float log_sum_xi = -INFINITY;
135
+ float log_trans_val = trans_mat[row * N + col];
136
+
137
+ // Loop over time 0 to T-2
138
+ for (int t = 0; t < T - 1; t++) {
139
+ float log_xi = alpha_full[t*N + row] +
140
+ log_trans_val +
141
+ emis_full[(t+1)*N + col] +
142
+ beta_full[(t+1)*N + col];
143
+
144
+ if (t == 0) log_sum_xi = log_xi;
145
+ else log_sum_xi = log_add(log_sum_xi, log_xi);
146
+ }
147
+ new_trans_counts[row * N + col] = log_sum_xi;
148
+ }
149
+ }
150
+
151
+ // 7. Accumulate Gammas (E-Step)
152
+ // Condenses time T into N summary counts
153
+ __kernel void accumulate_gammas(
154
+ const int T, const int N,
155
+ __global const float *alpha_full,
156
+ __global const float *beta_full,
157
+ __global float *log_gamma_sums)
158
+ {
159
+ int id = get_global_id(0);
160
+ if (id < N) {
161
+ float log_sum_gamma = -INFINITY;
162
+ for (int t = 0; t < T; t++) {
163
+ float val = alpha_full[t*N + id] + beta_full[t*N + id];
164
+ if (t == 0) log_sum_gamma = val;
165
+ else log_sum_gamma = log_add(log_sum_gamma, val);
166
+ }
167
+ log_gamma_sums[id] = log_sum_gamma;
168
+ }
169
+ }
markovgpu/py.typed ADDED
File without changes
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: markovgpu-rane
3
+ Version: 0.1.0
4
+ Summary: High-performance Markov Chains & HMMs using OpenCL
5
+ Author-email: Sahil Rane <sahilrane249@gmail.com>
6
+ Classifier: Development Status :: 4 - Beta
7
+ Classifier: Intended Audience :: Developers
8
+ Classifier: Intended Audience :: Financial and Insurance Industry
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
15
+ Requires-Python: >=3.12
16
+ Requires-Dist: matplotlib>=3.8.0
17
+ Requires-Dist: numpy>=1.26.0
18
+ Requires-Dist: pyopencl>=2024.1
19
+ Requires-Dist: scipy>=1.11.0
20
+ Description-Content-Type: text/markdown
21
+
22
+ hello
@@ -0,0 +1,7 @@
1
+ markovgpu/__init__.py,sha256=aGWvFGT6VaLCuFNO9T3ubnlhz2qgkBmNIcCy976YrqE,62
2
+ markovgpu/backend.py,sha256=bfYnge9MgMcDHmJ7CcCG2VGqVfsGxsCzXavGLUFdB2w,11733
3
+ markovgpu/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ markovgpu/kernels.cl,sha256=RXpt2jD6IRdh5YTunB_lwfajT1Cw9M95v3uxwMMmMvs,5141
5
+ markovgpu_rane-0.1.0.dist-info/METADATA,sha256=-CqRDK-d95CjNbsFpBIBut--zqLGHvvXPgsWdUe0Mtg,840
6
+ markovgpu_rane-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
7
+ markovgpu_rane-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any