markovgpu-rane 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
markovgpu/backend.py CHANGED
@@ -43,8 +43,11 @@ class MarkovEngine:
43
43
  if not os.path.exists(KERNEL_PATH):
44
44
  raise FileNotFoundError(f"Kernel file missing at: {KERNEL_PATH}")
45
45
 
46
+ # OPTIMIZATION: Fast Math Build Options
47
+ build_options = "-cl-mad-enable -cl-fast-relaxed-math"
48
+
46
49
  with open(KERNEL_PATH, "r") as f:
47
- self.prg = cl.Program(self.ctx, f.read()).build()
50
+ self.prg = cl.Program(self.ctx, f.read()).build(options=build_options)
48
51
 
49
52
  # 3. Cache Kernels (Robust Retrieval)
50
53
  self.use_gpu = True
@@ -80,19 +83,25 @@ class MarkovEngine:
80
83
  return v.dot(P)
81
84
 
82
85
  mf = cl.mem_flags
83
- P = np.ascontiguousarray(P, dtype=np.float32)
86
+ # OPTIMIZATION: Transpose P for coalesced access
87
+ # The kernel expects P_T[id][k] which maps to P[k][id]
88
+ P_T = np.ascontiguousarray(P.T, dtype=np.float32)
84
89
  v = np.ascontiguousarray(v, dtype=np.float32)
85
90
  result = np.empty_like(v)
86
91
 
87
- d_P = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P)
92
+ d_P_T = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P_T)
88
93
  d_v = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=v)
89
94
  d_res = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=result.nbytes)
90
95
 
91
- self.k_markov(self.queue, (N,), None, np.int32(N), d_v, d_P, d_res)
96
+ self.k_markov(self.queue, (N,), None, np.int32(N), d_v, d_P_T, d_res)
92
97
  cl.enqueue_copy(self.queue, result, d_res)
93
98
  return result
94
99
 
95
100
  def converge(self, P, start_v, tolerance=1e-5, max_steps=1000):
101
+ # Note: 'converge' currently uses the iterative step approach.
102
+ # For maximum optimization, this loop should ideally be moved to a kernel,
103
+ # but for now, we rely on the optimized 'step' logic implicitly or CPU fallback.
104
+ # Below is the robust hybrid implementation.
96
105
  N = len(start_v)
97
106
 
98
107
  # CPU Path
@@ -106,20 +115,20 @@ class MarkovEngine:
106
115
  return current_v
107
116
 
108
117
  # GPU Path
118
+ # We reuse the specific buffers to avoid reallocation overhead in loop
109
119
  mf = cl.mem_flags
110
- P = np.ascontiguousarray(P, dtype=np.float32)
120
+ P_T = np.ascontiguousarray(P.T, dtype=np.float32)
111
121
  start_v = np.ascontiguousarray(start_v, dtype=np.float32)
112
122
 
113
- d_P = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P)
114
- d_v_read = cl.Buffer(
115
- self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=start_v
116
- )
123
+ d_P_T = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P_T)
124
+ d_v_read = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=start_v)
117
125
  d_v_write = cl.Buffer(self.ctx, mf.READ_WRITE, size=start_v.nbytes)
118
126
 
119
127
  current_v = start_v.copy()
120
128
 
121
129
  for i in range(max_steps):
122
- self.k_markov(self.queue, (N,), None, np.int32(N), d_v_read, d_P, d_v_write)
130
+ # Use k_markov with Transposed Matrix
131
+ self.k_markov(self.queue, (N,), None, np.int32(N), d_v_read, d_P_T, d_v_write)
123
132
 
124
133
  if i % 10 == 0:
125
134
  new_v = np.empty_like(current_v)
@@ -163,16 +172,15 @@ class MarkovEngine:
163
172
 
164
173
  # GPU Path
165
174
  mf = cl.mem_flags
175
+ # OPTIMIZATION: Transpose Log-Transition Matrix
166
176
  log_trans = np.log(transition_matrix + epsilon).astype(np.float32)
177
+ log_trans_T = np.ascontiguousarray(log_trans.T, dtype=np.float32)
178
+
167
179
  log_emis = np.log(observation_probs + epsilon).astype(np.float32)
168
180
  log_delta = np.full(N, -np.log(N), dtype=np.float32)
169
181
 
170
- d_trans = cl.Buffer(
171
- self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_trans
172
- )
173
- d_delta_in = cl.Buffer(
174
- self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_delta
175
- )
182
+ d_trans_T = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_trans_T)
183
+ d_delta_in = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_delta)
176
184
  d_delta_out = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_delta.nbytes)
177
185
 
178
186
  full_backpointer_history = np.zeros((T, N), dtype=np.int32)
@@ -180,7 +188,7 @@ class MarkovEngine:
180
188
  self.ctx, mf.WRITE_ONLY, size=full_backpointer_history.nbytes // T
181
189
  )
182
190
 
183
- print(f"🕵️ Decoding {T} days (GPU Accelerated)...")
191
+ print(f"🕵️ Decoding {T} days (GPU Optimized)...")
184
192
 
185
193
  for t in range(T):
186
194
  d_emis = cl.Buffer(
@@ -193,7 +201,7 @@ class MarkovEngine:
193
201
  None,
194
202
  np.int32(N),
195
203
  d_delta_in,
196
- d_trans,
204
+ d_trans_T, # Pass Transposed Matrix
197
205
  d_emis,
198
206
  d_delta_out,
199
207
  d_backpointers,
@@ -231,8 +239,16 @@ class MarkovEngine:
231
239
  log_emis = np.log(observations + 1e-20).astype(np.float32)
232
240
 
233
241
  # 2. Allocate GPU Memory (VRAM)
234
- # We allocate FULL history on GPU to avoid copying back and forth
235
- d_trans = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_trans)
242
+ # We need TWO transition buffers for optimization:
243
+ # A. Original (Row-Major) for Backward Pass & Accumulation
244
+ # B. Transposed (Col-Major) for Forward Pass
245
+ d_trans = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
246
+ d_trans_T = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
247
+
248
+ # Initial Copy
249
+ cl.enqueue_copy(self.queue, d_trans, log_trans)
250
+ cl.enqueue_copy(self.queue, d_trans_T, np.ascontiguousarray(log_trans.T))
251
+
236
252
  d_emis = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_emis)
237
253
 
238
254
  d_alpha = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4) # float32 = 4 bytes
@@ -252,11 +268,10 @@ class MarkovEngine:
252
268
  for i in range(n_iters):
253
269
 
254
270
  # --- A. Forward Pass (GPU Loop) ---
255
- # Init Alpha[0] on CPU then send (fast enough for 1 step)
271
+ # Uses Transposed Matrix (d_trans_T) for coalesced reads
256
272
  init_alpha[:] = -np.log(N) + log_emis[0]
257
- cl.enqueue_copy(self.queue, d_alpha, init_alpha, is_blocking=False) # Write to offset 0
273
+ cl.enqueue_copy(self.queue, d_alpha, init_alpha, is_blocking=False)
258
274
 
259
- # Loop t=1 to T
260
275
  for t in range(1, T):
261
276
  prev_offset = (t - 1) * N
262
277
  curr_offset = t * N
@@ -265,22 +280,20 @@ class MarkovEngine:
265
280
  self.k_hmm_log(
266
281
  self.queue, (N,), None,
267
282
  np.int32(N),
268
- d_alpha, # Full Buffer
283
+ d_alpha,
269
284
  np.int32(prev_offset),
270
285
  np.int32(curr_offset),
271
- d_trans,
272
- d_emis, # Full Buffer
286
+ d_trans_T, # <--- Optimized Read
287
+ d_emis,
273
288
  np.int32(emis_offset)
274
289
  )
275
290
 
276
291
  # --- B. Backward Pass (GPU Loop) ---
277
- # Init Beta[T-1] to 0.0 (log(1))
278
- # We can use clEnqueueFillBuffer, but pyopencl 2022+ is cleaner with copy
279
- init_beta_end = np.zeros(N, dtype=np.float32) # log(1) = 0
280
- beta_end_offset = (T - 1) * N * 4 # Bytes offset
292
+ # Uses Original Matrix (d_trans) because Backward pass logic matches Row-Major
293
+ init_beta_end = np.zeros(N, dtype=np.float32)
294
+ beta_end_offset = (T - 1) * N * 4
281
295
  cl.enqueue_copy(self.queue, d_beta, init_beta_end, dst_offset=beta_end_offset, is_blocking=False)
282
296
 
283
- # Loop t = T-2 down to 0
284
297
  for t in range(T - 2, -1, -1):
285
298
  curr_offset = t * N
286
299
  future_offset = (t + 1) * N
@@ -289,26 +302,23 @@ class MarkovEngine:
289
302
  self.k_hmm_back(
290
303
  self.queue, (N,), None,
291
304
  np.int32(N),
292
- d_beta, # Full Buffer
305
+ d_beta,
293
306
  np.int32(future_offset),
294
307
  np.int32(curr_offset),
295
- d_trans,
308
+ d_trans, # <--- Optimized Read (Backward needs Row-Major)
296
309
  d_emis,
297
310
  np.int32(future_emis_offset)
298
311
  )
299
312
 
300
313
  # --- C. Accumulation (GPU) ---
301
- # Wait for loops to finish
302
314
  self.queue.finish()
303
315
 
304
- # Condense Alpha/Beta/Emis into new Transition Matrix
305
316
  self.k_acc_trans(
306
317
  self.queue, (N, N), None,
307
318
  np.int32(T), np.int32(N),
308
319
  d_alpha, d_beta, d_emis, d_trans, d_new_trans
309
320
  )
310
321
 
311
- # Condense into Gamma Sums
312
322
  self.k_acc_gamma(
313
323
  self.queue, (N,), None,
314
324
  np.int32(T), np.int32(N),
@@ -316,15 +326,13 @@ class MarkovEngine:
316
326
  )
317
327
 
318
328
  # --- D. Update & Check Convergence (CPU) ---
319
- # We only read back the "Summary Statistics", not the T*N buffers
320
329
  new_log_trans_counts = np.empty_like(log_trans)
321
330
  log_gamma_sums = np.empty(N, dtype=np.float32)
322
331
 
323
332
  cl.enqueue_copy(self.queue, new_log_trans_counts, d_new_trans)
324
333
  cl.enqueue_copy(self.queue, log_gamma_sums, d_gamma_sums)
325
334
 
326
- # Calc Likelihood from Alpha[T-1] for convergence check
327
- # Read just the last N floats
335
+ # Calc Likelihood
328
336
  alpha_T_offset = (T - 1) * N * 4
329
337
  cl.enqueue_copy(self.queue, final_alpha_T, d_alpha, src_offset=alpha_T_offset)
330
338
  log_likelihood = np.logaddexp.reduce(final_alpha_T)
@@ -332,8 +340,9 @@ class MarkovEngine:
332
340
  # M-Step: Normalize
333
341
  log_trans = new_log_trans_counts - log_gamma_sums[:, None]
334
342
 
335
- # Update GPU Trans Matrix for next iteration
343
+ # Update BOTH GPU Buffers for next iteration
336
344
  cl.enqueue_copy(self.queue, d_trans, log_trans)
345
+ cl.enqueue_copy(self.queue, d_trans_T, np.ascontiguousarray(log_trans.T))
337
346
 
338
347
  change = log_likelihood - prev_score
339
348
  print(f" Iter {i + 1}: Likelihood {log_likelihood:.2f} (Delta: {change:.4f})")
markovgpu/kernels.cl CHANGED
@@ -1,4 +1,4 @@
1
- // kernels.cl - Fixed Write Permissions
1
+ // kernels.cl - Memory Optimized (Transposed Access) + Fixed Write Permissions
2
2
 
3
3
  // --- HELPER: Log-Sum-Exp Trick ---
4
4
  float log_add(float log_a, float log_b) {
@@ -12,14 +12,17 @@ float log_add(float log_a, float log_b) {
12
12
  __kernel void markov_step(
13
13
  const int N,
14
14
  __global const float *current_state,
15
- __global const float *transition_mat,
15
+ __global const float *trans_mat_T, // EXPECTS TRANSPOSED MATRIX
16
16
  __global float *next_state)
17
17
  {
18
- int id = get_global_id(0);
18
+ int id = get_global_id(0); // Target State (Row in Transposed Mat)
19
19
  if (id < N) {
20
20
  float sum = 0.0f;
21
+ int row_start = id * N; // Coalesced Start (Optimization)
22
+
21
23
  for (int k = 0; k < N; k++) {
22
- sum += current_state[k] * transition_mat[k * N + id];
24
+ // Read sequentially: P_T[id][k] corresponds to P[k][id]
25
+ sum += current_state[k] * trans_mat_T[row_start + k];
23
26
  }
24
27
  next_state[id] = sum;
25
28
  }
@@ -28,15 +31,17 @@ __kernel void markov_step(
28
31
  __kernel void hmm_forward_step(
29
32
  const int N,
30
33
  __global const float *alpha_prev,
31
- __global const float *trans_mat,
34
+ __global const float *trans_mat_T, // EXPECTS TRANSPOSED MATRIX
32
35
  __global const float *emissions,
33
36
  __global float *alpha_new)
34
37
  {
35
38
  int id = get_global_id(0);
36
39
  if (id < N) {
37
40
  float sum = 0.0f;
41
+ int row_start = id * N;
42
+
38
43
  for (int k = 0; k < N; k++) {
39
- sum += alpha_prev[k] * trans_mat[k * N + id];
44
+ sum += alpha_prev[k] * trans_mat_T[row_start + k];
40
45
  }
41
46
  alpha_new[id] = sum * emissions[id];
42
47
  }
@@ -44,47 +49,55 @@ __kernel void hmm_forward_step(
44
49
 
45
50
  // --- SECTION 2: Advanced Log-Space Operations ---
46
51
 
47
- // 3. Log-Space Forward (FIXED: Removed 'const' from log_alpha_full)
52
+ // 3. Log-Space Forward (Memory Optimized)
48
53
  __kernel void hmm_forward_log(
49
54
  const int N,
50
- __global float *log_alpha_full, // <--- FIX: Removed 'const' here
55
+ __global float *log_alpha_full, // NO CONST (Write Permission Fix Preserved)
51
56
  const int prev_offset,
52
57
  const int curr_offset,
53
- __global const float *log_trans_mat,
58
+ __global const float *log_trans_mat_T, // EXPECTS TRANSPOSED MATRIX
54
59
  __global const float *log_emissions,
55
60
  const int emis_offset)
56
61
  {
57
- int id = get_global_id(0);
62
+ int id = get_global_id(0); // Target State (Row in Transposed Mat)
58
63
  if (id < N) {
59
64
  float log_sum = -INFINITY;
60
- // Read from 'prev_offset' in the giant buffer
65
+ int row_start = id * N;
66
+
67
+ // Loop 'k' (Previous State).
68
+ // In Transposed Matrix, 'id' is the Row, 'k' is the Column.
69
+ // So we read P_T[id][k] which corresponds to P[k][id]
61
70
  for (int k = 0; k < N; k++) {
62
- float val = log_alpha_full[prev_offset + k] + log_trans_mat[k * N + id];
71
+ float val = log_alpha_full[prev_offset + k] + log_trans_mat_T[row_start + k];
63
72
  if (k == 0) log_sum = val;
64
73
  else log_sum = log_add(log_sum, val);
65
74
  }
75
+
66
76
  // Write to 'curr_offset'
67
- // Read emission from 'emis_offset'
68
77
  log_alpha_full[curr_offset + id] = log_sum + log_emissions[emis_offset + id];
69
78
  }
70
79
  }
71
80
 
72
- // 4. Log-Space Backward
81
+ // 4. Log-Space Backward (Memory Optimized - Uses ORIGINAL Matrix)
82
+ // Note: Backward pass needs P[i][j], which is naturally Row-Major.
83
+ // So we DO NOT use the Transposed matrix here. It is already optimized!
73
84
  __kernel void hmm_backward_log(
74
85
  const int N,
75
86
  __global float *beta_full,
76
87
  const int future_offset,
77
88
  const int curr_offset,
78
- __global const float *trans,
89
+ __global const float *trans, // ORIGINAL MATRIX (Row-Major)
79
90
  __global const float *emis_full,
80
91
  const int future_emis_offset)
81
92
  {
82
93
  int id = get_global_id(0); // State 'i'
83
94
  if (id < N) {
84
95
  float log_sum = -INFINITY;
96
+ int row_start = id * N;
97
+
85
98
  for (int j=0; j<N; j++) {
86
- // trans(i->j) + emis(t+1, j) + beta(t+1, j)
87
- float val = trans[id*N + j] +
99
+ // Read sequentially: trans[row_start + j]
100
+ float val = trans[row_start + j] +
88
101
  emis_full[future_emis_offset + j] +
89
102
  beta_full[future_offset + j];
90
103
 
@@ -95,11 +108,11 @@ __kernel void hmm_backward_log(
95
108
  }
96
109
  }
97
110
 
98
- // 5. Viterbi Algorithm
111
+ // 5. Viterbi Algorithm (Memory Optimized)
99
112
  __kernel void viterbi_step(
100
113
  const int N,
101
114
  __global const float *log_delta_prev,
102
- __global const float *log_trans_mat,
115
+ __global const float *log_trans_mat_T, // EXPECTS TRANSPOSED MATRIX
103
116
  __global const float *log_emissions,
104
117
  __global float *log_delta_new,
105
118
  __global int *backpointers)
@@ -108,8 +121,11 @@ __kernel void viterbi_step(
108
121
  if (id < N) {
109
122
  float max_prob = -INFINITY;
110
123
  int best_prev_state = 0;
124
+ int row_start = id * N;
125
+
111
126
  for (int k = 0; k < N; k++) {
112
- float prob = log_delta_prev[k] + log_trans_mat[k * N + id];
127
+ // Read sequentially: P_T[id][k]
128
+ float prob = log_delta_prev[k] + log_trans_mat_T[row_start + k];
113
129
  if (prob > max_prob) {
114
130
  max_prob = prob;
115
131
  best_prev_state = k;
@@ -120,7 +136,7 @@ __kernel void viterbi_step(
120
136
  }
121
137
  }
122
138
 
123
- // --- SECTION 3: Learning Accumulators ---
139
+ // --- SECTION 3: Learning Accumulators (Unchanged) ---
124
140
 
125
141
  // 6. Accumulate Transitions (E-Step)
126
142
  __kernel void accumulate_transitions(
@@ -128,11 +144,11 @@ __kernel void accumulate_transitions(
128
144
  __global const float *alpha_full,
129
145
  __global const float *beta_full,
130
146
  __global const float *emis_full,
131
- __global const float *trans_mat,
147
+ __global const float *trans_mat, // Original Matrix
132
148
  __global float *new_trans_counts)
133
149
  {
134
- int row = get_global_id(1); // From State i
135
- int col = get_global_id(0); // To State j
150
+ int row = get_global_id(1);
151
+ int col = get_global_id(0);
136
152
 
137
153
  if (row < N && col < N) {
138
154
  float log_sum_xi = -INFINITY;
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: markovgpu-rane
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: High-performance Markov Chains & HMMs using OpenCL
5
5
  Author-email: Sahil Rane <sahilrane249@gmail.com>
6
6
  Classifier: Development Status :: 4 - Beta
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
26
26
  # ⚡ **MarkovGPU**
27
27
 
28
28
  ### *Massive Scale Markov Models on Consumer Hardware*
29
- <img width="100%" alt="MarkovGPU Hero" src="https://i.imgur.com/gK9J6hD.p" /
29
+ <img width="1024" height="338" alt="image" src="https://github.com/user-attachments/assets/b57dab80-ba03-4d1d-bb4d-6390e3f63f52" />
30
30
 
31
31
  > **Run million-state HMMs on your laptop GPU.**
32
32
  > **No CUDA required • Hybrid CPU/GPU Backend • Production Ready**
@@ -202,4 +202,4 @@ MarkovGPU doesn’t just crunch numbers.
202
202
 
203
203
  Made with 🧡 by Sahil Rane
204
204
 
205
- </div>
205
+ </div>
@@ -0,0 +1,8 @@
1
+ markovgpu/__init__.py,sha256=pCxM1YXY4faXxSm_LtdvL742NKkXKGMeNl61-hHcStU,121
2
+ markovgpu/backend.py,sha256=zbqKS0xjBvJRZ_Mu79y_6-HbpZkjbtA-1eQ_xDXc4lQ,13674
3
+ markovgpu/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ markovgpu/sklearn.py,sha256=5N6d4XVJwev4iH7OCPw4TT_nNTc71-CvNdfIW_S2kxI,3469
5
+ markovgpu/kernels.cl,sha256=DLrcHMg01UO6L1h8u9LM_6uwa9ec9hwdOclGdnxg768,6075
6
+ markovgpu_rane-0.3.0.dist-info/METADATA,sha256=y3soPxmx-IlAxPKGBpPvS0IeZTK7sD-8EBZUaBJOj6I,6622
7
+ markovgpu_rane-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
8
+ markovgpu_rane-0.3.0.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- markovgpu/__init__.py,sha256=pCxM1YXY4faXxSm_LtdvL742NKkXKGMeNl61-hHcStU,121
2
- markovgpu/backend.py,sha256=tp4fwaLhy_dwedx8c4RhFaQsDXcMXTGd2CyHy6cPzd8,12861
3
- markovgpu/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- markovgpu/sklearn.py,sha256=5N6d4XVJwev4iH7OCPw4TT_nNTc71-CvNdfIW_S2kxI,3469
5
- markovgpu/kernels.cl,sha256=bOnwQZd92wzY7dfrzhhWm0LSw8yjqHip_3EpNSrbaJo,5188
6
- markovgpu_rane-0.2.0.dist-info/METADATA,sha256=hsMjX26Nc2AVZjqMS4lgm1Ujv1Kz8FcBAVBhOTpgVM4,6566
7
- markovgpu_rane-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
8
- markovgpu_rane-0.2.0.dist-info/RECORD,,