markovgpu-rane 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markovgpu/backend.py +49 -40
- markovgpu/kernels.cl +40 -24
- {markovgpu_rane-0.2.0.dist-info → markovgpu_rane-0.3.0.dist-info}/METADATA +3 -3
- markovgpu_rane-0.3.0.dist-info/RECORD +8 -0
- markovgpu_rane-0.2.0.dist-info/RECORD +0 -8
- {markovgpu_rane-0.2.0.dist-info → markovgpu_rane-0.3.0.dist-info}/WHEEL +0 -0
markovgpu/backend.py
CHANGED
|
@@ -43,8 +43,11 @@ class MarkovEngine:
|
|
|
43
43
|
if not os.path.exists(KERNEL_PATH):
|
|
44
44
|
raise FileNotFoundError(f"Kernel file missing at: {KERNEL_PATH}")
|
|
45
45
|
|
|
46
|
+
# OPTIMIZATION: Fast Math Build Options
|
|
47
|
+
build_options = "-cl-mad-enable -cl-fast-relaxed-math"
|
|
48
|
+
|
|
46
49
|
with open(KERNEL_PATH, "r") as f:
|
|
47
|
-
self.prg = cl.Program(self.ctx, f.read()).build()
|
|
50
|
+
self.prg = cl.Program(self.ctx, f.read()).build(options=build_options)
|
|
48
51
|
|
|
49
52
|
# 3. Cache Kernels (Robust Retrieval)
|
|
50
53
|
self.use_gpu = True
|
|
@@ -80,19 +83,25 @@ class MarkovEngine:
|
|
|
80
83
|
return v.dot(P)
|
|
81
84
|
|
|
82
85
|
mf = cl.mem_flags
|
|
83
|
-
|
|
86
|
+
# OPTIMIZATION: Transpose P for coalesced access
|
|
87
|
+
# The kernel expects P_T[id][k] which maps to P[k][id]
|
|
88
|
+
P_T = np.ascontiguousarray(P.T, dtype=np.float32)
|
|
84
89
|
v = np.ascontiguousarray(v, dtype=np.float32)
|
|
85
90
|
result = np.empty_like(v)
|
|
86
91
|
|
|
87
|
-
|
|
92
|
+
d_P_T = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P_T)
|
|
88
93
|
d_v = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=v)
|
|
89
94
|
d_res = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=result.nbytes)
|
|
90
95
|
|
|
91
|
-
self.k_markov(self.queue, (N,), None, np.int32(N), d_v,
|
|
96
|
+
self.k_markov(self.queue, (N,), None, np.int32(N), d_v, d_P_T, d_res)
|
|
92
97
|
cl.enqueue_copy(self.queue, result, d_res)
|
|
93
98
|
return result
|
|
94
99
|
|
|
95
100
|
def converge(self, P, start_v, tolerance=1e-5, max_steps=1000):
|
|
101
|
+
# Note: 'converge' currently uses the iterative step approach.
|
|
102
|
+
# For maximum optimization, this loop should ideally be moved to a kernel,
|
|
103
|
+
# but for now, we rely on the optimized 'step' logic implicitly or CPU fallback.
|
|
104
|
+
# Below is the robust hybrid implementation.
|
|
96
105
|
N = len(start_v)
|
|
97
106
|
|
|
98
107
|
# CPU Path
|
|
@@ -106,20 +115,20 @@ class MarkovEngine:
|
|
|
106
115
|
return current_v
|
|
107
116
|
|
|
108
117
|
# GPU Path
|
|
118
|
+
# We reuse the specific buffers to avoid reallocation overhead in loop
|
|
109
119
|
mf = cl.mem_flags
|
|
110
|
-
|
|
120
|
+
P_T = np.ascontiguousarray(P.T, dtype=np.float32)
|
|
111
121
|
start_v = np.ascontiguousarray(start_v, dtype=np.float32)
|
|
112
122
|
|
|
113
|
-
|
|
114
|
-
d_v_read = cl.Buffer(
|
|
115
|
-
self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=start_v
|
|
116
|
-
)
|
|
123
|
+
d_P_T = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P_T)
|
|
124
|
+
d_v_read = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=start_v)
|
|
117
125
|
d_v_write = cl.Buffer(self.ctx, mf.READ_WRITE, size=start_v.nbytes)
|
|
118
126
|
|
|
119
127
|
current_v = start_v.copy()
|
|
120
128
|
|
|
121
129
|
for i in range(max_steps):
|
|
122
|
-
|
|
130
|
+
# Use k_markov with Transposed Matrix
|
|
131
|
+
self.k_markov(self.queue, (N,), None, np.int32(N), d_v_read, d_P_T, d_v_write)
|
|
123
132
|
|
|
124
133
|
if i % 10 == 0:
|
|
125
134
|
new_v = np.empty_like(current_v)
|
|
@@ -163,16 +172,15 @@ class MarkovEngine:
|
|
|
163
172
|
|
|
164
173
|
# GPU Path
|
|
165
174
|
mf = cl.mem_flags
|
|
175
|
+
# OPTIMIZATION: Transpose Log-Transition Matrix
|
|
166
176
|
log_trans = np.log(transition_matrix + epsilon).astype(np.float32)
|
|
177
|
+
log_trans_T = np.ascontiguousarray(log_trans.T, dtype=np.float32)
|
|
178
|
+
|
|
167
179
|
log_emis = np.log(observation_probs + epsilon).astype(np.float32)
|
|
168
180
|
log_delta = np.full(N, -np.log(N), dtype=np.float32)
|
|
169
181
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
)
|
|
173
|
-
d_delta_in = cl.Buffer(
|
|
174
|
-
self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_delta
|
|
175
|
-
)
|
|
182
|
+
d_trans_T = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_trans_T)
|
|
183
|
+
d_delta_in = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_delta)
|
|
176
184
|
d_delta_out = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_delta.nbytes)
|
|
177
185
|
|
|
178
186
|
full_backpointer_history = np.zeros((T, N), dtype=np.int32)
|
|
@@ -180,7 +188,7 @@ class MarkovEngine:
|
|
|
180
188
|
self.ctx, mf.WRITE_ONLY, size=full_backpointer_history.nbytes // T
|
|
181
189
|
)
|
|
182
190
|
|
|
183
|
-
print(f"🕵️ Decoding {T} days (GPU
|
|
191
|
+
print(f"🕵️ Decoding {T} days (GPU Optimized)...")
|
|
184
192
|
|
|
185
193
|
for t in range(T):
|
|
186
194
|
d_emis = cl.Buffer(
|
|
@@ -193,7 +201,7 @@ class MarkovEngine:
|
|
|
193
201
|
None,
|
|
194
202
|
np.int32(N),
|
|
195
203
|
d_delta_in,
|
|
196
|
-
|
|
204
|
+
d_trans_T, # Pass Transposed Matrix
|
|
197
205
|
d_emis,
|
|
198
206
|
d_delta_out,
|
|
199
207
|
d_backpointers,
|
|
@@ -231,8 +239,16 @@ class MarkovEngine:
|
|
|
231
239
|
log_emis = np.log(observations + 1e-20).astype(np.float32)
|
|
232
240
|
|
|
233
241
|
# 2. Allocate GPU Memory (VRAM)
|
|
234
|
-
# We
|
|
235
|
-
|
|
242
|
+
# We need TWO transition buffers for optimization:
|
|
243
|
+
# A. Original (Row-Major) for Backward Pass & Accumulation
|
|
244
|
+
# B. Transposed (Col-Major) for Forward Pass
|
|
245
|
+
d_trans = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
|
|
246
|
+
d_trans_T = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
|
|
247
|
+
|
|
248
|
+
# Initial Copy
|
|
249
|
+
cl.enqueue_copy(self.queue, d_trans, log_trans)
|
|
250
|
+
cl.enqueue_copy(self.queue, d_trans_T, np.ascontiguousarray(log_trans.T))
|
|
251
|
+
|
|
236
252
|
d_emis = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_emis)
|
|
237
253
|
|
|
238
254
|
d_alpha = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4) # float32 = 4 bytes
|
|
@@ -252,11 +268,10 @@ class MarkovEngine:
|
|
|
252
268
|
for i in range(n_iters):
|
|
253
269
|
|
|
254
270
|
# --- A. Forward Pass (GPU Loop) ---
|
|
255
|
-
#
|
|
271
|
+
# Uses Transposed Matrix (d_trans_T) for coalesced reads
|
|
256
272
|
init_alpha[:] = -np.log(N) + log_emis[0]
|
|
257
|
-
cl.enqueue_copy(self.queue, d_alpha, init_alpha, is_blocking=False)
|
|
273
|
+
cl.enqueue_copy(self.queue, d_alpha, init_alpha, is_blocking=False)
|
|
258
274
|
|
|
259
|
-
# Loop t=1 to T
|
|
260
275
|
for t in range(1, T):
|
|
261
276
|
prev_offset = (t - 1) * N
|
|
262
277
|
curr_offset = t * N
|
|
@@ -265,22 +280,20 @@ class MarkovEngine:
|
|
|
265
280
|
self.k_hmm_log(
|
|
266
281
|
self.queue, (N,), None,
|
|
267
282
|
np.int32(N),
|
|
268
|
-
d_alpha,
|
|
283
|
+
d_alpha,
|
|
269
284
|
np.int32(prev_offset),
|
|
270
285
|
np.int32(curr_offset),
|
|
271
|
-
|
|
272
|
-
d_emis,
|
|
286
|
+
d_trans_T, # <--- Optimized Read
|
|
287
|
+
d_emis,
|
|
273
288
|
np.int32(emis_offset)
|
|
274
289
|
)
|
|
275
290
|
|
|
276
291
|
# --- B. Backward Pass (GPU Loop) ---
|
|
277
|
-
#
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
beta_end_offset = (T - 1) * N * 4 # Bytes offset
|
|
292
|
+
# Uses Original Matrix (d_trans) because Backward pass logic matches Row-Major
|
|
293
|
+
init_beta_end = np.zeros(N, dtype=np.float32)
|
|
294
|
+
beta_end_offset = (T - 1) * N * 4
|
|
281
295
|
cl.enqueue_copy(self.queue, d_beta, init_beta_end, dst_offset=beta_end_offset, is_blocking=False)
|
|
282
296
|
|
|
283
|
-
# Loop t = T-2 down to 0
|
|
284
297
|
for t in range(T - 2, -1, -1):
|
|
285
298
|
curr_offset = t * N
|
|
286
299
|
future_offset = (t + 1) * N
|
|
@@ -289,26 +302,23 @@ class MarkovEngine:
|
|
|
289
302
|
self.k_hmm_back(
|
|
290
303
|
self.queue, (N,), None,
|
|
291
304
|
np.int32(N),
|
|
292
|
-
d_beta,
|
|
305
|
+
d_beta,
|
|
293
306
|
np.int32(future_offset),
|
|
294
307
|
np.int32(curr_offset),
|
|
295
|
-
d_trans,
|
|
308
|
+
d_trans, # <--- Optimized Read (Backward needs Row-Major)
|
|
296
309
|
d_emis,
|
|
297
310
|
np.int32(future_emis_offset)
|
|
298
311
|
)
|
|
299
312
|
|
|
300
313
|
# --- C. Accumulation (GPU) ---
|
|
301
|
-
# Wait for loops to finish
|
|
302
314
|
self.queue.finish()
|
|
303
315
|
|
|
304
|
-
# Condense Alpha/Beta/Emis into new Transition Matrix
|
|
305
316
|
self.k_acc_trans(
|
|
306
317
|
self.queue, (N, N), None,
|
|
307
318
|
np.int32(T), np.int32(N),
|
|
308
319
|
d_alpha, d_beta, d_emis, d_trans, d_new_trans
|
|
309
320
|
)
|
|
310
321
|
|
|
311
|
-
# Condense into Gamma Sums
|
|
312
322
|
self.k_acc_gamma(
|
|
313
323
|
self.queue, (N,), None,
|
|
314
324
|
np.int32(T), np.int32(N),
|
|
@@ -316,15 +326,13 @@ class MarkovEngine:
|
|
|
316
326
|
)
|
|
317
327
|
|
|
318
328
|
# --- D. Update & Check Convergence (CPU) ---
|
|
319
|
-
# We only read back the "Summary Statistics", not the T*N buffers
|
|
320
329
|
new_log_trans_counts = np.empty_like(log_trans)
|
|
321
330
|
log_gamma_sums = np.empty(N, dtype=np.float32)
|
|
322
331
|
|
|
323
332
|
cl.enqueue_copy(self.queue, new_log_trans_counts, d_new_trans)
|
|
324
333
|
cl.enqueue_copy(self.queue, log_gamma_sums, d_gamma_sums)
|
|
325
334
|
|
|
326
|
-
# Calc Likelihood
|
|
327
|
-
# Read just the last N floats
|
|
335
|
+
# Calc Likelihood
|
|
328
336
|
alpha_T_offset = (T - 1) * N * 4
|
|
329
337
|
cl.enqueue_copy(self.queue, final_alpha_T, d_alpha, src_offset=alpha_T_offset)
|
|
330
338
|
log_likelihood = np.logaddexp.reduce(final_alpha_T)
|
|
@@ -332,8 +340,9 @@ class MarkovEngine:
|
|
|
332
340
|
# M-Step: Normalize
|
|
333
341
|
log_trans = new_log_trans_counts - log_gamma_sums[:, None]
|
|
334
342
|
|
|
335
|
-
# Update GPU
|
|
343
|
+
# Update BOTH GPU Buffers for next iteration
|
|
336
344
|
cl.enqueue_copy(self.queue, d_trans, log_trans)
|
|
345
|
+
cl.enqueue_copy(self.queue, d_trans_T, np.ascontiguousarray(log_trans.T))
|
|
337
346
|
|
|
338
347
|
change = log_likelihood - prev_score
|
|
339
348
|
print(f" Iter {i + 1}: Likelihood {log_likelihood:.2f} (Delta: {change:.4f})")
|
markovgpu/kernels.cl
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
// kernels.cl - Fixed Write Permissions
|
|
1
|
+
// kernels.cl - Memory Optimized (Transposed Access) + Fixed Write Permissions
|
|
2
2
|
|
|
3
3
|
// --- HELPER: Log-Sum-Exp Trick ---
|
|
4
4
|
float log_add(float log_a, float log_b) {
|
|
@@ -12,14 +12,17 @@ float log_add(float log_a, float log_b) {
|
|
|
12
12
|
__kernel void markov_step(
|
|
13
13
|
const int N,
|
|
14
14
|
__global const float *current_state,
|
|
15
|
-
__global const float *
|
|
15
|
+
__global const float *trans_mat_T, // EXPECTS TRANSPOSED MATRIX
|
|
16
16
|
__global float *next_state)
|
|
17
17
|
{
|
|
18
|
-
int id = get_global_id(0);
|
|
18
|
+
int id = get_global_id(0); // Target State (Row in Transposed Mat)
|
|
19
19
|
if (id < N) {
|
|
20
20
|
float sum = 0.0f;
|
|
21
|
+
int row_start = id * N; // Coalesced Start (Optimization)
|
|
22
|
+
|
|
21
23
|
for (int k = 0; k < N; k++) {
|
|
22
|
-
|
|
24
|
+
// Read sequentially: P_T[id][k] corresponds to P[k][id]
|
|
25
|
+
sum += current_state[k] * trans_mat_T[row_start + k];
|
|
23
26
|
}
|
|
24
27
|
next_state[id] = sum;
|
|
25
28
|
}
|
|
@@ -28,15 +31,17 @@ __kernel void markov_step(
|
|
|
28
31
|
__kernel void hmm_forward_step(
|
|
29
32
|
const int N,
|
|
30
33
|
__global const float *alpha_prev,
|
|
31
|
-
__global const float *
|
|
34
|
+
__global const float *trans_mat_T, // EXPECTS TRANSPOSED MATRIX
|
|
32
35
|
__global const float *emissions,
|
|
33
36
|
__global float *alpha_new)
|
|
34
37
|
{
|
|
35
38
|
int id = get_global_id(0);
|
|
36
39
|
if (id < N) {
|
|
37
40
|
float sum = 0.0f;
|
|
41
|
+
int row_start = id * N;
|
|
42
|
+
|
|
38
43
|
for (int k = 0; k < N; k++) {
|
|
39
|
-
sum += alpha_prev[k] *
|
|
44
|
+
sum += alpha_prev[k] * trans_mat_T[row_start + k];
|
|
40
45
|
}
|
|
41
46
|
alpha_new[id] = sum * emissions[id];
|
|
42
47
|
}
|
|
@@ -44,47 +49,55 @@ __kernel void hmm_forward_step(
|
|
|
44
49
|
|
|
45
50
|
// --- SECTION 2: Advanced Log-Space Operations ---
|
|
46
51
|
|
|
47
|
-
// 3. Log-Space Forward (
|
|
52
|
+
// 3. Log-Space Forward (Memory Optimized)
|
|
48
53
|
__kernel void hmm_forward_log(
|
|
49
54
|
const int N,
|
|
50
|
-
__global float *log_alpha_full, //
|
|
55
|
+
__global float *log_alpha_full, // NO CONST (Write Permission Fix Preserved)
|
|
51
56
|
const int prev_offset,
|
|
52
57
|
const int curr_offset,
|
|
53
|
-
__global const float *
|
|
58
|
+
__global const float *log_trans_mat_T, // EXPECTS TRANSPOSED MATRIX
|
|
54
59
|
__global const float *log_emissions,
|
|
55
60
|
const int emis_offset)
|
|
56
61
|
{
|
|
57
|
-
int id = get_global_id(0);
|
|
62
|
+
int id = get_global_id(0); // Target State (Row in Transposed Mat)
|
|
58
63
|
if (id < N) {
|
|
59
64
|
float log_sum = -INFINITY;
|
|
60
|
-
|
|
65
|
+
int row_start = id * N;
|
|
66
|
+
|
|
67
|
+
// Loop 'k' (Previous State).
|
|
68
|
+
// In Transposed Matrix, 'id' is the Row, 'k' is the Column.
|
|
69
|
+
// So we read P_T[id][k] which corresponds to P[k][id]
|
|
61
70
|
for (int k = 0; k < N; k++) {
|
|
62
|
-
float val = log_alpha_full[prev_offset + k] +
|
|
71
|
+
float val = log_alpha_full[prev_offset + k] + log_trans_mat_T[row_start + k];
|
|
63
72
|
if (k == 0) log_sum = val;
|
|
64
73
|
else log_sum = log_add(log_sum, val);
|
|
65
74
|
}
|
|
75
|
+
|
|
66
76
|
// Write to 'curr_offset'
|
|
67
|
-
// Read emission from 'emis_offset'
|
|
68
77
|
log_alpha_full[curr_offset + id] = log_sum + log_emissions[emis_offset + id];
|
|
69
78
|
}
|
|
70
79
|
}
|
|
71
80
|
|
|
72
|
-
// 4. Log-Space Backward
|
|
81
|
+
// 4. Log-Space Backward (Memory Optimized - Uses ORIGINAL Matrix)
|
|
82
|
+
// Note: Backward pass needs P[i][j], which is naturally Row-Major.
|
|
83
|
+
// So we DO NOT use the Transposed matrix here. It is already optimized!
|
|
73
84
|
__kernel void hmm_backward_log(
|
|
74
85
|
const int N,
|
|
75
86
|
__global float *beta_full,
|
|
76
87
|
const int future_offset,
|
|
77
88
|
const int curr_offset,
|
|
78
|
-
__global const float *trans,
|
|
89
|
+
__global const float *trans, // ORIGINAL MATRIX (Row-Major)
|
|
79
90
|
__global const float *emis_full,
|
|
80
91
|
const int future_emis_offset)
|
|
81
92
|
{
|
|
82
93
|
int id = get_global_id(0); // State 'i'
|
|
83
94
|
if (id < N) {
|
|
84
95
|
float log_sum = -INFINITY;
|
|
96
|
+
int row_start = id * N;
|
|
97
|
+
|
|
85
98
|
for (int j=0; j<N; j++) {
|
|
86
|
-
//
|
|
87
|
-
float val = trans[
|
|
99
|
+
// Read sequentially: trans[row_start + j]
|
|
100
|
+
float val = trans[row_start + j] +
|
|
88
101
|
emis_full[future_emis_offset + j] +
|
|
89
102
|
beta_full[future_offset + j];
|
|
90
103
|
|
|
@@ -95,11 +108,11 @@ __kernel void hmm_backward_log(
|
|
|
95
108
|
}
|
|
96
109
|
}
|
|
97
110
|
|
|
98
|
-
// 5. Viterbi Algorithm
|
|
111
|
+
// 5. Viterbi Algorithm (Memory Optimized)
|
|
99
112
|
__kernel void viterbi_step(
|
|
100
113
|
const int N,
|
|
101
114
|
__global const float *log_delta_prev,
|
|
102
|
-
__global const float *
|
|
115
|
+
__global const float *log_trans_mat_T, // EXPECTS TRANSPOSED MATRIX
|
|
103
116
|
__global const float *log_emissions,
|
|
104
117
|
__global float *log_delta_new,
|
|
105
118
|
__global int *backpointers)
|
|
@@ -108,8 +121,11 @@ __kernel void viterbi_step(
|
|
|
108
121
|
if (id < N) {
|
|
109
122
|
float max_prob = -INFINITY;
|
|
110
123
|
int best_prev_state = 0;
|
|
124
|
+
int row_start = id * N;
|
|
125
|
+
|
|
111
126
|
for (int k = 0; k < N; k++) {
|
|
112
|
-
|
|
127
|
+
// Read sequentially: P_T[id][k]
|
|
128
|
+
float prob = log_delta_prev[k] + log_trans_mat_T[row_start + k];
|
|
113
129
|
if (prob > max_prob) {
|
|
114
130
|
max_prob = prob;
|
|
115
131
|
best_prev_state = k;
|
|
@@ -120,7 +136,7 @@ __kernel void viterbi_step(
|
|
|
120
136
|
}
|
|
121
137
|
}
|
|
122
138
|
|
|
123
|
-
// --- SECTION 3: Learning Accumulators ---
|
|
139
|
+
// --- SECTION 3: Learning Accumulators (Unchanged) ---
|
|
124
140
|
|
|
125
141
|
// 6. Accumulate Transitions (E-Step)
|
|
126
142
|
__kernel void accumulate_transitions(
|
|
@@ -128,11 +144,11 @@ __kernel void accumulate_transitions(
|
|
|
128
144
|
__global const float *alpha_full,
|
|
129
145
|
__global const float *beta_full,
|
|
130
146
|
__global const float *emis_full,
|
|
131
|
-
__global const float *trans_mat,
|
|
147
|
+
__global const float *trans_mat, // Original Matrix
|
|
132
148
|
__global float *new_trans_counts)
|
|
133
149
|
{
|
|
134
|
-
int row = get_global_id(1);
|
|
135
|
-
int col = get_global_id(0);
|
|
150
|
+
int row = get_global_id(1);
|
|
151
|
+
int col = get_global_id(0);
|
|
136
152
|
|
|
137
153
|
if (row < N && col < N) {
|
|
138
154
|
float log_sum_xi = -INFINITY;
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: markovgpu-rane
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: High-performance Markov Chains & HMMs using OpenCL
|
|
5
5
|
Author-email: Sahil Rane <sahilrane249@gmail.com>
|
|
6
6
|
Classifier: Development Status :: 4 - Beta
|
|
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
|
|
|
26
26
|
# ⚡ **MarkovGPU**
|
|
27
27
|
|
|
28
28
|
### *Massive Scale Markov Models on Consumer Hardware*
|
|
29
|
-
<img width="
|
|
29
|
+
<img width="1024" height="338" alt="image" src="https://github.com/user-attachments/assets/b57dab80-ba03-4d1d-bb4d-6390e3f63f52" />
|
|
30
30
|
|
|
31
31
|
> **Run million-state HMMs on your laptop GPU.**
|
|
32
32
|
> **No CUDA required • Hybrid CPU/GPU Backend • Production Ready**
|
|
@@ -202,4 +202,4 @@ MarkovGPU doesn’t just crunch numbers.
|
|
|
202
202
|
|
|
203
203
|
Made with 🧡 by Sahil Rane
|
|
204
204
|
|
|
205
|
-
</div>
|
|
205
|
+
</div>
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
markovgpu/__init__.py,sha256=pCxM1YXY4faXxSm_LtdvL742NKkXKGMeNl61-hHcStU,121
|
|
2
|
+
markovgpu/backend.py,sha256=zbqKS0xjBvJRZ_Mu79y_6-HbpZkjbtA-1eQ_xDXc4lQ,13674
|
|
3
|
+
markovgpu/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
markovgpu/sklearn.py,sha256=5N6d4XVJwev4iH7OCPw4TT_nNTc71-CvNdfIW_S2kxI,3469
|
|
5
|
+
markovgpu/kernels.cl,sha256=DLrcHMg01UO6L1h8u9LM_6uwa9ec9hwdOclGdnxg768,6075
|
|
6
|
+
markovgpu_rane-0.3.0.dist-info/METADATA,sha256=y3soPxmx-IlAxPKGBpPvS0IeZTK7sD-8EBZUaBJOj6I,6622
|
|
7
|
+
markovgpu_rane-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
8
|
+
markovgpu_rane-0.3.0.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
markovgpu/__init__.py,sha256=pCxM1YXY4faXxSm_LtdvL742NKkXKGMeNl61-hHcStU,121
|
|
2
|
-
markovgpu/backend.py,sha256=tp4fwaLhy_dwedx8c4RhFaQsDXcMXTGd2CyHy6cPzd8,12861
|
|
3
|
-
markovgpu/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
markovgpu/sklearn.py,sha256=5N6d4XVJwev4iH7OCPw4TT_nNTc71-CvNdfIW_S2kxI,3469
|
|
5
|
-
markovgpu/kernels.cl,sha256=bOnwQZd92wzY7dfrzhhWm0LSw8yjqHip_3EpNSrbaJo,5188
|
|
6
|
-
markovgpu_rane-0.2.0.dist-info/METADATA,sha256=hsMjX26Nc2AVZjqMS4lgm1Ujv1Kz8FcBAVBhOTpgVM4,6566
|
|
7
|
-
markovgpu_rane-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
8
|
-
markovgpu_rane-0.2.0.dist-info/RECORD,,
|
|
File without changes
|