markovgpu-rane 0.1.0__py3-none-any.whl β 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markovgpu/__init__.py +3 -1
- markovgpu/backend.py +113 -85
- markovgpu/kernels.cl +60 -43
- markovgpu/sklearn.py +94 -0
- markovgpu_rane-0.3.0.dist-info/METADATA +205 -0
- markovgpu_rane-0.3.0.dist-info/RECORD +8 -0
- markovgpu_rane-0.1.0.dist-info/METADATA +0 -22
- markovgpu_rane-0.1.0.dist-info/RECORD +0 -7
- {markovgpu_rane-0.1.0.dist-info β markovgpu_rane-0.3.0.dist-info}/WHEEL +0 -0
markovgpu/__init__.py
CHANGED
markovgpu/backend.py
CHANGED
|
@@ -43,8 +43,11 @@ class MarkovEngine:
|
|
|
43
43
|
if not os.path.exists(KERNEL_PATH):
|
|
44
44
|
raise FileNotFoundError(f"Kernel file missing at: {KERNEL_PATH}")
|
|
45
45
|
|
|
46
|
+
# OPTIMIZATION: Fast Math Build Options
|
|
47
|
+
build_options = "-cl-mad-enable -cl-fast-relaxed-math"
|
|
48
|
+
|
|
46
49
|
with open(KERNEL_PATH, "r") as f:
|
|
47
|
-
self.prg = cl.Program(self.ctx, f.read()).build()
|
|
50
|
+
self.prg = cl.Program(self.ctx, f.read()).build(options=build_options)
|
|
48
51
|
|
|
49
52
|
# 3. Cache Kernels (Robust Retrieval)
|
|
50
53
|
self.use_gpu = True
|
|
@@ -80,24 +83,29 @@ class MarkovEngine:
|
|
|
80
83
|
return v.dot(P)
|
|
81
84
|
|
|
82
85
|
mf = cl.mem_flags
|
|
83
|
-
|
|
86
|
+
# OPTIMIZATION: Transpose P for coalesced access
|
|
87
|
+
# The kernel expects P_T[id][k] which maps to P[k][id]
|
|
88
|
+
P_T = np.ascontiguousarray(P.T, dtype=np.float32)
|
|
84
89
|
v = np.ascontiguousarray(v, dtype=np.float32)
|
|
85
90
|
result = np.empty_like(v)
|
|
86
91
|
|
|
87
|
-
|
|
92
|
+
d_P_T = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P_T)
|
|
88
93
|
d_v = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=v)
|
|
89
94
|
d_res = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=result.nbytes)
|
|
90
95
|
|
|
91
|
-
self.k_markov(self.queue, (N,), None, np.int32(N), d_v,
|
|
96
|
+
self.k_markov(self.queue, (N,), None, np.int32(N), d_v, d_P_T, d_res)
|
|
92
97
|
cl.enqueue_copy(self.queue, result, d_res)
|
|
93
98
|
return result
|
|
94
99
|
|
|
95
100
|
def converge(self, P, start_v, tolerance=1e-5, max_steps=1000):
|
|
101
|
+
# Note: 'converge' currently uses the iterative step approach.
|
|
102
|
+
# For maximum optimization, this loop should ideally be moved to a kernel,
|
|
103
|
+
# but for now, we rely on the optimized 'step' logic implicitly or CPU fallback.
|
|
104
|
+
# Below is the robust hybrid implementation.
|
|
96
105
|
N = len(start_v)
|
|
97
106
|
|
|
98
107
|
# CPU Path
|
|
99
108
|
if not self.use_gpu or N < GPU_THRESHOLD:
|
|
100
|
-
# print(f"π Converging on CPU (N={N})...")
|
|
101
109
|
current_v = start_v.copy()
|
|
102
110
|
for i in range(max_steps):
|
|
103
111
|
new_v = current_v.dot(P)
|
|
@@ -107,21 +115,20 @@ class MarkovEngine:
|
|
|
107
115
|
return current_v
|
|
108
116
|
|
|
109
117
|
# GPU Path
|
|
110
|
-
#
|
|
118
|
+
# We reuse the specific buffers to avoid reallocation overhead in loop
|
|
111
119
|
mf = cl.mem_flags
|
|
112
|
-
|
|
120
|
+
P_T = np.ascontiguousarray(P.T, dtype=np.float32)
|
|
113
121
|
start_v = np.ascontiguousarray(start_v, dtype=np.float32)
|
|
114
122
|
|
|
115
|
-
|
|
116
|
-
d_v_read = cl.Buffer(
|
|
117
|
-
self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=start_v
|
|
118
|
-
)
|
|
123
|
+
d_P_T = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P_T)
|
|
124
|
+
d_v_read = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=start_v)
|
|
119
125
|
d_v_write = cl.Buffer(self.ctx, mf.READ_WRITE, size=start_v.nbytes)
|
|
120
126
|
|
|
121
127
|
current_v = start_v.copy()
|
|
122
128
|
|
|
123
129
|
for i in range(max_steps):
|
|
124
|
-
|
|
130
|
+
# Use k_markov with Transposed Matrix
|
|
131
|
+
self.k_markov(self.queue, (N,), None, np.int32(N), d_v_read, d_P_T, d_v_write)
|
|
125
132
|
|
|
126
133
|
if i % 10 == 0:
|
|
127
134
|
new_v = np.empty_like(current_v)
|
|
@@ -136,13 +143,6 @@ class MarkovEngine:
|
|
|
136
143
|
return current_v
|
|
137
144
|
|
|
138
145
|
# --- 2. Inference & Viterbi ---
|
|
139
|
-
def hmm_filter(self, transition_matrix, observation_probs):
|
|
140
|
-
"""Standard HMM Filter (Returns Probabilities)"""
|
|
141
|
-
# Simplification: Running basic HMM forward pass
|
|
142
|
-
# For production use, usually prefer Log-Space to avoid underflow.
|
|
143
|
-
# This wrapper can be upgraded to use k_hmm_log if needed.
|
|
144
|
-
pass
|
|
145
|
-
|
|
146
146
|
def decode_regime(self, transition_matrix, observation_probs):
|
|
147
147
|
"""Viterbi Algorithm (Finds Most Likely Path)"""
|
|
148
148
|
T, N = observation_probs.shape
|
|
@@ -172,16 +172,15 @@ class MarkovEngine:
|
|
|
172
172
|
|
|
173
173
|
# GPU Path
|
|
174
174
|
mf = cl.mem_flags
|
|
175
|
+
# OPTIMIZATION: Transpose Log-Transition Matrix
|
|
175
176
|
log_trans = np.log(transition_matrix + epsilon).astype(np.float32)
|
|
177
|
+
log_trans_T = np.ascontiguousarray(log_trans.T, dtype=np.float32)
|
|
178
|
+
|
|
176
179
|
log_emis = np.log(observation_probs + epsilon).astype(np.float32)
|
|
177
180
|
log_delta = np.full(N, -np.log(N), dtype=np.float32)
|
|
178
181
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
)
|
|
182
|
-
d_delta_in = cl.Buffer(
|
|
183
|
-
self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_delta
|
|
184
|
-
)
|
|
182
|
+
d_trans_T = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_trans_T)
|
|
183
|
+
d_delta_in = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_delta)
|
|
185
184
|
d_delta_out = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_delta.nbytes)
|
|
186
185
|
|
|
187
186
|
full_backpointer_history = np.zeros((T, N), dtype=np.int32)
|
|
@@ -189,7 +188,7 @@ class MarkovEngine:
|
|
|
189
188
|
self.ctx, mf.WRITE_ONLY, size=full_backpointer_history.nbytes // T
|
|
190
189
|
)
|
|
191
190
|
|
|
192
|
-
print(f"π΅οΈ Decoding {T} days (GPU
|
|
191
|
+
print(f"π΅οΈ Decoding {T} days (GPU Optimized)...")
|
|
193
192
|
|
|
194
193
|
for t in range(T):
|
|
195
194
|
d_emis = cl.Buffer(
|
|
@@ -202,7 +201,7 @@ class MarkovEngine:
|
|
|
202
201
|
None,
|
|
203
202
|
np.int32(N),
|
|
204
203
|
d_delta_in,
|
|
205
|
-
|
|
204
|
+
d_trans_T, # Pass Transposed Matrix
|
|
206
205
|
d_emis,
|
|
207
206
|
d_delta_out,
|
|
208
207
|
d_backpointers,
|
|
@@ -231,96 +230,125 @@ class MarkovEngine:
|
|
|
231
230
|
"""Baum-Welch Expectation Maximization (Training)"""
|
|
232
231
|
T = observations.shape[0]
|
|
233
232
|
N = n_states
|
|
233
|
+
mf = cl.mem_flags
|
|
234
234
|
|
|
235
|
-
#
|
|
235
|
+
# 1. Initialize Params (Log Space)
|
|
236
236
|
log_trans = np.log(
|
|
237
237
|
np.full((N, N), 1.0 / N) + np.random.rand(N, N) * 0.01
|
|
238
238
|
).astype(np.float32)
|
|
239
239
|
log_emis = np.log(observations + 1e-20).astype(np.float32)
|
|
240
240
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
241
|
+
# 2. Allocate GPU Memory (VRAM)
|
|
242
|
+
# We need TWO transition buffers for optimization:
|
|
243
|
+
# A. Original (Row-Major) for Backward Pass & Accumulation
|
|
244
|
+
# B. Transposed (Col-Major) for Forward Pass
|
|
245
|
+
d_trans = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
|
|
246
|
+
d_trans_T = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
|
|
247
|
+
|
|
248
|
+
# Initial Copy
|
|
249
|
+
cl.enqueue_copy(self.queue, d_trans, log_trans)
|
|
250
|
+
cl.enqueue_copy(self.queue, d_trans_T, np.ascontiguousarray(log_trans.T))
|
|
248
251
|
|
|
252
|
+
d_emis = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_emis)
|
|
253
|
+
|
|
254
|
+
d_alpha = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4) # float32 = 4 bytes
|
|
255
|
+
d_beta = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4)
|
|
256
|
+
|
|
249
257
|
d_new_trans = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
|
|
250
258
|
d_gamma_sums = cl.Buffer(self.ctx, mf.READ_WRITE, size=N * 4)
|
|
251
259
|
|
|
252
260
|
prev_score = -np.inf
|
|
253
261
|
|
|
254
|
-
print(f"π§ Training HMM ({N} States, {T} Steps)...")
|
|
262
|
+
print(f"π§ Training HMM ({N} States, {T} Steps) on GPU...")
|
|
263
|
+
|
|
264
|
+
# Host buffers for initial checks and final readback
|
|
265
|
+
init_alpha = np.zeros(N, dtype=np.float32)
|
|
266
|
+
final_alpha_T = np.zeros(N, dtype=np.float32)
|
|
255
267
|
|
|
256
268
|
for i in range(n_iters):
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
269
|
+
|
|
270
|
+
# --- A. Forward Pass (GPU Loop) ---
|
|
271
|
+
# Uses Transposed Matrix (d_trans_T) for coalesced reads
|
|
272
|
+
init_alpha[:] = -np.log(N) + log_emis[0]
|
|
273
|
+
cl.enqueue_copy(self.queue, d_alpha, init_alpha, is_blocking=False)
|
|
260
274
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
275
|
+
for t in range(1, T):
|
|
276
|
+
prev_offset = (t - 1) * N
|
|
277
|
+
curr_offset = t * N
|
|
278
|
+
emis_offset = t * N
|
|
279
|
+
|
|
280
|
+
self.k_hmm_log(
|
|
281
|
+
self.queue, (N,), None,
|
|
282
|
+
np.int32(N),
|
|
283
|
+
d_alpha,
|
|
284
|
+
np.int32(prev_offset),
|
|
285
|
+
np.int32(curr_offset),
|
|
286
|
+
d_trans_T, # <--- Optimized Read
|
|
287
|
+
d_emis,
|
|
288
|
+
np.int32(emis_offset)
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# --- B. Backward Pass (GPU Loop) ---
|
|
292
|
+
# Uses Original Matrix (d_trans) because Backward pass logic matches Row-Major
|
|
293
|
+
init_beta_end = np.zeros(N, dtype=np.float32)
|
|
294
|
+
beta_end_offset = (T - 1) * N * 4
|
|
295
|
+
cl.enqueue_copy(self.queue, d_beta, init_beta_end, dst_offset=beta_end_offset, is_blocking=False)
|
|
296
|
+
|
|
297
|
+
for t in range(T - 2, -1, -1):
|
|
298
|
+
curr_offset = t * N
|
|
299
|
+
future_offset = (t + 1) * N
|
|
300
|
+
future_emis_offset = (t + 1) * N
|
|
301
|
+
|
|
302
|
+
self.k_hmm_back(
|
|
303
|
+
self.queue, (N,), None,
|
|
304
|
+
np.int32(N),
|
|
305
|
+
d_beta,
|
|
306
|
+
np.int32(future_offset),
|
|
307
|
+
np.int32(curr_offset),
|
|
308
|
+
d_trans, # <--- Optimized Read (Backward needs Row-Major)
|
|
309
|
+
d_emis,
|
|
310
|
+
np.int32(future_emis_offset)
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# --- C. Accumulation (GPU) ---
|
|
314
|
+
self.queue.finish()
|
|
265
315
|
|
|
266
316
|
self.k_acc_trans(
|
|
267
|
-
self.queue,
|
|
268
|
-
(
|
|
269
|
-
|
|
270
|
-
np.int32(T),
|
|
271
|
-
np.int32(N),
|
|
272
|
-
d_alpha,
|
|
273
|
-
d_beta,
|
|
274
|
-
d_emis,
|
|
275
|
-
d_trans,
|
|
276
|
-
d_new_trans,
|
|
317
|
+
self.queue, (N, N), None,
|
|
318
|
+
np.int32(T), np.int32(N),
|
|
319
|
+
d_alpha, d_beta, d_emis, d_trans, d_new_trans
|
|
277
320
|
)
|
|
278
321
|
|
|
279
322
|
self.k_acc_gamma(
|
|
280
|
-
self.queue,
|
|
281
|
-
(N
|
|
282
|
-
|
|
283
|
-
np.int32(T),
|
|
284
|
-
np.int32(N),
|
|
285
|
-
d_alpha,
|
|
286
|
-
d_beta,
|
|
287
|
-
d_gamma_sums,
|
|
323
|
+
self.queue, (N,), None,
|
|
324
|
+
np.int32(T), np.int32(N),
|
|
325
|
+
d_alpha, d_beta, d_gamma_sums
|
|
288
326
|
)
|
|
289
327
|
|
|
290
|
-
#
|
|
328
|
+
# --- D. Update & Check Convergence (CPU) ---
|
|
291
329
|
new_log_trans_counts = np.empty_like(log_trans)
|
|
292
330
|
log_gamma_sums = np.empty(N, dtype=np.float32)
|
|
293
331
|
|
|
294
332
|
cl.enqueue_copy(self.queue, new_log_trans_counts, d_new_trans)
|
|
295
333
|
cl.enqueue_copy(self.queue, log_gamma_sums, d_gamma_sums)
|
|
334
|
+
|
|
335
|
+
# Calc Likelihood
|
|
336
|
+
alpha_T_offset = (T - 1) * N * 4
|
|
337
|
+
cl.enqueue_copy(self.queue, final_alpha_T, d_alpha, src_offset=alpha_T_offset)
|
|
338
|
+
log_likelihood = np.logaddexp.reduce(final_alpha_T)
|
|
296
339
|
|
|
340
|
+
# M-Step: Normalize
|
|
297
341
|
log_trans = new_log_trans_counts - log_gamma_sums[:, None]
|
|
342
|
+
|
|
343
|
+
# Update BOTH GPU Buffers for next iteration
|
|
344
|
+
cl.enqueue_copy(self.queue, d_trans, log_trans)
|
|
345
|
+
cl.enqueue_copy(self.queue, d_trans_T, np.ascontiguousarray(log_trans.T))
|
|
298
346
|
|
|
299
347
|
change = log_likelihood - prev_score
|
|
300
|
-
print(
|
|
301
|
-
|
|
302
|
-
)
|
|
348
|
+
print(f" Iter {i + 1}: Likelihood {log_likelihood:.2f} (Delta: {change:.4f})")
|
|
349
|
+
|
|
303
350
|
if abs(change) < tolerance:
|
|
304
351
|
break
|
|
305
352
|
prev_score = log_likelihood
|
|
306
353
|
|
|
307
|
-
return np.exp(log_trans)
|
|
308
|
-
|
|
309
|
-
def _cpu_forward(self, log_trans, log_emis):
|
|
310
|
-
T, N = log_emis.shape
|
|
311
|
-
alpha = np.zeros((T, N), dtype=np.float32)
|
|
312
|
-
alpha[0] = -np.log(N) + log_emis[0]
|
|
313
|
-
for t in range(1, T):
|
|
314
|
-
for j in range(N):
|
|
315
|
-
prev = alpha[t - 1] + log_trans[:, j]
|
|
316
|
-
alpha[t, j] = np.logaddexp.reduce(prev) + log_emis[t, j]
|
|
317
|
-
return alpha, np.logaddexp.reduce(alpha[-1])
|
|
318
|
-
|
|
319
|
-
def _cpu_backward(self, log_trans, log_emis):
|
|
320
|
-
T, N = log_emis.shape
|
|
321
|
-
beta = np.zeros((T, N), dtype=np.float32)
|
|
322
|
-
for t in range(T - 2, -1, -1):
|
|
323
|
-
for i in range(N):
|
|
324
|
-
terms = log_trans[i, :] + log_emis[t + 1] + beta[t + 1]
|
|
325
|
-
beta[t, i] = np.logaddexp.reduce(terms)
|
|
326
|
-
return beta
|
|
354
|
+
return np.exp(log_trans)
|
markovgpu/kernels.cl
CHANGED
|
@@ -1,146 +1,164 @@
|
|
|
1
|
-
// kernels.cl -
|
|
1
|
+
// kernels.cl - Memory Optimized (Transposed Access) + Fixed Write Permissions
|
|
2
2
|
|
|
3
3
|
// --- HELPER: Log-Sum-Exp Trick ---
|
|
4
|
-
// Prevents overflow when adding log-probabilities
|
|
5
4
|
float log_add(float log_a, float log_b) {
|
|
6
5
|
float max_val = max(log_a, log_b);
|
|
7
6
|
float min_val = min(log_a, log_b);
|
|
8
7
|
return max_val + log1p(exp(min_val - max_val));
|
|
9
8
|
}
|
|
10
9
|
|
|
11
|
-
// --- SECTION 1: Basic
|
|
10
|
+
// --- SECTION 1: Basic Operations ---
|
|
12
11
|
|
|
13
|
-
// 1. Standard Markov Step: Next = Current * Matrix
|
|
14
12
|
__kernel void markov_step(
|
|
15
13
|
const int N,
|
|
16
14
|
__global const float *current_state,
|
|
17
|
-
__global const float *
|
|
15
|
+
__global const float *trans_mat_T, // EXPECTS TRANSPOSED MATRIX
|
|
18
16
|
__global float *next_state)
|
|
19
17
|
{
|
|
20
|
-
int id = get_global_id(0);
|
|
18
|
+
int id = get_global_id(0); // Target State (Row in Transposed Mat)
|
|
21
19
|
if (id < N) {
|
|
22
20
|
float sum = 0.0f;
|
|
21
|
+
int row_start = id * N; // Coalesced Start (Optimization)
|
|
22
|
+
|
|
23
23
|
for (int k = 0; k < N; k++) {
|
|
24
|
-
|
|
24
|
+
// Read sequentially: P_T[id][k] corresponds to P[k][id]
|
|
25
|
+
sum += current_state[k] * trans_mat_T[row_start + k];
|
|
25
26
|
}
|
|
26
27
|
next_state[id] = sum;
|
|
27
28
|
}
|
|
28
29
|
}
|
|
29
30
|
|
|
30
|
-
// 2. Standard HMM Filter (Probability Space)
|
|
31
|
-
// Used for simple "What state am I in?" queries without log-space
|
|
32
31
|
__kernel void hmm_forward_step(
|
|
33
32
|
const int N,
|
|
34
33
|
__global const float *alpha_prev,
|
|
35
|
-
__global const float *
|
|
34
|
+
__global const float *trans_mat_T, // EXPECTS TRANSPOSED MATRIX
|
|
36
35
|
__global const float *emissions,
|
|
37
36
|
__global float *alpha_new)
|
|
38
37
|
{
|
|
39
38
|
int id = get_global_id(0);
|
|
40
39
|
if (id < N) {
|
|
41
40
|
float sum = 0.0f;
|
|
41
|
+
int row_start = id * N;
|
|
42
|
+
|
|
42
43
|
for (int k = 0; k < N; k++) {
|
|
43
|
-
sum += alpha_prev[k] *
|
|
44
|
+
sum += alpha_prev[k] * trans_mat_T[row_start + k];
|
|
44
45
|
}
|
|
45
46
|
alpha_new[id] = sum * emissions[id];
|
|
46
47
|
}
|
|
47
48
|
}
|
|
48
49
|
|
|
49
|
-
// --- SECTION 2: Advanced Log-Space Operations
|
|
50
|
+
// --- SECTION 2: Advanced Log-Space Operations ---
|
|
50
51
|
|
|
51
|
-
// 3. Log-Space Forward (
|
|
52
|
+
// 3. Log-Space Forward (Memory Optimized)
|
|
52
53
|
__kernel void hmm_forward_log(
|
|
53
54
|
const int N,
|
|
54
|
-
__global
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
__global float *
|
|
55
|
+
__global float *log_alpha_full, // NO CONST (Write Permission Fix Preserved)
|
|
56
|
+
const int prev_offset,
|
|
57
|
+
const int curr_offset,
|
|
58
|
+
__global const float *log_trans_mat_T, // EXPECTS TRANSPOSED MATRIX
|
|
59
|
+
__global const float *log_emissions,
|
|
60
|
+
const int emis_offset)
|
|
58
61
|
{
|
|
59
|
-
int id = get_global_id(0);
|
|
62
|
+
int id = get_global_id(0); // Target State (Row in Transposed Mat)
|
|
60
63
|
if (id < N) {
|
|
61
64
|
float log_sum = -INFINITY;
|
|
65
|
+
int row_start = id * N;
|
|
66
|
+
|
|
67
|
+
// Loop 'k' (Previous State).
|
|
68
|
+
// In Transposed Matrix, 'id' is the Row, 'k' is the Column.
|
|
69
|
+
// So we read P_T[id][k] which corresponds to P[k][id]
|
|
62
70
|
for (int k = 0; k < N; k++) {
|
|
63
|
-
float val =
|
|
71
|
+
float val = log_alpha_full[prev_offset + k] + log_trans_mat_T[row_start + k];
|
|
64
72
|
if (k == 0) log_sum = val;
|
|
65
73
|
else log_sum = log_add(log_sum, val);
|
|
66
74
|
}
|
|
67
|
-
|
|
75
|
+
|
|
76
|
+
// Write to 'curr_offset'
|
|
77
|
+
log_alpha_full[curr_offset + id] = log_sum + log_emissions[emis_offset + id];
|
|
68
78
|
}
|
|
69
79
|
}
|
|
70
80
|
|
|
71
|
-
// 4. Log-Space Backward (
|
|
81
|
+
// 4. Log-Space Backward (Memory Optimized - Uses ORIGINAL Matrix)
|
|
82
|
+
// Note: Backward pass needs P[i][j], which is naturally Row-Major.
|
|
83
|
+
// So we DO NOT use the Transposed matrix here. It is already optimized!
|
|
72
84
|
__kernel void hmm_backward_log(
|
|
73
|
-
const int N,
|
|
74
|
-
__global
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
__global float *
|
|
85
|
+
const int N,
|
|
86
|
+
__global float *beta_full,
|
|
87
|
+
const int future_offset,
|
|
88
|
+
const int curr_offset,
|
|
89
|
+
__global const float *trans, // ORIGINAL MATRIX (Row-Major)
|
|
90
|
+
__global const float *emis_full,
|
|
91
|
+
const int future_emis_offset)
|
|
78
92
|
{
|
|
79
93
|
int id = get_global_id(0); // State 'i'
|
|
80
94
|
if (id < N) {
|
|
81
95
|
float log_sum = -INFINITY;
|
|
96
|
+
int row_start = id * N;
|
|
97
|
+
|
|
82
98
|
for (int j=0; j<N; j++) {
|
|
83
|
-
//
|
|
84
|
-
float val = trans[
|
|
99
|
+
// Read sequentially: trans[row_start + j]
|
|
100
|
+
float val = trans[row_start + j] +
|
|
101
|
+
emis_full[future_emis_offset + j] +
|
|
102
|
+
beta_full[future_offset + j];
|
|
103
|
+
|
|
85
104
|
if (j==0) log_sum = val;
|
|
86
105
|
else log_sum = log_add(log_sum, val);
|
|
87
106
|
}
|
|
88
|
-
|
|
107
|
+
beta_full[curr_offset + id] = log_sum;
|
|
89
108
|
}
|
|
90
109
|
}
|
|
91
110
|
|
|
92
|
-
// 5. Viterbi Algorithm (
|
|
111
|
+
// 5. Viterbi Algorithm (Memory Optimized)
|
|
93
112
|
__kernel void viterbi_step(
|
|
94
113
|
const int N,
|
|
95
114
|
__global const float *log_delta_prev,
|
|
96
|
-
__global const float *
|
|
115
|
+
__global const float *log_trans_mat_T, // EXPECTS TRANSPOSED MATRIX
|
|
97
116
|
__global const float *log_emissions,
|
|
98
|
-
__global float *log_delta_new,
|
|
99
|
-
__global int *backpointers)
|
|
117
|
+
__global float *log_delta_new,
|
|
118
|
+
__global int *backpointers)
|
|
100
119
|
{
|
|
101
120
|
int id = get_global_id(0);
|
|
102
121
|
if (id < N) {
|
|
103
122
|
float max_prob = -INFINITY;
|
|
104
123
|
int best_prev_state = 0;
|
|
124
|
+
int row_start = id * N;
|
|
105
125
|
|
|
106
126
|
for (int k = 0; k < N; k++) {
|
|
107
|
-
|
|
127
|
+
// Read sequentially: P_T[id][k]
|
|
128
|
+
float prob = log_delta_prev[k] + log_trans_mat_T[row_start + k];
|
|
108
129
|
if (prob > max_prob) {
|
|
109
130
|
max_prob = prob;
|
|
110
131
|
best_prev_state = k;
|
|
111
132
|
}
|
|
112
133
|
}
|
|
113
134
|
log_delta_new[id] = max_prob + log_emissions[id];
|
|
114
|
-
backpointers[id] = best_prev_state;
|
|
135
|
+
backpointers[id] = best_prev_state;
|
|
115
136
|
}
|
|
116
137
|
}
|
|
117
138
|
|
|
118
|
-
// --- SECTION 3: Learning Accumulators (
|
|
139
|
+
// --- SECTION 3: Learning Accumulators (Unchanged) ---
|
|
119
140
|
|
|
120
141
|
// 6. Accumulate Transitions (E-Step)
|
|
121
|
-
// Condenses time T into N*N summary matrix
|
|
122
142
|
__kernel void accumulate_transitions(
|
|
123
143
|
const int T, const int N,
|
|
124
144
|
__global const float *alpha_full,
|
|
125
145
|
__global const float *beta_full,
|
|
126
146
|
__global const float *emis_full,
|
|
127
|
-
__global const float *trans_mat,
|
|
147
|
+
__global const float *trans_mat, // Original Matrix
|
|
128
148
|
__global float *new_trans_counts)
|
|
129
149
|
{
|
|
130
|
-
int row = get_global_id(1);
|
|
131
|
-
int col = get_global_id(0);
|
|
150
|
+
int row = get_global_id(1);
|
|
151
|
+
int col = get_global_id(0);
|
|
132
152
|
|
|
133
153
|
if (row < N && col < N) {
|
|
134
154
|
float log_sum_xi = -INFINITY;
|
|
135
155
|
float log_trans_val = trans_mat[row * N + col];
|
|
136
156
|
|
|
137
|
-
// Loop over time 0 to T-2
|
|
138
157
|
for (int t = 0; t < T - 1; t++) {
|
|
139
158
|
float log_xi = alpha_full[t*N + row] +
|
|
140
159
|
log_trans_val +
|
|
141
160
|
emis_full[(t+1)*N + col] +
|
|
142
161
|
beta_full[(t+1)*N + col];
|
|
143
|
-
|
|
144
162
|
if (t == 0) log_sum_xi = log_xi;
|
|
145
163
|
else log_sum_xi = log_add(log_sum_xi, log_xi);
|
|
146
164
|
}
|
|
@@ -149,7 +167,6 @@ __kernel void accumulate_transitions(
|
|
|
149
167
|
}
|
|
150
168
|
|
|
151
169
|
// 7. Accumulate Gammas (E-Step)
|
|
152
|
-
// Condenses time T into N summary counts
|
|
153
170
|
__kernel void accumulate_gammas(
|
|
154
171
|
const int T, const int N,
|
|
155
172
|
__global const float *alpha_full,
|
markovgpu/sklearn.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
3
|
+
from sklearn.utils.validation import check_array, check_is_fitted
|
|
4
|
+
from scipy.stats import norm
|
|
5
|
+
from .backend import MarkovEngine
|
|
6
|
+
|
|
7
|
+
class GpuHMM(BaseEstimator, TransformerMixin):
|
|
8
|
+
"""
|
|
9
|
+
Scikit-Learn compatible Wrapper for MarkovGPU.
|
|
10
|
+
Allows use in Pipelines, GridSearchCV, and Cross-Validation.
|
|
11
|
+
"""
|
|
12
|
+
def __init__(self, n_states=2, n_iter=100, tolerance=1e-4, verbose=False):
|
|
13
|
+
self.n_states = n_states
|
|
14
|
+
self.n_iter = n_iter
|
|
15
|
+
self.tolerance = tolerance
|
|
16
|
+
self.verbose = verbose
|
|
17
|
+
self.engine = MarkovEngine()
|
|
18
|
+
|
|
19
|
+
# Learned Parameters
|
|
20
|
+
self.trans_mat_ = None
|
|
21
|
+
self.start_prob_ = None
|
|
22
|
+
|
|
23
|
+
def fit(self, X, y=None):
|
|
24
|
+
"""
|
|
25
|
+
Trains the HMM on the GPU.
|
|
26
|
+
X: array-like of shape (n_samples, n_features) OR (n_samples,)
|
|
27
|
+
For now, we assume X represents 'Observation Probabilities'
|
|
28
|
+
OR raw data we can model as Gaussian emissions.
|
|
29
|
+
"""
|
|
30
|
+
# 1. Input Validation
|
|
31
|
+
X = check_array(X, ensure_2d=False)
|
|
32
|
+
|
|
33
|
+
# 2. Heuristic: If X is 1D (Raw Data), we convert to Emission Probs
|
|
34
|
+
# using a simple Gaussian mixture assumption for convenience.
|
|
35
|
+
if X.ndim == 1 or X.shape[1] == 1:
|
|
36
|
+
if self.verbose:
|
|
37
|
+
print(f"βΉοΈ Auto-converting raw data to {self.n_states} Gaussian states.")
|
|
38
|
+
X_flat = X.ravel()
|
|
39
|
+
obs_probs = self._auto_gaussian_emissions(X_flat)
|
|
40
|
+
else:
|
|
41
|
+
# Assume X is already [Probability of State 0, Prob of State 1, ...]
|
|
42
|
+
if X.shape[1] != self.n_states:
|
|
43
|
+
raise ValueError(f"Input has {X.shape[1]} columns, but n_states={self.n_states}. "
|
|
44
|
+
"If passing raw probabilities, cols must match n_states.")
|
|
45
|
+
obs_probs = X
|
|
46
|
+
|
|
47
|
+
# 3. Train on GPU
|
|
48
|
+
if self.verbose:
|
|
49
|
+
print(f"π Offloading to GPU: {X.shape[0]} samples, {self.n_states} states")
|
|
50
|
+
|
|
51
|
+
self.trans_mat_ = self.engine.fit(
|
|
52
|
+
obs_probs,
|
|
53
|
+
n_states=self.n_states,
|
|
54
|
+
n_iters=self.n_iter,
|
|
55
|
+
tolerance=self.tolerance
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Set is_fitted flag
|
|
59
|
+
self.is_fitted_ = True
|
|
60
|
+
return self
|
|
61
|
+
|
|
62
|
+
def predict(self, X):
|
|
63
|
+
"""
|
|
64
|
+
Returns the most likely hidden state path (Viterbi).
|
|
65
|
+
"""
|
|
66
|
+
check_is_fitted(self, ['trans_mat_'])
|
|
67
|
+
X = check_array(X, ensure_2d=False)
|
|
68
|
+
|
|
69
|
+
if X.ndim == 1 or X.shape[1] == 1:
|
|
70
|
+
obs_probs = self._auto_gaussian_emissions(X.ravel())
|
|
71
|
+
else:
|
|
72
|
+
obs_probs = X
|
|
73
|
+
|
|
74
|
+
return self.engine.decode_regime(self.trans_mat_, obs_probs)
|
|
75
|
+
|
|
76
|
+
def _auto_gaussian_emissions(self, data):
|
|
77
|
+
"""
|
|
78
|
+
Helper: Splits data into N quantiles and assumes Gaussian emissions.
|
|
79
|
+
This makes the class 'Just Work' for simple 1D data.
|
|
80
|
+
"""
|
|
81
|
+
T = len(data)
|
|
82
|
+
N = self.n_states
|
|
83
|
+
|
|
84
|
+
# Smart Init: Sort data and split into N chunks to guess means
|
|
85
|
+
sorted_data = np.sort(data)
|
|
86
|
+
chunk_size = T // N
|
|
87
|
+
means = [np.mean(sorted_data[i*chunk_size : (i+1)*chunk_size]) for i in range(N)]
|
|
88
|
+
std = np.std(data) * 0.5 # Heuristic width
|
|
89
|
+
|
|
90
|
+
probs = np.zeros((T, N), dtype=np.float32)
|
|
91
|
+
for k in range(N):
|
|
92
|
+
probs[:, k] = norm.pdf(data, loc=means[k], scale=std)
|
|
93
|
+
|
|
94
|
+
return probs
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: markovgpu-rane
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: High-performance Markov Chains & HMMs using OpenCL
|
|
5
|
+
Author-email: Sahil Rane <sahilrane249@gmail.com>
|
|
6
|
+
Classifier: Development Status :: 4 - Beta
|
|
7
|
+
Classifier: Intended Audience :: Developers
|
|
8
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Requires-Dist: matplotlib>=3.8.0
|
|
17
|
+
Requires-Dist: numpy>=1.26.0
|
|
18
|
+
Requires-Dist: pyopencl>=2024.1
|
|
19
|
+
Requires-Dist: scikit-learn>=1.8.0
|
|
20
|
+
Requires-Dist: scipy>=1.11.0
|
|
21
|
+
Requires-Dist: yfinance>=1.1.0
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
<div align="center">
|
|
25
|
+
|
|
26
|
+
# β‘ **MarkovGPU**
|
|
27
|
+
|
|
28
|
+
### *Massive Scale Markov Models on Consumer Hardware*
|
|
29
|
+
<img width="1024" height="338" alt="image" src="https://github.com/user-attachments/assets/b57dab80-ba03-4d1d-bb4d-6390e3f63f52" />
|
|
30
|
+
|
|
31
|
+
> **Run million-state HMMs on your laptop GPU.**
|
|
32
|
+
> **No CUDA required β’ Hybrid CPU/GPU Backend β’ Production Ready**
|
|
33
|
+
|
|
34
|
+
[](https://pypi.org/project/markovgpu-rane/)
|
|
35
|
+
[](https://www.python.org/downloads/)
|
|
36
|
+
[](https://opensource.org/licenses/MIT)
|
|
37
|
+
[](https://github.com/wizardwithcodehazard/markov/actions)
|
|
38
|
+
|
|
39
|
+
</div>
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## π **The Engine for Stochastic Intelligence**
|
|
44
|
+
|
|
45
|
+
**MarkovGPU** is a high-performance probabilistic modeling library built for speed. It breaks the "NVIDIA Monopoly" by using **OpenCL** to accelerate **Hidden Markov Models (HMM)** and **Markov Chains** on *any* GPUβincluding AMD Radeon, Intel Arc, and Apple Silicon.
|
|
46
|
+
|
|
47
|
+
It doesn't just run; it *thinks*. The **Smart Hybrid Backend** automatically routes small tasks to the CPU (NumPy) and massive workloads to the GPU, giving you optimal performance at every scale.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## π **Core Superpowers**
|
|
52
|
+
|
|
53
|
+
| Feature | Magic Behind It |
|
|
54
|
+
|-------|----------------|
|
|
55
|
+
| β‘ **Hardware Agnostic** | Built on **OpenCL** β runs on AMD, Intel, NVIDIA, and Apple M1/M2/M3 chips. |
|
|
56
|
+
| π§ **Smart Hybrid Backend** | Auto-detects problem size ($N$). Uses **NumPy** for speed on small data, **GPU** for massive throughput. |
|
|
57
|
+
| π **Log-Space Stability** | Implements **Log-Sum-Exp** kernels to prevent underflow on long time-series (1M+ steps). |
|
|
58
|
+
| π΅οΈ **Viterbi Decoding** | Finds the "Hidden Truth" in noisy data (e.g., market regimes, DNA sequences) in milliseconds. |
|
|
59
|
+
| π **Unsupervised Learning** | **Baum-Welch (EM)** algorithm trains models directly on the GPU, learning rules from raw data. |
|
|
60
|
+
| π¦ **Zero-Config Install** | `pip install markovgpu-rane`. No driver hell. No CUDA toolkit nightmares. |
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## ποΈ **Architecture: The Hybrid Pipeline**
|
|
65
|
+
|
|
66
|
+
```mermaid
|
|
67
|
+
graph LR
|
|
68
|
+
A[User Code] -->|Request Fit/Predict| B{Smart Dispatcher}
|
|
69
|
+
B -->|Small N < 64| C["CPU Engine
|
|
70
|
+
(NumPy AVX2)"]
|
|
71
|
+
B -->|Large N >= 64| D["GPU Engine
|
|
72
|
+
(OpenCL Kernels)"]
|
|
73
|
+
C --> E[Result]
|
|
74
|
+
D --> E
|
|
75
|
+
subgraph GPU_Acceleration[GPU Acceleration]
|
|
76
|
+
D --> F[Matrix Multiply]
|
|
77
|
+
D --> G[Log-Sum-Exp]
|
|
78
|
+
D --> H[Parallel Viterbi]
|
|
79
|
+
end
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
The library handles the hardware. You handle the math.
|
|
83
|
+
|
|
84
|
+
## β‘ Performance: Benchmarks
|
|
85
|
+
|
|
86
|
+
**Task**: Viterbi Decoding (64 Hidden States, 5000 Days of Data).
|
|
87
|
+
**Hardware**: AMD Ryzen 680M (Integrated Graphics).
|
|
88
|
+
|
|
89
|
+
| Engine | Execution Time | Speedup |
|
|
90
|
+
|--------|---------------|---------|
|
|
91
|
+
| π’ CPU (NumPy Optimized) | 5.06s | 1x |
|
|
92
|
+
| π GPU (MarkovGPU) | 0.82s | **6.2x** |
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## βοΈ Quick Start in 30 Seconds
|
|
97
|
+
|
|
98
|
+
### Installation
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# Production
|
|
102
|
+
pip install markovgpu-rane
|
|
103
|
+
|
|
104
|
+
# Or for local development
|
|
105
|
+
uv pip install markovgpu-rane
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 1. Market Regime Detection (Viterbi)
|
|
109
|
+
Identify hidden "Bull" vs. "Bear" markets from noisy stock returns.
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
import numpy as np
|
|
113
|
+
from markovgpu import MarkovEngine
|
|
114
|
+
|
|
115
|
+
# 1. Setup the Rules (Transition Matrix)
|
|
116
|
+
# "Bull markets tend to stay Bullish (95%)"
|
|
117
|
+
trans_mat = np.array([[0.95, 0.05],
|
|
118
|
+
[0.10, 0.90]], dtype=np.float32)
|
|
119
|
+
|
|
120
|
+
# 2. Feed the Data (Observation Likelihoods)
|
|
121
|
+
# Shape: (1000 Days, 2 States)
|
|
122
|
+
obs_probs = np.random.rand(1000, 2).astype(np.float32)
|
|
123
|
+
|
|
124
|
+
# 3. Ignite the Engine
|
|
125
|
+
engine = MarkovEngine()
|
|
126
|
+
predicted_states = engine.decode_regime(trans_mat, obs_probs)
|
|
127
|
+
|
|
128
|
+
print("Detected Regimes:", predicted_states)
|
|
129
|
+
# Output: [0, 0, 0, 1, 1, 1, 0 ...]
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### 2. Unsupervised Learning (Baum-Welch)
|
|
133
|
+
Train the AI to discover the hidden rules from raw data.
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
# The engine learns the Transition Matrix automatically
|
|
137
|
+
learned_matrix = engine.fit(
|
|
138
|
+
obs_probs,
|
|
139
|
+
n_states=2,
|
|
140
|
+
n_iters=100,
|
|
141
|
+
tolerance=1e-4
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
print("Discovered Rules:")
|
|
145
|
+
print(learned_matrix)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## π¬ Technical Brilliance
|
|
151
|
+
|
|
152
|
+
### 1. The Log-Sum-Exp Kernel
|
|
153
|
+
Standard HMMs crash on long sequences because probabilities like $0.9^{1000}$ vanish to zero.
|
|
154
|
+
We solved this by rewriting the entire GPU kernel in Log-Space:
|
|
155
|
+
|
|
156
|
+
```c
|
|
157
|
+
// Actual OpenCL Kernel snippet
|
|
158
|
+
float log_add(float log_a, float log_b) {
|
|
159
|
+
float max_val = max(log_a, log_b);
|
|
160
|
+
return max_val + log1p(exp(min(log_a, log_b) - max_val));
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
β **Result**: You can process sequences of infinite length without numerical collapse.
|
|
164
|
+
|
|
165
|
+
### 2. Parallel Viterbi
|
|
166
|
+
Instead of a slow Python loop, we launch $N$ threads (one per state) for every time step on the GPU, calculating the optimal path in parallel.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## π οΈ Project Structure
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
markovgpu/
|
|
174
|
+
βββ src/markovgpu/
|
|
175
|
+
β βββ backend.py # The Brain (Smart Dispatcher)
|
|
176
|
+
β βββ kernels.cl # The Muscle (OpenCL C Code)
|
|
177
|
+
β βββ __init__.py
|
|
178
|
+
βββ tests/ # Unit Tests
|
|
179
|
+
βββ pyproject.toml # Modern Packaging Config
|
|
180
|
+
βββ README.md
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## π± Contributing
|
|
184
|
+
|
|
185
|
+
We welcome forks, issues, and PRs!
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
git clone https://github.com/wizardwithcodehazard/markov.git
|
|
189
|
+
cd markov
|
|
190
|
+
uv sync --dev
|
|
191
|
+
uv run pytest
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## π License
|
|
195
|
+
|
|
196
|
+
**MIT License** β Free to use, modify, and ship in commercial products.
|
|
197
|
+
|
|
198
|
+
<div align="center">
|
|
199
|
+
|
|
200
|
+
MarkovGPU doesnβt just crunch numbers.
|
|
201
|
+
### It discovers the hidden structure of reality.
|
|
202
|
+
|
|
203
|
+
Made with π§‘ by Sahil Rane
|
|
204
|
+
|
|
205
|
+
</div>
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
markovgpu/__init__.py,sha256=pCxM1YXY4faXxSm_LtdvL742NKkXKGMeNl61-hHcStU,121
|
|
2
|
+
markovgpu/backend.py,sha256=zbqKS0xjBvJRZ_Mu79y_6-HbpZkjbtA-1eQ_xDXc4lQ,13674
|
|
3
|
+
markovgpu/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
markovgpu/sklearn.py,sha256=5N6d4XVJwev4iH7OCPw4TT_nNTc71-CvNdfIW_S2kxI,3469
|
|
5
|
+
markovgpu/kernels.cl,sha256=DLrcHMg01UO6L1h8u9LM_6uwa9ec9hwdOclGdnxg768,6075
|
|
6
|
+
markovgpu_rane-0.3.0.dist-info/METADATA,sha256=y3soPxmx-IlAxPKGBpPvS0IeZTK7sD-8EBZUaBJOj6I,6622
|
|
7
|
+
markovgpu_rane-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
8
|
+
markovgpu_rane-0.3.0.dist-info/RECORD,,
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: markovgpu-rane
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: High-performance Markov Chains & HMMs using OpenCL
|
|
5
|
-
Author-email: Sahil Rane <sahilrane249@gmail.com>
|
|
6
|
-
Classifier: Development Status :: 4 - Beta
|
|
7
|
-
Classifier: Intended Audience :: Developers
|
|
8
|
-
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
9
|
-
Classifier: Intended Audience :: Science/Research
|
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
-
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
15
|
-
Requires-Python: >=3.12
|
|
16
|
-
Requires-Dist: matplotlib>=3.8.0
|
|
17
|
-
Requires-Dist: numpy>=1.26.0
|
|
18
|
-
Requires-Dist: pyopencl>=2024.1
|
|
19
|
-
Requires-Dist: scipy>=1.11.0
|
|
20
|
-
Description-Content-Type: text/markdown
|
|
21
|
-
|
|
22
|
-
hello
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
markovgpu/__init__.py,sha256=aGWvFGT6VaLCuFNO9T3ubnlhz2qgkBmNIcCy976YrqE,62
|
|
2
|
-
markovgpu/backend.py,sha256=bfYnge9MgMcDHmJ7CcCG2VGqVfsGxsCzXavGLUFdB2w,11733
|
|
3
|
-
markovgpu/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
markovgpu/kernels.cl,sha256=RXpt2jD6IRdh5YTunB_lwfajT1Cw9M95v3uxwMMmMvs,5141
|
|
5
|
-
markovgpu_rane-0.1.0.dist-info/METADATA,sha256=-CqRDK-d95CjNbsFpBIBut--zqLGHvvXPgsWdUe0Mtg,840
|
|
6
|
-
markovgpu_rane-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
7
|
-
markovgpu_rane-0.1.0.dist-info/RECORD,,
|
|
File without changes
|