markovgpu-rane 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markovgpu_rane-0.1.0/.github/workflows/publish.yml +27 -0
- markovgpu_rane-0.1.0/.github/workflows/test.yml +37 -0
- markovgpu_rane-0.1.0/.gitignore +76 -0
- markovgpu_rane-0.1.0/.python-version +1 -0
- markovgpu_rane-0.1.0/PKG-INFO +22 -0
- markovgpu_rane-0.1.0/README.md +1 -0
- markovgpu_rane-0.1.0/pyproject.toml +58 -0
- markovgpu_rane-0.1.0/src/markovgpu/__init__.py +3 -0
- markovgpu_rane-0.1.0/src/markovgpu/backend.py +326 -0
- markovgpu_rane-0.1.0/src/markovgpu/kernels.cl +169 -0
- markovgpu_rane-0.1.0/src/markovgpu/py.typed +0 -0
- markovgpu_rane-0.1.0/tests/test_basic.py +17 -0
- markovgpu_rane-0.1.0/uv.lock +693 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published] # Runs ONLY when you click "Create Release" on GitHub
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
pypi-publish:
|
|
9
|
+
name: Publish to PyPI
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment: pypi # Matches what you typed in PyPI website
|
|
12
|
+
permissions:
|
|
13
|
+
id-token: write # REQUIRED for Trusted Publishing
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout Code
|
|
17
|
+
uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Install uv
|
|
20
|
+
uses: astral-sh/setup-uv@v5
|
|
21
|
+
|
|
22
|
+
- name: Build Package
|
|
23
|
+
run: uv build
|
|
24
|
+
|
|
25
|
+
- name: Publish to PyPI
|
|
26
|
+
# This action uses the "Trusted" connection automatically
|
|
27
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
name: CI (Test & Lint)
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
name: Test on Python ${{ matrix.python-version }}
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12"] # Test multiple versions
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- name: Checkout Code
|
|
19
|
+
uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Install uv
|
|
22
|
+
uses: astral-sh/setup-uv@v5 # The official uv action
|
|
23
|
+
|
|
24
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
25
|
+
run: uv python install ${{ matrix.python-version }}
|
|
26
|
+
|
|
27
|
+
- name: Install Project
|
|
28
|
+
run: uv sync --all-extras --dev
|
|
29
|
+
|
|
30
|
+
- name: Lint Code (Ruff)
|
|
31
|
+
run: |
|
|
32
|
+
uv run ruff check .
|
|
33
|
+
uv run ruff format --check .
|
|
34
|
+
|
|
35
|
+
- name: Run Tests
|
|
36
|
+
# We assume CPU fallback works (GitHub runners don't have GPUs)
|
|
37
|
+
run: uv run pytest
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# --- Python Basics ---
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# --- C Extensions & Shared Libraries ---
|
|
7
|
+
*.so
|
|
8
|
+
*.pyd
|
|
9
|
+
*.dylib
|
|
10
|
+
|
|
11
|
+
# --- Distribution / Packaging ---
|
|
12
|
+
.Python
|
|
13
|
+
build/
|
|
14
|
+
develop-eggs/
|
|
15
|
+
dist/
|
|
16
|
+
downloads/
|
|
17
|
+
eggs/
|
|
18
|
+
.eggs/
|
|
19
|
+
lib/
|
|
20
|
+
lib64/
|
|
21
|
+
parts/
|
|
22
|
+
sdist/
|
|
23
|
+
var/
|
|
24
|
+
wheels/
|
|
25
|
+
share/python-wheels/
|
|
26
|
+
*.egg-info/
|
|
27
|
+
.installed.cfg
|
|
28
|
+
*.egg
|
|
29
|
+
MANIFEST
|
|
30
|
+
|
|
31
|
+
# --- Virtual Environments ---
|
|
32
|
+
# Common names for virtual envs
|
|
33
|
+
.venv
|
|
34
|
+
venv/
|
|
35
|
+
ENV/
|
|
36
|
+
env/
|
|
37
|
+
|
|
38
|
+
# --- Unit Test / Coverage ---
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.nox/
|
|
42
|
+
.coverage
|
|
43
|
+
.coverage.*
|
|
44
|
+
.cache
|
|
45
|
+
nosetests.xml
|
|
46
|
+
coverage.xml
|
|
47
|
+
*.cover
|
|
48
|
+
*.py.cover
|
|
49
|
+
.hypothesis/
|
|
50
|
+
.pytest_cache/
|
|
51
|
+
|
|
52
|
+
# --- IDEs & Editors (Optional but Recommended) ---
|
|
53
|
+
# VS Code
|
|
54
|
+
.vscode/
|
|
55
|
+
!.vscode/settings.json
|
|
56
|
+
!.vscode/tasks.json
|
|
57
|
+
!.vscode/launch.json
|
|
58
|
+
!.vscode/extensions.json
|
|
59
|
+
*.code-workspace
|
|
60
|
+
|
|
61
|
+
# PyCharm / IntelliJ
|
|
62
|
+
.idea/
|
|
63
|
+
|
|
64
|
+
# Mac / Windows System Files
|
|
65
|
+
.DS_Store
|
|
66
|
+
Thumbs.db
|
|
67
|
+
|
|
68
|
+
# --- Environment Variables (Security) ---
|
|
69
|
+
# NEVER commit your secrets
|
|
70
|
+
.env
|
|
71
|
+
.env.local
|
|
72
|
+
.env.*.local
|
|
73
|
+
|
|
74
|
+
# --- Project Specific ---
|
|
75
|
+
# If your OpenCL kernels generate binary caches
|
|
76
|
+
*.cl.bin
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: markovgpu-rane
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: High-performance Markov Chains & HMMs using OpenCL
|
|
5
|
+
Author-email: Sahil Rane <sahilrane249@gmail.com>
|
|
6
|
+
Classifier: Development Status :: 4 - Beta
|
|
7
|
+
Classifier: Intended Audience :: Developers
|
|
8
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Requires-Dist: matplotlib>=3.8.0
|
|
17
|
+
Requires-Dist: numpy>=1.26.0
|
|
18
|
+
Requires-Dist: pyopencl>=2024.1
|
|
19
|
+
Requires-Dist: scipy>=1.11.0
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
hello
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hello
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "markovgpu-rane"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "High-performance Markov Chains & HMMs using OpenCL"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Sahil Rane", email = "sahilrane249@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
|
|
11
|
+
# Your core libraries + visualization/stats tools you used in the demos
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy>=1.26.0", # Relaxed version constraint slightly for better compatibility
|
|
14
|
+
"pyopencl>=2024.1",
|
|
15
|
+
"matplotlib>=3.8.0",
|
|
16
|
+
"scipy>=1.11.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
# Metadata tags to help people find your library on PyPI
|
|
20
|
+
classifiers = [
|
|
21
|
+
"Development Status :: 4 - Beta",
|
|
22
|
+
"Intended Audience :: Developers",
|
|
23
|
+
"Intended Audience :: Science/Research",
|
|
24
|
+
"Intended Audience :: Financial and Insurance Industry",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Operating System :: OS Independent",
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Mathematics",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# (Optional) Add your GitHub URL if you have pushed the code
|
|
33
|
+
# [project.urls]
|
|
34
|
+
# Repository = "https://github.com/yourusername/markovgpu"
|
|
35
|
+
# Issues = "https://github.com/yourusername/markovgpu/issues"
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------
|
|
38
|
+
# BUILD SYSTEM: Hatchling (Standard for uv)
|
|
39
|
+
# ---------------------------------------------------------
|
|
40
|
+
[build-system]
|
|
41
|
+
requires = ["hatchling"]
|
|
42
|
+
build-backend = "hatchling.build"
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------
|
|
45
|
+
# PACKAGING: Include the .cl Kernel file
|
|
46
|
+
# ---------------------------------------------------------
|
|
47
|
+
[tool.hatch.build.targets.wheel]
|
|
48
|
+
packages = ["src/markovgpu"]
|
|
49
|
+
|
|
50
|
+
[tool.hatch.build.targets.wheel.force-include]
|
|
51
|
+
# Map source path (left) to package destination (right)
|
|
52
|
+
"src/markovgpu/kernels.cl" = "markovgpu/kernels.cl"
|
|
53
|
+
|
|
54
|
+
[dependency-groups]
|
|
55
|
+
dev = [
|
|
56
|
+
"pytest>=9.0.2",
|
|
57
|
+
"ruff>=0.15.0",
|
|
58
|
+
]
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
import pyopencl as cl
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# Locate the kernel file
|
|
6
|
+
MODULE_PATH = os.path.dirname(os.path.abspath(__file__))
|
|
7
|
+
KERNEL_PATH = os.path.join(MODULE_PATH, "kernels.cl")
|
|
8
|
+
|
|
9
|
+
# Threshold: Use GPU if states >= 64, otherwise CPU is faster
|
|
10
|
+
GPU_THRESHOLD = 64
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MarkovEngine:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
self.use_gpu = False
|
|
16
|
+
self.ctx = None
|
|
17
|
+
self.queue = None
|
|
18
|
+
self.prg = None
|
|
19
|
+
|
|
20
|
+
# 1. Try to Connect to GPU
|
|
21
|
+
try:
|
|
22
|
+
platforms = cl.get_platforms()
|
|
23
|
+
gpu_devices = []
|
|
24
|
+
for p in platforms:
|
|
25
|
+
gpu_devices.extend(p.get_devices(device_type=cl.device_type.GPU))
|
|
26
|
+
|
|
27
|
+
if gpu_devices:
|
|
28
|
+
# Pick the discrete GPU (highest compute units)
|
|
29
|
+
best_dev = sorted(
|
|
30
|
+
gpu_devices, key=lambda d: d.max_compute_units, reverse=True
|
|
31
|
+
)[0]
|
|
32
|
+
self.ctx = cl.Context([best_dev])
|
|
33
|
+
print(
|
|
34
|
+
f"🔌 Connected to Accelerator: {best_dev.name} ({best_dev.max_compute_units} CUs)"
|
|
35
|
+
)
|
|
36
|
+
else:
|
|
37
|
+
self.ctx = cl.create_some_context(interactive=False)
|
|
38
|
+
print(f"⚠️ No Dedicated GPU found. Using: {self.ctx.devices[0].name}")
|
|
39
|
+
|
|
40
|
+
self.queue = cl.CommandQueue(self.ctx)
|
|
41
|
+
|
|
42
|
+
# 2. Compile Kernels
|
|
43
|
+
if not os.path.exists(KERNEL_PATH):
|
|
44
|
+
raise FileNotFoundError(f"Kernel file missing at: {KERNEL_PATH}")
|
|
45
|
+
|
|
46
|
+
with open(KERNEL_PATH, "r") as f:
|
|
47
|
+
self.prg = cl.Program(self.ctx, f.read()).build()
|
|
48
|
+
|
|
49
|
+
# 3. Cache Kernels (Robust Retrieval)
|
|
50
|
+
self.use_gpu = True
|
|
51
|
+
try:
|
|
52
|
+
# Basic
|
|
53
|
+
self.k_markov = self.prg.markov_step
|
|
54
|
+
self.k_hmm_basic = self.prg.hmm_forward_step
|
|
55
|
+
|
|
56
|
+
# Advanced / Viterbi
|
|
57
|
+
self.k_hmm_log = self.prg.hmm_forward_log
|
|
58
|
+
self.k_viterbi = self.prg.viterbi_step
|
|
59
|
+
|
|
60
|
+
# Training
|
|
61
|
+
self.k_hmm_back = self.prg.hmm_backward_log
|
|
62
|
+
self.k_acc_trans = self.prg.accumulate_transitions
|
|
63
|
+
self.k_acc_gamma = self.prg.accumulate_gammas
|
|
64
|
+
|
|
65
|
+
except AttributeError as e:
|
|
66
|
+
print(f"❌ Kernel Warning: {e}")
|
|
67
|
+
print("⚠️ Some GPU features may be disabled.")
|
|
68
|
+
|
|
69
|
+
except Exception as e:
|
|
70
|
+
print(f"⚠️ OpenCL Initialization failed: {e}")
|
|
71
|
+
print("⚠️ Running in CPU-Only Mode (NumPy).")
|
|
72
|
+
self.use_gpu = False
|
|
73
|
+
|
|
74
|
+
# --- 1. Simulation ---
|
|
75
|
+
def step(self, P, v):
|
|
76
|
+
"""Runs one step: v_new = v * P"""
|
|
77
|
+
N = len(v)
|
|
78
|
+
|
|
79
|
+
if not self.use_gpu or N < GPU_THRESHOLD:
|
|
80
|
+
return v.dot(P)
|
|
81
|
+
|
|
82
|
+
mf = cl.mem_flags
|
|
83
|
+
P = np.ascontiguousarray(P, dtype=np.float32)
|
|
84
|
+
v = np.ascontiguousarray(v, dtype=np.float32)
|
|
85
|
+
result = np.empty_like(v)
|
|
86
|
+
|
|
87
|
+
d_P = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P)
|
|
88
|
+
d_v = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=v)
|
|
89
|
+
d_res = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=result.nbytes)
|
|
90
|
+
|
|
91
|
+
self.k_markov(self.queue, (N,), None, np.int32(N), d_v, d_P, d_res)
|
|
92
|
+
cl.enqueue_copy(self.queue, result, d_res)
|
|
93
|
+
return result
|
|
94
|
+
|
|
95
|
+
def converge(self, P, start_v, tolerance=1e-5, max_steps=1000):
|
|
96
|
+
N = len(start_v)
|
|
97
|
+
|
|
98
|
+
# CPU Path
|
|
99
|
+
if not self.use_gpu or N < GPU_THRESHOLD:
|
|
100
|
+
# print(f"🔄 Converging on CPU (N={N})...")
|
|
101
|
+
current_v = start_v.copy()
|
|
102
|
+
for i in range(max_steps):
|
|
103
|
+
new_v = current_v.dot(P)
|
|
104
|
+
if np.sum(np.abs(new_v - current_v)) < tolerance:
|
|
105
|
+
return new_v
|
|
106
|
+
current_v = new_v
|
|
107
|
+
return current_v
|
|
108
|
+
|
|
109
|
+
# GPU Path
|
|
110
|
+
# print(f"🔄 Converging on GPU (N={N})...")
|
|
111
|
+
mf = cl.mem_flags
|
|
112
|
+
P = np.ascontiguousarray(P, dtype=np.float32)
|
|
113
|
+
start_v = np.ascontiguousarray(start_v, dtype=np.float32)
|
|
114
|
+
|
|
115
|
+
d_P = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=P)
|
|
116
|
+
d_v_read = cl.Buffer(
|
|
117
|
+
self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=start_v
|
|
118
|
+
)
|
|
119
|
+
d_v_write = cl.Buffer(self.ctx, mf.READ_WRITE, size=start_v.nbytes)
|
|
120
|
+
|
|
121
|
+
current_v = start_v.copy()
|
|
122
|
+
|
|
123
|
+
for i in range(max_steps):
|
|
124
|
+
self.k_markov(self.queue, (N,), None, np.int32(N), d_v_read, d_P, d_v_write)
|
|
125
|
+
|
|
126
|
+
if i % 10 == 0:
|
|
127
|
+
new_v = np.empty_like(current_v)
|
|
128
|
+
cl.enqueue_copy(self.queue, new_v, d_v_write)
|
|
129
|
+
if np.sum(np.abs(new_v - current_v)) < tolerance:
|
|
130
|
+
return new_v
|
|
131
|
+
current_v = new_v
|
|
132
|
+
|
|
133
|
+
d_v_read, d_v_write = d_v_write, d_v_read
|
|
134
|
+
|
|
135
|
+
print("⚠️ Reached max steps without full convergence.")
|
|
136
|
+
return current_v
|
|
137
|
+
|
|
138
|
+
# --- 2. Inference & Viterbi ---
|
|
139
|
+
def hmm_filter(self, transition_matrix, observation_probs):
|
|
140
|
+
"""Standard HMM Filter (Returns Probabilities)"""
|
|
141
|
+
# Simplification: Running basic HMM forward pass
|
|
142
|
+
# For production use, usually prefer Log-Space to avoid underflow.
|
|
143
|
+
# This wrapper can be upgraded to use k_hmm_log if needed.
|
|
144
|
+
pass
|
|
145
|
+
|
|
146
|
+
def decode_regime(self, transition_matrix, observation_probs):
|
|
147
|
+
"""Viterbi Algorithm (Finds Most Likely Path)"""
|
|
148
|
+
T, N = observation_probs.shape
|
|
149
|
+
epsilon = 1e-20
|
|
150
|
+
|
|
151
|
+
# CPU Path
|
|
152
|
+
if not self.use_gpu or N < GPU_THRESHOLD:
|
|
153
|
+
log_trans = np.log(transition_matrix + epsilon)
|
|
154
|
+
log_emis = np.log(observation_probs + epsilon)
|
|
155
|
+
log_delta = np.zeros((T, N))
|
|
156
|
+
backpointers = np.zeros((T, N), dtype=int)
|
|
157
|
+
|
|
158
|
+
log_delta[0] = -np.log(N) + log_emis[0]
|
|
159
|
+
|
|
160
|
+
for t in range(1, T):
|
|
161
|
+
for j in range(N):
|
|
162
|
+
vals = log_delta[t - 1] + log_trans[:, j]
|
|
163
|
+
best_prev = np.argmax(vals)
|
|
164
|
+
backpointers[t, j] = best_prev
|
|
165
|
+
log_delta[t, j] = vals[best_prev] + log_emis[t, j]
|
|
166
|
+
|
|
167
|
+
path = np.zeros(T, dtype=int)
|
|
168
|
+
path[-1] = np.argmax(log_delta[-1])
|
|
169
|
+
for t in range(T - 2, -1, -1):
|
|
170
|
+
path[t] = backpointers[t + 1, path[t + 1]]
|
|
171
|
+
return path
|
|
172
|
+
|
|
173
|
+
# GPU Path
|
|
174
|
+
mf = cl.mem_flags
|
|
175
|
+
log_trans = np.log(transition_matrix + epsilon).astype(np.float32)
|
|
176
|
+
log_emis = np.log(observation_probs + epsilon).astype(np.float32)
|
|
177
|
+
log_delta = np.full(N, -np.log(N), dtype=np.float32)
|
|
178
|
+
|
|
179
|
+
d_trans = cl.Buffer(
|
|
180
|
+
self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_trans
|
|
181
|
+
)
|
|
182
|
+
d_delta_in = cl.Buffer(
|
|
183
|
+
self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_delta
|
|
184
|
+
)
|
|
185
|
+
d_delta_out = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_delta.nbytes)
|
|
186
|
+
|
|
187
|
+
full_backpointer_history = np.zeros((T, N), dtype=np.int32)
|
|
188
|
+
d_backpointers = cl.Buffer(
|
|
189
|
+
self.ctx, mf.WRITE_ONLY, size=full_backpointer_history.nbytes // T
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
print(f"🕵️ Decoding {T} days (GPU Accelerated)...")
|
|
193
|
+
|
|
194
|
+
for t in range(T):
|
|
195
|
+
d_emis = cl.Buffer(
|
|
196
|
+
self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_emis[t]
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
self.k_viterbi(
|
|
200
|
+
self.queue,
|
|
201
|
+
(N,),
|
|
202
|
+
None,
|
|
203
|
+
np.int32(N),
|
|
204
|
+
d_delta_in,
|
|
205
|
+
d_trans,
|
|
206
|
+
d_emis,
|
|
207
|
+
d_delta_out,
|
|
208
|
+
d_backpointers,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
step_pointers = np.empty(N, dtype=np.int32)
|
|
212
|
+
cl.enqueue_copy(self.queue, step_pointers, d_backpointers)
|
|
213
|
+
full_backpointer_history[t] = step_pointers
|
|
214
|
+
|
|
215
|
+
d_delta_in, d_delta_out = d_delta_out, d_delta_in
|
|
216
|
+
|
|
217
|
+
final_log_probs = np.empty(N, dtype=np.float32)
|
|
218
|
+
cl.enqueue_copy(self.queue, final_log_probs, d_delta_in)
|
|
219
|
+
|
|
220
|
+
best_path = np.zeros(T, dtype=np.int32)
|
|
221
|
+
best_path[-1] = np.argmax(final_log_probs)
|
|
222
|
+
|
|
223
|
+
for t in range(T - 2, -1, -1):
|
|
224
|
+
next_state = best_path[t + 1]
|
|
225
|
+
best_path[t] = full_backpointer_history[t + 1][next_state]
|
|
226
|
+
|
|
227
|
+
return best_path
|
|
228
|
+
|
|
229
|
+
# --- 3. Training (Baum-Welch) ---
|
|
230
|
+
def fit(self, observations, n_states, n_iters=10, tolerance=1e-4):
|
|
231
|
+
"""Baum-Welch Expectation Maximization (Training)"""
|
|
232
|
+
T = observations.shape[0]
|
|
233
|
+
N = n_states
|
|
234
|
+
|
|
235
|
+
# Random Init
|
|
236
|
+
log_trans = np.log(
|
|
237
|
+
np.full((N, N), 1.0 / N) + np.random.rand(N, N) * 0.01
|
|
238
|
+
).astype(np.float32)
|
|
239
|
+
log_emis = np.log(observations + 1e-20).astype(np.float32)
|
|
240
|
+
|
|
241
|
+
mf = cl.mem_flags
|
|
242
|
+
d_trans = cl.Buffer(
|
|
243
|
+
self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=log_trans
|
|
244
|
+
)
|
|
245
|
+
d_alpha = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4) # Full history
|
|
246
|
+
d_beta = cl.Buffer(self.ctx, mf.READ_WRITE, size=T * N * 4) # Full history
|
|
247
|
+
d_emis = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=log_emis)
|
|
248
|
+
|
|
249
|
+
d_new_trans = cl.Buffer(self.ctx, mf.READ_WRITE, size=log_trans.nbytes)
|
|
250
|
+
d_gamma_sums = cl.Buffer(self.ctx, mf.READ_WRITE, size=N * 4)
|
|
251
|
+
|
|
252
|
+
prev_score = -np.inf
|
|
253
|
+
|
|
254
|
+
print(f"🧠 Training HMM ({N} States, {T} Steps)...")
|
|
255
|
+
|
|
256
|
+
for i in range(n_iters):
|
|
257
|
+
# 1. CPU Forward/Backward (Latency Optimized)
|
|
258
|
+
alpha_full, log_likelihood = self._cpu_forward(log_trans, log_emis)
|
|
259
|
+
beta_full = self._cpu_backward(log_trans, log_emis)
|
|
260
|
+
|
|
261
|
+
# 2. GPU Accumulation (Throughput Optimized)
|
|
262
|
+
cl.enqueue_copy(self.queue, d_alpha, alpha_full)
|
|
263
|
+
cl.enqueue_copy(self.queue, d_beta, beta_full)
|
|
264
|
+
cl.enqueue_copy(self.queue, d_trans, log_trans)
|
|
265
|
+
|
|
266
|
+
self.k_acc_trans(
|
|
267
|
+
self.queue,
|
|
268
|
+
(N, N),
|
|
269
|
+
None,
|
|
270
|
+
np.int32(T),
|
|
271
|
+
np.int32(N),
|
|
272
|
+
d_alpha,
|
|
273
|
+
d_beta,
|
|
274
|
+
d_emis,
|
|
275
|
+
d_trans,
|
|
276
|
+
d_new_trans,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
self.k_acc_gamma(
|
|
280
|
+
self.queue,
|
|
281
|
+
(N,),
|
|
282
|
+
None,
|
|
283
|
+
np.int32(T),
|
|
284
|
+
np.int32(N),
|
|
285
|
+
d_alpha,
|
|
286
|
+
d_beta,
|
|
287
|
+
d_gamma_sums,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# 3. Update
|
|
291
|
+
new_log_trans_counts = np.empty_like(log_trans)
|
|
292
|
+
log_gamma_sums = np.empty(N, dtype=np.float32)
|
|
293
|
+
|
|
294
|
+
cl.enqueue_copy(self.queue, new_log_trans_counts, d_new_trans)
|
|
295
|
+
cl.enqueue_copy(self.queue, log_gamma_sums, d_gamma_sums)
|
|
296
|
+
|
|
297
|
+
log_trans = new_log_trans_counts - log_gamma_sums[:, None]
|
|
298
|
+
|
|
299
|
+
change = log_likelihood - prev_score
|
|
300
|
+
print(
|
|
301
|
+
f" Iter {i + 1}: Likelihood {log_likelihood:.2f} (Delta: {change:.4f})"
|
|
302
|
+
)
|
|
303
|
+
if abs(change) < tolerance:
|
|
304
|
+
break
|
|
305
|
+
prev_score = log_likelihood
|
|
306
|
+
|
|
307
|
+
return np.exp(log_trans)
|
|
308
|
+
|
|
309
|
+
def _cpu_forward(self, log_trans, log_emis):
|
|
310
|
+
T, N = log_emis.shape
|
|
311
|
+
alpha = np.zeros((T, N), dtype=np.float32)
|
|
312
|
+
alpha[0] = -np.log(N) + log_emis[0]
|
|
313
|
+
for t in range(1, T):
|
|
314
|
+
for j in range(N):
|
|
315
|
+
prev = alpha[t - 1] + log_trans[:, j]
|
|
316
|
+
alpha[t, j] = np.logaddexp.reduce(prev) + log_emis[t, j]
|
|
317
|
+
return alpha, np.logaddexp.reduce(alpha[-1])
|
|
318
|
+
|
|
319
|
+
def _cpu_backward(self, log_trans, log_emis):
|
|
320
|
+
T, N = log_emis.shape
|
|
321
|
+
beta = np.zeros((T, N), dtype=np.float32)
|
|
322
|
+
for t in range(T - 2, -1, -1):
|
|
323
|
+
for i in range(N):
|
|
324
|
+
terms = log_trans[i, :] + log_emis[t + 1] + beta[t + 1]
|
|
325
|
+
beta[t, i] = np.logaddexp.reduce(terms)
|
|
326
|
+
return beta
|