physkan 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- physkan-0.1.0/.gitignore +1 -0
- physkan-0.1.0/.ipynb_checkpoints/demo-checkpoint.py +246 -0
- physkan-0.1.0/.ipynb_checkpoints/demo_deep-checkpoint.py +147 -0
- physkan-0.1.0/LICENSE +22 -0
- physkan-0.1.0/PKG-INFO +184 -0
- physkan-0.1.0/README.md +173 -0
- physkan-0.1.0/demo.py +246 -0
- physkan-0.1.0/demo_deep.py +173 -0
- physkan-0.1.0/pyproject.toml +39 -0
- physkan-0.1.0/src/physkan/__init__.py +16 -0
- physkan-0.1.0/src/physkan/demonstrator.py +92 -0
- physkan-0.1.0/src/physkan/interaction.py +116 -0
- physkan-0.1.0/src/physkan/kan.py +260 -0
- physkan-0.1.0/test.py +38 -0
physkan-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
**/__pycache__/
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# %% [markdown]
|
|
2
|
+
# # Bounded-KAN: Architecture Demonstrations
|
|
3
|
+
# This suite systematically proves the uncertainty-forwarding and gradient
|
|
4
|
+
# firewall mechanics of the Bounded-KAN architecture.
|
|
5
|
+
|
|
6
|
+
# %%
|
|
7
|
+
import torch
|
|
8
|
+
import torch.nn as nn
|
|
9
|
+
from physkan import KAN, KANDemonstrator, KANLinear
|
|
10
|
+
|
|
11
|
+
torch.manual_seed(42)
|
|
12
|
+
|
|
13
|
+
# Helper: Generate raw physical state (x)
|
|
14
|
+
def generate_x_data(x_min, x_max, steps=200):
|
|
15
|
+
return torch.linspace(x_min, x_max, steps).unsqueeze(1)
|
|
16
|
+
|
|
17
|
+
# Helper: Generate raw physical state (x) and angle (theta)
|
|
18
|
+
def generate_x_theta_train(steps=400):
|
|
19
|
+
x = torch.rand(steps, 1) * 2 - 1
|
|
20
|
+
# Full phase [-pi, pi] to break collinearity and ensure cos(theta) spans [-1, 1]
|
|
21
|
+
theta = torch.rand(steps, 1) * 2 * torch.pi - torch.pi
|
|
22
|
+
return torch.cat([x, theta], dim=1)
|
|
23
|
+
|
|
24
|
+
def generate_x_theta_eval(x_min, x_max, steps=200):
|
|
25
|
+
x = torch.linspace(x_min, x_max, steps).unsqueeze(1)
|
|
26
|
+
# Lock theta at 1.5 rad (~85 deg) so cos(theta) is near 0.07.
|
|
27
|
+
# This specifically exposes the naive multiplication trap for evaluation.
|
|
28
|
+
theta = torch.full((steps, 1), 1.5)
|
|
29
|
+
return torch.cat([x, theta], dim=1)
|
|
30
|
+
|
|
31
|
+
# %%
|
|
32
|
+
# %matplotlib inline
|
|
33
|
+
|
|
34
|
+
# %% [markdown]
|
|
35
|
+
# # 0a. Standard KAN Vulnerability (Arbitrary OOB)
|
|
36
|
+
# **Goal:** Mimic an unprotected KAN using narrow nominal bounds `(-1.0, 1.0)` and
|
|
37
|
+
# the default `SiLU` base activation. We train on the nominal range and extrapolate.
|
|
38
|
+
#
|
|
39
|
+
# **Result:** While our native clamp turns the violent out-of-bounds discontinuity
|
|
40
|
+
# into a plateau, the asymmetric nature of `SiLU` (linear for positive, zero for
|
|
41
|
+
# negative) makes extrapolation arbitrary. It grows on the right but flatlines on
|
|
42
|
+
# the left.
|
|
43
|
+
|
|
44
|
+
# %%
|
|
45
|
+
model_0a = KANLinear(
|
|
46
|
+
in_features=1,
|
|
47
|
+
out_features=1,
|
|
48
|
+
grid_size=5,
|
|
49
|
+
spline_order=3,
|
|
50
|
+
grid_range=(-1.0, 1.0),
|
|
51
|
+
base_activation=nn.SiLU
|
|
52
|
+
)
|
|
53
|
+
demo_0a = KANDemonstrator(model=model_0a, target_fn=lambda x: x**2)
|
|
54
|
+
|
|
55
|
+
demo_0a.train(generate_x_data(-1.0, 1.0, 100))
|
|
56
|
+
demo_0a.plot(generate_x_data(-4.0, 4.0, 200), "0a. Narrow Bounds (Arbitrary SiLU Asymmetry)")
|
|
57
|
+
|
|
58
|
+
# %% [markdown]
|
|
59
|
+
# # 0b. The "Wide Grid" Fallacy (Untrained Knot Collapse)
|
|
60
|
+
# **Goal:** Mimic a practitioner trying to fix 0a by expanding the bounds to cover
|
|
61
|
+
# the extrapolation limits `(-4.0, 4.0)`. We increase `grid_size` proportionally
|
|
62
|
+
# to maintain resolution.
|
|
63
|
+
#
|
|
64
|
+
# **Result:** B-splines have strictly local support. The knots in the `(1.0, 4.0)`
|
|
65
|
+
# region receive absolutely zero gradient updates during training. The prediction
|
|
66
|
+
# completely detaches from the physics and outputs chaotic initialization noise,
|
|
67
|
+
# proving why expanding bounds without data is mathematically unsafe.
|
|
68
|
+
|
|
69
|
+
# %%
|
|
70
|
+
model_0b = KANLinear(
|
|
71
|
+
in_features=1,
|
|
72
|
+
out_features=1,
|
|
73
|
+
grid_size=20, # Increased to maintain resolution over wider bounds
|
|
74
|
+
spline_order=3,
|
|
75
|
+
grid_range=(-4.0, 4.0), # The practitioner's "fix"
|
|
76
|
+
base_activation=nn.SiLU
|
|
77
|
+
)
|
|
78
|
+
demo_0b = KANDemonstrator(model=model_0b, target_fn=lambda x: x**2)
|
|
79
|
+
|
|
80
|
+
demo_0b.train(generate_x_data(-1.0, 1.0, 100))
|
|
81
|
+
demo_0b.plot(generate_x_data(-4.0, 4.0, 200), "0b. Wide Bounds Fallacy (Untrained Extrapolation)")
|
|
82
|
+
|
|
83
|
+
# %% [markdown]
|
|
84
|
+
# # 0c. The Data Sparsity Vulnerability (The Interpolation Hole)
|
|
85
|
+
# **Goal:** The practitioner now tries to train across the full wide grid `(-1.0, 4.0)`.
|
|
86
|
+
# However, real physical data has gaps. We filter out all training data between `2.0`
|
|
87
|
+
# and `3.5` to simulate a sparse transition regime (e.g., ships avoiding marginal weather).
|
|
88
|
+
#
|
|
89
|
+
# **Result:** Even though the bounds enclose all the data, the knots *inside the hole* # receive zero gradient updates. Instead of bridging the gap smoothly, the prediction
|
|
90
|
+
# violently collapses into the void, outputting untrained noise. This proves that
|
|
91
|
+
# relying purely on splines across sparse datasets destroys physical identification.
|
|
92
|
+
|
|
93
|
+
# %%
|
|
94
|
+
model_0c = KANLinear(
|
|
95
|
+
in_features=1,
|
|
96
|
+
out_features=1,
|
|
97
|
+
grid_size=20,
|
|
98
|
+
spline_order=3,
|
|
99
|
+
grid_range=(-1.0, 4.0),
|
|
100
|
+
base_activation=nn.SiLU
|
|
101
|
+
)
|
|
102
|
+
demo_0c = KANDemonstrator(model=model_0c, target_fn=lambda x: x**2)
|
|
103
|
+
|
|
104
|
+
# Generate full data, then explicitly mask out the (2.0 to 3.5) transition regime
|
|
105
|
+
x_train_0c = generate_x_data(-1.0, 4.0, steps=200)
|
|
106
|
+
x_train_sparse = x_train_0c[(x_train_0c[:, 0] < 1.5) | (x_train_0c[:, 0] > 3.5)]
|
|
107
|
+
|
|
108
|
+
demo_0c.train(x_train_sparse)
|
|
109
|
+
demo_0c.plot(generate_x_data(-4.0, 4.0, 200), "0c. Data Sparsity (Interpolation Hole Collapse)")
|
|
110
|
+
|
|
111
|
+
# %% [markdown]
|
|
112
|
+
# # 1. Spline Plateau (Symmetric Linear Track)
|
|
113
|
+
# **Goal:** Show how Bounded-KAN behaves with the strict `Identity` linear baseline.
|
|
114
|
+
#
|
|
115
|
+
# **Result:** Inside the bounds, the splines perfectly fit the curve. Out of bounds,
|
|
116
|
+
# the mechanical clamp safely freezes the splines to prevent chaotic oscillation.
|
|
117
|
+
# However, notice that the left-side extrapolation actually looks slightly *worse* # than the naive SiLU in Case 0a!
|
|
118
|
+
#
|
|
119
|
+
# **Why?** The symmetry exists in the training data, but *we structurally
|
|
120
|
+
# enforced* an asymmetric linear asymptote by using the strict `Identity` base track.
|
|
121
|
+
# The splines easily fit the symmetric parabola locally, while the base track absorbs
|
|
122
|
+
# a slight residual slope. When the splines clamp out of bounds, that raw linear slope
|
|
123
|
+
# is exposed. We intentionally trade the arbitrary, "lucky" flatlining of SiLU for
|
|
124
|
+
# strict, predictable linear extrapolation.
|
|
125
|
+
#
|
|
126
|
+
# **Try this:** If you know the physical domain is symmetric, you can pass `base_activation=torch.abs` when initializing the model
|
|
127
|
+
# to structurally enforce a symmetric V-shape out of bounds. While this makes the baseline extrapolation look slightly better,
|
|
128
|
+
# it is still just a linear approximation. In general, the explicit feature engineering demonstrated in step 2 is the preferred
|
|
129
|
+
# approach.
|
|
130
|
+
|
|
131
|
+
# %%
|
|
132
|
+
model_1 = KAN(layer_dims=[1, 1], grid_size=5, spline_order=3)
|
|
133
|
+
demo_1 = KANDemonstrator(model=model_1, target_fn=lambda x: x**2)
|
|
134
|
+
|
|
135
|
+
demo_1.train(generate_x_data(-1.0, 1.0, 100))
|
|
136
|
+
demo_1.plot(generate_x_data(-4.0, 4.0, 200), "1. Spline Plateau (Symmetric Linear Track)")
|
|
137
|
+
|
|
138
|
+
# %% [markdown]
|
|
139
|
+
# # 2a. Linear Recovery via Feature Engineering
|
|
140
|
+
# **Goal:** Provide $x^2$ as an engineered feature. Show that extrapolation now
|
|
141
|
+
# works perfectly because the unbroken linear track carries the out-of-bounds scaling.
|
|
142
|
+
|
|
143
|
+
# %%
|
|
144
|
+
model_2a = KAN(layer_dims=[2, 1], grid_size=5, spline_order=3)
|
|
145
|
+
demo_2a = KANDemonstrator(
|
|
146
|
+
model=model_2a,
|
|
147
|
+
target_fn=lambda x: x**2,
|
|
148
|
+
feature_fn=lambda x: torch.cat([x, x**2], dim=1)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
demo_2a.train(generate_x_data(-1.0, 1.0, 100))
|
|
152
|
+
demo_2a.plot(generate_x_data(-4.0, 4.0, 200), "2. Linear Recovery (Engineered $x^2$)")
|
|
153
|
+
|
|
154
|
+
# %% [markdown]
|
|
155
|
+
# # 2b. Interval Protection (The Collinearity Fix)
|
|
156
|
+
# **Goal:** Use the `KANInteraction` module to compute the product using interval
|
|
157
|
+
# arithmetic. We feed the network $x^2$, $\cos(\theta)$, and their interaction.
|
|
158
|
+
#
|
|
159
|
+
# **Result:** Look at the bottom plot—the firewall worked perfectly! It recognized
|
|
160
|
+
# the massive out-of-bounds variance of $x$ and slammed the severity $D$ up to 6.0.
|
|
161
|
+
# However, the physical prediction (top plot) overshoots. Why? Spurious correlation.
|
|
162
|
+
# During training, the network got lazy. Instead of relying purely on the interaction
|
|
163
|
+
# feature, it put weight on the raw $x^2$ feature, and used the splines to cancel
|
|
164
|
+
# out the error. When extrapolated, the splines clamped, the cancellation stopped,
|
|
165
|
+
# and the raw $x^2$ error shot up. This proves why severity tracking is non-negotiable!
|
|
166
|
+
|
|
167
|
+
# %%
|
|
168
|
+
model_2b = KAN(layer_dims=[1, 1], interaction_map=[[0, 0]], grid_size=5, spline_order=3)
|
|
169
|
+
demo_2b = KANDemonstrator(
|
|
170
|
+
model=model_2b,
|
|
171
|
+
target_fn=lambda x: x**2,
|
|
172
|
+
feature_fn=lambda x: x
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
demo_2b.train(generate_x_data(-1.0, 1.0, 100))
|
|
176
|
+
demo_2b.plot(generate_x_data(-4.0, 4.0, 200), "2. Linear Recovery (Engineered $x^2$)")
|
|
177
|
+
|
|
178
|
+
# %% [markdown]
|
|
179
|
+
# # 2c. The Dropout Fix (Forcing Physical Isolation)
|
|
180
|
+
# **Goal:** How do we stop the network from using splines as a crutch to hide bad
|
|
181
|
+
# linear weights? We introduce **Spline Dropout**. By randomly zeroing out the
|
|
182
|
+
# splines during training, the linear track is forced to explain as much as possible of the physical features.
|
|
183
|
+
#
|
|
184
|
+
# **Result:** The linear track sets the weight of pure $x^2$ to zero, and the weight
|
|
185
|
+
# of the interaction feature to 1.0. The physical prediction is now perfectly flat
|
|
186
|
+
# (matching the true physics), AND the severity firewall remains fully active.
|
|
187
|
+
|
|
188
|
+
# %%
|
|
189
|
+
model_2c = KAN(layer_dims=[1, 1], interaction_map=[[0, 0]], grid_size=5, spline_order=3, spline_dropout=0.05)
|
|
190
|
+
demo_2c = KANDemonstrator(
|
|
191
|
+
model=model_2c,
|
|
192
|
+
target_fn=lambda x: x**2,
|
|
193
|
+
feature_fn=lambda x: x
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
demo_2c.train(generate_x_data(-1.0, 1.0, 100))
|
|
197
|
+
demo_2c.plot(generate_x_data(-4.0, 4.0, 200), "2c. The Dropout Fix (Perfect Physics + Firewall)")
|
|
198
|
+
|
|
199
|
+
# %% [markdown]
|
|
200
|
+
# # 3a. Protected Interaction Layer
|
|
201
|
+
# **Goal:** Use the `KANInteraction` module to compute the product.
|
|
202
|
+
# The interval arithmetic accurately assesses the high variance, raises a severe dual $D$,
|
|
203
|
+
# and slams the gradient firewall shut. Extrapolation plateaus safely.
|
|
204
|
+
|
|
205
|
+
# %%
|
|
206
|
+
model_3a = KAN(layer_dims=[2, 1], interaction_map=[[0, 1]], grid_size=5, spline_order=3, spline_dropout=0.05)
|
|
207
|
+
demo_3a = KANDemonstrator(
|
|
208
|
+
model=model_3a,
|
|
209
|
+
target_fn=lambda x: (x[:, 0:1]**2) * torch.cos(x[:, 1:2]),
|
|
210
|
+
feature_fn=lambda x: torch.cat([x[:, 0:1]**2, torch.cos(x[:, 1:2])], dim=1)
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
demo_3a.train(generate_x_theta_train())
|
|
214
|
+
demo_3a.plot(generate_x_theta_eval(-4.0, 4.0), "3a. Interval Protection (Interaction Firewall)")
|
|
215
|
+
|
|
216
|
+
# %% [markdown]
|
|
217
|
+
# # 3b. Deep Network Feature Discovery
|
|
218
|
+
# **Goal:** Remove explicit interaction mapping. Provide just $x^2$ and $\cos(\theta)$
|
|
219
|
+
# to a deeper network (`[2, 4, 1]`) to let it learn the interaction. Show that
|
|
220
|
+
# the dual mathematically compounds through the linear matrices, protecting the entire depth.
|
|
221
|
+
|
|
222
|
+
# %%
|
|
223
|
+
model_3b = KAN(layer_dims=[2, 4, 1], grid_size=5, spline_order=3)
|
|
224
|
+
demo_3b = KANDemonstrator(
|
|
225
|
+
model=model_3b,
|
|
226
|
+
target_fn=lambda x: (x[:, 0:1]**2) * torch.cos(x[:, 1:2]),
|
|
227
|
+
feature_fn=lambda x: torch.cat([x[:, 0:1]**2, torch.cos(x[:, 1:2])], dim=1)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
demo_3b.train(generate_x_theta_train(steps=800), epochs=1000)
|
|
231
|
+
demo_3b.plot(generate_x_theta_eval(-4.0, 4.0), "3b. Deep Discovery (Matrix Dual Routing)")
|
|
232
|
+
|
|
233
|
+
# %% [markdown]
|
|
234
|
+
# **The Takeaway:** The severity firewall ($D$) still spikes perfectly, alerting us that
|
|
235
|
+
# we have left the data-driven regime. However, because the deep network relies on
|
|
236
|
+
# fragile, unconstrained spline combinations to fake multiplication, the physical
|
|
237
|
+
# extrapolation shape becomes erratic.
|
|
238
|
+
#
|
|
239
|
+
# This highlights a crucial philosophical point: high severity ($D$) doesn't inherently
|
|
240
|
+
# mean "danger"—it simply means the model is now relying entirely on its structural priors.
|
|
241
|
+
# If those priors are unconstrained deep networks, extrapolation is chaotic. But if we
|
|
242
|
+
# engineer those priors correctly, we can extrapolate safely and indefinitely.
|
|
243
|
+
#
|
|
244
|
+
# If deep KANs and automated feature discovery are part of your plans, please proceed
|
|
245
|
+
# to `demo_deep.py` to see how we leash the beast!
|
|
246
|
+
# %%
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# %% [markdown]
|
|
2
|
+
# # Bounded-KAN: Architecture Demonstrations
|
|
3
|
+
# This suite systematically proves the uncertainty-forwarding and gradient
|
|
4
|
+
# firewall mechanics of the Bounded-KAN architecture.
|
|
5
|
+
|
|
6
|
+
# %%
|
|
7
|
+
import torch
|
|
8
|
+
import torch.nn as nn
|
|
9
|
+
from physkan import KAN, KANDemonstrator, KANLinear
|
|
10
|
+
|
|
11
|
+
torch.manual_seed(42)
|
|
12
|
+
|
|
13
|
+
# Helper: Generate raw physical state (x)
|
|
14
|
+
def generate_x_data(x_min, x_max, steps=200):
|
|
15
|
+
return torch.linspace(x_min, x_max, steps).unsqueeze(1)
|
|
16
|
+
|
|
17
|
+
# Helper: Generate raw physical state (x) and angle (theta)
|
|
18
|
+
def generate_x_theta_train(steps=400):
|
|
19
|
+
x = torch.rand(steps, 1) * 2 - 1
|
|
20
|
+
# Full phase [-pi, pi] to break collinearity and ensure cos(theta) spans [-1, 1]
|
|
21
|
+
theta = torch.rand(steps, 1) * 2 * torch.pi - torch.pi
|
|
22
|
+
return torch.cat([x, theta], dim=1)
|
|
23
|
+
|
|
24
|
+
def generate_x_theta_eval(x_min, x_max, steps=200):
|
|
25
|
+
x = torch.linspace(x_min, x_max, steps).unsqueeze(1)
|
|
26
|
+
# Lock theta at 1.5 rad (~85 deg) so cos(theta) is near 0.07.
|
|
27
|
+
# This specifically exposes the naive multiplication trap for evaluation.
|
|
28
|
+
theta = torch.full((steps, 1), 1.5)
|
|
29
|
+
return torch.cat([x, theta], dim=1)
|
|
30
|
+
|
|
31
|
+
# %%
|
|
32
|
+
# %matplotlib inline
|
|
33
|
+
|
|
34
|
+
# %% [markdown]
|
|
35
|
+
# # 3b. Deep Network Feature Discovery
|
|
36
|
+
# We repeat this final example from `demo.py` for context, with takeaway
|
|
37
|
+
# "... high severity ($D$) doesn't inherently
|
|
38
|
+
# mean "danger"—it simply means the model is now relying entirely on its structural priors.
|
|
39
|
+
# If those priors are unconstrained deep networks, extrapolation is chaotic. But if we
|
|
40
|
+
# engineer those priors correctly, we can extrapolate safely and indefinitely.".
|
|
41
|
+
|
|
42
|
+
# %%
|
|
43
|
+
torch.manual_seed(42)
|
|
44
|
+
model_3b = KAN(layer_dims=[2, 4, 1], grid_size=5, spline_order=3, spline_dropout=0.1)
|
|
45
|
+
demo_3b = KANDemonstrator(
|
|
46
|
+
model=model_3b,
|
|
47
|
+
target_fn=lambda x: (x[:, 0:1]**2) * torch.cos(x[:, 1:2]),
|
|
48
|
+
feature_fn=lambda x: torch.cat([x[:, 0:1]**2, torch.cos(x[:, 1:2])], dim=1)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
demo_3b.train(generate_x_theta_train(steps=800), epochs=1000)
|
|
52
|
+
demo_3b.plot(generate_x_theta_eval(-4.0, 4.0), "3b. Deep Discovery (Matrix Dual Routing)")
|
|
53
|
+
|
|
54
|
+
# %% [markdown]
|
|
55
|
+
# # 3c. Deep Network Feature Discovery, a hybrid approach
|
|
56
|
+
# ...hybrid polynomial/kan...
|
|
57
|
+
|
|
58
|
+
# %%
|
|
59
|
+
torch.manual_seed(42)
|
|
60
|
+
model_3c = KAN(layer_dims=[2, 4, 1], grid_size=5, spline_order=3, symbolic_order=2, spline_dropout=0.8)
|
|
61
|
+
demo_3c = KANDemonstrator(
|
|
62
|
+
model=model_3c,
|
|
63
|
+
target_fn=lambda x: (x[:, 0:1]**2) * torch.cos(x[:, 1:2]),
|
|
64
|
+
feature_fn=lambda x: torch.cat([x[:, 0:1]**2, torch.cos(x[:, 1:2])], dim=1)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
demo_3c.train(generate_x_theta_train(steps=800), epochs=1000)
|
|
68
|
+
demo_3c.plot(generate_x_theta_eval(-4.0, 4.0), "3b. Hybrid Deep Discovery")
|
|
69
|
+
|
|
70
|
+
# %% [markdown]
|
|
71
|
+
# # 4a. Multi-target Surgical Detachment
|
|
72
|
+
#
|
|
73
|
+
# **Goal:** Demonstrate that the Dual Severity Tracker is not a global panic button, but a surgical, node-specific diagnostic tool.
|
|
74
|
+
#
|
|
75
|
+
# We will map a system with two outputs:
|
|
76
|
+
# * $y_1$ relies heavily on an $x^2$ anomaly.
|
|
77
|
+
# * $y_2$ is highly insulated, relying on a stable $\cos(\theta)$ feature and a tiny fractional coefficient of $x$.
|
|
78
|
+
#
|
|
79
|
+
# **The Expectation:** When $x$ goes violently out of bounds, the network should aggressively firewall $y_1$ (high severity) while leaving $y_2$ almost completely untouched. The severity is quarantined because the underlying linear weights strictly dictate the localized interval routing ($|W| \cdot D$).
|
|
80
|
+
|
|
81
|
+
# %%
|
|
82
|
+
torch.manual_seed(42)
|
|
83
|
+
|
|
84
|
+
def target_multi(x):
|
|
85
|
+
# y1 is highly sensitive to the out-of-bounds explosion
|
|
86
|
+
y1 = x[:, 0:1]**2
|
|
87
|
+
# y2 is insulated, relying mostly on bounded cos(theta)
|
|
88
|
+
y2 = (1e-3 * x[:, 0:1]) + torch.cos(x[:, 1:2])
|
|
89
|
+
return torch.cat([y1, y2], dim=1)
|
|
90
|
+
|
|
91
|
+
def feature_multi(x):
|
|
92
|
+
# Provide the exact bases so the symbolic track can perfectly map the weights
|
|
93
|
+
return torch.cat([
|
|
94
|
+
x[:, 0:1], # Raw x
|
|
95
|
+
x[:, 0:1]**2, # The x^2 anomaly
|
|
96
|
+
torch.cos(x[:, 1:2]) # The bounded periodic feature
|
|
97
|
+
], dim=1)
|
|
98
|
+
|
|
99
|
+
# We use 3 inputs for the 3 explicit features.
|
|
100
|
+
# symbolic_order=1 lets the global skip-connection effortlessly lock onto the correct features.
|
|
101
|
+
model_4a = KAN(
|
|
102
|
+
layer_dims=[3, 4, 2],
|
|
103
|
+
grid_size=5,
|
|
104
|
+
spline_order=3,
|
|
105
|
+
symbolic_order=1,
|
|
106
|
+
spline_dropout=0.8
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
demo_4a = KANDemonstrator(
|
|
110
|
+
model=model_4a,
|
|
111
|
+
target_fn=target_multi,
|
|
112
|
+
feature_fn=feature_multi
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
demo_4a.train(generate_x_theta_train(steps=800), epochs=1000)
|
|
116
|
+
demo_4a.plot(generate_x_theta_eval(-4.0, 4.0), "4a. Multi-target Surgical Detachment")
|
|
117
|
+
|
|
118
|
+
# %%
|
|
119
|
+
# 4b. Perfect Surgical Detachment (Shallow Architecture)
|
|
120
|
+
def target_multi(x):
|
|
121
|
+
# y1 is highly sensitive to the out-of-bounds explosion
|
|
122
|
+
y1 = x[:, 0:1]**2
|
|
123
|
+
# y2 is insulated, relying mostly on bounded cos(theta)
|
|
124
|
+
y2 = (1e-3 * x[:, 0:1]) + torch.cos(x[:, 1:2])
|
|
125
|
+
return torch.cat([y1, y2], dim=1)
|
|
126
|
+
|
|
127
|
+
def feature_multi(x):
|
|
128
|
+
# Provide the exact bases so the symbolic track can perfectly map the weights
|
|
129
|
+
return torch.cat([
|
|
130
|
+
x[:, 0:1], # Raw x
|
|
131
|
+
x[:, 0:1]**2, # The x^2 anomaly
|
|
132
|
+
torch.cos(x[:, 1:2]) # The bounded periodic feature
|
|
133
|
+
], dim=1)
|
|
134
|
+
|
|
135
|
+
model_4b = KAN(
|
|
136
|
+
layer_dims=[3, 2], # NO hidden layers. Pure direct mapping.
|
|
137
|
+
grid_size=5,
|
|
138
|
+
spline_order=3,
|
|
139
|
+
symbolic_order=1,
|
|
140
|
+
spline_dropout=0.8
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
demo_4b = KANDemonstrator(model=model_4b, target_fn=target_multi, feature_fn=feature_multi)
|
|
144
|
+
demo_4b.train(generate_x_theta_train(steps=800), epochs=1000)
|
|
145
|
+
demo_4b.plot(generate_x_theta_eval(-4.0, 4.0), "4b. Perfect Quarantine (Shallow)")
|
|
146
|
+
|
|
147
|
+
# %%
|
physkan-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Based on efficient-kan, copyright (c) 2024 Huanqi Cao.
|
|
4
|
+
Modifications copyright (c) 2026 Simula Research Laboratory.
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
physkan-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: physkan
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A physics-constrained Kolmogorov-Arnold Network with bounded latent spaces.
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: torch>=2.9.0
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# PhysKAN
|
|
13
|
+
|
|
14
|
+
**Physics-constrained Kolmogorov-Arnold Networks for stable system identification**
|
|
15
|
+
|
|
16
|
+
This repository provides a structural adaptation of the B-spline Kolmogorov-Arnold Network (KAN) architecture, designed for physical system identification, digital twins, and robust regression.
|
|
17
|
+
|
|
18
|
+
While standard KANs perform well at function approximation in purely mathematical domains, applying them to physical telemetry often requires interventions, like dynamic grid updates or statistical normalization such as LayerNorm, to handle out-of-bounds (OOB) anomalies.
|
|
19
|
+
In this context, OOB refers to any data point that exceeds the nominal operational range of the system, whether caused by a real but long-tail phenomenon (e.g., unseen weather regimes) or a transient sensor failure (e.g., signal spikes).
|
|
20
|
+
Unfortunately, these standard deep learning techniques remove the spatial meaning of the network's internal variables.
|
|
21
|
+
|
|
22
|
+
This architecture addresses this by freezing the spatial grid and enforcing strict physical bounds natively, prioritizing metric stability and OOB safety over localized curve-fitting flexibility.
|
|
23
|
+
It also uses forward uncertainty propagation with interval arithmetic to track the OOB state through the network.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
## Core design philosophy
|
|
27
|
+
|
|
28
|
+
PhysKAN is built on three central ideas, meant to bridge the gap between theoretical non-linear mapping and the robust fail-safes required for physical engineering:
|
|
29
|
+
|
|
30
|
+
1. **Progressive Koopman-style unbending:** Rather than relying on black-box MLP node activations, the model acts as a structural filter.
|
|
31
|
+
It uses constrained B-splines to progressively unbend non-linear physical inputs layer-by-layer, lifting them into a linearized latent space (analogous to finding "observables" in Koopman Operator Theory).
|
|
32
|
+
|
|
33
|
+
2. **Embrace out-of-bounds (OOB) values:** Real-world physics do not stay neatly within standardized grids.
|
|
34
|
+
Instead of arbitrarily squashing long-tail events or sensor glitches with clamps or global activations, the architecture uses the grid range to explicitly define the boundary between the dense, well-modeled operational regime and the sparse, asymptotic tail.
|
|
35
|
+
OOB states are safely clamped on the non-linear spline track and routed unclamped through a parallel linear track, ensuring mathematically stable extrapolation.
|
|
36
|
+
|
|
37
|
+
3. **Epistemic uncertainty tracking:** The network computes a continuous dual property alongside the physical prediction.
|
|
38
|
+
This signal forward-propagates the mathematical severity of any out-of-bounds state, providing a deterministic measure of when the network is forced to extrapolate.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Under the hood: the OOB routing mechanism
|
|
43
|
+
|
|
44
|
+
To safely execute this philosophy, the network requires a specific mental model for how it routes data—especially during the backward pass.
|
|
45
|
+
|
|
46
|
+
In standard implementations, out-of-bounds data either "falls off" the spline grid entirely (dropping to zero) or requires the input to be clamped or bounded.
|
|
47
|
+
However, if clamped *without* gradient detachment, the boundary knot absorbs the training loss for all out-of-bounds states.
|
|
48
|
+
It becomes a wastebasket for outlying values, compressing the long-tail distribution into a single coordinate and warping predictions for nominal operations.
|
|
49
|
+
|
|
50
|
+
The PhysKAN architecture acts as a traffic cop for physical regimes:
|
|
51
|
+
* **The nominal regime (non-linear track):** Dense, expected data operates inside the grid, shaping the non-linear B-splines.
|
|
52
|
+
* **The out-of-bounds regime (linear track):** OOB data are clamped on the non-linear track (with detached gradients to protect the nominal-range knots).
|
|
53
|
+
The excess signal flows entirely through the linear track.
|
|
54
|
+
|
|
55
|
+
This ensures the non-linear splines strictly learn the nominal physics, while the linear track safely catches long-tail events.
|
|
56
|
+
|
|
57
|
+
## Architectural constraints
|
|
58
|
+
|
|
59
|
+
To maintain the absolute physical meaning of these latent observables during deployment, the model relies on two structural constraints:
|
|
60
|
+
|
|
61
|
+
### 1. Static grid boundaries
|
|
62
|
+
|
|
63
|
+
KAN architectures often rely on dynamic grid updates (knot insertion or movement) during training.
|
|
64
|
+
This architecture disables this.
|
|
65
|
+
Dynamic updates shift the underlying coordinate system of the network mid-training, causing downstream layers to lose their physical calibration.
|
|
66
|
+
By enforcing a static grid, the model sacrifices some theoretical curve-fitting capacity to guarantee that a specific latent state retains its exact metric meaning from initialization to deployment.
|
|
67
|
+
|
|
68
|
+
### 2. Linear skip connections as safety valves
|
|
69
|
+
Because the spline gradients are detached for OOB values, the network routes the excess gradients entirely through the parallel linear skip connection.
|
|
70
|
+
This serves as a vital safety valve: it protects the non-linear splines from gradient pollution, and it ensures that OOB inputs extrapolate linearly and predictably.
|
|
71
|
+
This limits the downstream impact of anomalies, making system filtering more reliable.
|
|
72
|
+
|
|
73
|
+
#### Justification for linear extrapolation (physical basis functions)
|
|
74
|
+
|
|
75
|
+
While real-world OOB events often exhibit higher-order scaling (e.g., cubic wave resistance), the model enforces a linear default for OOB extrapolation.
|
|
76
|
+
This is a deliberate design choice to prevent mathematical instability caused by sensor faults.
|
|
77
|
+
|
|
78
|
+
To safely capture higher-order OOB physics, domain knowledge should be embedded directly via feature engineering.
|
|
79
|
+
As long as the input features form a sufficient physical basis, particularly for asymptotic behaviours, the linear skip connection will naturally capture higher-order OOB phenomena as a linear combination of features without compromising the nominal operating region.
|
|
80
|
+
|
|
81
|
+
Applying a post-summation node activation (such as `SiLU` or `tanh`) fundamentally sabotages this mechanism.
|
|
82
|
+
A non-linear activation will warp the magnitude of the OOB event, rendering the linear skip connection unable to model it.
|
|
83
|
+
For this reason, activations are disabled by default (using `Identity`).
|
|
84
|
+
Other activations may be selected, but beware that the guarantees provided by "standard" PhysKAN may be weakened or destroyed.
|
|
85
|
+
|
|
86
|
+
## Feature engineering and explicit interactions
|
|
87
|
+
|
|
88
|
+
Deep architectures can theoretically learn multiplicative interactions (such as computing `x * y` by combining multiple layers).
|
|
89
|
+
Making the network deduce these relationships from scratch consumes capacity and degrades poorly when out-of-bounds.
|
|
90
|
+
Instead, to capture known physical behaviors, domain knowledge should be embedded directly via feature engineering.
|
|
91
|
+
Providing the network with a dictionary of physical basis functions (e.g., `x^2` or `cos(θ)`) allows the linear skip connection to latch onto these engineered features as a stable baseline.
|
|
92
|
+
This leaves the splines to map the local residuals, ensuring safe extrapolation when the splines saturate.
|
|
93
|
+
|
|
94
|
+
However, combining features naively can mask out-of-bounds anomalies.
|
|
95
|
+
If you manually pre-compute an interaction like `wave_height * cos(wind_dir)` and pass it to the network as a raw input, the anomaly signal is suppressed.
|
|
96
|
+
For instance, if `wave_height` is OOB (e.g., twice nominal range) but `cos(wave_dir)` is near zero, their product is well within nominal bounds.
|
|
97
|
+
The model treats this as a regular in-bounds prediction, and uses the data point to update its nominal-range spline.
|
|
98
|
+
|
|
99
|
+
To prevent this suppression, the network requires interaction terms to be defined internally via an `interaction_map` rather than expanded manually beforehand.
|
|
100
|
+
|
|
101
|
+
The network computes a continuous dual property alongside the standard physical prediction.
|
|
102
|
+
This dual represents the mathematical severity of the out-of-bounds state.
|
|
103
|
+
* The *physical prediction* is computed using the non-linear splines and the linear track.
|
|
104
|
+
* The *dual severity* strictly bypasses the splines and propagates via the absolute values of the linear weights, ensuring that uncertainties compound and never cancel out.
|
|
105
|
+
|
|
106
|
+
By defining interactions explicitly through the `interaction_map`, the model correctly applies the uncertainty product rule to the input features before they enter the network.
|
|
107
|
+
If a large wave anomaly interacts with a nominal-range cosine, the resulting interaction term inherits a proportional severity score.
|
|
108
|
+
This deterministic distress signal persists through the entire depth of the network, ensuring that the non-linear splines are firewalled from learning from the anomaly, while the linear track safely handles the extrapolated magnitude.
|
|
109
|
+
It also provides downstream consumers with a clear indicator of when the model is operating on dodgy data.
|
|
110
|
+
|
|
111
|
+
### Defining the nominal range: data density vs. physical limits
|
|
112
|
+
|
|
113
|
+
When defining the `grid_range` and normalizing inputs, the boundaries should reflect the density of the training data rather than the theoretical limits of the physical system.
|
|
114
|
+
|
|
115
|
+
B-splines require consistent data distribution across their internal grid to form a stable curve.
|
|
116
|
+
If for example a physical feature (such as wave height) has a theoretical operational limit of 5.0 meters, but the training dataset becomes sparse above 2.0 meters, setting the spline boundary to 5.0 meters forces the model to fit curves in an under-constrained region.
|
|
117
|
+
This often causes the splines to oscillate or overfit to a handful of isolated data points.
|
|
118
|
+
|
|
119
|
+
Instead, the grid boundary should be placed where the data density noticeably drops off (e.g., at 2.0 meters).
|
|
120
|
+
By treating the sparse region as out-of-bounds, the network safely clamps the splines in the dense region and relies on the linear track to extrapolate smoothly through the sparse tail.
|
|
121
|
+
The working principle is to treat the nominal range strictly as the bounds of the dense training data.
|
|
122
|
+
|
|
123
|
+
## Installation
|
|
124
|
+
|
|
125
|
+
You can install the package directly from GitHub:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
pip install git+[https://github.com/simula/physkan.git](https://github.com/simula/physkan.git)
|
|
129
|
+
|
|
130
|
+
## Usage example
|
|
131
|
+
|
|
132
|
+
The model handles explicit feature expansion and interval arithmetic internally. A standard linear layer should be used as the final readout.
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
import torch
|
|
136
|
+
import torch.nn as nn
|
|
137
|
+
from physkan import KAN
|
|
138
|
+
|
|
139
|
+
# Define explicit cross-terms using indices
|
|
140
|
+
# e.g., for features [wave, wind, cos_dir]:
|
|
141
|
+
# [0, 0] adds wave^2
|
|
142
|
+
# [0, 2] adds wave * cos_dir
|
|
143
|
+
interactions = [[0, 0], [0, 2]]
|
|
144
|
+
|
|
145
|
+
# The KAN model automatically expands the initial input dimension
|
|
146
|
+
# and sets up the continuous dual routing.
|
|
147
|
+
kan_encoder = KAN(
|
|
148
|
+
layers_dims=[3, 16, 8], # Input dim is 3 (wave, wind, cos_dir)
|
|
149
|
+
grid_range=(0.0, 1.0),
|
|
150
|
+
interaction_map=interactions
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# The readout: Linear combination of the final observables of a zero-at-rest (unbiased) system
|
|
154
|
+
linear_mixer = nn.Linear(in_features=8, out_features=1, bias=False)
|
|
155
|
+
|
|
156
|
+
model = nn.Sequential(
|
|
157
|
+
kan_encoder,
|
|
158
|
+
linear_mixer
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Nominal physical data
|
|
162
|
+
x_nominal = torch.tensor([[0.5, 0.8, 0.1]])
|
|
163
|
+
|
|
164
|
+
# Pass data through the encoder, requesting the dual distress signal
|
|
165
|
+
latent_features, severity_signal = kan_encoder(x_nominal, return_dual=True)
|
|
166
|
+
prediction = linear_mixer(latent_features)
|
|
167
|
+
|
|
168
|
+
# For an out-of-bounds event (e.g., wave height sensor reads 5.0)
|
|
169
|
+
x_oob = torch.tensor([[5.0, 0.8, 0.1]])
|
|
170
|
+
latent_oob, severity_oob = kan_encoder(x_oob, return_dual=True)
|
|
171
|
+
|
|
172
|
+
# severity_oob > 0 indicates the prediction relies on mathematically
|
|
173
|
+
# extrapolated values, allowing downstream logic to trigger heuristics.
|
|
174
|
+
if severity_oob.mean() > 0.0:
|
|
175
|
+
print("Warning: operating in uncharted physical regime.")
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Attribution
|
|
179
|
+
|
|
180
|
+
This repository is an adaptation of the excellent **[efficient-kan](https://github.com/Blealtan/efficient-kan)** library by Blealtan.
|
|
181
|
+
|
|
182
|
+
The core B-spline evaluation mechanics, memory-efficient tensor formulation, and foundational matrix operations are directly derived from `efficient-kan`.
|
|
183
|
+
The modifications introduced here are strictly architectural (specifically the detached routing, strict boundary clamping, interval arithmetic dual, and default identity activations) designed to constrain the network for physical system identification.
|
|
184
|
+
Full credit for the underlying efficiency and base implementation belongs to the original author.
|