python-fedci 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Maximilian Hahn
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,112 @@
1
+ Metadata-Version: 2.2
2
+ Name: python-fedci
3
+ Version: 0.1.0
4
+ Summary: A small package for federated independence tests
5
+ Author-email: Maximilian Hahn <max.hahn@gmx.de>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Maximilian Hahn
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/maxhahn/fedci
29
+ Project-URL: Repository, https://github.com/maxhahn/fedci
30
+ Requires-Python: >=3.10
31
+ Description-Content-Type: text/markdown
32
+ License-File: LICENSE
33
+ Requires-Dist: polars>=1.22.0
34
+ Requires-Dist: pyarrow>=19.0.0
35
+ Requires-Dist: rpyc>=6.0.1
36
+ Requires-Dist: scipy>=1.15.1
37
+ Requires-Dist: statsmodels>=0.14.4
38
+
39
+ # About this Project
40
+
41
+ This projects aims to create an accessible platform and user interface to obtain causal knowledge from distributed datasets.
42
+ Different parties can share key insights about their data without compromising their privacy.
43
+ Results come in the form of causal graphs (PAGs).
44
+
45
+ There are two main algorithm this paper supports:
46
+ * rIOD
47
+ * FedGLM
48
+
49
+ ## General
50
+
51
+ This application is made up of two main components.
52
+ There is a streamlit UI and a litestar server.
53
+
54
+ Via the streamlit UI, one can connect to an existing server instance and run distributed/federated algorithms with peers connected to the same server.
55
+
56
+ It is designed to be easily self-hostable, and is fully contained within a docker container.
57
+
58
+ **Beware**: As of now, this client-server architecture communicates via http (_not_ https!).
59
+ As such a malicious agent may spoof your identity or steal data that is transmitted over the network.
60
+
61
+ When hosting this application, you may use a reverse-proxy and your own SSL certificates to enable https.
62
+
63
+ ## rIOD
64
+
65
+ This project implements the IOD algorithm created by [Tillman and Spirtes](http://proceedings.mlr.press/v15/tillman11a.html).
66
+
67
+ When running rIOD, (conditional) independence tests are performed and the resulting p-values are transmitted to the server.
68
+ On the server-side, these p-values are aggregated to give insights about the independences in the distributed dataset.
69
+
70
+ rIOD only supports numerical (float) features.
71
+ As per the IOD algorithm, not all participating parties have to have identical features in their dataset.
72
+
73
+ Without code modifications, this application will only ever transmit p-values and PAG adjacency matrices over the network.
74
+ The original dataset is kept completely private and is never transmitted over the network.
75
+ (This can be easily checked inside the source code)
76
+
77
+ ## FedGLM
78
+
79
+ This project implemens an algorithm (titled FedGLM for now) which utilizes federated learning to create linear models, which are then used for likelihood-ratio tests in order to obtain independence information about the dataset.
80
+
81
+ FedGLM supports numerical (float), categorical (string), and ordinal (int) features.
82
+ Similar to IOD, here as well, not all participating parties have to have the exact same feature set.
83
+
84
+ This algorithm requires the transmission of the expression levels of ordinal and categorical variables.
85
+ Additionaly, when the algorithm is running, linear model coefficients are transmitted, as well as matrices that do not contain recreatable information about the data (see algorithm 2 in [Cellamare et al.](https://www.mdpi.com/1999-4893/15/7/243))
86
+ As such, data privacy is preserved.
87
+
88
+ ## Setup
89
+
90
+ First, install docker and docker-compose.
91
+
92
+ Use the following commands to build and run the application:
93
+
94
+ * Run client and server on same machine:
95
+ `docker-compose up`
96
+
97
+ * Run client or server only:
98
+ `docker-compuse up client`
99
+ `docker-compose up server`
100
+
101
+ * The client can be accessed via `localhost:8081` on the host machine.
102
+ When hosting the client on a machine within the same network, use it's ip address and ensure proper connectivity between the two machines.
103
+
104
+ * The first step within the client is to connect to a server.
105
+ A server hosted by us can be used with the following URL: `heiderlab.com:8080`
106
+
107
+ ## Configuration
108
+
109
+ No major changes of this setup should be required.
110
+ All configurations can be changed in `docker-compose.yml`.
111
+
112
+ The only reasonable change is the port mapping, if your host machine requires specific ports to be exposed.
@@ -0,0 +1,74 @@
1
+ # About this Project
2
+
3
+ This projects aims to create an accessible platform and user interface to obtain causal knowledge from distributed datasets.
4
+ Different parties can share key insights about their data without compromising their privacy.
5
+ Results come in the form of causal graphs (PAGs).
6
+
7
+ There are two main algorithm this paper supports:
8
+ * rIOD
9
+ * FedGLM
10
+
11
+ ## General
12
+
13
+ This application is made up of two main components.
14
+ There is a streamlit UI and a litestar server.
15
+
16
+ Via the streamlit UI, one can connect to an existing server instance and run distributed/federated algorithms with peers connected to the same server.
17
+
18
+ It is designed to be easily self-hostable, and is fully contained within a docker container.
19
+
20
+ **Beware**: As of now, this client-server architecture communicates via http (_not_ https!).
21
+ As such a malicious agent may spoof your identity or steal data that is transmitted over the network.
22
+
23
+ When hosting this application, you may use a reverse-proxy and your own SSL certificates to enable https.
24
+
25
+ ## rIOD
26
+
27
+ This project implements the IOD algorithm created by [Tillman and Spirtes](http://proceedings.mlr.press/v15/tillman11a.html).
28
+
29
+ When running rIOD, (conditional) independence tests are performed and the resulting p-values are transmitted to the server.
30
+ On the server-side, these p-values are aggregated to give insights about the independences in the distributed dataset.
31
+
32
+ rIOD only supports numerical (float) features.
33
+ As per the IOD algorithm, not all participating parties have to have identical features in their dataset.
34
+
35
+ Without code modifications, this application will only ever transmit p-values and PAG adjacency matrices over the network.
36
+ The original dataset is kept completely private and is never transmitted over the network.
37
+ (This can be easily checked inside the source code)
38
+
39
+ ## FedGLM
40
+
41
+ This project implemens an algorithm (titled FedGLM for now) which utilizes federated learning to create linear models, which are then used for likelihood-ratio tests in order to obtain independence information about the dataset.
42
+
43
+ FedGLM supports numerical (float), categorical (string), and ordinal (int) features.
44
+ Similar to IOD, here as well, not all participating parties have to have the exact same feature set.
45
+
46
+ This algorithm requires the transmission of the expression levels of ordinal and categorical variables.
47
+ Additionaly, when the algorithm is running, linear model coefficients are transmitted, as well as matrices that do not contain recreatable information about the data (see algorithm 2 in [Cellamare et al.](https://www.mdpi.com/1999-4893/15/7/243))
48
+ As such, data privacy is preserved.
49
+
50
+ ## Setup
51
+
52
+ First, install docker and docker-compose.
53
+
54
+ Use the following commands to build and run the application:
55
+
56
+ * Run client and server on same machine:
57
+ `docker-compose up`
58
+
59
+ * Run client or server only:
60
+ `docker-compuse up client`
61
+ `docker-compose up server`
62
+
63
+ * The client can be accessed via `localhost:8081` on the host machine.
64
+ When hosting the client on a machine within the same network, use it's ip address and ensure proper connectivity between the two machines.
65
+
66
+ * The first step within the client is to connect to a server.
67
+ A server hosted by us can be used with the following URL: `heiderlab.com:8080`
68
+
69
+ ## Configuration
70
+
71
+ No major changes of this setup should be required.
72
+ All configurations can be changed in `docker-compose.yml`.
73
+
74
+ The only reasonable change is the port mapping, if your host machine requires specific ports to be exposed.
@@ -0,0 +1,4 @@
1
+ from .server import Server, ProxyServer
2
+ from .client import Client, ProxyClient
3
+ from .utils import VariableType
4
+ from .evaluation import get_symmetric_likelihood_tests
@@ -0,0 +1,405 @@
1
+ from scipy.sparse.linalg._eigen.lobpcg.lobpcg import LinAlgError
2
+ from .utils import VariableType, ClientResponseData, BetaUpdateData
3
+ import polars as pl
4
+ import numpy as np
5
+ import scipy
6
+ import rpyc
7
+
8
+ from typing import Dict, List
9
+
10
+ from .env import DEBUG, EXPAND_ORDINALS, RIDGE, LR, OVR
11
+
12
+ import statsmodels.api as sm
13
+ from statsmodels.genmod.generalized_linear_model import GLMResults
14
+ from statsmodels.genmod.families import family
15
+
16
+ class ComputationHelper():
17
+ @staticmethod
18
+ def get_regression_model(y, X, beta, glm_family):
19
+ model = sm.GLM(y, X, family=glm_family)
20
+ result = GLMResults(model, beta, normalized_cov_params=None, scale=None)
21
+ #result = GLMResults(model, beta, normalized_cov_params=None, scale=model.estimate_scale(result.predict()))
22
+ return result
23
+
24
+ @staticmethod
25
+ def run_model(y, X, model):
26
+ llf = model.llf
27
+ deviance = model.deviance
28
+
29
+ # calculate fisher information and score vector
30
+ eta = model.predict(which='linear')
31
+
32
+ # g' is inverse of link function
33
+ inverse_link = model.family.link.inverse
34
+ mu = inverse_link(eta)
35
+
36
+ # delta g' is derivative of inverse link function
37
+ derivative_inverse_link = model.family.link.inverse_deriv
38
+ dmu_deta = derivative_inverse_link(eta)
39
+ dmu_deta = np.clip(dmu_deta, 1e-8, 1-1e-8)
40
+
41
+ z = eta + LR*(y - mu)/dmu_deta
42
+
43
+ if type(model.family) == family.Gaussian:
44
+ var_y = np.var(y-mu)
45
+ elif type(model.family) == family.Binomial:
46
+ var_y = dmu_deta
47
+ else:
48
+ raise Exception(f'Cannot handle model family {model.family.__class__.__name__}')
49
+ W = np.diag((dmu_deta**2)/var_y)
50
+
51
+ xw = X.T @ W
52
+ xwx = xw @ X
53
+ xwz = xw @ z
54
+
55
+ return {'llf': llf, 'deviance': deviance, 'xwx': xwx, 'xwz': xwz}
56
+
57
+ @classmethod
58
+ def run_regression(cls, y, X, beta, glm_family):
59
+ model = cls.get_regression_model(y, X, beta, glm_family)
60
+ return cls.run_model(y, X, model)
61
+
62
+ class ComputationUnit():
63
+ @staticmethod
64
+ def compute(data, y_label, X_labels, beta):
65
+ raise NotImplementedError()
66
+
67
+ class ContinousComputationUnit(ComputationUnit):
68
+ @staticmethod
69
+ def compute(data, y_label, X_labels, beta):
70
+ assert len(beta) == 1, 'Continuos regression called with more than one beta'
71
+ beta = list(beta.values())[0]
72
+
73
+ X = data.to_pandas()[sorted(X_labels)]
74
+ X['__const'] = 1
75
+ X = X.to_numpy().astype(float)
76
+
77
+ y = data.to_pandas()[y_label]
78
+ y = y.to_numpy().astype(float)
79
+
80
+ return ComputationHelper.run_regression(
81
+ y=y,
82
+ X=X,
83
+ beta=beta,
84
+ glm_family=family.Gaussian()
85
+ )
86
+
87
+ class BinaryComputationUnit(ComputationUnit):
88
+ @staticmethod
89
+ def compute(data, y_label, X_labels, beta):
90
+ assert len(beta) == 1, 'Binary regression called with more than one beta'
91
+ beta = list(beta.values())[0]
92
+
93
+ X = data.to_pandas()[sorted(X_labels)]
94
+ X['__const'] = 1
95
+ X = X.to_numpy().astype(float)
96
+
97
+ y = data.to_pandas()[y_label]
98
+ y = y.to_numpy().astype(float)
99
+
100
+ return ComputationHelper.run_regression(
101
+ y=y,
102
+ X=X,
103
+ beta=beta,
104
+ glm_family=family.Binomial()
105
+ )
106
+
107
+ class CategoricalComputationUnit(ComputationUnit):
108
+ @staticmethod
109
+ def compute(data, y_label, X_labels, beta):
110
+ assert len(beta) == 1, 'Multinomial regression called with more than one beta'
111
+ beta = beta[y_label]
112
+
113
+ # Identify the dummy columns for the response
114
+ y_dummy_columns = [c for c in data.columns if c.startswith(f'{y_label}__cat__')]
115
+
116
+ # Design matrix
117
+ X = data.to_pandas()[sorted(X_labels)]
118
+ X['__const'] = 1
119
+ X = X.to_numpy().astype(float)
120
+
121
+ num_categories = len(y_dummy_columns) # J
122
+ num_features = len(X_labels) + 1 # K
123
+
124
+ def softmax(eta):
125
+ exp_eta = np.exp(np.hstack([np.zeros((eta.shape[0], 1)), eta]))
126
+ return exp_eta / exp_eta.sum(axis=1, keepdims=True)
127
+
128
+ # Response matrix (N x (J-1))
129
+ Y = data.to_pandas()[y_dummy_columns[1:]].to_numpy()
130
+
131
+ # Reshape beta (K x (J-1))
132
+ beta = beta.reshape(num_features, -1, order='F')
133
+
134
+ # Compute eta and mu
135
+ eta = np.clip(X @ beta, -350, 350) # N x (J-1)
136
+ mu = np.clip(softmax(eta), 1e-8, 1-1e-8) # N x J
137
+ mu_reduced = mu[:, 1:] # N x (J-1)
138
+
139
+ # Initialize accumulators for XWX and XWz
140
+ XWX = np.zeros((num_features * (num_categories - 1), num_features * (num_categories - 1)))
141
+ XWz = np.zeros(num_features * (num_categories - 1))
142
+
143
+ # Construct W blocks and z
144
+ for i in range(Y.shape[0]):
145
+ yi = Y[i] # (J-1)
146
+ pi = mu_reduced[i]
147
+ var_i = np.diag(pi) - np.outer(pi, pi) # (J-1) x (J-1)
148
+
149
+ try:
150
+ var_i_inv = np.linalg.inv(var_i)
151
+ except np.linalg.LinAlgError:
152
+ var_i_inv = np.linalg.pinv(var_i)
153
+
154
+ z_i = eta[i] + var_i_inv @ (yi - pi) # (J-1)
155
+
156
+ # Compute local contributions to XWX and XWz
157
+ Xi = np.kron(np.eye(num_categories - 1), X[i:i+1]) # (J-1) x (J-1)*K
158
+ Wi = var_i # (J-1) x (J-1)
159
+ XWX += Xi.T @ Wi @ Xi
160
+ XWz += Xi.T @ Wi @ z_i
161
+
162
+ # Compute log-likelihood and deviance
163
+ Y_full = data.to_pandas()[y_dummy_columns].to_numpy() # N x J
164
+ logprob = np.log(np.clip(mu, 1e-8, 1))
165
+ llf = np.sum(Y_full * logprob)
166
+ deviance = -2 * llf
167
+
168
+ results = {y_label: {'xwx': XWX, 'xwz': XWz}}
169
+
170
+ return {
171
+ 'llf': llf,
172
+ 'deviance': deviance,
173
+ 'beta_update_data': results
174
+ }
175
+
176
+
177
+ class CategoricalOVRComputationUnit(ComputationUnit):
178
+ @staticmethod
179
+ def compute(data, y_label, X_labels, betas):
180
+ X = data.to_pandas()[sorted(X_labels)]
181
+ X['__const'] = 1
182
+ X = X.to_numpy().astype(float)
183
+
184
+ models = {}
185
+ results = {}
186
+ for category in betas.keys():
187
+ y = data.to_pandas()[category]
188
+ y = y.to_numpy().astype(float)
189
+
190
+ models[category] = ComputationHelper.get_regression_model(
191
+ y=y,
192
+ X=X,
193
+ beta=betas[category],
194
+ glm_family=family.Binomial()
195
+ )
196
+ current_result = ComputationHelper.run_model(
197
+ y=y,
198
+ X=X,
199
+ model=models[category]
200
+ )
201
+ results[category] = {'xwx': current_result['xwx'], 'xwz': current_result['xwz']}
202
+
203
+ # calculate multinomial llf
204
+ etas = {c:np.clip(m.predict(which='linear'), -350, 350) for c,m in models.items()}
205
+ denom = 1 + sum(np.exp(eta) for eta in etas.values())
206
+ mus = {c:np.clip(np.exp(eta)/denom, 1e-8, 1-1e-8) for c,eta in etas.items()}
207
+
208
+ llf = 0
209
+ llf_saturated = 0
210
+ reference_category_indices = np.ones(len(data))
211
+ for category in betas.keys():
212
+ y = data.to_pandas()[category].to_numpy().astype(float)
213
+ mu = mus[category]
214
+ reference_category_indices = reference_category_indices * (y==0)
215
+ # LLF
216
+ llf += np.sum(np.log(np.take(mu, np.nonzero(y)[0])))
217
+ ## LLF SATURATED (for deviance)
218
+ #llf_saturated += np.sum(y * np.log(np.clip(y, 1e-10, None)))
219
+
220
+ # LLF
221
+ llf += np.sum(np.log(np.take(1/denom, reference_category_indices.nonzero()[0])))
222
+
223
+ # LLF SATURATED (for deviance)
224
+ #llf_saturated += np.sum(reference_category_indices * np.log(np.clip(reference_category_indices, 1e-10, None)))
225
+ deviance = 2 * (llf_saturated - llf)
226
+
227
+ return {
228
+ 'llf': llf,
229
+ 'deviance': deviance,
230
+ 'beta_update_data': results
231
+ }
232
+
233
+ class OrdinalComputationUnit(ComputationUnit):
234
+ @staticmethod
235
+ def compute(data, y_label, X_labels, betas):
236
+ X = data.to_pandas()[sorted(X_labels)]
237
+ X['__const'] = 1
238
+ X = X.to_numpy().astype(float)
239
+
240
+ models = {}
241
+ results = {}
242
+ for level in betas.keys():
243
+ level_int = int(level.split('__ord__')[-1])
244
+ y = data.to_pandas()[y_label]
245
+ y = (y.to_numpy() <= level_int).astype(float)
246
+
247
+ models[level] = ComputationHelper.get_regression_model(
248
+ y=y,
249
+ X=X,
250
+ beta=betas[level],
251
+ glm_family=family.Binomial()
252
+ )
253
+ current_result = ComputationHelper.run_model(
254
+ y=y,
255
+ X=X,
256
+ model=models[level]
257
+ )
258
+ results[level] = {'xwx': current_result['xwx'], 'xwz': current_result['xwz']}
259
+
260
+ mus = [(level, model.predict()) for level, model
261
+ in sorted(models.items(), key=lambda lvl: int(lvl[0].split('__ord__')[-1]))]
262
+ # get diffs of mus of successive levels
263
+ mus_diff = [mus[0]]
264
+ mus_diff.extend([(mus[i][0], mus[i][1] - mus[i-1][1]) for i in range(1,len(mus))])
265
+ mus_diff.append(('ref',1-mus[-1][1]))
266
+
267
+ # fix negative probs
268
+ sign_fix = np.column_stack([e[1] for e in mus_diff])
269
+ problematic_indices = np.where(sign_fix < 0)[0]
270
+ if len(problematic_indices) > 0:
271
+ problem_probs = np.abs(sign_fix[problematic_indices])
272
+ row_sums = np.clip(np.sum(problem_probs, axis=1, keepdims=True), 1e-8, None)
273
+ normalized_probs = problem_probs / row_sums
274
+ sign_fix[problematic_indices] = normalized_probs
275
+ mus_diff = [(mus_diff[i][0], sign_fix[:,i]) for i in range(len(mus_diff))]
276
+ mus_diff = [(l, np.clip(p, 1e-8, None)) for l,p in mus_diff]
277
+
278
+ llf = 0
279
+ llf_saturated = 0
280
+ reference_level_indices = np.ones(len(data))
281
+ for i in range(len(mus_diff)-1):
282
+ level, mu_diff = mus_diff[i]
283
+ level_int = int(level.split('__ord__')[-1])
284
+ current_level_indices = data[y_label].to_numpy() == level_int
285
+ reference_level_indices = reference_level_indices * (1-current_level_indices)
286
+
287
+ llf += np.sum(np.log(np.take(mu_diff, current_level_indices.nonzero()[0])))
288
+ _, mu_diff = mus_diff[-1]
289
+ llf += np.sum(np.log(np.take(mu_diff, reference_level_indices.nonzero()[0])))
290
+ deviance = 2 * (llf_saturated - llf)
291
+
292
+ return {
293
+ 'llf': llf,
294
+ 'deviance': deviance,
295
+ 'beta_update_data': results
296
+ }
297
+
298
+ polars_dtype_map = {
299
+ pl.Float64: VariableType.CONTINUOS,
300
+ pl.Boolean: VariableType.BINARY,
301
+ pl.String: VariableType.CATEGORICAL,
302
+ pl.Int32: VariableType.ORDINAL,
303
+ pl.Int64: VariableType.ORDINAL
304
+ }
305
+
306
+ regression_computation_map = {
307
+ VariableType.CONTINUOS: ContinousComputationUnit,
308
+ VariableType.BINARY: BinaryComputationUnit,
309
+ VariableType.CATEGORICAL: CategoricalComputationUnit if OVR == 0 else CategoricalOVRComputationUnit,
310
+ VariableType.ORDINAL: OrdinalComputationUnit
311
+ }
312
+
313
+ class Client():
314
+ def __init__(self, data: pl.DataFrame, _network_fetch_function=lambda x: x):
315
+ self._network_fetch_function = _network_fetch_function
316
+ self.data: pl.DataFrame = data
317
+ self.schema: Dict[str, VariableType] = {column: polars_dtype_map[dtype] for column, dtype in dict(self.data.schema).items()}
318
+ self.categorical_expressions: Dict[str, List[str]] = {column: self.data.select(column).to_dummies(separator='__cat__').columns
319
+ for column, dtype in self.schema.items() if dtype == VariableType.CATEGORICAL}
320
+ self.ordinal_expressions: Dict[str, List[str]] = {column: self.data.select(pl.col(column)).to_dummies(separator='__ord__').columns
321
+ for column, dtype in self.schema.items() if dtype == VariableType.ORDINAL}
322
+
323
+ self.server_categorical_expressions: Dict[str, List[str]] = None
324
+ self.server_ordinal_expressions: Dict[str, List[str]] = None
325
+ self.expanded_data: pl.DataFrame = None
326
+
327
+ def get_data_schema(self):
328
+ return self.schema
329
+
330
+ def get_categorical_expressions(self):
331
+ return self.categorical_expressions
332
+ def get_ordinal_expressions(self):
333
+ return self.ordinal_expressions
334
+
335
+ def provide_expressions(
336
+ self,
337
+ categorical_expressions: Dict[str, List[str]],
338
+ ordinal_expressions: Dict[str, List[str]]
339
+ ):
340
+ self.server_categorical_expressions = categorical_expressions
341
+ self.server_ordinal_expressions = ordinal_expressions
342
+
343
+ # expand categoricals
344
+ all_possible_categorical_expressions = set([li for l in categorical_expressions.values() for li in l])
345
+ temp = self.data.select([column for column, dtype in self.schema.items() if dtype == VariableType.CATEGORICAL])
346
+ _data = self.data.to_dummies([column for column, dtype in self.schema.items() if dtype == VariableType.CATEGORICAL], separator='__cat__')
347
+ _data = _data.with_columns(temp)
348
+ missing_cols = list(all_possible_categorical_expressions - set(_data.columns))
349
+ _data = _data.with_columns(*[pl.lit(0.0).alias(c) for c in missing_cols])
350
+
351
+ if EXPAND_ORDINALS == 1:
352
+ # expand ordinals
353
+ all_possible_ordinal_expressions = set([li for l in ordinal_expressions.values() for li in l])
354
+ _data = _data.to_dummies([column for column, dtype in self.schema.items() if dtype == VariableType.ORDINAL], separator='__ord__')
355
+ missing_cols = list(all_possible_ordinal_expressions - set(_data.columns))
356
+ _data = _data.with_columns(*[pl.lit(0.0).alias(c) for c in missing_cols])
357
+
358
+ # keep original ordinal variables
359
+ _data = _data.with_columns(self.data.select([column for column, dtype in self.schema.items() if dtype == VariableType.ORDINAL]))
360
+
361
+ self.expanded_data = _data
362
+
363
+ def compute(self, y_label: str, X_labels, beta):
364
+ # NOTE: preemptive cast for netref arrays
365
+ beta = self._network_fetch_function(beta)
366
+ assert y_label in self.schema
367
+
368
+ result = regression_computation_map[self.schema[y_label]].compute(self.expanded_data, y_label, X_labels, beta)
369
+
370
+ if self.schema[y_label] in [VariableType.CONTINUOS, VariableType.BINARY]:
371
+ response: ClientResponseData = ClientResponseData(
372
+ llf=result['llf'],
373
+ deviance=result['deviance'],
374
+ beta_update_data={list(beta.keys())[0]: BetaUpdateData(xwx=result['xwx'], xwz=result['xwz'])}
375
+ )
376
+ else:
377
+ beta_update_data = {category: BetaUpdateData(xwx=data['xwx'], xwz=data['xwz']) for category, data in result['beta_update_data'].items()}
378
+ response: ClientResponseData = ClientResponseData(
379
+ llf=result['llf'],
380
+ deviance=result['deviance'],
381
+ beta_update_data=beta_update_data
382
+ )
383
+ return response
384
+
385
+ class ProxyClient(rpyc.Service):
386
+ def __init__(self, data):
387
+ self.client = Client(data, _network_fetch_function=rpyc.classic.obtain)
388
+ self.server: rpyc.utils.server.ThreadedServer = None
389
+ def __del__(self):
390
+ self.close()
391
+ def start(self, port):
392
+ if self.server is not None:
393
+ self.close()
394
+ self.server = rpyc.utils.server.ThreadedServer(self, port=port, protocol_config={'allow_public_attrs': True, 'allow_pickle': True})
395
+ self.server.start()
396
+ def close(self):
397
+ if self.server is None:
398
+ return
399
+ self.server.close()
400
+ self.server = None
401
+ # expose all functions of client
402
+ def on_connect(self, conn):
403
+ for name in dir(self.client):
404
+ if callable(getattr(self.client, name)) and not name.startswith("_"):
405
+ setattr(self, f"exposed_{name}", getattr(self.client, name))
@@ -0,0 +1,6 @@
1
+ import os
2
+ DEBUG = 0 if (v:=os.getenv("DEBUG")) is None else int(v)
3
+ EXPAND_ORDINALS = 1 if (v:=os.getenv("EXPAND_ORDINALS")) is None else int(v)
4
+ LR = 1 if (v:=os.getenv("LR")) is None else float(v)
5
+ RIDGE = 0 if (v:=os.getenv("RIDGE")) is None else float(v)
6
+ OVR = 0 if (v:=os.getenv("OVR")) is None else int(v)