python-fedci 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fedci/__init__.py +4 -0
- fedci/client.py +405 -0
- fedci/env.py +6 -0
- fedci/evaluation.py +154 -0
- fedci/server.py +105 -0
- fedci/testing.py +275 -0
- fedci/utils.py +26 -0
- python_fedci-0.1.0.dist-info/LICENSE +21 -0
- python_fedci-0.1.0.dist-info/METADATA +112 -0
- python_fedci-0.1.0.dist-info/RECORD +12 -0
- python_fedci-0.1.0.dist-info/WHEEL +5 -0
- python_fedci-0.1.0.dist-info/top_level.txt +1 -0
fedci/__init__.py
ADDED
fedci/client.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
from scipy.sparse.linalg._eigen.lobpcg.lobpcg import LinAlgError
|
|
2
|
+
from .utils import VariableType, ClientResponseData, BetaUpdateData
|
|
3
|
+
import polars as pl
|
|
4
|
+
import numpy as np
|
|
5
|
+
import scipy
|
|
6
|
+
import rpyc
|
|
7
|
+
|
|
8
|
+
from typing import Dict, List
|
|
9
|
+
|
|
10
|
+
from .env import DEBUG, EXPAND_ORDINALS, RIDGE, LR, OVR
|
|
11
|
+
|
|
12
|
+
import statsmodels.api as sm
|
|
13
|
+
from statsmodels.genmod.generalized_linear_model import GLMResults
|
|
14
|
+
from statsmodels.genmod.families import family
|
|
15
|
+
|
|
16
|
+
class ComputationHelper():
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_regression_model(y, X, beta, glm_family):
|
|
19
|
+
model = sm.GLM(y, X, family=glm_family)
|
|
20
|
+
result = GLMResults(model, beta, normalized_cov_params=None, scale=None)
|
|
21
|
+
#result = GLMResults(model, beta, normalized_cov_params=None, scale=model.estimate_scale(result.predict()))
|
|
22
|
+
return result
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def run_model(y, X, model):
|
|
26
|
+
llf = model.llf
|
|
27
|
+
deviance = model.deviance
|
|
28
|
+
|
|
29
|
+
# calculate fisher information and score vector
|
|
30
|
+
eta = model.predict(which='linear')
|
|
31
|
+
|
|
32
|
+
# g' is inverse of link function
|
|
33
|
+
inverse_link = model.family.link.inverse
|
|
34
|
+
mu = inverse_link(eta)
|
|
35
|
+
|
|
36
|
+
# delta g' is derivative of inverse link function
|
|
37
|
+
derivative_inverse_link = model.family.link.inverse_deriv
|
|
38
|
+
dmu_deta = derivative_inverse_link(eta)
|
|
39
|
+
dmu_deta = np.clip(dmu_deta, 1e-8, 1-1e-8)
|
|
40
|
+
|
|
41
|
+
z = eta + LR*(y - mu)/dmu_deta
|
|
42
|
+
|
|
43
|
+
if type(model.family) == family.Gaussian:
|
|
44
|
+
var_y = np.var(y-mu)
|
|
45
|
+
elif type(model.family) == family.Binomial:
|
|
46
|
+
var_y = dmu_deta
|
|
47
|
+
else:
|
|
48
|
+
raise Exception(f'Cannot handle model family {model.family.__class__.__name__}')
|
|
49
|
+
W = np.diag((dmu_deta**2)/var_y)
|
|
50
|
+
|
|
51
|
+
xw = X.T @ W
|
|
52
|
+
xwx = xw @ X
|
|
53
|
+
xwz = xw @ z
|
|
54
|
+
|
|
55
|
+
return {'llf': llf, 'deviance': deviance, 'xwx': xwx, 'xwz': xwz}
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def run_regression(cls, y, X, beta, glm_family):
|
|
59
|
+
model = cls.get_regression_model(y, X, beta, glm_family)
|
|
60
|
+
return cls.run_model(y, X, model)
|
|
61
|
+
|
|
62
|
+
class ComputationUnit():
|
|
63
|
+
@staticmethod
|
|
64
|
+
def compute(data, y_label, X_labels, beta):
|
|
65
|
+
raise NotImplementedError()
|
|
66
|
+
|
|
67
|
+
class ContinousComputationUnit(ComputationUnit):
|
|
68
|
+
@staticmethod
|
|
69
|
+
def compute(data, y_label, X_labels, beta):
|
|
70
|
+
assert len(beta) == 1, 'Continuos regression called with more than one beta'
|
|
71
|
+
beta = list(beta.values())[0]
|
|
72
|
+
|
|
73
|
+
X = data.to_pandas()[sorted(X_labels)]
|
|
74
|
+
X['__const'] = 1
|
|
75
|
+
X = X.to_numpy().astype(float)
|
|
76
|
+
|
|
77
|
+
y = data.to_pandas()[y_label]
|
|
78
|
+
y = y.to_numpy().astype(float)
|
|
79
|
+
|
|
80
|
+
return ComputationHelper.run_regression(
|
|
81
|
+
y=y,
|
|
82
|
+
X=X,
|
|
83
|
+
beta=beta,
|
|
84
|
+
glm_family=family.Gaussian()
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
class BinaryComputationUnit(ComputationUnit):
|
|
88
|
+
@staticmethod
|
|
89
|
+
def compute(data, y_label, X_labels, beta):
|
|
90
|
+
assert len(beta) == 1, 'Binary regression called with more than one beta'
|
|
91
|
+
beta = list(beta.values())[0]
|
|
92
|
+
|
|
93
|
+
X = data.to_pandas()[sorted(X_labels)]
|
|
94
|
+
X['__const'] = 1
|
|
95
|
+
X = X.to_numpy().astype(float)
|
|
96
|
+
|
|
97
|
+
y = data.to_pandas()[y_label]
|
|
98
|
+
y = y.to_numpy().astype(float)
|
|
99
|
+
|
|
100
|
+
return ComputationHelper.run_regression(
|
|
101
|
+
y=y,
|
|
102
|
+
X=X,
|
|
103
|
+
beta=beta,
|
|
104
|
+
glm_family=family.Binomial()
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
class CategoricalComputationUnit(ComputationUnit):
|
|
108
|
+
@staticmethod
|
|
109
|
+
def compute(data, y_label, X_labels, beta):
|
|
110
|
+
assert len(beta) == 1, 'Multinomial regression called with more than one beta'
|
|
111
|
+
beta = beta[y_label]
|
|
112
|
+
|
|
113
|
+
# Identify the dummy columns for the response
|
|
114
|
+
y_dummy_columns = [c for c in data.columns if c.startswith(f'{y_label}__cat__')]
|
|
115
|
+
|
|
116
|
+
# Design matrix
|
|
117
|
+
X = data.to_pandas()[sorted(X_labels)]
|
|
118
|
+
X['__const'] = 1
|
|
119
|
+
X = X.to_numpy().astype(float)
|
|
120
|
+
|
|
121
|
+
num_categories = len(y_dummy_columns) # J
|
|
122
|
+
num_features = len(X_labels) + 1 # K
|
|
123
|
+
|
|
124
|
+
def softmax(eta):
|
|
125
|
+
exp_eta = np.exp(np.hstack([np.zeros((eta.shape[0], 1)), eta]))
|
|
126
|
+
return exp_eta / exp_eta.sum(axis=1, keepdims=True)
|
|
127
|
+
|
|
128
|
+
# Response matrix (N x (J-1))
|
|
129
|
+
Y = data.to_pandas()[y_dummy_columns[1:]].to_numpy()
|
|
130
|
+
|
|
131
|
+
# Reshape beta (K x (J-1))
|
|
132
|
+
beta = beta.reshape(num_features, -1, order='F')
|
|
133
|
+
|
|
134
|
+
# Compute eta and mu
|
|
135
|
+
eta = np.clip(X @ beta, -350, 350) # N x (J-1)
|
|
136
|
+
mu = np.clip(softmax(eta), 1e-8, 1-1e-8) # N x J
|
|
137
|
+
mu_reduced = mu[:, 1:] # N x (J-1)
|
|
138
|
+
|
|
139
|
+
# Initialize accumulators for XWX and XWz
|
|
140
|
+
XWX = np.zeros((num_features * (num_categories - 1), num_features * (num_categories - 1)))
|
|
141
|
+
XWz = np.zeros(num_features * (num_categories - 1))
|
|
142
|
+
|
|
143
|
+
# Construct W blocks and z
|
|
144
|
+
for i in range(Y.shape[0]):
|
|
145
|
+
yi = Y[i] # (J-1)
|
|
146
|
+
pi = mu_reduced[i]
|
|
147
|
+
var_i = np.diag(pi) - np.outer(pi, pi) # (J-1) x (J-1)
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
var_i_inv = np.linalg.inv(var_i)
|
|
151
|
+
except np.linalg.LinAlgError:
|
|
152
|
+
var_i_inv = np.linalg.pinv(var_i)
|
|
153
|
+
|
|
154
|
+
z_i = eta[i] + var_i_inv @ (yi - pi) # (J-1)
|
|
155
|
+
|
|
156
|
+
# Compute local contributions to XWX and XWz
|
|
157
|
+
Xi = np.kron(np.eye(num_categories - 1), X[i:i+1]) # (J-1) x (J-1)*K
|
|
158
|
+
Wi = var_i # (J-1) x (J-1)
|
|
159
|
+
XWX += Xi.T @ Wi @ Xi
|
|
160
|
+
XWz += Xi.T @ Wi @ z_i
|
|
161
|
+
|
|
162
|
+
# Compute log-likelihood and deviance
|
|
163
|
+
Y_full = data.to_pandas()[y_dummy_columns].to_numpy() # N x J
|
|
164
|
+
logprob = np.log(np.clip(mu, 1e-8, 1))
|
|
165
|
+
llf = np.sum(Y_full * logprob)
|
|
166
|
+
deviance = -2 * llf
|
|
167
|
+
|
|
168
|
+
results = {y_label: {'xwx': XWX, 'xwz': XWz}}
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
'llf': llf,
|
|
172
|
+
'deviance': deviance,
|
|
173
|
+
'beta_update_data': results
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class CategoricalOVRComputationUnit(ComputationUnit):
|
|
178
|
+
@staticmethod
|
|
179
|
+
def compute(data, y_label, X_labels, betas):
|
|
180
|
+
X = data.to_pandas()[sorted(X_labels)]
|
|
181
|
+
X['__const'] = 1
|
|
182
|
+
X = X.to_numpy().astype(float)
|
|
183
|
+
|
|
184
|
+
models = {}
|
|
185
|
+
results = {}
|
|
186
|
+
for category in betas.keys():
|
|
187
|
+
y = data.to_pandas()[category]
|
|
188
|
+
y = y.to_numpy().astype(float)
|
|
189
|
+
|
|
190
|
+
models[category] = ComputationHelper.get_regression_model(
|
|
191
|
+
y=y,
|
|
192
|
+
X=X,
|
|
193
|
+
beta=betas[category],
|
|
194
|
+
glm_family=family.Binomial()
|
|
195
|
+
)
|
|
196
|
+
current_result = ComputationHelper.run_model(
|
|
197
|
+
y=y,
|
|
198
|
+
X=X,
|
|
199
|
+
model=models[category]
|
|
200
|
+
)
|
|
201
|
+
results[category] = {'xwx': current_result['xwx'], 'xwz': current_result['xwz']}
|
|
202
|
+
|
|
203
|
+
# calculate multinomial llf
|
|
204
|
+
etas = {c:np.clip(m.predict(which='linear'), -350, 350) for c,m in models.items()}
|
|
205
|
+
denom = 1 + sum(np.exp(eta) for eta in etas.values())
|
|
206
|
+
mus = {c:np.clip(np.exp(eta)/denom, 1e-8, 1-1e-8) for c,eta in etas.items()}
|
|
207
|
+
|
|
208
|
+
llf = 0
|
|
209
|
+
llf_saturated = 0
|
|
210
|
+
reference_category_indices = np.ones(len(data))
|
|
211
|
+
for category in betas.keys():
|
|
212
|
+
y = data.to_pandas()[category].to_numpy().astype(float)
|
|
213
|
+
mu = mus[category]
|
|
214
|
+
reference_category_indices = reference_category_indices * (y==0)
|
|
215
|
+
# LLF
|
|
216
|
+
llf += np.sum(np.log(np.take(mu, np.nonzero(y)[0])))
|
|
217
|
+
## LLF SATURATED (for deviance)
|
|
218
|
+
#llf_saturated += np.sum(y * np.log(np.clip(y, 1e-10, None)))
|
|
219
|
+
|
|
220
|
+
# LLF
|
|
221
|
+
llf += np.sum(np.log(np.take(1/denom, reference_category_indices.nonzero()[0])))
|
|
222
|
+
|
|
223
|
+
# LLF SATURATED (for deviance)
|
|
224
|
+
#llf_saturated += np.sum(reference_category_indices * np.log(np.clip(reference_category_indices, 1e-10, None)))
|
|
225
|
+
deviance = 2 * (llf_saturated - llf)
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
'llf': llf,
|
|
229
|
+
'deviance': deviance,
|
|
230
|
+
'beta_update_data': results
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
class OrdinalComputationUnit(ComputationUnit):
|
|
234
|
+
@staticmethod
|
|
235
|
+
def compute(data, y_label, X_labels, betas):
|
|
236
|
+
X = data.to_pandas()[sorted(X_labels)]
|
|
237
|
+
X['__const'] = 1
|
|
238
|
+
X = X.to_numpy().astype(float)
|
|
239
|
+
|
|
240
|
+
models = {}
|
|
241
|
+
results = {}
|
|
242
|
+
for level in betas.keys():
|
|
243
|
+
level_int = int(level.split('__ord__')[-1])
|
|
244
|
+
y = data.to_pandas()[y_label]
|
|
245
|
+
y = (y.to_numpy() <= level_int).astype(float)
|
|
246
|
+
|
|
247
|
+
models[level] = ComputationHelper.get_regression_model(
|
|
248
|
+
y=y,
|
|
249
|
+
X=X,
|
|
250
|
+
beta=betas[level],
|
|
251
|
+
glm_family=family.Binomial()
|
|
252
|
+
)
|
|
253
|
+
current_result = ComputationHelper.run_model(
|
|
254
|
+
y=y,
|
|
255
|
+
X=X,
|
|
256
|
+
model=models[level]
|
|
257
|
+
)
|
|
258
|
+
results[level] = {'xwx': current_result['xwx'], 'xwz': current_result['xwz']}
|
|
259
|
+
|
|
260
|
+
mus = [(level, model.predict()) for level, model
|
|
261
|
+
in sorted(models.items(), key=lambda lvl: int(lvl[0].split('__ord__')[-1]))]
|
|
262
|
+
# get diffs of mus of successive levels
|
|
263
|
+
mus_diff = [mus[0]]
|
|
264
|
+
mus_diff.extend([(mus[i][0], mus[i][1] - mus[i-1][1]) for i in range(1,len(mus))])
|
|
265
|
+
mus_diff.append(('ref',1-mus[-1][1]))
|
|
266
|
+
|
|
267
|
+
# fix negative probs
|
|
268
|
+
sign_fix = np.column_stack([e[1] for e in mus_diff])
|
|
269
|
+
problematic_indices = np.where(sign_fix < 0)[0]
|
|
270
|
+
if len(problematic_indices) > 0:
|
|
271
|
+
problem_probs = np.abs(sign_fix[problematic_indices])
|
|
272
|
+
row_sums = np.clip(np.sum(problem_probs, axis=1, keepdims=True), 1e-8, None)
|
|
273
|
+
normalized_probs = problem_probs / row_sums
|
|
274
|
+
sign_fix[problematic_indices] = normalized_probs
|
|
275
|
+
mus_diff = [(mus_diff[i][0], sign_fix[:,i]) for i in range(len(mus_diff))]
|
|
276
|
+
mus_diff = [(l, np.clip(p, 1e-8, None)) for l,p in mus_diff]
|
|
277
|
+
|
|
278
|
+
llf = 0
|
|
279
|
+
llf_saturated = 0
|
|
280
|
+
reference_level_indices = np.ones(len(data))
|
|
281
|
+
for i in range(len(mus_diff)-1):
|
|
282
|
+
level, mu_diff = mus_diff[i]
|
|
283
|
+
level_int = int(level.split('__ord__')[-1])
|
|
284
|
+
current_level_indices = data[y_label].to_numpy() == level_int
|
|
285
|
+
reference_level_indices = reference_level_indices * (1-current_level_indices)
|
|
286
|
+
|
|
287
|
+
llf += np.sum(np.log(np.take(mu_diff, current_level_indices.nonzero()[0])))
|
|
288
|
+
_, mu_diff = mus_diff[-1]
|
|
289
|
+
llf += np.sum(np.log(np.take(mu_diff, reference_level_indices.nonzero()[0])))
|
|
290
|
+
deviance = 2 * (llf_saturated - llf)
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
'llf': llf,
|
|
294
|
+
'deviance': deviance,
|
|
295
|
+
'beta_update_data': results
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
polars_dtype_map = {
|
|
299
|
+
pl.Float64: VariableType.CONTINUOS,
|
|
300
|
+
pl.Boolean: VariableType.BINARY,
|
|
301
|
+
pl.String: VariableType.CATEGORICAL,
|
|
302
|
+
pl.Int32: VariableType.ORDINAL,
|
|
303
|
+
pl.Int64: VariableType.ORDINAL
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
regression_computation_map = {
|
|
307
|
+
VariableType.CONTINUOS: ContinousComputationUnit,
|
|
308
|
+
VariableType.BINARY: BinaryComputationUnit,
|
|
309
|
+
VariableType.CATEGORICAL: CategoricalComputationUnit if OVR == 0 else CategoricalOVRComputationUnit,
|
|
310
|
+
VariableType.ORDINAL: OrdinalComputationUnit
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
class Client():
|
|
314
|
+
def __init__(self, data: pl.DataFrame, _network_fetch_function=lambda x: x):
|
|
315
|
+
self._network_fetch_function = _network_fetch_function
|
|
316
|
+
self.data: pl.DataFrame = data
|
|
317
|
+
self.schema: Dict[str, VariableType] = {column: polars_dtype_map[dtype] for column, dtype in dict(self.data.schema).items()}
|
|
318
|
+
self.categorical_expressions: Dict[str, List[str]] = {column: self.data.select(column).to_dummies(separator='__cat__').columns
|
|
319
|
+
for column, dtype in self.schema.items() if dtype == VariableType.CATEGORICAL}
|
|
320
|
+
self.ordinal_expressions: Dict[str, List[str]] = {column: self.data.select(pl.col(column)).to_dummies(separator='__ord__').columns
|
|
321
|
+
for column, dtype in self.schema.items() if dtype == VariableType.ORDINAL}
|
|
322
|
+
|
|
323
|
+
self.server_categorical_expressions: Dict[str, List[str]] = None
|
|
324
|
+
self.server_ordinal_expressions: Dict[str, List[str]] = None
|
|
325
|
+
self.expanded_data: pl.DataFrame = None
|
|
326
|
+
|
|
327
|
+
def get_data_schema(self):
|
|
328
|
+
return self.schema
|
|
329
|
+
|
|
330
|
+
def get_categorical_expressions(self):
|
|
331
|
+
return self.categorical_expressions
|
|
332
|
+
def get_ordinal_expressions(self):
|
|
333
|
+
return self.ordinal_expressions
|
|
334
|
+
|
|
335
|
+
def provide_expressions(
|
|
336
|
+
self,
|
|
337
|
+
categorical_expressions: Dict[str, List[str]],
|
|
338
|
+
ordinal_expressions: Dict[str, List[str]]
|
|
339
|
+
):
|
|
340
|
+
self.server_categorical_expressions = categorical_expressions
|
|
341
|
+
self.server_ordinal_expressions = ordinal_expressions
|
|
342
|
+
|
|
343
|
+
# expand categoricals
|
|
344
|
+
all_possible_categorical_expressions = set([li for l in categorical_expressions.values() for li in l])
|
|
345
|
+
temp = self.data.select([column for column, dtype in self.schema.items() if dtype == VariableType.CATEGORICAL])
|
|
346
|
+
_data = self.data.to_dummies([column for column, dtype in self.schema.items() if dtype == VariableType.CATEGORICAL], separator='__cat__')
|
|
347
|
+
_data = _data.with_columns(temp)
|
|
348
|
+
missing_cols = list(all_possible_categorical_expressions - set(_data.columns))
|
|
349
|
+
_data = _data.with_columns(*[pl.lit(0.0).alias(c) for c in missing_cols])
|
|
350
|
+
|
|
351
|
+
if EXPAND_ORDINALS == 1:
|
|
352
|
+
# expand ordinals
|
|
353
|
+
all_possible_ordinal_expressions = set([li for l in ordinal_expressions.values() for li in l])
|
|
354
|
+
_data = _data.to_dummies([column for column, dtype in self.schema.items() if dtype == VariableType.ORDINAL], separator='__ord__')
|
|
355
|
+
missing_cols = list(all_possible_ordinal_expressions - set(_data.columns))
|
|
356
|
+
_data = _data.with_columns(*[pl.lit(0.0).alias(c) for c in missing_cols])
|
|
357
|
+
|
|
358
|
+
# keep original ordinal variables
|
|
359
|
+
_data = _data.with_columns(self.data.select([column for column, dtype in self.schema.items() if dtype == VariableType.ORDINAL]))
|
|
360
|
+
|
|
361
|
+
self.expanded_data = _data
|
|
362
|
+
|
|
363
|
+
def compute(self, y_label: str, X_labels, beta):
|
|
364
|
+
# NOTE: preemptive cast for netref arrays
|
|
365
|
+
beta = self._network_fetch_function(beta)
|
|
366
|
+
assert y_label in self.schema
|
|
367
|
+
|
|
368
|
+
result = regression_computation_map[self.schema[y_label]].compute(self.expanded_data, y_label, X_labels, beta)
|
|
369
|
+
|
|
370
|
+
if self.schema[y_label] in [VariableType.CONTINUOS, VariableType.BINARY]:
|
|
371
|
+
response: ClientResponseData = ClientResponseData(
|
|
372
|
+
llf=result['llf'],
|
|
373
|
+
deviance=result['deviance'],
|
|
374
|
+
beta_update_data={list(beta.keys())[0]: BetaUpdateData(xwx=result['xwx'], xwz=result['xwz'])}
|
|
375
|
+
)
|
|
376
|
+
else:
|
|
377
|
+
beta_update_data = {category: BetaUpdateData(xwx=data['xwx'], xwz=data['xwz']) for category, data in result['beta_update_data'].items()}
|
|
378
|
+
response: ClientResponseData = ClientResponseData(
|
|
379
|
+
llf=result['llf'],
|
|
380
|
+
deviance=result['deviance'],
|
|
381
|
+
beta_update_data=beta_update_data
|
|
382
|
+
)
|
|
383
|
+
return response
|
|
384
|
+
|
|
385
|
+
class ProxyClient(rpyc.Service):
|
|
386
|
+
def __init__(self, data):
|
|
387
|
+
self.client = Client(data, _network_fetch_function=rpyc.classic.obtain)
|
|
388
|
+
self.server: rpyc.utils.server.ThreadedServer = None
|
|
389
|
+
def __del__(self):
|
|
390
|
+
self.close()
|
|
391
|
+
def start(self, port):
|
|
392
|
+
if self.server is not None:
|
|
393
|
+
self.close()
|
|
394
|
+
self.server = rpyc.utils.server.ThreadedServer(self, port=port, protocol_config={'allow_public_attrs': True, 'allow_pickle': True})
|
|
395
|
+
self.server.start()
|
|
396
|
+
def close(self):
|
|
397
|
+
if self.server is None:
|
|
398
|
+
return
|
|
399
|
+
self.server.close()
|
|
400
|
+
self.server = None
|
|
401
|
+
# expose all functions of client
|
|
402
|
+
def on_connect(self, conn):
|
|
403
|
+
for name in dir(self.client):
|
|
404
|
+
if callable(getattr(self.client, name)) and not name.startswith("_"):
|
|
405
|
+
setattr(self, f"exposed_{name}", getattr(self.client, name))
|
fedci/env.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import os
|
|
2
|
+
DEBUG = 0 if (v:=os.getenv("DEBUG")) is None else int(v)
|
|
3
|
+
EXPAND_ORDINALS = 1 if (v:=os.getenv("EXPAND_ORDINALS")) is None else int(v)
|
|
4
|
+
LR = 1 if (v:=os.getenv("LR")) is None else float(v)
|
|
5
|
+
RIDGE = 0 if (v:=os.getenv("RIDGE")) is None else float(v)
|
|
6
|
+
OVR = 0 if (v:=os.getenv("OVR")) is None else int(v)
|
fedci/evaluation.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from .testing import Test
|
|
2
|
+
from .env import DEBUG
|
|
3
|
+
|
|
4
|
+
import scipy
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
import polars.selectors as cs
|
|
8
|
+
import os
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
class LikelihoodRatioTest():
|
|
12
|
+
def __init__(self, t0: Test, t1: Test) -> None:
|
|
13
|
+
|
|
14
|
+
assert t0.y_label == t1.y_label, 'Provided tests do not predict the same variable'
|
|
15
|
+
t0_req_labels = t0.get_required_labels() - {t0.y_label}
|
|
16
|
+
t1_req_labels = t1.get_required_labels() - {t0.y_label}
|
|
17
|
+
assert t0_req_labels.issubset(t1_req_labels), 'Provided tests are not nested'
|
|
18
|
+
assert len(t0_req_labels)+1 == len(t1_req_labels), 'Provided tests differ by more than one regressor variable'
|
|
19
|
+
|
|
20
|
+
self.y_label = t0.y_label
|
|
21
|
+
self.x_label = list(t1_req_labels - t0_req_labels)[0]
|
|
22
|
+
self.s_labels = sorted(list(t0_req_labels))
|
|
23
|
+
|
|
24
|
+
self.p_val = self._run_ci_test(t0, t1)
|
|
25
|
+
|
|
26
|
+
def _run_ci_test(self, t0: Test, t1: Test):
|
|
27
|
+
client_subset = t1.get_providing_clients()
|
|
28
|
+
t0_llf = t0.get_llf(client_subset)
|
|
29
|
+
t1_llf = t1.get_llf(client_subset)
|
|
30
|
+
|
|
31
|
+
t0_dof = t0.get_degrees_of_freedom()
|
|
32
|
+
t1_dof = t1.get_degrees_of_freedom()
|
|
33
|
+
|
|
34
|
+
p_val = scipy.stats.chi2.sf(2*(t1_llf - t0_llf), t1_dof-t0_dof)
|
|
35
|
+
|
|
36
|
+
if DEBUG >= 2:
|
|
37
|
+
print(f'*** Calculating p value for independence of {self.y_label} from {self.x_label} given {self.s_labels}')
|
|
38
|
+
print(f'{t1_dof-t0_dof} DOFs = {t1_dof} T1 DOFs - {t0_dof} T0 DOFs')
|
|
39
|
+
print(f'{2*(t1_llf - t0_llf):.4f} Test statistic = 2*({t1_llf:.4f} T1 LLF - {t0_llf:.4f} T0 LLF)')
|
|
40
|
+
print(f'p value = {p_val:.6f}')
|
|
41
|
+
return p_val
|
|
42
|
+
|
|
43
|
+
def __repr__(self):
|
|
44
|
+
return f"LikelihoodRatioTest - y: {self.y_label}, x: {self.x_label}, S: {self.s_labels}, p: {self.p_val:.4f}"
|
|
45
|
+
|
|
46
|
+
def __lt__(self, other):
|
|
47
|
+
if len(self.s_labels) < len(other.s_labels):
|
|
48
|
+
return True
|
|
49
|
+
elif len(self.s_labels) > len(other.s_labels):
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
if self.y_label < other.y_label:
|
|
53
|
+
return True
|
|
54
|
+
elif self.y_label > other.y_label:
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
if self.x_label < other.x_label:
|
|
58
|
+
return True
|
|
59
|
+
elif self.x_label > other.x_label:
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
if tuple(sorted(self.s_labels)) < tuple(sorted(other.s_labels)):
|
|
63
|
+
return True
|
|
64
|
+
elif tuple(sorted(self.s_labels)) > tuple(sorted(other.s_labels)):
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
class SymmetricLikelihoodRatioTest():
|
|
70
|
+
def __init__(self, lrt0: LikelihoodRatioTest, lrt1: LikelihoodRatioTest):
|
|
71
|
+
|
|
72
|
+
assert lrt0.y_label == lrt1.x_label and lrt1.y_label == lrt0.x_label and sorted(lrt0.s_labels) == sorted(lrt1.s_labels), 'Tests do not match'
|
|
73
|
+
|
|
74
|
+
self.lrt0: LikelihoodRatioTest = lrt0
|
|
75
|
+
self.lrt1: LikelihoodRatioTest = lrt1
|
|
76
|
+
|
|
77
|
+
self.v0, self.v1 = sorted([lrt0.y_label, lrt1.y_label])
|
|
78
|
+
self.conditioning_set = sorted(lrt0.s_labels)
|
|
79
|
+
|
|
80
|
+
self.p_val = min(2*min(self.lrt0.p_val, self.lrt1.p_val), max(self.lrt0.p_val, self.lrt1.p_val))
|
|
81
|
+
if DEBUG >= 2:
|
|
82
|
+
print(f'*** Combining p values for symmetry of tests between {self.v0} and {self.v1} given {self.conditioning_set}')
|
|
83
|
+
print(f'p value {self.lrt0.y_label}: {self.lrt0.p_val}')
|
|
84
|
+
print(f'p value {self.lrt1.y_label}: {self.lrt1.p_val}')
|
|
85
|
+
print(f'p value = {self.p_val:.4f}')
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def __repr__(self):
|
|
89
|
+
return f"SymmetricLikelihoodRatioTest - v0: {self.v0}, v1: {self.v1}, conditioning set: {self.conditioning_set}, p: {self.p_val:.4f}\n\t- {self.lrt0}\n\t- {self.lrt1}"
|
|
90
|
+
|
|
91
|
+
def __lt__(self, other):
|
|
92
|
+
if len(self.conditioning_set) < len(other.conditioning_set):
|
|
93
|
+
return True
|
|
94
|
+
elif len(self.conditioning_set) > len(other.conditioning_set):
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
if self.v0 < other.v0:
|
|
98
|
+
return True
|
|
99
|
+
elif self.v0 > other.v0:
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
if self.v1 < other.v1:
|
|
103
|
+
return True
|
|
104
|
+
elif self.v1 > other.v1:
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
if tuple(self.conditioning_set) < tuple(other.conditioning_set):
|
|
108
|
+
return True
|
|
109
|
+
elif tuple(self.conditioning_set) > tuple(other.conditioning_set):
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
def __eq__(self, other):
|
|
115
|
+
return self.v0 == other.v0 and self.v1 == other.v1 and self.conditioning_set == other.conditioning_set
|
|
116
|
+
|
|
117
|
+
class EmptyLikelihoodRatioTest(SymmetricLikelihoodRatioTest):
|
|
118
|
+
def __init__(self, v0, v1, conditioning_set, p_val):
|
|
119
|
+
self.v0, self.v1 = sorted([v0, v1])
|
|
120
|
+
self.conditioning_set = conditioning_set
|
|
121
|
+
self.p_val = p_val
|
|
122
|
+
|
|
123
|
+
def __repr__(self):
|
|
124
|
+
return f"EmptyLikelihoodRatioTest - v0: {self.v0}, v1: {self.v1}, conditioning set: {self.conditioning_set}, p: {self.p_val:.4f}"
|
|
125
|
+
|
|
126
|
+
def get_likelihood_tests(tests: List[Test]):
|
|
127
|
+
likelihood_tests = []
|
|
128
|
+
for test in tests:
|
|
129
|
+
curr_y = test.y_label
|
|
130
|
+
curr_X = test.get_required_labels() - {curr_y}
|
|
131
|
+
for x_var in curr_X:
|
|
132
|
+
curr_conditioning_set = curr_X - {x_var}
|
|
133
|
+
nested_test = [t for t in tests if ((t.get_required_labels() - {curr_y}) == curr_conditioning_set) and t.y_label == curr_y]
|
|
134
|
+
if len(nested_test) == 0:
|
|
135
|
+
print(f'No test for\n{test}\nin\n{tests}')
|
|
136
|
+
continue
|
|
137
|
+
assert len(nested_test) == 1, 'There is more than one nested test'
|
|
138
|
+
likelihood_tests.append(LikelihoodRatioTest(nested_test[0], test))
|
|
139
|
+
return likelihood_tests
|
|
140
|
+
|
|
141
|
+
def get_symmetric_likelihood_tests(tests, test_targets=None):
|
|
142
|
+
symmetric_tests = []
|
|
143
|
+
asymmetric_tests = get_likelihood_tests(tests)
|
|
144
|
+
unique_tests = [t for t in asymmetric_tests if t.x_label < t.y_label]
|
|
145
|
+
|
|
146
|
+
for test in unique_tests:
|
|
147
|
+
if test_targets is not None and (test.x_label, test.y_label, tuple(sorted(test.s_labels))) not in test_targets:
|
|
148
|
+
continue
|
|
149
|
+
test_counterpart = [t for t in asymmetric_tests if (t.y_label == test.x_label) and (t.x_label == test.y_label) and (t.s_labels == test.s_labels)]
|
|
150
|
+
if len(test_counterpart) == 0:
|
|
151
|
+
continue
|
|
152
|
+
assert len(test_counterpart) == 1, 'There is more than one matching counterpart test'
|
|
153
|
+
symmetric_tests.append(SymmetricLikelihoodRatioTest(test, test_counterpart[0]))
|
|
154
|
+
return symmetric_tests
|
fedci/server.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from .client import Client
|
|
2
|
+
from .testing import TestEngine
|
|
3
|
+
from .env import DEBUG
|
|
4
|
+
from typing import List, Dict
|
|
5
|
+
import rpyc
|
|
6
|
+
|
|
7
|
+
class Server():
|
|
8
|
+
def __init__(self, clients: Dict[str, Client], max_regressors=None, test_targets=None, max_iterations=25, _network_fetch_function=lambda x: x):
|
|
9
|
+
self._network_fetch_function = _network_fetch_function
|
|
10
|
+
self.clients = clients
|
|
11
|
+
self.client_schemas = {}
|
|
12
|
+
self.schema = {}
|
|
13
|
+
|
|
14
|
+
for client_id, client in self.clients.items():
|
|
15
|
+
client_schema = client.get_data_schema()
|
|
16
|
+
self.client_schemas[client_id] = client_schema
|
|
17
|
+
for column, dtype in client_schema.items():
|
|
18
|
+
if column not in self.schema:
|
|
19
|
+
self.schema[column] = dtype
|
|
20
|
+
continue
|
|
21
|
+
assert self.schema[column] == dtype, f'Schema mismatch between clients detected for variable {column}!'
|
|
22
|
+
|
|
23
|
+
self.category_expressions = {}
|
|
24
|
+
self.ordinal_expressions = {}
|
|
25
|
+
for client in self.clients.values():
|
|
26
|
+
for feature, expressions in client.get_categorical_expressions().items():
|
|
27
|
+
self.category_expressions[feature] = sorted(list(set(self.category_expressions.get(feature, [])).union(set(expressions))))
|
|
28
|
+
for feature, levels in client.get_ordinal_expressions().items():
|
|
29
|
+
self.ordinal_expressions[feature] = sorted(list(set(self.ordinal_expressions.get(feature, [])).union(set(levels))), key=lambda x: int(x.split('__ord__')[-1]))
|
|
30
|
+
|
|
31
|
+
for client in self.clients.values(): client.provide_expressions(self.category_expressions, self.ordinal_expressions)
|
|
32
|
+
|
|
33
|
+
self.test_engine: TestEngine = TestEngine(
|
|
34
|
+
schema=self.schema,
|
|
35
|
+
category_expressions=self.category_expressions,
|
|
36
|
+
ordinal_expressions=self.ordinal_expressions,
|
|
37
|
+
max_regressors=max_regressors,
|
|
38
|
+
max_iterations=max_iterations,
|
|
39
|
+
test_targets=test_targets
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def run(self):
|
|
43
|
+
while not self.test_engine.is_finished():
|
|
44
|
+
required_labels = self.test_engine.get_currently_required_labels()
|
|
45
|
+
selected_clients = {client_id: client for client_id, client in self.clients.items() if set(required_labels).issubset(self.client_schemas[client_id].keys())}
|
|
46
|
+
if len(selected_clients) == 0:
|
|
47
|
+
self.test_engine.remove_current_test()
|
|
48
|
+
continue
|
|
49
|
+
assert len(selected_clients) > 0, f'No client is able to provide the data for {required_labels}'
|
|
50
|
+
|
|
51
|
+
y_label, X_labels, beta = self.test_engine.get_current_test_parameters()
|
|
52
|
+
|
|
53
|
+
results = {client_id: client.compute(y_label, X_labels, beta) for client_id, client in selected_clients.items()}
|
|
54
|
+
# NOTE: fetch ClientResponseData over network if necessary
|
|
55
|
+
results = {client_id: self._network_fetch_function(result) for client_id, result in results.items()}
|
|
56
|
+
self.test_engine.update_current_test(results)
|
|
57
|
+
if DEBUG >= 1:
|
|
58
|
+
print("*** All tests")
|
|
59
|
+
for test in self.test_engine.tests:
|
|
60
|
+
print(test)
|
|
61
|
+
return self.test_engine.tests
|
|
62
|
+
|
|
63
|
+
def get_tests(self):
|
|
64
|
+
return self.test_engine.tests
|
|
65
|
+
|
|
66
|
+
class ProxyServerBuilder():
|
|
67
|
+
def __init__(self, cls):
|
|
68
|
+
self.clients = []
|
|
69
|
+
self.cls = cls
|
|
70
|
+
self.max_regressors = None
|
|
71
|
+
self.max_iterations = 25
|
|
72
|
+
def set_max_regressors(self, max_regressors):
|
|
73
|
+
self.max_regressors = max_regressors
|
|
74
|
+
return self
|
|
75
|
+
def set_max_iterations(self, max_iterations):
|
|
76
|
+
self.max_iterations = max_iterations
|
|
77
|
+
return self
|
|
78
|
+
def add_client(self, hostname, port):
|
|
79
|
+
if (hostname, port) in self.clients:
|
|
80
|
+
print('Client exists already')
|
|
81
|
+
return self
|
|
82
|
+
client = rpyc.connect(hostname, port, config={'allow_public_attrs': True, 'allow_pickle': True})
|
|
83
|
+
self.clients.append(client)
|
|
84
|
+
return self
|
|
85
|
+
def build(self):
|
|
86
|
+
return self.cls(self.clients, max_regressors=self.max_regressors, max_iterations=self.max_iterations)
|
|
87
|
+
|
|
88
|
+
class ProxyServer():
|
|
89
|
+
@classmethod
|
|
90
|
+
def builder(cls, **kwargs):
|
|
91
|
+
return ProxyServerBuilder(cls, **kwargs)
|
|
92
|
+
def __init__(self, clients, max_regressors, max_iterations):
|
|
93
|
+
self.clients = {i: c.root for i, c in enumerate(clients)}
|
|
94
|
+
self.server = Server(
|
|
95
|
+
self.clients,
|
|
96
|
+
_network_fetch_function=rpyc.classic.obtain,
|
|
97
|
+
max_regressors=max_regressors,
|
|
98
|
+
max_iterations=max_iterations
|
|
99
|
+
)
|
|
100
|
+
def __getattr__(self, name):
|
|
101
|
+
return getattr(self.server, name)
|
|
102
|
+
def run(self):
|
|
103
|
+
return self.server.run()
|
|
104
|
+
def get_tests(self):
|
|
105
|
+
return self.server.get_tests()
|
fedci/testing.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
from itertools import chain, combinations
|
|
5
|
+
|
|
6
|
+
from .env import EXPAND_ORDINALS, OVR, RIDGE
|
|
7
|
+
from .utils import BetaUpdateData, ClientResponseData, VariableType
|
|
8
|
+
|
|
9
|
+
class RegressionTest():
|
|
10
|
+
@classmethod
|
|
11
|
+
def create_and_overwrite_beta(cls, y_label, X_labels, beta):
|
|
12
|
+
c = cls(y_label, X_labels)
|
|
13
|
+
c.beta = beta
|
|
14
|
+
return c
|
|
15
|
+
|
|
16
|
+
def __init__(self, y_label: str, X_labels: List[str]):
|
|
17
|
+
self.y_label = y_label
|
|
18
|
+
self.X_labels = X_labels
|
|
19
|
+
self.beta = np.zeros(len(X_labels) + 1)
|
|
20
|
+
|
|
21
|
+
def update_beta(self, data: List[BetaUpdateData]):
|
|
22
|
+
xwx = sum([d.xwx for d in data])
|
|
23
|
+
xwz = sum([d.xwz for d in data])
|
|
24
|
+
|
|
25
|
+
if RIDGE > 0:
|
|
26
|
+
penalty_matrix = RIDGE * np.eye(len(xwx))
|
|
27
|
+
xwx += penalty_matrix
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
xwx_inv = np.linalg.inv(xwx)
|
|
31
|
+
except np.linalg.LinAlgError:
|
|
32
|
+
xwx_inv = np.linalg.pinv(xwx)
|
|
33
|
+
|
|
34
|
+
if RIDGE > 0:
|
|
35
|
+
self.beta = (xwx_inv @ xwz) + RIDGE * xwx_inv @ self.beta
|
|
36
|
+
else:
|
|
37
|
+
self.beta = xwx_inv @ xwz
|
|
38
|
+
|
|
39
|
+
def __lt__(self, other):
|
|
40
|
+
if len(self.X_labels) < len(other.X_labels): return True
|
|
41
|
+
elif len(self.X_labels) > len(other.X_labels): return False
|
|
42
|
+
|
|
43
|
+
if self.y_label < other.y_label: return True
|
|
44
|
+
elif self.y_label > other.y_label: return False
|
|
45
|
+
|
|
46
|
+
if tuple(sorted(self.X_labels)) < tuple(sorted(other.X_labels)): return True
|
|
47
|
+
elif tuple(sorted(self.X_labels)) > tuple(sorted(other.X_labels)): return False
|
|
48
|
+
|
|
49
|
+
return True
|
|
50
|
+
|
|
51
|
+
def __repr__(self):
|
|
52
|
+
return f'RegressionTest {self.y_label} ~ {", ".join(self.X_labels + ["1"])} - beta: {self.beta}'
|
|
53
|
+
|
|
54
|
+
class Test():
|
|
55
|
+
def __init__(self,
|
|
56
|
+
y_label,
|
|
57
|
+
X_labels: List[str],
|
|
58
|
+
y_labels: List[str] = None,
|
|
59
|
+
max_iterations=25,
|
|
60
|
+
y_type=None
|
|
61
|
+
):
|
|
62
|
+
self.y_label = y_label
|
|
63
|
+
self.X_labels = X_labels
|
|
64
|
+
if y_labels is None: y_labels = [y_label]
|
|
65
|
+
self.y_labels = y_labels
|
|
66
|
+
self.tests: Dict[str, RegressionTest] = {_y_label: RegressionTest(_y_label, X_labels) for _y_label in y_labels}
|
|
67
|
+
|
|
68
|
+
if y_type == VariableType.CATEGORICAL and OVR == 0:
|
|
69
|
+
_beta = np.concatenate([t.beta for t in self.tests.values()])
|
|
70
|
+
self.tests = {y_label: RegressionTest.create_and_overwrite_beta(y_label, X_labels, _beta)}
|
|
71
|
+
|
|
72
|
+
self.llf = None
|
|
73
|
+
self.last_deviance = None
|
|
74
|
+
self.deviance = 0
|
|
75
|
+
self.iterations = 0
|
|
76
|
+
self.max_iterations = max_iterations
|
|
77
|
+
|
|
78
|
+
def is_finished(self):
|
|
79
|
+
return self.get_change_in_deviance() < 1e-3 or self.iterations >= self.max_iterations
|
|
80
|
+
|
|
81
|
+
def update_betas(self, data: Dict[str, ClientResponseData]):
|
|
82
|
+
self.llf = {client_id: client_response.llf for client_id, client_response in data.items()}
|
|
83
|
+
self.last_deviance = self.deviance
|
|
84
|
+
self.deviance = sum(client_response.deviance for client_response in data.values())
|
|
85
|
+
|
|
86
|
+
if self.is_finished():
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
beta_update_data = [client_response.beta_update_data for client_response in data.values()]
|
|
90
|
+
# Transform data from list of dicts to dict of lists => all data for one y_label grouped together
|
|
91
|
+
beta_update_data = {k: [dic[k] for dic in beta_update_data] for k in beta_update_data[0]}
|
|
92
|
+
|
|
93
|
+
for y_label, _data in beta_update_data.items():
|
|
94
|
+
self.tests[y_label].update_beta(_data)
|
|
95
|
+
self.iterations += 1
|
|
96
|
+
|
|
97
|
+
def get_degrees_of_freedom(self):
|
|
98
|
+
# len tests -> num_cats -1
|
|
99
|
+
# len X_labels + 1 -> x vars, intercept
|
|
100
|
+
return len(self.y_labels)*(len(self.X_labels) + 1)
|
|
101
|
+
|
|
102
|
+
def get_llf(self, client_subset=None):
|
|
103
|
+
if client_subset is not None:
|
|
104
|
+
return sum([llf for client_id, llf in self.llf.items() if client_id in client_subset])
|
|
105
|
+
return sum([llf for llf in self.llf.values()]) if self.llf is not None else 0
|
|
106
|
+
|
|
107
|
+
def get_providing_clients(self):
|
|
108
|
+
if self.llf is None:
|
|
109
|
+
return []
|
|
110
|
+
return set(self.llf.keys())
|
|
111
|
+
|
|
112
|
+
def get_beta(self):
|
|
113
|
+
return {t.y_label: t.beta for t in self.tests.values()}
|
|
114
|
+
|
|
115
|
+
def get_required_labels(self):
|
|
116
|
+
vars = {self.y_label}
|
|
117
|
+
for var in self.X_labels:
|
|
118
|
+
if '__cat__' in var:
|
|
119
|
+
vars.add(var.split('__cat__')[0])
|
|
120
|
+
elif '__ord__' in var:
|
|
121
|
+
vars.add(var.split('__ord__')[0])
|
|
122
|
+
else:
|
|
123
|
+
vars.add(var)
|
|
124
|
+
return vars
|
|
125
|
+
|
|
126
|
+
def get_change_in_deviance(self):
|
|
127
|
+
if self.last_deviance is None:
|
|
128
|
+
return 1
|
|
129
|
+
return abs(self.deviance - self.last_deviance)
|
|
130
|
+
|
|
131
|
+
def get_relative_change_in_deviance(self):
|
|
132
|
+
if self.last_deviance is None:
|
|
133
|
+
return 1
|
|
134
|
+
return abs(self.deviance - self.last_deviance) / (1e-5 + abs(self.deviance))
|
|
135
|
+
|
|
136
|
+
def __repr__(self):
|
|
137
|
+
test_string = "\n\t- " + "\n\t- ".join([str(t) for t in sorted(self.tests.values())])
|
|
138
|
+
test_title = f'{self.y_label} ~ {",".join(list(set([l.split("__")[0] for l in self.X_labels])))},1'
|
|
139
|
+
return f'Test {test_title} - llf: {self.get_llf()}, deviance: {self.deviance}, {self.iterations}/{self.max_iterations} iterations{test_string}'
|
|
140
|
+
|
|
141
|
+
def __eq__(self, other):
|
|
142
|
+
req_labels = self.get_required_labels()
|
|
143
|
+
other_labels = other.get_required_labels()
|
|
144
|
+
return (
|
|
145
|
+
len(req_labels) == len(other_labels) and
|
|
146
|
+
self.y_label == other.y_label and
|
|
147
|
+
tuple(sorted(self.X_labels)) == tuple(sorted(other.X_labels))
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def __lt__(self, other):
|
|
151
|
+
req_labels = self.get_required_labels()
|
|
152
|
+
other_labels = other.get_required_labels()
|
|
153
|
+
if len(req_labels) < len(other_labels):
|
|
154
|
+
return True
|
|
155
|
+
elif len(req_labels) > len(other_labels):
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
if self.y_label < other.y_label:
|
|
159
|
+
return True
|
|
160
|
+
elif self.y_label > other.y_label:
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
if tuple(sorted(self.X_labels)) < tuple(sorted(other.X_labels)):
|
|
164
|
+
return True
|
|
165
|
+
elif tuple(sorted(self.X_labels)) > tuple(sorted(other.X_labels)):
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class TestEngine():
|
|
172
|
+
def __init__(self,
|
|
173
|
+
schema,
|
|
174
|
+
category_expressions,
|
|
175
|
+
ordinal_expressions,
|
|
176
|
+
max_regressors=None,
|
|
177
|
+
max_iterations=25,
|
|
178
|
+
test_targets=None
|
|
179
|
+
):
|
|
180
|
+
|
|
181
|
+
self.tests = []
|
|
182
|
+
self.max_iterations = max_iterations
|
|
183
|
+
self.bad_tests = []
|
|
184
|
+
|
|
185
|
+
variables = set(schema.keys())
|
|
186
|
+
max_conditioning_set_size = min(len(variables)-1, max_regressors) if max_regressors is not None else len(variables)-1
|
|
187
|
+
|
|
188
|
+
all_test_targets = set(sum([(a,b) for a,b,_ in test_targets], ())) if test_targets is not None else None
|
|
189
|
+
|
|
190
|
+
for y_var in variables:
|
|
191
|
+
if all_test_targets is not None and y_var not in all_test_targets:
|
|
192
|
+
continue
|
|
193
|
+
set_of_possible_regressors = variables - {y_var}
|
|
194
|
+
powerset_of_regressors = chain.from_iterable(combinations(set_of_possible_regressors, r) for r in range(0, max_conditioning_set_size+1))
|
|
195
|
+
|
|
196
|
+
# expand categorical features in regressor sets
|
|
197
|
+
expanded_powerset_of_regressors = []
|
|
198
|
+
for variable_set in powerset_of_regressors:
|
|
199
|
+
for _var, expressions in category_expressions.items():
|
|
200
|
+
if _var in variable_set:
|
|
201
|
+
variable_set = (set(variable_set) - {_var}) | set(sorted(list(expressions)[1:])) # drop first cat
|
|
202
|
+
if EXPAND_ORDINALS:
|
|
203
|
+
for _var, expressions in ordinal_expressions.items():
|
|
204
|
+
if _var in variable_set:
|
|
205
|
+
variable_set = (set(variable_set) - {_var}) | set(sorted(list(expressions)[1:])) # drop first cat
|
|
206
|
+
expanded_powerset_of_regressors.append(variable_set)
|
|
207
|
+
powerset_of_regressors = expanded_powerset_of_regressors
|
|
208
|
+
|
|
209
|
+
if schema[y_var] == VariableType.CONTINUOS:
|
|
210
|
+
self.tests.extend([Test(
|
|
211
|
+
y_label=y_var,
|
|
212
|
+
X_labels=sorted(list(x_vars)),
|
|
213
|
+
max_iterations=max_iterations,
|
|
214
|
+
y_type=schema[y_var]
|
|
215
|
+
) for x_vars in powerset_of_regressors])
|
|
216
|
+
elif schema[y_var] == VariableType.BINARY:
|
|
217
|
+
self.tests.extend([Test(
|
|
218
|
+
y_label=y_var,
|
|
219
|
+
X_labels=sorted(list(x_vars)),
|
|
220
|
+
max_iterations=max_iterations,
|
|
221
|
+
y_type=schema[y_var]
|
|
222
|
+
) for x_vars in powerset_of_regressors])
|
|
223
|
+
elif schema[y_var] == VariableType.CATEGORICAL:
|
|
224
|
+
assert y_var in category_expressions, f'Categorical variable {y_var} is not in expression mapping'
|
|
225
|
+
self.tests.extend([Test(
|
|
226
|
+
y_label=y_var,
|
|
227
|
+
X_labels=sorted(list(x_vars)),
|
|
228
|
+
y_labels=category_expressions[y_var][:-1],
|
|
229
|
+
max_iterations=max_iterations,
|
|
230
|
+
y_type=schema[y_var]
|
|
231
|
+
) for x_vars in powerset_of_regressors])
|
|
232
|
+
elif schema[y_var] == VariableType.ORDINAL:
|
|
233
|
+
assert y_var in ordinal_expressions, f'Ordinal variable {y_var} is not in expression mapping'
|
|
234
|
+
self.tests.extend([Test(
|
|
235
|
+
y_label=y_var,
|
|
236
|
+
X_labels=sorted(list(x_vars)),
|
|
237
|
+
y_labels=ordinal_expressions[y_var][:-1],
|
|
238
|
+
max_iterations=max_iterations,
|
|
239
|
+
y_type=schema[y_var]
|
|
240
|
+
) for x_vars in powerset_of_regressors])
|
|
241
|
+
else:
|
|
242
|
+
raise Exception(f'Unknown variable type {schema[y_var]} encountered!')
|
|
243
|
+
|
|
244
|
+
self.tests: List[Test] = sorted(self.tests)
|
|
245
|
+
self.current_test_index = 0
|
|
246
|
+
|
|
247
|
+
def is_finished(self):
|
|
248
|
+
return self.current_test_index >= len(self.tests)
|
|
249
|
+
|
|
250
|
+
def get_currently_required_labels(self):
|
|
251
|
+
if self.is_finished():
|
|
252
|
+
return None
|
|
253
|
+
current_test = self.tests[self.current_test_index]
|
|
254
|
+
return current_test.get_required_labels()
|
|
255
|
+
|
|
256
|
+
def get_current_test_parameters(self):
|
|
257
|
+
if self.is_finished():
|
|
258
|
+
return None, None, None
|
|
259
|
+
current_test = self.tests[self.current_test_index]
|
|
260
|
+
return current_test.y_label, current_test.X_labels, current_test.get_beta()
|
|
261
|
+
|
|
262
|
+
def update_current_test(self, client_responses: Dict[str, ClientResponseData]):
|
|
263
|
+
if self.is_finished():
|
|
264
|
+
return
|
|
265
|
+
current_test = self.tests[self.current_test_index]
|
|
266
|
+
current_test.update_betas(client_responses)
|
|
267
|
+
if current_test.is_finished():
|
|
268
|
+
self.current_test_index += 1
|
|
269
|
+
|
|
270
|
+
def remove_current_test(self):
|
|
271
|
+
if self.is_finished():
|
|
272
|
+
return
|
|
273
|
+
current_test = self.tests[self.current_test_index]
|
|
274
|
+
self.bad_tests.append(current_test)
|
|
275
|
+
self.tests = self.tests[:self.current_test_index] + self.tests[self.current_test_index+1:]
|
fedci/utils.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
class VariableType(enum.Enum):
|
|
6
|
+
CONTINUOS = 0
|
|
7
|
+
BINARY = 1
|
|
8
|
+
CATEGORICAL = 2
|
|
9
|
+
ORDINAL = 3
|
|
10
|
+
def __eq__(self, other):
|
|
11
|
+
if self.__class__ is other.__class__:
|
|
12
|
+
return self.value == other.value
|
|
13
|
+
return NotImplemented
|
|
14
|
+
def __hash__(self):
|
|
15
|
+
return hash(self.value)
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class BetaUpdateData:
|
|
19
|
+
xwx: object
|
|
20
|
+
xwz: object
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class ClientResponseData:
|
|
24
|
+
llf: float
|
|
25
|
+
deviance: float
|
|
26
|
+
beta_update_data: Dict[str, BetaUpdateData]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Maximilian Hahn
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: python-fedci
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A small package for federated independence tests
|
|
5
|
+
Author-email: Maximilian Hahn <max.hahn@gmx.de>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Maximilian Hahn
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/maxhahn/fedci
|
|
29
|
+
Project-URL: Repository, https://github.com/maxhahn/fedci
|
|
30
|
+
Requires-Python: >=3.10
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Requires-Dist: polars>=1.22.0
|
|
34
|
+
Requires-Dist: pyarrow>=19.0.0
|
|
35
|
+
Requires-Dist: rpyc>=6.0.1
|
|
36
|
+
Requires-Dist: scipy>=1.15.1
|
|
37
|
+
Requires-Dist: statsmodels>=0.14.4
|
|
38
|
+
|
|
39
|
+
# About this Project
|
|
40
|
+
|
|
41
|
+
This projects aims to create an accessible platform and user interface to obtain causal knowledge from distributed datasets.
|
|
42
|
+
Different parties can share key insights about their data without compromising their privacy.
|
|
43
|
+
Results come in the form of causal graphs (PAGs).
|
|
44
|
+
|
|
45
|
+
There are two main algorithm this paper supports:
|
|
46
|
+
* rIOD
|
|
47
|
+
* FedGLM
|
|
48
|
+
|
|
49
|
+
## General
|
|
50
|
+
|
|
51
|
+
This application is made up of two main components.
|
|
52
|
+
There is a streamlit UI and a litestar server.
|
|
53
|
+
|
|
54
|
+
Via the streamlit UI, one can connect to an existing server instance and run distributed/federated algorithms with peers connected to the same server.
|
|
55
|
+
|
|
56
|
+
It is designed to be easily self-hostable, and is fully contained within a docker container.
|
|
57
|
+
|
|
58
|
+
**Beware**: As of now, this client-server architecture communicates via http (_not_ https!).
|
|
59
|
+
As such a malicious agent may spoof your identity or steal data that is transmitted over the network.
|
|
60
|
+
|
|
61
|
+
When hosting this application, you may use a reverse-proxy and your own SSL certificates to enable https.
|
|
62
|
+
|
|
63
|
+
## rIOD
|
|
64
|
+
|
|
65
|
+
This project implements the IOD algorithm created by [Tillman and Spirtes](http://proceedings.mlr.press/v15/tillman11a.html).
|
|
66
|
+
|
|
67
|
+
When running rIOD, (conditional) independence tests are performed and the resulting p-values are transmitted to the server.
|
|
68
|
+
On the server-side, these p-values are aggregated to give insights about the independences in the distributed dataset.
|
|
69
|
+
|
|
70
|
+
rIOD only supports numerical (float) features.
|
|
71
|
+
As per the IOD algorithm, not all participating parties have to have identical features in their dataset.
|
|
72
|
+
|
|
73
|
+
Without code modifications, this application will only ever transmit p-values and PAG adjacency matrices over the network.
|
|
74
|
+
The original dataset is kept completely private and is never transmitted over the network.
|
|
75
|
+
(This can be easily checked inside the source code)
|
|
76
|
+
|
|
77
|
+
## FedGLM
|
|
78
|
+
|
|
79
|
+
This project implemens an algorithm (titled FedGLM for now) which utilizes federated learning to create linear models, which are then used for likelihood-ratio tests in order to obtain independence information about the dataset.
|
|
80
|
+
|
|
81
|
+
FedGLM supports numerical (float), categorical (string), and ordinal (int) features.
|
|
82
|
+
Similar to IOD, here as well, not all participating parties have to have the exact same feature set.
|
|
83
|
+
|
|
84
|
+
This algorithm requires the transmission of the expression levels of ordinal and categorical variables.
|
|
85
|
+
Additionaly, when the algorithm is running, linear model coefficients are transmitted, as well as matrices that do not contain recreatable information about the data (see algorithm 2 in [Cellamare et al.](https://www.mdpi.com/1999-4893/15/7/243))
|
|
86
|
+
As such, data privacy is preserved.
|
|
87
|
+
|
|
88
|
+
## Setup
|
|
89
|
+
|
|
90
|
+
First, install docker and docker-compose.
|
|
91
|
+
|
|
92
|
+
Use the following commands to build and run the application:
|
|
93
|
+
|
|
94
|
+
* Run client and server on same machine:
|
|
95
|
+
`docker-compose up`
|
|
96
|
+
|
|
97
|
+
* Run client or server only:
|
|
98
|
+
`docker-compuse up client`
|
|
99
|
+
`docker-compose up server`
|
|
100
|
+
|
|
101
|
+
* The client can be accessed via `localhost:8081` on the host machine.
|
|
102
|
+
When hosting the client on a machine within the same network, use it's ip address and ensure proper connectivity between the two machines.
|
|
103
|
+
|
|
104
|
+
* The first step within the client is to connect to a server.
|
|
105
|
+
A server hosted by us can be used with the following URL: `heiderlab.com:8080`
|
|
106
|
+
|
|
107
|
+
## Configuration
|
|
108
|
+
|
|
109
|
+
No major changes of this setup should be required.
|
|
110
|
+
All configurations can be changed in `docker-compose.yml`.
|
|
111
|
+
|
|
112
|
+
The only reasonable change is the port mapping, if your host machine requires specific ports to be exposed.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
fedci/__init__.py,sha256=v0ZdJg0rQaCGcfrKuFuUV0s22DDHpN0iRJaCcMo00aY,167
|
|
2
|
+
fedci/client.py,sha256=yYPT6I2Ek4C5GKKvKIwV4XQ_kfghi_pNFqX29FhCuhA,15939
|
|
3
|
+
fedci/env.py,sha256=BqLlyUxC8of89P96bH_H5_2vzlVoUupwAPA7po7IvHs,309
|
|
4
|
+
fedci/evaluation.py,sha256=vHqIdPfo7JI-ux0PL6byl2ltJ4nwN57OPAxVmvIWVGk,6293
|
|
5
|
+
fedci/server.py,sha256=eXhkwEwnPDGmTvfupxTrobx_d2uAPHqtAXc7H6-vBmE,4711
|
|
6
|
+
fedci/testing.py,sha256=x5lVmwD8ZK2QZ-jhncfQNcBXk5ZovM2Lp_XMc4A_D7M,10988
|
|
7
|
+
fedci/utils.py,sha256=GQ8xeU7SRqUgnoJdynMnDCT8_qn_6j6XnWsoar4LiPg,564
|
|
8
|
+
python_fedci-0.1.0.dist-info/LICENSE,sha256=zm3uXqpOuah2iueuXaCJZxXk1xZNzJzfo_csVwG56ZU,1072
|
|
9
|
+
python_fedci-0.1.0.dist-info/METADATA,sha256=TSl-tWX8TEbXBMgIieitpEiSaNHNnX1fCADCHOfipPQ,5253
|
|
10
|
+
python_fedci-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
11
|
+
python_fedci-0.1.0.dist-info/top_level.txt,sha256=cqbR8aI0m4KvYOXgv94Lg1XmKxIBBAFKyDmJD1zefDQ,6
|
|
12
|
+
python_fedci-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fedci
|