python-fedci 0.1.5__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {python_fedci-0.1.5 → python_fedci-0.2.1}/LICENSE +0 -0
  2. python_fedci-0.2.1/PKG-INFO +319 -0
  3. python_fedci-0.2.1/README.md +298 -0
  4. python_fedci-0.2.1/fedci/__init__.py +37 -0
  5. python_fedci-0.2.1/fedci/client.py +395 -0
  6. python_fedci-0.2.1/fedci/computing.py +279 -0
  7. python_fedci-0.2.1/fedci/env.py +3 -0
  8. python_fedci-0.2.1/fedci/masking.py +103 -0
  9. python_fedci-0.2.1/fedci/modeling.py +309 -0
  10. python_fedci-0.2.1/fedci/network.py +170 -0
  11. python_fedci-0.2.1/fedci/results.py +210 -0
  12. python_fedci-0.2.1/fedci/server.py +293 -0
  13. python_fedci-0.2.1/fedci/testing.py +147 -0
  14. python_fedci-0.2.1/fedci/utils.py +98 -0
  15. {python_fedci-0.1.5 → python_fedci-0.2.1}/pyproject.toml +13 -12
  16. python_fedci-0.2.1/python_fedci.egg-info/PKG-INFO +319 -0
  17. {python_fedci-0.1.5 → python_fedci-0.2.1}/python_fedci.egg-info/SOURCES.txt +6 -1
  18. python_fedci-0.2.1/python_fedci.egg-info/requires.txt +9 -0
  19. python_fedci-0.2.1/tests/testing.py +179 -0
  20. python_fedci-0.1.5/PKG-INFO +0 -690
  21. python_fedci-0.1.5/README.md +0 -3
  22. python_fedci-0.1.5/fedci/__init__.py +0 -4
  23. python_fedci-0.1.5/fedci/client.py +0 -842
  24. python_fedci-0.1.5/fedci/env.py +0 -9
  25. python_fedci-0.1.5/fedci/server.py +0 -320
  26. python_fedci-0.1.5/fedci/testing.py +0 -573
  27. python_fedci-0.1.5/fedci/utils.py +0 -52
  28. python_fedci-0.1.5/python_fedci.egg-info/PKG-INFO +0 -690
  29. python_fedci-0.1.5/python_fedci.egg-info/requires.txt +0 -11
  30. python_fedci-0.1.5/tests/test.py +0 -296
  31. {python_fedci-0.1.5 → python_fedci-0.2.1}/python_fedci.egg-info/dependency_links.txt +0 -0
  32. {python_fedci-0.1.5 → python_fedci-0.2.1}/python_fedci.egg-info/top_level.txt +0 -0
  33. {python_fedci-0.1.5 → python_fedci-0.2.1}/setup.cfg +0 -0
File without changes
@@ -0,0 +1,319 @@
1
+ Metadata-Version: 2.4
2
+ Name: python-fedci
3
+ Version: 0.2.1
4
+ Summary: A small package for federated independence tests
5
+ Author-email: Maximilian Hahn <max.hahn@gmx.de>
6
+ License-Expression: AGPL-3.0-or-later
7
+ Project-URL: Homepage, https://github.com/maxhahn/fedci
8
+ Project-URL: Repository, https://github.com/maxhahn/fedci
9
+ Requires-Python: >=3.12
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: matplotlib>=3.11.0
13
+ Requires-Dist: numpy>=2.4.6
14
+ Requires-Dist: polars>=1.41.2
15
+ Requires-Dist: scipy>=1.17.1
16
+ Requires-Dist: tqdm>=4.68.3
17
+ Requires-Dist: pyzmq>=27.1.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=9.1.0; extra == "dev"
20
+ Dynamic: license-file
21
+
22
+ # fedci
23
+
24
+ Federated conditional independence (CI) testing via likelihood ratio tests. Data never leaves each client — only aggregated sufficient statistics are exchanged with the central server.
25
+
26
+ ---
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ uv add python-fedci # or: pip install python-fedci
32
+ ```
33
+
34
+ Requires Python 3.10+. ZMQ networking requires `pyzmq`.
35
+
36
+ ---
37
+
38
+ ## Core concepts
39
+
40
+ | Class | Role |
41
+ |---|---|
42
+ | `Client` | Holds a local dataset; answers statistical queries from the server |
43
+ | `Server` | Orchestrates tests across all clients; never sees raw data |
44
+ | `TestResult` | Result of one CI test (p-value, Bayes factors) |
45
+ | `RepeatedTestResult` | Aggregated statistics over many bootstrap runs |
46
+
47
+ ---
48
+
49
+ ## Basic workflow
50
+
51
+ ```python
52
+ import polars as pl
53
+ from fedci import Client, Server
54
+
55
+ # Each site wraps its own data
56
+ df1 = pl.read_parquet("site1.parquet")
57
+ df2 = pl.read_parquet("site2.parquet")
58
+
59
+ client1 = Client("site1", df1)
60
+ client2 = Client("site2", df2)
61
+
62
+ server = Server([client1, client2])
63
+ ```
64
+
65
+ ### Test a single conditional independence
66
+
67
+ ```python
68
+ # Is X independent of Y given {Z, W}?
69
+ result = server.test("X", "Y", {"Z", "W"})
70
+
71
+ result.p_value # float
72
+ result.log10_bic_bf # log10 Bayes factor (BIC approximation)
73
+ result.log10_bff # log10 Bayes factor (BFF method)
74
+ result.pH0_bic # P(H0) = 1 / (1 + BF) via BIC
75
+ result.pH0_bff # P(H0) = 1 / (1 + BF) via BFF
76
+ result.n_samples # total observations used
77
+ ```
78
+
79
+ ### Run all pairwise tests
80
+
81
+ ```python
82
+ # All pairs up to conditioning set size 2, 4 parallel workers
83
+ results = server.run(max_cond_size=2, workers=4, progress_bar=True)
84
+
85
+ # results is Dict[(x, y, frozenset(s)), TestResult]
86
+ for (x, y, s), r in results.items():
87
+ print(f"{x} ⊥ {y} | {s} p={r.p_value:.4f} log10BF={r.log10_bic_bf:.2f}")
88
+ ```
89
+
90
+ `workers` parallelises at the test level — each CI test runs in its own thread. Numpy releases the GIL during computation so multiple cores are used effectively.
91
+
92
+ ### Repeated tests with subsampling
93
+
94
+ Bootstrap stability analysis: run the same test many times on random subsamples to assess reliability.
95
+
96
+ ```python
97
+ rtr = server.test_repeatedly(
98
+ "X", "Y", {"Z"},
99
+ num_runs=100,
100
+ sample_fraction=0.80,
101
+ workers=4, # parallel runs (local clients only; see Networking)
102
+ progress_bar=True,
103
+ )
104
+
105
+ rtr.n_runs # 100
106
+ rtr.mean_p # mean p-value across runs
107
+ rtr.mean_log10_bic_bf # mean log10 BF
108
+ rtr.std_log10_bic_bf # spread — lower means more stable
109
+ rtr.rate_dependence_bic # fraction of runs where BF > 1 (evidence for dependence)
110
+ rtr.mean_pH0_bic # mean P(H0) across runs
111
+ ```
112
+
113
+ Run the full test suite repeatedly:
114
+
115
+ ```python
116
+ repeated = server.run_repeatedly(
117
+ num_runs=50,
118
+ sample_fraction=0.80,
119
+ max_cond_size=2,
120
+ workers=4, # parallel tests within each run
121
+ progress_bar=True,
122
+ )
123
+ # Dict[(x, y, frozenset(s)), RepeatedTestResult]
124
+ ```
125
+
126
+ ---
127
+
128
+ ## Interpreting results
129
+
130
+ Two Bayes factor estimates are available alongside the p-value.
131
+
132
+ ### BIC Bayes factor
133
+
134
+ ```
135
+ log10(BF) ≈ ½ (2·ΔLL − Δk·log n) / log(10)
136
+ ```
137
+
138
+ Closed-form, conservative, low variance across subsamples.
139
+
140
+ ### BFF — Bayes Factor Function
141
+
142
+ Maximises a chi² Bayes factor over a prior width grid.
143
+
144
+ ### Decision guide
145
+
146
+ | log10(BF) | Interpretation |
147
+ |---|---|
148
+ | > 1.0 | Strong evidence for dependence |
149
+ | 0 to 1.0 | Weak / inconclusive |
150
+ | < 0 | Evidence for independence |
151
+
152
+ `pH0 = 1 / (1 + BF)` converts a Bayes factor directly to a probability of the null.
153
+
154
+ For stability assessment across repeated runs, prefer `rate_dependence_bff` (fraction of runs where BF > 1) and `std_log10_bff_bf` (spread on log scale) over a single p-value threshold or its `_bic` counterparts.
155
+
156
+ ---
157
+
158
+ ## Model configuration
159
+
160
+ ### Default — GLM
161
+
162
+ Logistic regression for binary/categorical responses; linear regression for continuous. No extra configuration required.
163
+
164
+ ```python
165
+ server = Server([client1, client2])
166
+ ```
167
+
168
+ ### GAM — nonlinear continuous relationships
169
+
170
+ Cubic B-spline basis expansion for each continuous predictor. Custom knot placement is supported, though preferably, data is normalised to **[0, 1]** before being passed to clients so that a fixed knot grid is valid across all sites.
171
+
172
+ ```python
173
+ import numpy as np
174
+ from fedci import Server, Client, GAMConfiguration, ModelType
175
+
176
+ # Knot arithmetic:
177
+ # num_knots=8, num_degrees=3 → n_basis = num_knots + num_degrees - 2 = 9
178
+ # knot vector length = num_knots + 2*num_degrees - 1 = 13 (5 interior knots)
179
+ N_KNOTS = 8
180
+ DEGREE = 3
181
+ interior = list(np.linspace(0.0, 1.0, N_KNOTS - DEGREE + 2)[1:-1])
182
+ knot_vec = [0.0] * (DEGREE + 1) + interior + [1.0] * (DEGREE + 1)
183
+
184
+ # Normalise to [0, 1] on the combined dataset before splitting into clients
185
+ mins = df_full.min()
186
+ maxs = df_full.max()
187
+ df_norm = df_full.with_columns([
188
+ ((pl.col(c) - mins[c][0]) / (maxs[c][0] - mins[c][0])).alias(c)
189
+ for c in df_full.columns
190
+ ])
191
+
192
+ gam_config = GAMConfiguration(
193
+ type=ModelType.GAM,
194
+ num_knots=N_KNOTS,
195
+ num_degrees=DEGREE,
196
+ knots={v: knot_vec for v in df_norm.columns}, # include all variables
197
+ )
198
+
199
+ server = Server([client1, client2], model_configuration=gam_config)
200
+ ```
201
+
202
+ Including a variable in `knots` is safe even when it appears as the response in some tests — the spline basis is only applied to predictors, never to the response.
203
+
204
+ ### Site heterogeneity
205
+
206
+ When datasets across sites differ systematically (e.g. batch effects or recruitment differences), enable random effects:
207
+
208
+ ```python
209
+ from fedci import HeterogenietyType
210
+
211
+ # Random intercepts only (recommended — far fewer parameters)
212
+ server = Server(
213
+ [client1, client2],
214
+ heterogeniety=HeterogenietyType.GLOBAL,
215
+ random_intercept_only=True, # one shift per site per model
216
+ local_ridge_coefficient=1.0, # starting prior strength; adapted via EM
217
+ )
218
+
219
+ # Full random effects on all coefficients (intercept + slopes)
220
+ server = Server(
221
+ [client1, client2],
222
+ heterogeniety=HeterogenietyType.GLOBAL,
223
+ random_intercept_only=False, # site-specific shift on every coefficient
224
+ local_ridge_coefficient=1.0,
225
+ )
226
+ ```
227
+
228
+ `random_intercept_only=False` is heavily overparameterised for most datasets — a warning is raised when used with GAM, where the spline basis already has many coefficients. Prefer `True` unless you have a strong reason for full random slopes.
229
+
230
+ | `HeterogenietyType` | Behaviour |
231
+ |---|---|
232
+ | `NONE` (default) | Pooled model, no site effects |
233
+ | `GLOBAL` | Random effects with shared variance across sites, estimated by EM; calculates effective DoF |
234
+ | `LOCAL` | Independent random effects per site, not shared; uses regular DoF |
235
+
236
+ ---
237
+
238
+ ## Additive masking
239
+
240
+ Additive masking protects intermediate aggregates: each client adds a per-pair random mask to its output; masks cancel exactly in the sum, so the server only ever sees the correct aggregate — never any individual client's contribution.
241
+
242
+ ### Local masking
243
+
244
+ For multiple clients running in the same process:
245
+
246
+ ```python
247
+ server = Server([client1, client2], additive_masking=True)
248
+ results = server.run(max_cond_size=1)
249
+ # Numerically identical to unmasked results
250
+ ```
251
+
252
+ ### Network masking
253
+
254
+ When clients run on separate machines, `additive_masking=True` triggers a peer-to-peer seed exchange: the server passes each client's address to all others, peers connect directly to agree on shared random seeds, and thereafter each masks its own output independently. The server never participates in seed exchange.
255
+
256
+ ```python
257
+ from fedci import connect_client, Server
258
+
259
+ nc1 = connect_client("192.168.1.10", 5555)
260
+ nc2 = connect_client("192.168.1.11", 5555)
261
+
262
+ server = Server([nc1, nc2], additive_masking=True)
263
+ results = server.run(max_cond_size=1)
264
+ ```
265
+
266
+ ---
267
+
268
+ ## Networking
269
+
270
+ Each client runs `serve_client` on its own machine. The orchestrating server uses `connect_client` and receives a `NetworkClient` that has the same interface as a local `Client`.
271
+
272
+ ### On each client machine
273
+
274
+ ```python
275
+ import polars as pl
276
+ from fedci import Client, serve_client
277
+
278
+ df = pl.read_parquet("local_data.parquet")
279
+ client = Client("site1", df)
280
+
281
+ serve_client(client, port=5555) # blocks; run as a separate process or service
282
+ ```
283
+
284
+ ### On the server machine
285
+
286
+ ```python
287
+ from fedci import Server, connect_client
288
+
289
+ nc1 = connect_client("192.168.1.10", 5555)
290
+ nc2 = connect_client("192.168.1.11", 5555)
291
+
292
+ server = Server([nc1, nc2])
293
+ results = server.run(max_cond_size=2, workers=4)
294
+ ```
295
+
296
+ ### How it works
297
+
298
+ Communication uses **ZeroMQ** (ROUTER/REQ pattern) with pickle serialisation, which handles numpy arrays, dataclasses, and sets without extra schema definitions. Each calling thread gets its own ZMQ socket, so multiple concurrent tests can call the same remote client without conflicts. The remote `serve_client` loop processes requests sequentially.
299
+
300
+ ### Parallelism with network clients
301
+
302
+ `workers` in `server.run()` and `server.test_repeatedly()` parallelises across threads and works transparently with both local and network clients.
303
+
304
+ Each parallel run in `test_repeatedly` registers its subsample on every client under a unique run-ID, then passes that ID with every compute call. The remote `serve_client` loop processes requests sequentially but each request carries its own run-ID, so concurrent runs never interfere — they simply reference different subsample entries in the client's registry. The registry is cleaned up automatically when each run completes.
305
+
306
+ ---
307
+
308
+ ## Variable types
309
+
310
+ `Client` infers variable types from the Polars schema automatically:
311
+
312
+ | Polars dtype | Treated as | Model |
313
+ |---|---|---|
314
+ | `Float32` / `Float64` | Continuous | Gaussian GLM / GAM |
315
+ | `Boolean` | Binary | Logistic regression |
316
+ | `Utf8` / `Categorical` | Categorical | Multinomial logistic |
317
+ | `Int*` | Ordinal | Proportional-odds |
318
+
319
+ When sites have different category levels for the same variable, the server takes the union across all clients automatically.
@@ -0,0 +1,298 @@
1
+ # fedci
2
+
3
+ Federated conditional independence (CI) testing via likelihood ratio tests. Data never leaves each client — only aggregated sufficient statistics are exchanged with the central server.
4
+
5
+ ---
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ uv add python-fedci # or: pip install python-fedci
11
+ ```
12
+
13
+ Requires Python 3.10+. ZMQ networking requires `pyzmq`.
14
+
15
+ ---
16
+
17
+ ## Core concepts
18
+
19
+ | Class | Role |
20
+ |---|---|
21
+ | `Client` | Holds a local dataset; answers statistical queries from the server |
22
+ | `Server` | Orchestrates tests across all clients; never sees raw data |
23
+ | `TestResult` | Result of one CI test (p-value, Bayes factors) |
24
+ | `RepeatedTestResult` | Aggregated statistics over many bootstrap runs |
25
+
26
+ ---
27
+
28
+ ## Basic workflow
29
+
30
+ ```python
31
+ import polars as pl
32
+ from fedci import Client, Server
33
+
34
+ # Each site wraps its own data
35
+ df1 = pl.read_parquet("site1.parquet")
36
+ df2 = pl.read_parquet("site2.parquet")
37
+
38
+ client1 = Client("site1", df1)
39
+ client2 = Client("site2", df2)
40
+
41
+ server = Server([client1, client2])
42
+ ```
43
+
44
+ ### Test a single conditional independence
45
+
46
+ ```python
47
+ # Is X independent of Y given {Z, W}?
48
+ result = server.test("X", "Y", {"Z", "W"})
49
+
50
+ result.p_value # float
51
+ result.log10_bic_bf # log10 Bayes factor (BIC approximation)
52
+ result.log10_bff # log10 Bayes factor (BFF method)
53
+ result.pH0_bic # P(H0) = 1 / (1 + BF) via BIC
54
+ result.pH0_bff # P(H0) = 1 / (1 + BF) via BFF
55
+ result.n_samples # total observations used
56
+ ```
57
+
58
+ ### Run all pairwise tests
59
+
60
+ ```python
61
+ # All pairs up to conditioning set size 2, 4 parallel workers
62
+ results = server.run(max_cond_size=2, workers=4, progress_bar=True)
63
+
64
+ # results is Dict[(x, y, frozenset(s)), TestResult]
65
+ for (x, y, s), r in results.items():
66
+ print(f"{x} ⊥ {y} | {s} p={r.p_value:.4f} log10BF={r.log10_bic_bf:.2f}")
67
+ ```
68
+
69
+ `workers` parallelises at the test level — each CI test runs in its own thread. Numpy releases the GIL during computation so multiple cores are used effectively.
70
+
71
+ ### Repeated tests with subsampling
72
+
73
+ Bootstrap stability analysis: run the same test many times on random subsamples to assess reliability.
74
+
75
+ ```python
76
+ rtr = server.test_repeatedly(
77
+ "X", "Y", {"Z"},
78
+ num_runs=100,
79
+ sample_fraction=0.80,
80
+ workers=4, # parallel runs (local clients only; see Networking)
81
+ progress_bar=True,
82
+ )
83
+
84
+ rtr.n_runs # 100
85
+ rtr.mean_p # mean p-value across runs
86
+ rtr.mean_log10_bic_bf # mean log10 BF
87
+ rtr.std_log10_bic_bf # spread — lower means more stable
88
+ rtr.rate_dependence_bic # fraction of runs where BF > 1 (evidence for dependence)
89
+ rtr.mean_pH0_bic # mean P(H0) across runs
90
+ ```
91
+
92
+ Run the full test suite repeatedly:
93
+
94
+ ```python
95
+ repeated = server.run_repeatedly(
96
+ num_runs=50,
97
+ sample_fraction=0.80,
98
+ max_cond_size=2,
99
+ workers=4, # parallel tests within each run
100
+ progress_bar=True,
101
+ )
102
+ # Dict[(x, y, frozenset(s)), RepeatedTestResult]
103
+ ```
104
+
105
+ ---
106
+
107
+ ## Interpreting results
108
+
109
+ Two Bayes factor estimates are available alongside the p-value.
110
+
111
+ ### BIC Bayes factor
112
+
113
+ ```
114
+ log10(BF) ≈ ½ (2·ΔLL − Δk·log n) / log(10)
115
+ ```
116
+
117
+ Closed-form, conservative, low variance across subsamples.
118
+
119
+ ### BFF — Bayes Factor Function
120
+
121
+ Maximises a chi² Bayes factor over a prior width grid.
122
+
123
+ ### Decision guide
124
+
125
+ | log10(BF) | Interpretation |
126
+ |---|---|
127
+ | > 1.0 | Strong evidence for dependence |
128
+ | 0 to 1.0 | Weak / inconclusive |
129
+ | < 0 | Evidence for independence |
130
+
131
+ `pH0 = 1 / (1 + BF)` converts a Bayes factor directly to a probability of the null.
132
+
133
+ For stability assessment across repeated runs, prefer `rate_dependence_bff` (fraction of runs where BF > 1) and `std_log10_bff_bf` (spread on log scale) over a single p-value threshold or its `_bic` counterparts.
134
+
135
+ ---
136
+
137
+ ## Model configuration
138
+
139
+ ### Default — GLM
140
+
141
+ Logistic regression for binary/categorical responses; linear regression for continuous. No extra configuration required.
142
+
143
+ ```python
144
+ server = Server([client1, client2])
145
+ ```
146
+
147
+ ### GAM — nonlinear continuous relationships
148
+
149
+ Cubic B-spline basis expansion for each continuous predictor. Custom knot placement is supported, though preferably, data is normalised to **[0, 1]** before being passed to clients so that a fixed knot grid is valid across all sites.
150
+
151
+ ```python
152
+ import numpy as np
153
+ from fedci import Server, Client, GAMConfiguration, ModelType
154
+
155
+ # Knot arithmetic:
156
+ # num_knots=8, num_degrees=3 → n_basis = num_knots + num_degrees - 2 = 9
157
+ # knot vector length = num_knots + 2*num_degrees - 1 = 13 (5 interior knots)
158
+ N_KNOTS = 8
159
+ DEGREE = 3
160
+ interior = list(np.linspace(0.0, 1.0, N_KNOTS - DEGREE + 2)[1:-1])
161
+ knot_vec = [0.0] * (DEGREE + 1) + interior + [1.0] * (DEGREE + 1)
162
+
163
+ # Normalise to [0, 1] on the combined dataset before splitting into clients
164
+ mins = df_full.min()
165
+ maxs = df_full.max()
166
+ df_norm = df_full.with_columns([
167
+ ((pl.col(c) - mins[c][0]) / (maxs[c][0] - mins[c][0])).alias(c)
168
+ for c in df_full.columns
169
+ ])
170
+
171
+ gam_config = GAMConfiguration(
172
+ type=ModelType.GAM,
173
+ num_knots=N_KNOTS,
174
+ num_degrees=DEGREE,
175
+ knots={v: knot_vec for v in df_norm.columns}, # include all variables
176
+ )
177
+
178
+ server = Server([client1, client2], model_configuration=gam_config)
179
+ ```
180
+
181
+ Including a variable in `knots` is safe even when it appears as the response in some tests — the spline basis is only applied to predictors, never to the response.
182
+
183
+ ### Site heterogeneity
184
+
185
+ When datasets across sites differ systematically (e.g. batch effects or recruitment differences), enable random effects:
186
+
187
+ ```python
188
+ from fedci import HeterogenietyType
189
+
190
+ # Random intercepts only (recommended — far fewer parameters)
191
+ server = Server(
192
+ [client1, client2],
193
+ heterogeniety=HeterogenietyType.GLOBAL,
194
+ random_intercept_only=True, # one shift per site per model
195
+ local_ridge_coefficient=1.0, # starting prior strength; adapted via EM
196
+ )
197
+
198
+ # Full random effects on all coefficients (intercept + slopes)
199
+ server = Server(
200
+ [client1, client2],
201
+ heterogeniety=HeterogenietyType.GLOBAL,
202
+ random_intercept_only=False, # site-specific shift on every coefficient
203
+ local_ridge_coefficient=1.0,
204
+ )
205
+ ```
206
+
207
+ `random_intercept_only=False` is heavily overparameterised for most datasets — a warning is raised when used with GAM, where the spline basis already has many coefficients. Prefer `True` unless you have a strong reason for full random slopes.
208
+
209
+ | `HeterogenietyType` | Behaviour |
210
+ |---|---|
211
+ | `NONE` (default) | Pooled model, no site effects |
212
+ | `GLOBAL` | Random effects with shared variance across sites, estimated by EM; calculates effective DoF |
213
+ | `LOCAL` | Independent random effects per site, not shared; uses regular DoF |
214
+
215
+ ---
216
+
217
+ ## Additive masking
218
+
219
+ Additive masking protects intermediate aggregates: each client adds a per-pair random mask to its output; masks cancel exactly in the sum, so the server only ever sees the correct aggregate — never any individual client's contribution.
220
+
221
+ ### Local masking
222
+
223
+ For multiple clients running in the same process:
224
+
225
+ ```python
226
+ server = Server([client1, client2], additive_masking=True)
227
+ results = server.run(max_cond_size=1)
228
+ # Numerically identical to unmasked results
229
+ ```
230
+
231
+ ### Network masking
232
+
233
+ When clients run on separate machines, `additive_masking=True` triggers a peer-to-peer seed exchange: the server passes each client's address to all others, peers connect directly to agree on shared random seeds, and thereafter each masks its own output independently. The server never participates in seed exchange.
234
+
235
+ ```python
236
+ from fedci import connect_client, Server
237
+
238
+ nc1 = connect_client("192.168.1.10", 5555)
239
+ nc2 = connect_client("192.168.1.11", 5555)
240
+
241
+ server = Server([nc1, nc2], additive_masking=True)
242
+ results = server.run(max_cond_size=1)
243
+ ```
244
+
245
+ ---
246
+
247
+ ## Networking
248
+
249
+ Each client runs `serve_client` on its own machine. The orchestrating server uses `connect_client` and receives a `NetworkClient` that has the same interface as a local `Client`.
250
+
251
+ ### On each client machine
252
+
253
+ ```python
254
+ import polars as pl
255
+ from fedci import Client, serve_client
256
+
257
+ df = pl.read_parquet("local_data.parquet")
258
+ client = Client("site1", df)
259
+
260
+ serve_client(client, port=5555) # blocks; run as a separate process or service
261
+ ```
262
+
263
+ ### On the server machine
264
+
265
+ ```python
266
+ from fedci import Server, connect_client
267
+
268
+ nc1 = connect_client("192.168.1.10", 5555)
269
+ nc2 = connect_client("192.168.1.11", 5555)
270
+
271
+ server = Server([nc1, nc2])
272
+ results = server.run(max_cond_size=2, workers=4)
273
+ ```
274
+
275
+ ### How it works
276
+
277
+ Communication uses **ZeroMQ** (ROUTER/REQ pattern) with pickle serialisation, which handles numpy arrays, dataclasses, and sets without extra schema definitions. Each calling thread gets its own ZMQ socket, so multiple concurrent tests can call the same remote client without conflicts. The remote `serve_client` loop processes requests sequentially.
278
+
279
+ ### Parallelism with network clients
280
+
281
+ `workers` in `server.run()` and `server.test_repeatedly()` parallelises across threads and works transparently with both local and network clients.
282
+
283
+ Each parallel run in `test_repeatedly` registers its subsample on every client under a unique run-ID, then passes that ID with every compute call. The remote `serve_client` loop processes requests sequentially but each request carries its own run-ID, so concurrent runs never interfere — they simply reference different subsample entries in the client's registry. The registry is cleaned up automatically when each run completes.
284
+
285
+ ---
286
+
287
+ ## Variable types
288
+
289
+ `Client` infers variable types from the Polars schema automatically:
290
+
291
+ | Polars dtype | Treated as | Model |
292
+ |---|---|---|
293
+ | `Float32` / `Float64` | Continuous | Gaussian GLM / GAM |
294
+ | `Boolean` | Binary | Logistic regression |
295
+ | `Utf8` / `Categorical` | Categorical | Multinomial logistic |
296
+ | `Int*` | Ordinal | Proportional-odds |
297
+
298
+ When sites have different category levels for the same variable, the server takes the union across all clients automatically.
@@ -0,0 +1,37 @@
1
+ from fedci.client import Client
2
+ from fedci.server import Server
3
+ from fedci.network import NetworkClient, connect_client, serve_client
4
+ from fedci.results import (
5
+ LRTResult,
6
+ SymmetricLRTResult,
7
+ TestResult,
8
+ RepeatedTestResult,
9
+ )
10
+ from fedci.utils import (
11
+ VariableType,
12
+ ModelType,
13
+ HeterogenietyType,
14
+ ModelConfiguration,
15
+ GAMConfiguration,
16
+ )
17
+
18
+ __all__ = [
19
+ # Core
20
+ "Client",
21
+ "Server",
22
+ # Networking
23
+ "NetworkClient",
24
+ "connect_client",
25
+ "serve_client",
26
+ # Results
27
+ "LRTResult",
28
+ "SymmetricLRTResult",
29
+ "TestResult",
30
+ "RepeatedTestResult",
31
+ # Configuration
32
+ "VariableType",
33
+ "ModelType",
34
+ "HeterogenietyType",
35
+ "ModelConfiguration",
36
+ "GAMConfiguration",
37
+ ]