PyTDLM 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
TDLM/tdlm.py ADDED
@@ -0,0 +1,821 @@
1
+ """
2
+ TDLM: Trip Distribution Law Models Library
3
+
4
+ Author: Maxime Lenormand (2015)
5
+ Converted to Python with enhanced parallel processing support
6
+
7
+ This program is free software: you can redistribute it and/or modify
8
+ it under the terms of the GNU General Public License version 3.
9
+
10
+ This program is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ GNU General Public License for more details.
14
+ """
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ from tqdm import tqdm
19
+ import multiprocessing as mp
20
+ from typing import Union, Optional, List, Dict
21
+ import warnings
22
+
23
+
24
+ class TDLMError(Exception):
25
+ """Custom exception for TDLM errors"""
26
+ pass
27
+
28
+
29
+ def compute_opportunity(
30
+ mass_destination: np.ndarray,
31
+ distance: np.ndarray,
32
+ processes: Optional[int] = None
33
+ ) -> np.ndarray:
34
+ """
35
+ Compute the opportunity matrix Sij: Number of opportunities located in a circle
36
+ of radius dij centered in i (excluding the source and the destination).
37
+
38
+ Parameters
39
+ ----------
40
+ mass_destination : np.ndarray
41
+ Number of inhabitants at destination (mj)
42
+ distance : np.ndarray
43
+ Distance matrix (n x n)
44
+ processes : int, optional
45
+ Number of processes for parallel computation. Default: CPU count - 2
46
+
47
+ Returns
48
+ -------
49
+ np.ndarray
50
+ Opportunity matrix Sij of shape (n, n)
51
+ """
52
+ n = len(mass_destination)
53
+
54
+ # Validate inputs
55
+ if distance.shape != (n, n):
56
+ raise TDLMError(f"distance matrix must be {n}x{n}")
57
+
58
+ print(f"Computing opportunity matrix for {n} regions...")
59
+
60
+ # Setup multiprocessing
61
+ num_processes = processes if processes is not None else max(1, mp.cpu_count() - 2)
62
+ print(f'Using {num_processes} parallel processes')
63
+
64
+ # Prepare arguments for parallel processing
65
+ args_list = [(i, distance, mass_destination, n) for i in range(n)]
66
+
67
+ # Use multiprocessing to compute S matrix rows in parallel
68
+ with mp.Pool(processes=num_processes) as pool:
69
+ results = list(tqdm(pool.imap(_process_opportunity_row, args_list),
70
+ total=n, desc="Computing opportunities"))
71
+
72
+ # Collect results into S matrix
73
+ S = np.zeros((n, n))
74
+ for i, row_S in results:
75
+ S[i, :] = row_S
76
+
77
+ print("Done\n")
78
+ return S
79
+
80
+
81
+ def _process_opportunity_row(args):
82
+ """Process a single row of the opportunity matrix S with complete vectorization."""
83
+ i, dij, mj, n = args
84
+
85
+ # Initialize row
86
+ row_S = np.zeros(n)
87
+
88
+ # Get distances from i to all regions
89
+ distances_i = dij[i, :]
90
+
91
+ # Create 2D arrays for the j and l dimensions (n×n)
92
+ j_indices = np.arange(n).reshape(n, 1) # Column vector
93
+ l_indices = np.arange(n).reshape(1, n) # Row vector
94
+
95
+ # This creates a matrix of distances from i to l
96
+ distances_il = np.broadcast_to(distances_i, (n, n))
97
+
98
+ # This creates a column vector of distances from i to j
99
+ distances_ij = distances_i.reshape(n, 1)
100
+
101
+ # Create masks for all combinations of j and l at once
102
+ # distance condition: dist(i,l) <= dist(i,j)
103
+ distance_mask = distances_il <= distances_ij
104
+
105
+ # l != i mask
106
+ l_not_i_mask = l_indices != i
107
+
108
+ # l != j mask
109
+ l_not_j_mask = l_indices != j_indices
110
+
111
+ # Combine all masks
112
+ combined_mask = distance_mask & l_not_i_mask & l_not_j_mask
113
+
114
+ # Apply the mask to mj and sum for each j
115
+ # Need to reshape mj for broadcasting
116
+ mj_expanded = mj.reshape(1, n)
117
+ row_S = np.sum(combined_mask * mj_expanded, axis=1)
118
+
119
+ # Set diagonal to 0 (where i==j)
120
+ row_S[i] = 0
121
+
122
+ return i, row_S
123
+
124
+
125
+ def run_law_model(
126
+ law: str,
127
+ mass_origin: np.ndarray,
128
+ mass_destination: np.ndarray,
129
+ distance: np.ndarray,
130
+ opportunity: Optional[np.ndarray] = None,
131
+ exponent: Union[float, np.ndarray] = 1.0,
132
+ return_proba: bool = False,
133
+ model: str = "UM",
134
+ out_trips: Optional[np.ndarray] = None,
135
+ in_trips: Optional[np.ndarray] = None,
136
+ repli: int = 1,
137
+ processes: Optional[int] = None,
138
+ random_seed: Optional[int] = None
139
+ ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
140
+ """
141
+ Run trip distribution law model simulations.
142
+
143
+ Parameters
144
+ ----------
145
+ law : str
146
+ Trip distribution law. One of: "GravExp", "NGravExp", "GravPow",
147
+ "NGravPow", "Schneider", "Rad", "RadExt", "Rand"
148
+ mass_origin : np.ndarray
149
+ Number of inhabitants at origin (mi)
150
+ mass_destination : np.ndarray
151
+ Number of inhabitants at destination (mj)
152
+ distance : np.ndarray
153
+ Distance matrix (n x n)
154
+ opportunity : np.ndarray, optional
155
+ Matrix of opportunities (n x n). Required for "Rad", "RadExt", "Schneider".
156
+ If not provided and required, will be computed automatically.
157
+ exponent : float or np.ndarray
158
+ Exponent parameter(s) for the distribution law
159
+ return_proba : bool, default False
160
+ Whether to return probability matrices
161
+ model : str, default "UM"
162
+ Distribution model. One of: "UM", "PCM", "ACM", "DCM"
163
+ out_trips : np.ndarray, optional
164
+ Number of out-commuters (Oi). Required for constrained models
165
+ in_trips : np.ndarray, optional
166
+ Number of in-commuters (Dj). Required for ACM and DCM models
167
+ repli : int, default 1
168
+ Number of replications
169
+ processes : int, optional
170
+ Number of processes for parallel computation. Default: CPU count - 2
171
+ random_seed : int, optional
172
+ Random seed for reproducibility
173
+
174
+ Returns
175
+ -------
176
+ Union[np.ndarray, Dict[str, np.ndarray]]
177
+ If single exponent: np.ndarray of shape (repli, n, n)
178
+ If multiple exponents: Dict with exponents as keys, arrays as values
179
+ """
180
+
181
+ # Check if opportunity matrix is needed and compute if not provided
182
+ laws_requiring_opportunity = ["Rad", "RadExt", "Schneider"]
183
+ if law in laws_requiring_opportunity and opportunity is None:
184
+ print(f"Law '{law}' requires opportunity matrix. Computing automatically...")
185
+ opportunity = compute_opportunity(mass_destination, distance, processes)
186
+
187
+ # Input validation
188
+ _validate_inputs(law, model, mass_origin, mass_destination, distance,
189
+ opportunity, out_trips, in_trips)
190
+
191
+ # Set random seed if provided
192
+ if random_seed is not None:
193
+ np.random.seed(random_seed)
194
+
195
+ # Handle single vs multiple exponents
196
+ exponents = np.atleast_1d(exponent)
197
+ single_exponent = len(exponents) == 1
198
+
199
+ # Setup data tuple
200
+ n = len(mass_origin)
201
+ data = (n, mass_origin, mass_destination, out_trips, in_trips, distance, opportunity)
202
+
203
+ # Setup multiprocessing
204
+ num_processes = processes if processes is not None else max(1, mp.cpu_count() - 2)
205
+
206
+ if len(exponents) > 1 and num_processes > 1:
207
+ # Parallel processing for multiple exponents
208
+ print(f'Running simulations for {law} with {model} model ({repli} replications)')
209
+ print(f'Using {num_processes} parallel processes')
210
+
211
+ with mp.Pool(processes=num_processes) as pool:
212
+ params = [(data, law, model, beta, repli, return_proba) for beta in exponents]
213
+ results = list(tqdm(pool.imap(_process_exponent, params),
214
+ total=len(exponents), desc='Computing exponents'))
215
+
216
+ # Organize results
217
+ output = {}
218
+ for i, beta in enumerate(exponents):
219
+ if return_proba:
220
+ output[beta] = results[i]
221
+ else:
222
+ output[beta] = results[i]['simulations']
223
+
224
+ else:
225
+ # Sequential processing
226
+ output = {}
227
+ if single_exponent:
228
+ beta = exponents[0]
229
+ print(f'Simulating matrix for {law} β = {beta:.2g} with {model}')
230
+ params = (data, law, model, beta, repli, return_proba)
231
+ result = _process_exponent(params)
232
+ if return_proba:
233
+ output[beta] = result
234
+ else:
235
+ output[beta] = result['simulations']
236
+ else:
237
+ print(f'Running simulations for {law} with {model} model ({repli} replications)')
238
+
239
+
240
+ for i, beta in enumerate(tqdm(exponents, desc='Computing exponents')):
241
+ params = (data, law, model, beta, repli, return_proba)
242
+ result = _process_exponent(params)
243
+ if return_proba:
244
+ output[beta] = result
245
+ else:
246
+ output[beta] = result['simulations']
247
+ print('Done\n')
248
+
249
+ # Return format based on input
250
+ if single_exponent:
251
+ return list(output.values())[0]
252
+ else:
253
+ return output
254
+
255
+
256
+ def gof(
257
+ sim: Union[np.ndarray, Dict[str, np.ndarray]],
258
+ obs: np.ndarray,
259
+ distance: np.ndarray,
260
+ measures: Union[str, List[str]] = "all",
261
+ processes: Optional[int] = None
262
+ ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
263
+ """
264
+ Calculate goodness-of-fit measures for simulated vs observed trip matrices.
265
+
266
+ Parameters
267
+ ----------
268
+ sim : np.ndarray or Dict[str, np.ndarray]
269
+ Simulated trip matrices. If Dict, keys should be exponent values
270
+ obs : np.ndarray
271
+ Observed trip matrix (n x n)
272
+ distance : np.ndarray
273
+ Distance matrix (n x n)
274
+ measures : str or List[str], default "all"
275
+ Measures to calculate. "all" or subset of:
276
+ ["CPC", "CPL", "CPCd", "KS_stat", "KS_pval", "KL_div", "RMSE"]
277
+ processes : int, optional
278
+ Number of processes for parallel computation. Default: CPU count - 2
279
+
280
+ Returns
281
+ -------
282
+ Union[pd.DataFrame, Dict[str, pd.DataFrame]]
283
+ If single simulation: DataFrame with measures
284
+ If multiple simulations: Dict with exponents as keys, DataFrames as values
285
+ """
286
+
287
+ # Available measures
288
+ all_measures = ["CPC", "CPL", "CPCd", "KS_stat", "KS_pval", "KL_div", "RMSE"]
289
+
290
+ if measures == "all":
291
+ selected_measures = all_measures
292
+ else:
293
+ selected_measures = measures if isinstance(measures, list) else [measures]
294
+ invalid = set(selected_measures) - set(all_measures)
295
+ if invalid:
296
+ raise TDLMError(f"Invalid measures: {invalid}. Available: {['all']+all_measures}")
297
+
298
+ # Setup multiprocessing
299
+ num_processes = processes if processes is not None else max(1, mp.cpu_count() - 2)
300
+
301
+ # Handle single vs multiple simulations
302
+ if isinstance(sim, dict):
303
+ exponents = list(sim.keys())
304
+ single_simulation = len(exponents) == 1
305
+
306
+ if len(exponents) > 1 and num_processes > 1:
307
+ # Parallel processing for multiple exponents
308
+ print(f'Calculating GOF measures for {len(exponents)} exponents')
309
+ print(f'Using {num_processes} parallel processes')
310
+
311
+ with mp.Pool(processes=num_processes) as pool:
312
+ params = [(exponent, sim_matrices, obs, distance, selected_measures)
313
+ for exponent, sim_matrices in sim.items()]
314
+ results = list(tqdm(pool.imap(_process_gof_exponent, params),
315
+ total=len(exponents), desc='Computing GOF measures'))
316
+
317
+ # Organize results
318
+ output = {}
319
+ for i, exponent in enumerate(exponents):
320
+ output[exponent] = results[i]
321
+
322
+ print('Done\n')
323
+ return output
324
+ else:
325
+ # Sequential processing
326
+ results = {}
327
+ if single_simulation:
328
+ exponent = exponents[0]
329
+ print(f'Calculating GOF measures for exponent {exponent}')
330
+ results[exponent] = _calculate_gof(sim[exponent], obs, distance, selected_measures)
331
+ else:
332
+ print(f'Calculating GOF measures for {len(exponents)} exponents')
333
+ for exponent in tqdm(exponents, desc='Computing GOF measures'):
334
+ results[exponent] = _calculate_gof(sim[exponent], obs, distance, selected_measures)
335
+
336
+ print('Done\n')
337
+ return results
338
+ else:
339
+ # Single simulation matrix
340
+ print('Calculating GOF measures')
341
+ result = _calculate_gof(sim, obs, distance, selected_measures)
342
+ print('Done\n')
343
+ return result
344
+
345
+
346
+ def _process_gof_exponent(params):
347
+ """Process GOF calculation for a single exponent"""
348
+ exponent, sim_matrices, obs, distance, selected_measures = params
349
+ return _calculate_gof(sim_matrices, obs, distance, selected_measures)
350
+
351
+
352
+
353
+ def _validate_inputs(law, model, mass_origin, mass_destination, distance,
354
+ opportunity, out_trips, in_trips):
355
+ """Validate input parameters"""
356
+
357
+ valid_laws = ["GravExp", "NGravExp", "GravPow", "NGravPow", "Schneider", "Rad", "RadExt", "Rand"]
358
+ valid_models = ["UM", "PCM", "ACM", "DCM"]
359
+
360
+ if law not in valid_laws:
361
+ raise TDLMError(f"Invalid law '{law}'. Must be one of: {valid_laws}")
362
+
363
+ if model not in valid_models:
364
+ raise TDLMError(f"Invalid model '{model}'. Must be one of: {valid_models}")
365
+
366
+ # Check array dimensions
367
+ n = len(mass_origin)
368
+ if len(mass_destination) != n:
369
+ raise TDLMError("mass_origin and mass_destination must have same length")
370
+
371
+ if distance.shape != (n, n):
372
+ raise TDLMError(f"distance matrix must be {n}x{n}")
373
+
374
+ # Check opportunity matrix for relevant laws
375
+ if law in ["Rad", "RadExt", "Schneider"]:
376
+ if opportunity is None:
377
+ raise TDLMError(f"opportunity matrix required for law '{law}'")
378
+ if opportunity.shape != (n, n):
379
+ raise TDLMError(f"opportunity matrix must be {n}x{n}")
380
+
381
+ # Check trip constraints for models
382
+ if model in ["PCM", "DCM"] and out_trips is None:
383
+ raise TDLMError(f"out_trips required for model '{model}'")
384
+
385
+ if model in ["ACM", "DCM"] and in_trips is None:
386
+ raise TDLMError(f"in_trips required for model '{model}'")
387
+
388
+ if out_trips is not None and len(out_trips) != n:
389
+ raise TDLMError("out_trips must have same length as mass arrays")
390
+
391
+ if in_trips is not None and len(in_trips) != n:
392
+ raise TDLMError("in_trips must have same length as mass arrays")
393
+
394
+
395
+ def _process_exponent(params):
396
+ """Process a single exponent value"""
397
+ data, law, model, beta, repli, return_proba = params
398
+ n, mi, mj, Oi, Dj, dij, sij = data
399
+
400
+ # Build the matrix pij according to the law
401
+ pij = _proba(law, dij, sij, mi, mj, beta)
402
+
403
+ # Store results
404
+ simulations = []
405
+
406
+ # Loop replications
407
+ for r in range(repli):
408
+ # Simulated OD
409
+ S = np.zeros((n, n))
410
+
411
+ # Network generation according to the constrained model
412
+ if model == "UM": # Unconstrained model
413
+ S = _UM(pij, Oi)
414
+ elif model == "PCM": # Production constrained model
415
+ S = _PCM(pij, Oi)
416
+ elif model == "ACM": # Attraction constrained model
417
+ S = _ACM(pij, Dj)
418
+ elif model == "DCM": # Doubly constrained model
419
+ S = _DCM(pij, Oi, Dj, 50, 0.01)
420
+
421
+ simulations.append(S)
422
+
423
+ simulations = np.array(simulations)
424
+
425
+ if return_proba:
426
+ # Normalize pij
427
+ sumpij = np.sum(pij)
428
+ return {
429
+ 'simulations': simulations,
430
+ 'probabilities': pij / sumpij if sumpij > 0 else pij
431
+ }
432
+ else:
433
+ return {'simulations': simulations}
434
+
435
+
436
+ def _calculate_gof(sim_matrices, obs, distance, measures):
437
+ """Calculate goodness-of-fit measures"""
438
+
439
+ # Ensure sim_matrices is 3D (replications, n, n)
440
+ if sim_matrices.ndim == 2:
441
+ sim_matrices = sim_matrices[np.newaxis, ...]
442
+
443
+ repli, n, _ = sim_matrices.shape
444
+
445
+ # Prepare observed data
446
+ pobs = (obs / obs.sum()).flatten()
447
+ nb = np.sum(obs)
448
+ T_range = np.max(obs) - np.min(obs)
449
+
450
+ # Calculate distance indices for CPCd
451
+ indices = np.floor(distance / 2).astype(int).flatten()
452
+ max_index = indices.max() + 1
453
+ CDD_R = np.bincount(indices, weights=obs.flatten(), minlength=max_index)
454
+
455
+ results = []
456
+
457
+ for r in range(repli):
458
+ S = sim_matrices[r]
459
+ result_dict = {"Replication": r}
460
+
461
+ if "CPC" in measures:
462
+ # CPC - Common Part of Commuters
463
+ mask = (obs != 0) * (S != 0)
464
+ cpc = np.minimum(obs[mask], S[mask]).sum() / nb if nb > 0 else 0
465
+ result_dict["CPC"] = cpc
466
+
467
+ if "CPL" in measures:
468
+ # CPL - Common Part of Links
469
+ nbNL = ((obs == 0) * (S != 0)).sum() # Number of new links
470
+ nbML = ((obs != 0) * (S == 0)).sum() # Number of missing links
471
+ nbCL = ((obs != 0) * (S != 0)).sum() # Number of common links
472
+ cpl = 2 * nbCL / (nbNL + 2 * nbCL + nbML) if (nbNL + 2 * nbCL + nbML) > 0 else 0
473
+ result_dict["CPL"] = cpl
474
+
475
+ if "CPCd" in measures:
476
+ # CPCd - Common Part of Commuters by distance
477
+ CDD_S = np.bincount(indices, weights=S.flatten(), minlength=max_index)
478
+ cpcd = (np.abs(CDD_S - CDD_R) / nb).sum() if nb > 0 else 0
479
+ cpcd = 1 - 0.5 * cpcd
480
+ result_dict["CPCd"] = cpcd
481
+
482
+ if "KS_stat" in measures or "KS_pval" in measures:
483
+ # KS - Kolmogorov-Smirnov test
484
+ ks_statistic, ks_pvalue = _ks_weighted(
485
+ data1=distance.flatten(),
486
+ wei1=obs.flatten(),
487
+ wei2=S.flatten()
488
+ )
489
+ if "KS_stat" in measures:
490
+ result_dict["KS_stat"] = ks_statistic
491
+ if "KS_pval" in measures:
492
+ result_dict["KS_pval"] = ks_pvalue
493
+
494
+ if "KL_div" in measures:
495
+ # KL - Kullback-Leibler divergence
496
+ ppred = (S / S.sum()).flatten() if S.sum() > 0 else S.flatten()
497
+ with warnings.catch_warnings():
498
+ warnings.simplefilter("ignore")
499
+ kl_div_array = pobs * np.log(pobs / ppred)
500
+ kl_div = np.nan_to_num(kl_div_array, nan=0., posinf=0., neginf=0.).sum()
501
+ result_dict["KL_div"] = kl_div
502
+
503
+ if "RMSE" in measures:
504
+ # NRMSE - Normalized Root Mean Square Error
505
+ if T_range > 0 and obs.sum() > 0:
506
+ mse = np.sum((obs - S) ** 2) / obs.sum()
507
+ nrmse = np.sqrt(mse)
508
+ else:
509
+ nrmse = 0
510
+ result_dict["RMSE"] = nrmse
511
+
512
+ results.append(result_dict)
513
+
514
+ return pd.DataFrame(results)
515
+
516
+
517
+ # Import the utility functions from the original scripts
518
+ def _proba(law, dij, sij, mi, mj, beta):
519
+ """Generate the matrix pij according to the law"""
520
+ n = len(mi)
521
+ W = np.zeros((n, n))
522
+
523
+ if law == "GravExp":
524
+ W = np.outer(mi, mj) * np.exp(-dij * beta)
525
+ np.fill_diagonal(W, 0)
526
+
527
+ elif law == "NGravExp":
528
+ W = mj * np.exp(-dij * beta)
529
+ np.fill_diagonal(W, 0)
530
+
531
+ elif law == "GravPow":
532
+ W = np.outer(mi, mj) * dij**(-beta)
533
+ np.fill_diagonal(W, 0)
534
+
535
+ elif law == "NGravPow":
536
+ W = mj * dij**(-beta)
537
+ np.fill_diagonal(W, 0)
538
+
539
+ elif law == "Schneider":
540
+ W = np.exp(-beta * sij) - np.exp(-beta * (sij + mj))
541
+ np.fill_diagonal(W, 0)
542
+ W[np.isnan(W)] = 0
543
+
544
+ elif law == "Rad":
545
+ W = np.outer(mi, mj) / ((mi[:, np.newaxis] + sij) * (mi[:, np.newaxis] + mj + sij))
546
+ np.fill_diagonal(W, 0)
547
+ W[np.isnan(W)] = 0
548
+
549
+ elif law == "RadExt":
550
+ numerator = ((mi[:, np.newaxis] + mj + sij)**beta - (mi[:, np.newaxis] + sij)**beta) * (mi**beta + 1)[:, np.newaxis]
551
+ denominator = ((mi[:, np.newaxis] + mj + sij)**beta + 1) * ((mi[:, np.newaxis] + sij)**beta + 1)
552
+ W = numerator / denominator
553
+ np.fill_diagonal(W, 0)
554
+ W[np.isnan(W)] = 0
555
+
556
+ elif law == "Rand":
557
+ W = np.ones((n, n)) / (n**2 - n)
558
+ np.fill_diagonal(W, 0)
559
+
560
+ # Row normalization if needed
561
+ if law not in ["GravExp", "GravPow", "Rand"]:
562
+ Wi = np.sum(W, axis=1)
563
+ mask = Wi != 0
564
+ W[mask] = mi[mask, np.newaxis] * W[mask] / Wi[mask, np.newaxis]
565
+
566
+ return W
567
+
568
+
569
+ def _UM(pij, Oi):
570
+ """Generate the network using the Unconstrained Model"""
571
+ n = pij.shape[0]
572
+ nb_commuters = np.sum(Oi)
573
+ sumt = np.sum(pij)
574
+ sum_rows = np.sum(pij, axis=1)
575
+
576
+ S = np.floor(nb_commuters * pij / sumt) if sumt > 0 else np.zeros_like(pij)
577
+ nb = np.sum(S)
578
+
579
+ remaining = int(nb_commuters - nb)
580
+ if remaining > 0:
581
+ index = _Multinomial_ij(remaining, pij, sum_rows)
582
+ flat_indices = index[:, 0] * n + index[:, 1]
583
+ increments = np.bincount(flat_indices, minlength=n*n).reshape(n, n)
584
+ S += increments
585
+
586
+ return S
587
+
588
+
589
+ def _PCM(pij, Oi):
590
+ """Generate the network using the Production Constrained Model"""
591
+ n = len(Oi)
592
+ S = np.zeros((n, n))
593
+ sum_rows = np.sum(pij, axis=1)
594
+
595
+ # Initial allocation
596
+ valid_rows = sum_rows > 0
597
+ division_factors = np.zeros(n)
598
+ division_factors[valid_rows] = 1.0 / sum_rows[valid_rows]
599
+
600
+ allocation_ratios = pij * division_factors[:, np.newaxis]
601
+ S = np.floor(Oi[:, np.newaxis] * allocation_ratios)
602
+
603
+ # Allocate remaining commuters
604
+ nb = np.sum(S, axis=1).astype(int)
605
+
606
+ for i in range(n):
607
+ remaining = Oi[i] - nb[i]
608
+ if remaining > 0 and sum_rows[i] > 0:
609
+ index = _Multinomial_i(remaining, pij[i], sum_rows[i])
610
+ increments = np.bincount(index, minlength=n)
611
+ S[i] += increments
612
+
613
+ return S
614
+
615
+
616
+ def _ACM(pij, Dj):
617
+ """Generate the network using the Attraction Constrained Model"""
618
+ n = len(Dj)
619
+ S = np.zeros((n, n))
620
+ tweights = pij.T
621
+ sum_rows = np.sum(tweights, axis=1)
622
+
623
+ # Initial allocation
624
+ valid_rows = sum_rows > 0
625
+ division_factors = np.zeros(n)
626
+ division_factors[valid_rows] = 1.0 / sum_rows[valid_rows]
627
+
628
+ allocation_ratios = tweights * division_factors[:, np.newaxis]
629
+ initial_allocation = np.floor(Dj[:, np.newaxis] * allocation_ratios)
630
+ S = initial_allocation.T
631
+
632
+ # Allocate remaining commuters
633
+ nb = np.sum(S, axis=0).astype(int)
634
+
635
+ for i in range(n):
636
+ remaining = Dj[i] - nb[i]
637
+ if remaining > 0 and sum_rows[i] > 0:
638
+ index = _Multinomial_i(remaining, tweights[i], sum_rows[i])
639
+ increments = np.bincount(index, minlength=n)
640
+ S[:, i] += increments
641
+
642
+ return S
643
+
644
+
645
+ def _DCM(pij, Oi, Dj, max_iter, closure):
646
+ """Generate the network using the Doubly Constrained Model"""
647
+ n = len(Oi)
648
+
649
+ # Initialize marginals
650
+ marg = np.zeros((n, 2))
651
+ marg[:, 0] = np.maximum(Oi, 0.01)
652
+ marg[:, 1] = np.maximum(Dj, 0.01)
653
+
654
+ weights = np.maximum(pij, 0.01)
655
+
656
+ iter_count = 0
657
+ crit_out = 1.0
658
+ crit_in = 1.0
659
+
660
+ # IPF procedure
661
+ while (crit_out > closure or crit_in > closure) and (iter_count <= max_iter):
662
+ # Row adjustment
663
+ sout = np.sum(weights, axis=1)
664
+ adjustment_factors = marg[:, 0] / sout
665
+ weights = weights * adjustment_factors[:, np.newaxis]
666
+
667
+ # Column adjustment
668
+ sin = np.sum(weights, axis=0)
669
+ adjustment_factors = marg[:, 1] / sin
670
+ weights = weights * adjustment_factors[np.newaxis, :]
671
+
672
+ # Check convergence
673
+ sout = np.sum(weights, axis=1)
674
+ sin = np.sum(weights, axis=0)
675
+
676
+ rel_error_out = np.abs(1 - (sout / marg[:, 0]))
677
+ rel_error_in = np.abs(1 - (sin / marg[:, 1]))
678
+
679
+ crit_out = np.max(rel_error_out)
680
+ crit_in = np.max(rel_error_in)
681
+
682
+ iter_count += 1
683
+
684
+ # Generate final matrix using UM
685
+ S = _UM(weights, Oi)
686
+ return S
687
+
688
+
689
+ def _Multinomial_i(n, weights, sum_val):
690
+ """Sample indices according to weights"""
691
+ n = int(n)
692
+ if n <= 0 or sum_val <= 0:
693
+ return np.array([], dtype=int)
694
+
695
+ random_vals = np.random.random(n) * sum_val
696
+ cumulative_weights = np.cumsum(weights)
697
+ random_index = np.searchsorted(cumulative_weights, random_vals)
698
+
699
+ return random_index
700
+
701
+
702
+ def _Multinomial_ij(n, weights, sum_rows):
703
+ """Sample 2D indices according to matrix weights"""
704
+ n = int(n)
705
+ if n <= 0:
706
+ return np.array([]).reshape(0, 2)
707
+
708
+ sumt = np.sum(sum_rows)
709
+ if sumt <= 0:
710
+ return np.array([]).reshape(0, 2)
711
+
712
+ random_vals = np.random.random(n) * sumt
713
+ cumsum_rows = np.cumsum(sum_rows)
714
+ row_indices = np.searchsorted(cumsum_rows, random_vals)
715
+
716
+ # Calculate remaining values for column selection
717
+ prev_cumsum = np.zeros(n)
718
+ mask = row_indices > 0
719
+ prev_cumsum[mask] = cumsum_rows[row_indices[mask] - 1]
720
+ remaining_vals = random_vals - prev_cumsum
721
+
722
+ # Select columns
723
+ col_indices = np.zeros(n, dtype=int)
724
+ for i in range(n):
725
+ row = row_indices[i]
726
+ if row < weights.shape[0]:
727
+ row_weights = weights[row]
728
+ cumsum_cols = np.cumsum(row_weights)
729
+ col_indices[i] = np.searchsorted(cumsum_cols, remaining_vals[i])
730
+
731
+ return np.column_stack((row_indices, col_indices))
732
+
733
+
734
+ def _pkstwo(x, tol=0):
735
+ """Calculate CDF of Kolmogorov-Smirnov two-sample test statistic"""
736
+ if np.isscalar(x):
737
+ x = np.array([x])
738
+ else:
739
+ x = np.asarray(x)
740
+
741
+ p = np.zeros_like(x, dtype=float)
742
+ p[np.isnan(x)] = np.nan
743
+
744
+ idx = np.where(~np.isnan(x) & (x > 0))[0]
745
+
746
+ for i in idx:
747
+ if x[i] < 1e-10:
748
+ p[i] = 0.0
749
+ else:
750
+ k_max = int(np.ceil(45 / x[i]**2))
751
+ sum_term = 0.0
752
+ for k in range(1, k_max + 1):
753
+ t1 = np.exp(-2 * k**2 * x[i]**2)
754
+ t2 = 2 * k**2 * x[i]**2 - 1
755
+ sum_term += t1 * t2
756
+ if t1 * t2 < tol:
757
+ break
758
+ p[i] = 1 - 2 * sum_term
759
+
760
+ if len(p) == 1:
761
+ return p[0]
762
+ return p
763
+
764
+
765
+ def _ks_weighted(data1, data2=None, wei1=None, wei2=None, alternative='two-sided'):
766
+ """Compute Kolmogorov-Smirnov statistic for weighted data"""
767
+ if data2 is None:
768
+ data2 = data1
769
+
770
+ ix1 = np.argsort(data1)
771
+ data1_sorted = data1[ix1]
772
+ wei1_sorted = wei1[ix1]
773
+
774
+ if data1 is data2:
775
+ data2_sorted = data1_sorted
776
+ ix2 = ix1
777
+ else:
778
+ ix2 = np.argsort(data2)
779
+ data2_sorted = data2[ix2]
780
+
781
+ wei2_sorted = wei2[ix2]
782
+
783
+ # Calculate CDFs
784
+ if np.array_equal(data1_sorted, data2_sorted):
785
+ cwei1 = np.hstack([0, np.cumsum(wei1_sorted) / np.sum(wei1_sorted)])
786
+ cwei2 = np.hstack([0, np.cumsum(wei2_sorted) / np.sum(wei2_sorted)])
787
+ cdf1we = cwei1[1:]
788
+ cdf2we = cwei2[1:]
789
+ else:
790
+ data = np.concatenate([data1_sorted, data2_sorted])
791
+ cwei1 = np.hstack([0, np.cumsum(wei1_sorted) / np.sum(wei1_sorted)])
792
+ cwei2 = np.hstack([0, np.cumsum(wei2_sorted) / np.sum(wei2_sorted)])
793
+ cdf1we = cwei1[np.searchsorted(data1_sorted, data, side='right')]
794
+ cdf2we = cwei2[np.searchsorted(data2_sorted, data, side='right')]
795
+
796
+ # Calculate KS statistic
797
+ if alternative == 'two-sided':
798
+ d = np.max(np.abs(cdf1we - cdf2we))
799
+ elif alternative == 'less':
800
+ d = np.max(cdf2we - cdf1we)
801
+ elif alternative == 'greater':
802
+ d = np.max(cdf1we - cdf2we)
803
+ else:
804
+ raise ValueError("alternative must be one of 'two-sided', 'less', or 'greater'")
805
+
806
+ # Calculate effective sample sizes
807
+ n1_effective = np.sum(wei1)**2 / np.sum(wei1**2) if np.sum(wei1**2) > 0 else 0
808
+ n2_effective = np.sum(wei2)**2 / np.sum(wei2**2) if np.sum(wei2**2) > 0 else 0
809
+
810
+ if n1_effective > 0 and n2_effective > 0:
811
+ n_effective = (n1_effective * n2_effective) / (n1_effective + n2_effective)
812
+ else:
813
+ n_effective = 0
814
+
815
+ # Calculate p-value
816
+ if alternative == 'two-sided' and n_effective > 0:
817
+ prob = 1 - _pkstwo(np.sqrt(n_effective) * d)
818
+ else:
819
+ prob = np.nan
820
+
821
+ return d, prob