econcomplex 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- econcomplex/__init__.py +220 -0
- econcomplex/complexity/__init__.py +23 -0
- econcomplex/complexity/eci_pci.py +131 -0
- econcomplex/complexity/eigenvector.py +115 -0
- econcomplex/complexity/fitness.py +130 -0
- econcomplex/complexity/reflections.py +173 -0
- econcomplex/complexity/subnational.py +82 -0
- econcomplex/core/__init__.py +23 -0
- econcomplex/core/diversity.py +125 -0
- econcomplex/core/preprocess.py +83 -0
- econcomplex/core/rca.py +161 -0
- econcomplex/core/utils.py +137 -0
- econcomplex/dynamics/__init__.py +10 -0
- econcomplex/dynamics/entry_exit.py +248 -0
- econcomplex/dynamics/growth.py +146 -0
- econcomplex/inequality/__init__.py +11 -0
- econcomplex/inequality/concentration.py +148 -0
- econcomplex/inequality/gini.py +164 -0
- econcomplex/optimization/__init__.py +46 -0
- econcomplex/optimization/diffusion.py +379 -0
- econcomplex/optimization/growth_target.py +170 -0
- econcomplex/optimization/portfolio.py +178 -0
- econcomplex/optimization/steppingstone.py +267 -0
- econcomplex/outlook/__init__.py +6 -0
- econcomplex/outlook/coi_cog.py +168 -0
- econcomplex/patents/__init__.py +7 -0
- econcomplex/patents/recombination.py +135 -0
- econcomplex/pipeline.py +255 -0
- econcomplex/productivity/__init__.py +8 -0
- econcomplex/productivity/prody.py +218 -0
- econcomplex/relatedness/__init__.py +25 -0
- econcomplex/relatedness/cooccurrence.py +173 -0
- econcomplex/relatedness/cross_space.py +142 -0
- econcomplex/relatedness/density.py +232 -0
- econcomplex/relatedness/proximity.py +214 -0
- econcomplex/specialization/__init__.py +17 -0
- econcomplex/specialization/location_quotient.py +163 -0
- econcomplex/specialization/similarity.py +68 -0
- econcomplex-1.0.0.dist-info/METADATA +223 -0
- econcomplex-1.0.0.dist-info/RECORD +43 -0
- econcomplex-1.0.0.dist-info/WHEEL +5 -0
- econcomplex-1.0.0.dist-info/licenses/LICENSE +22 -0
- econcomplex-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gini coefficient, locational Gini, and Hoover-Gini.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Krugman (1991); Hoover & Giarratani (1985); Ellison & Glaeser (1997).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import Optional, Union
|
|
12
|
+
|
|
13
|
+
from ..core.utils import validate_matrix, safe_divide
|
|
14
|
+
|
|
15
|
+
# np.trapz was renamed to np.trapezoid in NumPy 2.0 and the old name removed
|
|
16
|
+
try:
|
|
17
|
+
from numpy import trapezoid as _trapezoid
|
|
18
|
+
except ImportError: # NumPy < 2.0
|
|
19
|
+
from numpy import trapz as _trapezoid
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def gini(
|
|
23
|
+
x: Union[np.ndarray, pd.Series, pd.DataFrame],
|
|
24
|
+
) -> Union[float, pd.Series]:
|
|
25
|
+
"""
|
|
26
|
+
Standard Gini coefficient.
|
|
27
|
+
|
|
28
|
+
G = (sum_{i,j} |x_i - x_j|) / (2 * n * sum_i x_i)
|
|
29
|
+
= 1 - 2 * area under Lorenz curve (computed via sorted cumulative sums)
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
x : 1-D array or DataFrame
|
|
34
|
+
If DataFrame, computes Gini per column.
|
|
35
|
+
|
|
36
|
+
Returns
|
|
37
|
+
-------
|
|
38
|
+
float (for 1-D) or pd.Series (for DataFrame).
|
|
39
|
+
"""
|
|
40
|
+
def _gini_1d(v):
|
|
41
|
+
v = np.sort(np.maximum(v, 0))
|
|
42
|
+
n = len(v)
|
|
43
|
+
if n == 0 or v.sum() == 0:
|
|
44
|
+
return 0.0
|
|
45
|
+
# Lorenz-curve area approximation
|
|
46
|
+
numerator = 2 * np.sum((np.arange(1, n + 1)) * v) - (n + 1) * v.sum()
|
|
47
|
+
return numerator / (n * v.sum())
|
|
48
|
+
|
|
49
|
+
if isinstance(x, pd.DataFrame):
|
|
50
|
+
return x.apply(lambda col: _gini_1d(col.values), axis=0)
|
|
51
|
+
if isinstance(x, pd.Series):
|
|
52
|
+
return _gini_1d(x.values)
|
|
53
|
+
return _gini_1d(np.array(x, dtype=float))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def locational_gini(
|
|
57
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
58
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
59
|
+
"""
|
|
60
|
+
Krugman Locational Gini per activity (column).
|
|
61
|
+
|
|
62
|
+
Ranks regions by (industry-share / total-share) ratio and computes
|
|
63
|
+
the area between the Lorenz curve and the 45-degree line.
|
|
64
|
+
Theoretical maximum = 0.5 (full concentration).
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
mat : array-like (R x C)
|
|
69
|
+
Value matrix.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
pd.Series indexed by activity (or ndarray).
|
|
74
|
+
"""
|
|
75
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
76
|
+
col_index = mat.columns if is_df else None
|
|
77
|
+
|
|
78
|
+
arr = validate_matrix(mat)
|
|
79
|
+
total = arr.sum()
|
|
80
|
+
|
|
81
|
+
col_sums = arr.sum(axis=0) # C
|
|
82
|
+
row_sums = arr.sum(axis=1) # R
|
|
83
|
+
|
|
84
|
+
# share of each activity per region
|
|
85
|
+
s_rc = safe_divide(arr, col_sums[None, :]) # R x C (share of region r in activity c)
|
|
86
|
+
# national employment share of each region
|
|
87
|
+
s_r = row_sums / total # R
|
|
88
|
+
|
|
89
|
+
results = []
|
|
90
|
+
for c in range(arr.shape[1]):
|
|
91
|
+
# Sort regions by (s_rc / s_r) ratio
|
|
92
|
+
ratio = safe_divide(s_rc[:, c], s_r)
|
|
93
|
+
order = np.argsort(ratio)
|
|
94
|
+
p = np.cumsum(s_r[order])
|
|
95
|
+
nu = np.cumsum(s_rc[:, c][order])
|
|
96
|
+
p = np.concatenate([[0], p])
|
|
97
|
+
nu = np.concatenate([[0], nu])
|
|
98
|
+
# Area under Lorenz curve (trapezoidal)
|
|
99
|
+
area = _trapezoid(nu, p)
|
|
100
|
+
loc_gini = 0.5 - area
|
|
101
|
+
results.append(loc_gini)
|
|
102
|
+
|
|
103
|
+
result = np.array(results)
|
|
104
|
+
if is_df:
|
|
105
|
+
return pd.Series(result, index=col_index, name="locational_gini")
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def hoover_gini(
|
|
110
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
111
|
+
pop: Optional[Union[np.ndarray, pd.Series]] = None,
|
|
112
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
113
|
+
"""
|
|
114
|
+
Hoover-Gini: Gini using population as horizontal axis (Hoover curve).
|
|
115
|
+
|
|
116
|
+
Ranks regions by (industry-employment-share / population-share) ratio,
|
|
117
|
+
then computes Gini-type area.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
mat : array-like (R x C)
|
|
122
|
+
Value matrix.
|
|
123
|
+
pop : 1-D array-like (length R), optional
|
|
124
|
+
Population vector. If None, uses row sums (total employment).
|
|
125
|
+
|
|
126
|
+
Returns
|
|
127
|
+
-------
|
|
128
|
+
pd.Series indexed by activity.
|
|
129
|
+
"""
|
|
130
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
131
|
+
col_index = mat.columns if is_df else None
|
|
132
|
+
|
|
133
|
+
arr = validate_matrix(mat)
|
|
134
|
+
|
|
135
|
+
if pop is None:
|
|
136
|
+
pop_arr = arr.sum(axis=1)
|
|
137
|
+
else:
|
|
138
|
+
pop_arr = np.array(pop, dtype=float)
|
|
139
|
+
|
|
140
|
+
total_pop = pop_arr.sum()
|
|
141
|
+
col_sums = arr.sum(axis=0) # C
|
|
142
|
+
|
|
143
|
+
p_r = pop_arr / total_pop # population share per region
|
|
144
|
+
|
|
145
|
+
results = []
|
|
146
|
+
for c in range(arr.shape[1]):
|
|
147
|
+
if col_sums[c] == 0:
|
|
148
|
+
results.append(0.0)
|
|
149
|
+
continue
|
|
150
|
+
e_rc = arr[:, c]
|
|
151
|
+
s_rc = e_rc / col_sums[c] # employment share of region r in activity c
|
|
152
|
+
ratio = safe_divide(s_rc, p_r)
|
|
153
|
+
order = np.argsort(ratio)
|
|
154
|
+
p = np.cumsum(p_r[order])
|
|
155
|
+
nu = np.cumsum(s_rc[order])
|
|
156
|
+
p = np.concatenate([[0], p])
|
|
157
|
+
nu = np.concatenate([[0], nu])
|
|
158
|
+
area = _trapezoid(nu, p)
|
|
159
|
+
results.append(1 - 2 * area)
|
|
160
|
+
|
|
161
|
+
result = np.array(results)
|
|
162
|
+
if is_df:
|
|
163
|
+
return pd.Series(result, index=col_index, name="hoover_gini")
|
|
164
|
+
return result
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ECI Optimization (Stojkoski & Hidalgo 2026, Research Policy 55, 105454).
|
|
3
|
+
|
|
4
|
+
Target-oriented optimization layer for strategic diversification:
|
|
5
|
+
calibrate a stepping-stone forecast model, compute the effort required
|
|
6
|
+
for new specializations, and select minimal-effort portfolios that reach
|
|
7
|
+
an ECI (or growth) target.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .steppingstone import (
|
|
11
|
+
calibrate_steppingstone,
|
|
12
|
+
effort_matrix,
|
|
13
|
+
forecast_specialization,
|
|
14
|
+
)
|
|
15
|
+
from .portfolio import eci_optimization
|
|
16
|
+
from .growth_target import (
|
|
17
|
+
calibrate_growth_model,
|
|
18
|
+
expected_growth,
|
|
19
|
+
eci_target_for_growth,
|
|
20
|
+
)
|
|
21
|
+
from .diffusion import (
|
|
22
|
+
proximity_network,
|
|
23
|
+
activation_probabilities,
|
|
24
|
+
calibrate_contagion,
|
|
25
|
+
diversification_strategy,
|
|
26
|
+
expected_diversification_time,
|
|
27
|
+
compare_strategies,
|
|
28
|
+
optimize_sequence,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"calibrate_steppingstone",
|
|
33
|
+
"effort_matrix",
|
|
34
|
+
"forecast_specialization",
|
|
35
|
+
"eci_optimization",
|
|
36
|
+
"calibrate_growth_model",
|
|
37
|
+
"expected_growth",
|
|
38
|
+
"eci_target_for_growth",
|
|
39
|
+
"proximity_network",
|
|
40
|
+
"activation_probabilities",
|
|
41
|
+
"calibrate_contagion",
|
|
42
|
+
"diversification_strategy",
|
|
43
|
+
"expected_diversification_time",
|
|
44
|
+
"compare_strategies",
|
|
45
|
+
"optimize_sequence",
|
|
46
|
+
]
|
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Strategic diffusion on networks of related activities.
|
|
3
|
+
|
|
4
|
+
Implements the complex-contagion model of Alshamsi, Pinheiro & Hidalgo
|
|
5
|
+
(2018): the probability that a location activates a new activity grows as
|
|
6
|
+
a power of the fraction of related activities already present,
|
|
7
|
+
|
|
8
|
+
p_i = B * (sum_j a_ij M_j / k_i) ** alpha (eq. 1)
|
|
9
|
+
|
|
10
|
+
A diversification strategy is an ordered sequence of activation targets;
|
|
11
|
+
the expected waiting time of a target is 1/p_i given the current active
|
|
12
|
+
set. The paper shows that the greedy strategy (always target the most
|
|
13
|
+
related activity) is suboptimal: minimal total diversification time
|
|
14
|
+
requires targeting highly connected hubs during a narrow, relatively
|
|
15
|
+
early window ("suboptimality of relatedness").
|
|
16
|
+
|
|
17
|
+
References
|
|
18
|
+
----------
|
|
19
|
+
Alshamsi, Pinheiro & Hidalgo (2018) "Optimal diversification strategies
|
|
20
|
+
in the networks of related products and of related research areas",
|
|
21
|
+
Nature Communications 9, 1328.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
import pandas as pd
|
|
26
|
+
from typing import Dict, Optional, Union
|
|
27
|
+
|
|
28
|
+
from ..core.utils import safe_divide, pivot_to_matrix
|
|
29
|
+
from ..core.rca import mcp as compute_mcp
|
|
30
|
+
from ..relatedness.proximity import proximity as compute_proximity
|
|
31
|
+
|
|
32
|
+
STRATEGIES = ("greedy", "majority", "high_degree", "low_degree", "random")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def proximity_network(
|
|
36
|
+
mat: pd.DataFrame,
|
|
37
|
+
phi: Optional[pd.DataFrame] = None,
|
|
38
|
+
phi_threshold: float = 0.55,
|
|
39
|
+
method: str = "max",
|
|
40
|
+
) -> pd.DataFrame:
|
|
41
|
+
"""
|
|
42
|
+
Binary network of related activities: a_pp' = 1 if phi_pp' >= threshold.
|
|
43
|
+
|
|
44
|
+
The default threshold (0.55) follows the product-space literature
|
|
45
|
+
(Hidalgo et al. 2007). Pass a pre-computed `phi` to use another
|
|
46
|
+
proximity measure.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
C x C binary adjacency DataFrame with zero diagonal.
|
|
51
|
+
"""
|
|
52
|
+
if phi is None:
|
|
53
|
+
phi = compute_proximity(mat, method=method, compute="product")["product"]
|
|
54
|
+
adj = (phi.values >= phi_threshold).astype(float)
|
|
55
|
+
np.fill_diagonal(adj, 0.0)
|
|
56
|
+
return pd.DataFrame(adj, index=phi.index, columns=phi.columns)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def activation_probabilities(
|
|
60
|
+
adjacency: pd.DataFrame,
|
|
61
|
+
active: Union[pd.Series, np.ndarray],
|
|
62
|
+
B: float = 1.0,
|
|
63
|
+
alpha: float = 1.0,
|
|
64
|
+
) -> pd.Series:
|
|
65
|
+
"""
|
|
66
|
+
Activation probability of each inactive activity (eq. 1):
|
|
67
|
+
p_i = B * (active neighbors / degree)^alpha; 0 for active nodes.
|
|
68
|
+
"""
|
|
69
|
+
a = np.asarray(active, dtype=float)
|
|
70
|
+
adj = adjacency.values
|
|
71
|
+
k = adj.sum(axis=1)
|
|
72
|
+
frac = safe_divide(adj @ a, k)
|
|
73
|
+
p = B * frac ** alpha
|
|
74
|
+
p[a.astype(bool)] = 0.0
|
|
75
|
+
return pd.Series(p, index=adjacency.index, name="activation_probability")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def calibrate_contagion(
|
|
79
|
+
df: pd.DataFrame,
|
|
80
|
+
loc: str,
|
|
81
|
+
act: str,
|
|
82
|
+
val: str,
|
|
83
|
+
time: str,
|
|
84
|
+
adjacency: Optional[pd.DataFrame] = None,
|
|
85
|
+
phi_threshold: float = 0.55,
|
|
86
|
+
threshold: float = 1.0,
|
|
87
|
+
presence_test: str = "rca",
|
|
88
|
+
n_bins: int = 20,
|
|
89
|
+
) -> Dict:
|
|
90
|
+
"""
|
|
91
|
+
Empirically calibrate B and alpha of the contagion model (eq. 1) from
|
|
92
|
+
a panel: the probability that a location enters an activity between
|
|
93
|
+
consecutive periods is regressed (log-log, on binned data) against
|
|
94
|
+
the fraction of related activities already present.
|
|
95
|
+
|
|
96
|
+
Alshamsi et al. (2018) report p ~ 0.16 x^1.03 for the product space
|
|
97
|
+
and p ~ 0.74 x^1.09 for the research space.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
df : pd.DataFrame
|
|
102
|
+
Long-format panel.
|
|
103
|
+
loc, act, val, time : str
|
|
104
|
+
Column names.
|
|
105
|
+
adjacency : pd.DataFrame, optional
|
|
106
|
+
Binary activity network. Computed from the first period via
|
|
107
|
+
`proximity_network` if omitted.
|
|
108
|
+
phi_threshold : float
|
|
109
|
+
Proximity threshold when computing the network internally.
|
|
110
|
+
threshold : float
|
|
111
|
+
RCA binarization threshold for presences.
|
|
112
|
+
presence_test : str
|
|
113
|
+
Passed to `mcp` ('rca' default; use 'manual' when `val` is
|
|
114
|
+
already a binary presence indicator).
|
|
115
|
+
n_bins : int
|
|
116
|
+
Number of bins of the related fraction.
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
dict with 'B', 'alpha', 'n_events', 'n_observations', and 'bins'
|
|
121
|
+
(DataFrame with the binned fractions and empirical probabilities,
|
|
122
|
+
useful for plotting the fit).
|
|
123
|
+
"""
|
|
124
|
+
periods = sorted(df[time].unique())
|
|
125
|
+
if len(periods) < 2:
|
|
126
|
+
raise ValueError("Need at least 2 periods to observe entries.")
|
|
127
|
+
|
|
128
|
+
mats = {t: pivot_to_matrix(df[df[time] == t], loc, act, val)
|
|
129
|
+
for t in periods}
|
|
130
|
+
if adjacency is None:
|
|
131
|
+
adjacency = proximity_network(mats[periods[0]],
|
|
132
|
+
phi_threshold=phi_threshold)
|
|
133
|
+
acts = adjacency.index
|
|
134
|
+
adj = adjacency.values
|
|
135
|
+
k = adj.sum(axis=1)
|
|
136
|
+
|
|
137
|
+
xs, ys = [], []
|
|
138
|
+
for t0, t1 in zip(periods[:-1], periods[1:]):
|
|
139
|
+
m0 = compute_mcp(mats[t0].reindex(columns=acts, fill_value=0.0),
|
|
140
|
+
presence_test=presence_test, rca_threshold=threshold)
|
|
141
|
+
rows = m0.index
|
|
142
|
+
m1 = compute_mcp(mats[t1].reindex(columns=acts, fill_value=0.0),
|
|
143
|
+
presence_test=presence_test, rca_threshold=threshold)
|
|
144
|
+
m1 = m1.reindex(index=rows, fill_value=0.0)
|
|
145
|
+
frac = safe_divide(m0.values @ adj, k[None, :])
|
|
146
|
+
inactive = m0.values == 0
|
|
147
|
+
xs.append(frac[inactive])
|
|
148
|
+
ys.append(m1.values[inactive])
|
|
149
|
+
|
|
150
|
+
x = np.concatenate(xs)
|
|
151
|
+
y = np.concatenate(ys)
|
|
152
|
+
|
|
153
|
+
edges = np.linspace(0.0, 1.0, n_bins + 1)
|
|
154
|
+
centers, probs = [], []
|
|
155
|
+
for lo, hi in zip(edges[:-1], edges[1:]):
|
|
156
|
+
sel = (x > lo) & (x <= hi)
|
|
157
|
+
if sel.sum() > 0:
|
|
158
|
+
centers.append(x[sel].mean())
|
|
159
|
+
probs.append(y[sel].mean())
|
|
160
|
+
bins = pd.DataFrame({"related_fraction": centers, "entry_probability": probs})
|
|
161
|
+
|
|
162
|
+
fit = bins[(bins["related_fraction"] > 0) & (bins["entry_probability"] > 0)]
|
|
163
|
+
if len(fit) < 2:
|
|
164
|
+
raise ValueError("Not enough non-empty bins to fit the power law.")
|
|
165
|
+
beta, intercept = np.polyfit(np.log(fit["related_fraction"]),
|
|
166
|
+
np.log(fit["entry_probability"]), 1)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"B": float(np.exp(intercept)),
|
|
170
|
+
"alpha": float(beta),
|
|
171
|
+
"n_events": int(y.sum()),
|
|
172
|
+
"n_observations": int(len(y)),
|
|
173
|
+
"bins": bins,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _pick_target(strategy, p, k, n_active_neighbors, rng):
|
|
178
|
+
"""Index of the next target among candidates (p > 0)."""
|
|
179
|
+
cand = np.flatnonzero(p > 0)
|
|
180
|
+
if cand.size == 0:
|
|
181
|
+
return None
|
|
182
|
+
if strategy == "greedy":
|
|
183
|
+
return cand[np.argmax(p[cand])]
|
|
184
|
+
if strategy == "majority":
|
|
185
|
+
return cand[np.argmax(n_active_neighbors[cand])]
|
|
186
|
+
if strategy == "high_degree":
|
|
187
|
+
return cand[np.argmax(k[cand])]
|
|
188
|
+
if strategy == "low_degree":
|
|
189
|
+
return cand[np.argmin(k[cand])]
|
|
190
|
+
if strategy == "random":
|
|
191
|
+
return rng.choice(cand)
|
|
192
|
+
raise ValueError(f"strategy must be one of {STRATEGIES}.")
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def diversification_strategy(
|
|
196
|
+
adjacency: pd.DataFrame,
|
|
197
|
+
active0: Union[pd.Series, np.ndarray],
|
|
198
|
+
B: float = 1.0,
|
|
199
|
+
alpha: float = 1.0,
|
|
200
|
+
strategy: str = "greedy",
|
|
201
|
+
seed: Optional[int] = None,
|
|
202
|
+
) -> pd.DataFrame:
|
|
203
|
+
"""
|
|
204
|
+
Generate a diversification sequence with one of the heuristic
|
|
205
|
+
strategies of Alshamsi et al. (2018) and evaluate its expected times.
|
|
206
|
+
|
|
207
|
+
At each step the strategy picks one potentially active target
|
|
208
|
+
(an inactive activity with at least one active neighbor); the expected
|
|
209
|
+
waiting time is 1/p_i (eq. 1) given the current active set, after
|
|
210
|
+
which the target becomes active. Activities unreachable from the
|
|
211
|
+
initial active set are not activated.
|
|
212
|
+
|
|
213
|
+
Strategies: 'greedy' (highest activation probability — the most
|
|
214
|
+
related), 'majority' (most active neighbors in absolute number),
|
|
215
|
+
'high_degree', 'low_degree', and 'random'.
|
|
216
|
+
|
|
217
|
+
Returns
|
|
218
|
+
-------
|
|
219
|
+
pd.DataFrame, one row per activated target in order, with columns
|
|
220
|
+
[activity, expected_time, cumulative_time, degree, related_fraction].
|
|
221
|
+
"""
|
|
222
|
+
rng = np.random.default_rng(seed)
|
|
223
|
+
adj = adjacency.values
|
|
224
|
+
k = adj.sum(axis=1)
|
|
225
|
+
active = np.asarray(active0, dtype=float).copy()
|
|
226
|
+
|
|
227
|
+
rows = []
|
|
228
|
+
total = 0.0
|
|
229
|
+
while True:
|
|
230
|
+
n_act = adj @ active
|
|
231
|
+
frac = safe_divide(n_act, k)
|
|
232
|
+
p = B * frac ** alpha
|
|
233
|
+
p[active.astype(bool)] = 0.0
|
|
234
|
+
i = _pick_target(strategy, p, k, n_act, rng)
|
|
235
|
+
if i is None:
|
|
236
|
+
break
|
|
237
|
+
total += 1.0 / p[i]
|
|
238
|
+
active[i] = 1.0
|
|
239
|
+
rows.append({
|
|
240
|
+
"activity": adjacency.index[i],
|
|
241
|
+
"expected_time": 1.0 / p[i],
|
|
242
|
+
"cumulative_time": total,
|
|
243
|
+
"degree": k[i],
|
|
244
|
+
"related_fraction": frac[i],
|
|
245
|
+
})
|
|
246
|
+
|
|
247
|
+
return pd.DataFrame(
|
|
248
|
+
rows, columns=["activity", "expected_time", "cumulative_time",
|
|
249
|
+
"degree", "related_fraction"],
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def expected_diversification_time(
|
|
254
|
+
adjacency: pd.DataFrame,
|
|
255
|
+
active0: Union[pd.Series, np.ndarray],
|
|
256
|
+
sequence,
|
|
257
|
+
B: float = 1.0,
|
|
258
|
+
alpha: float = 1.0,
|
|
259
|
+
) -> float:
|
|
260
|
+
"""
|
|
261
|
+
Expected total time of a fixed sequence of targets (activity labels).
|
|
262
|
+
Returns inf if some target has no active neighbor when reached.
|
|
263
|
+
"""
|
|
264
|
+
adj = adjacency.values
|
|
265
|
+
k = adj.sum(axis=1)
|
|
266
|
+
idx = adjacency.index.get_indexer(pd.Index(sequence))
|
|
267
|
+
if (idx < 0).any():
|
|
268
|
+
raise ValueError("sequence contains activities absent from the network.")
|
|
269
|
+
active = np.asarray(active0, dtype=float).copy()
|
|
270
|
+
total = 0.0
|
|
271
|
+
for i in idx:
|
|
272
|
+
cnt = adj[i] @ active
|
|
273
|
+
if cnt == 0 or k[i] == 0:
|
|
274
|
+
return np.inf
|
|
275
|
+
total += 1.0 / (B * (cnt / k[i]) ** alpha)
|
|
276
|
+
active[i] = 1.0
|
|
277
|
+
return total
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def compare_strategies(
|
|
281
|
+
adjacency: pd.DataFrame,
|
|
282
|
+
active0: Union[pd.Series, np.ndarray],
|
|
283
|
+
B: float = 1.0,
|
|
284
|
+
alpha: float = 1.0,
|
|
285
|
+
n_random: int = 10,
|
|
286
|
+
seed: int = 0,
|
|
287
|
+
) -> pd.DataFrame:
|
|
288
|
+
"""
|
|
289
|
+
Expected total diversification time of each heuristic strategy
|
|
290
|
+
('random' averaged over `n_random` runs).
|
|
291
|
+
|
|
292
|
+
Returns
|
|
293
|
+
-------
|
|
294
|
+
pd.DataFrame indexed by strategy with columns
|
|
295
|
+
[total_time, n_activated].
|
|
296
|
+
"""
|
|
297
|
+
rows = {}
|
|
298
|
+
for s in STRATEGIES:
|
|
299
|
+
if s == "random":
|
|
300
|
+
runs = [diversification_strategy(adjacency, active0, B, alpha,
|
|
301
|
+
strategy=s, seed=seed + r)
|
|
302
|
+
for r in range(n_random)]
|
|
303
|
+
rows[s] = {
|
|
304
|
+
"total_time": float(np.mean(
|
|
305
|
+
[r_["cumulative_time"].iloc[-1] for r_ in runs if len(r_)])),
|
|
306
|
+
"n_activated": int(np.mean([len(r_) for r_ in runs])),
|
|
307
|
+
}
|
|
308
|
+
else:
|
|
309
|
+
r_ = diversification_strategy(adjacency, active0, B, alpha, strategy=s)
|
|
310
|
+
rows[s] = {
|
|
311
|
+
"total_time": float(r_["cumulative_time"].iloc[-1]) if len(r_) else np.nan,
|
|
312
|
+
"n_activated": len(r_),
|
|
313
|
+
}
|
|
314
|
+
return pd.DataFrame.from_dict(rows, orient="index")
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def optimize_sequence(
|
|
318
|
+
adjacency: pd.DataFrame,
|
|
319
|
+
active0: Union[pd.Series, np.ndarray],
|
|
320
|
+
B: float = 1.0,
|
|
321
|
+
alpha: float = 1.0,
|
|
322
|
+
n_iter: int = 2000,
|
|
323
|
+
seed: int = 0,
|
|
324
|
+
) -> Dict:
|
|
325
|
+
"""
|
|
326
|
+
Approximate the optimal diversification sequence by simulated
|
|
327
|
+
annealing over target orderings (pairwise-swap proposals), starting
|
|
328
|
+
from the greedy sequence.
|
|
329
|
+
|
|
330
|
+
Exact optimality is combinatorial (Alshamsi et al. solve it in closed
|
|
331
|
+
form only for stylized networks); annealing reproduces the paper's
|
|
332
|
+
qualitative optimum — targeting hubs earlier than the greedy strategy
|
|
333
|
+
does — and never returns a sequence worse than greedy.
|
|
334
|
+
|
|
335
|
+
Returns
|
|
336
|
+
-------
|
|
337
|
+
dict with 'sequence' (list of activity labels), 'total_time',
|
|
338
|
+
'greedy_time', and 'improvement' (greedy_time - total_time).
|
|
339
|
+
"""
|
|
340
|
+
rng = np.random.default_rng(seed)
|
|
341
|
+
greedy = diversification_strategy(adjacency, active0, B, alpha, "greedy")
|
|
342
|
+
order = list(greedy["activity"])
|
|
343
|
+
if len(order) < 2:
|
|
344
|
+
t = float(greedy["cumulative_time"].iloc[-1]) if len(order) else 0.0
|
|
345
|
+
return {"sequence": order, "total_time": t,
|
|
346
|
+
"greedy_time": t, "improvement": 0.0}
|
|
347
|
+
|
|
348
|
+
def cost(seq):
|
|
349
|
+
return expected_diversification_time(adjacency, active0, seq, B, alpha)
|
|
350
|
+
|
|
351
|
+
current = order[:]
|
|
352
|
+
current_cost = cost(current)
|
|
353
|
+
greedy_cost = current_cost
|
|
354
|
+
best, best_cost = current[:], current_cost
|
|
355
|
+
|
|
356
|
+
t0, tf = 0.05 * current_cost, 1e-4 * current_cost
|
|
357
|
+
cooling = (tf / t0) ** (1.0 / max(n_iter, 1))
|
|
358
|
+
temp = t0
|
|
359
|
+
n = len(order)
|
|
360
|
+
|
|
361
|
+
for _ in range(n_iter):
|
|
362
|
+
i, j = rng.integers(0, n, 2)
|
|
363
|
+
if i == j:
|
|
364
|
+
continue
|
|
365
|
+
proposal = current[:]
|
|
366
|
+
proposal[i], proposal[j] = proposal[j], proposal[i]
|
|
367
|
+
c = cost(proposal)
|
|
368
|
+
if c < current_cost or rng.random() < np.exp(-(c - current_cost) / temp):
|
|
369
|
+
current, current_cost = proposal, c
|
|
370
|
+
if c < best_cost:
|
|
371
|
+
best, best_cost = proposal, c
|
|
372
|
+
temp *= cooling
|
|
373
|
+
|
|
374
|
+
return {
|
|
375
|
+
"sequence": best,
|
|
376
|
+
"total_time": float(best_cost),
|
|
377
|
+
"greedy_time": float(greedy_cost),
|
|
378
|
+
"improvement": float(greedy_cost - best_cost),
|
|
379
|
+
}
|