econcomplex 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- econcomplex/__init__.py +220 -0
- econcomplex/complexity/__init__.py +23 -0
- econcomplex/complexity/eci_pci.py +131 -0
- econcomplex/complexity/eigenvector.py +115 -0
- econcomplex/complexity/fitness.py +130 -0
- econcomplex/complexity/reflections.py +173 -0
- econcomplex/complexity/subnational.py +82 -0
- econcomplex/core/__init__.py +23 -0
- econcomplex/core/diversity.py +125 -0
- econcomplex/core/preprocess.py +83 -0
- econcomplex/core/rca.py +161 -0
- econcomplex/core/utils.py +137 -0
- econcomplex/dynamics/__init__.py +10 -0
- econcomplex/dynamics/entry_exit.py +248 -0
- econcomplex/dynamics/growth.py +146 -0
- econcomplex/inequality/__init__.py +11 -0
- econcomplex/inequality/concentration.py +148 -0
- econcomplex/inequality/gini.py +164 -0
- econcomplex/optimization/__init__.py +46 -0
- econcomplex/optimization/diffusion.py +379 -0
- econcomplex/optimization/growth_target.py +170 -0
- econcomplex/optimization/portfolio.py +178 -0
- econcomplex/optimization/steppingstone.py +267 -0
- econcomplex/outlook/__init__.py +6 -0
- econcomplex/outlook/coi_cog.py +168 -0
- econcomplex/patents/__init__.py +7 -0
- econcomplex/patents/recombination.py +135 -0
- econcomplex/pipeline.py +255 -0
- econcomplex/productivity/__init__.py +8 -0
- econcomplex/productivity/prody.py +218 -0
- econcomplex/relatedness/__init__.py +25 -0
- econcomplex/relatedness/cooccurrence.py +173 -0
- econcomplex/relatedness/cross_space.py +142 -0
- econcomplex/relatedness/density.py +232 -0
- econcomplex/relatedness/proximity.py +214 -0
- econcomplex/specialization/__init__.py +17 -0
- econcomplex/specialization/location_quotient.py +163 -0
- econcomplex/specialization/similarity.py +68 -0
- econcomplex-1.0.0.dist-info/METADATA +223 -0
- econcomplex-1.0.0.dist-info/RECORD +43 -0
- econcomplex-1.0.0.dist-info/WHEEL +5 -0
- econcomplex-1.0.0.dist-info/licenses/LICENSE +22 -0
- econcomplex-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Productivity levels of products and regions.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Hausmann, Hwang & Rodrik (2007) "What you export matters".
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import Union
|
|
12
|
+
|
|
13
|
+
from ..core.utils import validate_matrix, safe_divide
|
|
14
|
+
from ..core.rca import rca as compute_rca
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def prody(
|
|
18
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
19
|
+
gdp: Union[np.ndarray, pd.Series],
|
|
20
|
+
use_rca: bool = True,
|
|
21
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
22
|
+
"""
|
|
23
|
+
PRODY: Income / productivity level of each activity.
|
|
24
|
+
|
|
25
|
+
PRODY_c = sum_r (LQ_{rc} * GDP_r) / sum_r LQ_{rc}
|
|
26
|
+
|
|
27
|
+
Weighted average of regional GDP, weighted by each region's
|
|
28
|
+
Revealed Comparative Advantage in activity c.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
mat : array-like (R x C)
|
|
33
|
+
Value matrix.
|
|
34
|
+
gdp : array-like (length R)
|
|
35
|
+
GDP (or income) per region.
|
|
36
|
+
use_rca : bool
|
|
37
|
+
Compute RCA internally (True) or use mat as LQ (False).
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
pd.Series indexed by activity.
|
|
42
|
+
"""
|
|
43
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
44
|
+
col_index = mat.columns if is_df else None
|
|
45
|
+
|
|
46
|
+
arr = validate_matrix(mat)
|
|
47
|
+
gdp_arr = np.array(gdp, dtype=float)
|
|
48
|
+
|
|
49
|
+
if len(gdp_arr) != arr.shape[0]:
|
|
50
|
+
raise ValueError("Length of gdp must equal number of rows in mat.")
|
|
51
|
+
|
|
52
|
+
if use_rca:
|
|
53
|
+
lq = compute_rca(arr)
|
|
54
|
+
if isinstance(lq, pd.DataFrame):
|
|
55
|
+
lq = lq.values
|
|
56
|
+
else:
|
|
57
|
+
lq = arr
|
|
58
|
+
|
|
59
|
+
numerator = (lq * gdp_arr[:, None]).sum(axis=0) # C
|
|
60
|
+
denominator = lq.sum(axis=0) # C
|
|
61
|
+
result = safe_divide(numerator, denominator)
|
|
62
|
+
|
|
63
|
+
if is_df:
|
|
64
|
+
return pd.Series(result, index=col_index, name="prody")
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def expy(
|
|
69
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
70
|
+
gdp: Union[np.ndarray, pd.Series],
|
|
71
|
+
use_rca: bool = True,
|
|
72
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
73
|
+
"""
|
|
74
|
+
EXPY: Income level of a region's export / activity portfolio.
|
|
75
|
+
|
|
76
|
+
EXPY_r = sum_c (s_{rc} * PRODY_c)
|
|
77
|
+
where s_{rc} = x_{rc} / X_r (share of activity c in region r)
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
mat : array-like (R x C)
|
|
82
|
+
Value matrix.
|
|
83
|
+
gdp : array-like (length R)
|
|
84
|
+
GDP (or income) per region.
|
|
85
|
+
use_rca : bool
|
|
86
|
+
Whether to compute RCA for PRODY calculation.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
pd.Series indexed by region.
|
|
91
|
+
"""
|
|
92
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
93
|
+
row_index = mat.index if is_df else None
|
|
94
|
+
|
|
95
|
+
arr = validate_matrix(mat)
|
|
96
|
+
gdp_arr = np.array(gdp, dtype=float)
|
|
97
|
+
|
|
98
|
+
prody_arr = prody(arr, gdp_arr, use_rca=use_rca)
|
|
99
|
+
if isinstance(prody_arr, pd.Series):
|
|
100
|
+
prody_arr = prody_arr.values
|
|
101
|
+
|
|
102
|
+
row_sums = arr.sum(axis=1, keepdims=True)
|
|
103
|
+
shares = safe_divide(arr, row_sums)
|
|
104
|
+
|
|
105
|
+
result = (shares * prody_arr[None, :]).sum(axis=1)
|
|
106
|
+
|
|
107
|
+
if is_df:
|
|
108
|
+
return pd.Series(result, index=row_index, name="expy")
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def product_gini_index(
|
|
113
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
114
|
+
gini_vec: Union[np.ndarray, pd.Series],
|
|
115
|
+
threshold: float = 1.0,
|
|
116
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
117
|
+
"""
|
|
118
|
+
Product Gini Index (PGI): inequality embedded in a product.
|
|
119
|
+
|
|
120
|
+
PGI_c = sum_r (M_{rc} * s_{rc} * Gini_r) / sum_r (M_{rc} * s_{rc})
|
|
121
|
+
|
|
122
|
+
Weighted average of regional Gini coefficients, weighted by
|
|
123
|
+
specialization (RCA) shares.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
mat : array-like (R x C)
|
|
128
|
+
Value matrix.
|
|
129
|
+
gini_vec : array-like (length R)
|
|
130
|
+
Gini coefficient per region.
|
|
131
|
+
threshold : float
|
|
132
|
+
RCA binarization threshold.
|
|
133
|
+
|
|
134
|
+
Returns
|
|
135
|
+
-------
|
|
136
|
+
pd.Series indexed by activity.
|
|
137
|
+
"""
|
|
138
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
139
|
+
col_index = mat.columns if is_df else None
|
|
140
|
+
|
|
141
|
+
arr = validate_matrix(mat)
|
|
142
|
+
gini_arr = np.array(gini_vec, dtype=float)
|
|
143
|
+
|
|
144
|
+
if len(gini_arr) != arr.shape[0]:
|
|
145
|
+
raise ValueError("Length of gini_vec must equal number of rows in mat.")
|
|
146
|
+
|
|
147
|
+
rca_mat = compute_rca(arr)
|
|
148
|
+
if isinstance(rca_mat, pd.DataFrame):
|
|
149
|
+
rca_mat = rca_mat.values
|
|
150
|
+
|
|
151
|
+
m = (rca_mat >= threshold).astype(float)
|
|
152
|
+
row_sums = arr.sum(axis=1, keepdims=True)
|
|
153
|
+
shares = safe_divide(arr, row_sums)
|
|
154
|
+
|
|
155
|
+
weights = m * shares # R x C
|
|
156
|
+
numerator = (weights * gini_arr[:, None]).sum(axis=0)
|
|
157
|
+
denominator = weights.sum(axis=0)
|
|
158
|
+
result = safe_divide(numerator, denominator)
|
|
159
|
+
|
|
160
|
+
if is_df:
|
|
161
|
+
return pd.Series(result, index=col_index, name="pgi")
|
|
162
|
+
return result
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def product_emissions_index(
|
|
166
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
167
|
+
emissions: Union[np.ndarray, pd.Series],
|
|
168
|
+
threshold: float = 1.0,
|
|
169
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
170
|
+
"""
|
|
171
|
+
Product Emissions Intensity Index (PEII).
|
|
172
|
+
|
|
173
|
+
Same formula as PGI but substitutes emissions intensity for Gini:
|
|
174
|
+
PEII_c = sum_r (M_{rc} * s_{rc} * emissions_r) / sum_r (M_{rc} * s_{rc})
|
|
175
|
+
|
|
176
|
+
Parameters
|
|
177
|
+
----------
|
|
178
|
+
mat : array-like (R x C)
|
|
179
|
+
Value matrix.
|
|
180
|
+
emissions : array-like (length R)
|
|
181
|
+
Emissions intensity per region.
|
|
182
|
+
threshold : float
|
|
183
|
+
RCA binarization threshold.
|
|
184
|
+
|
|
185
|
+
Returns
|
|
186
|
+
-------
|
|
187
|
+
pd.Series indexed by activity.
|
|
188
|
+
"""
|
|
189
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
190
|
+
col_index = mat.columns if is_df else None
|
|
191
|
+
|
|
192
|
+
arr = validate_matrix(mat)
|
|
193
|
+
em_arr = np.array(emissions, dtype=float)
|
|
194
|
+
|
|
195
|
+
if len(em_arr) != arr.shape[0]:
|
|
196
|
+
raise ValueError("Length of emissions must equal number of rows in mat.")
|
|
197
|
+
|
|
198
|
+
rca_mat = compute_rca(arr)
|
|
199
|
+
if isinstance(rca_mat, pd.DataFrame):
|
|
200
|
+
rca_mat = rca_mat.values
|
|
201
|
+
|
|
202
|
+
m = (rca_mat >= threshold).astype(float)
|
|
203
|
+
row_sums = arr.sum(axis=1, keepdims=True)
|
|
204
|
+
shares = safe_divide(arr, row_sums)
|
|
205
|
+
|
|
206
|
+
weights = m * shares
|
|
207
|
+
numerator = (weights * em_arr[:, None]).sum(axis=0)
|
|
208
|
+
denominator = weights.sum(axis=0)
|
|
209
|
+
result = safe_divide(numerator, denominator)
|
|
210
|
+
|
|
211
|
+
if is_df:
|
|
212
|
+
return pd.Series(result, index=col_index, name="peii")
|
|
213
|
+
return result
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# Short aliases matching the documented API
|
|
217
|
+
pgi = product_gini_index
|
|
218
|
+
peii = product_emissions_index
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from .proximity import proximity, continuous_proximity
|
|
2
|
+
from .density import (
|
|
3
|
+
relatedness_density,
|
|
4
|
+
distance,
|
|
5
|
+
relatedness_density_internal,
|
|
6
|
+
relatedness_density_external,
|
|
7
|
+
relative_relatedness,
|
|
8
|
+
)
|
|
9
|
+
from .cooccurrence import co_occurrence, relatedness_index, z_score_novelty
|
|
10
|
+
from .cross_space import cross_proximity, cross_relatedness
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"proximity",
|
|
14
|
+
"continuous_proximity",
|
|
15
|
+
"relatedness_density",
|
|
16
|
+
"distance",
|
|
17
|
+
"relatedness_density_internal",
|
|
18
|
+
"relatedness_density_external",
|
|
19
|
+
"relative_relatedness",
|
|
20
|
+
"co_occurrence",
|
|
21
|
+
"relatedness_index",
|
|
22
|
+
"z_score_novelty",
|
|
23
|
+
"cross_proximity",
|
|
24
|
+
"cross_relatedness",
|
|
25
|
+
]
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Co-occurrence matrices and statistical novelty (z-score).
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Steijn (2017) probability index; Fleming & Sorenson (2001) for patents.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import Literal, Union
|
|
12
|
+
|
|
13
|
+
from ..core.utils import validate_matrix, safe_divide
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def co_occurrence(
|
|
17
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
18
|
+
diagonal: bool = False,
|
|
19
|
+
) -> Union[pd.DataFrame, np.ndarray]:
|
|
20
|
+
"""
|
|
21
|
+
Co-occurrence matrix: number of times two activities appear together
|
|
22
|
+
across regions/events.
|
|
23
|
+
|
|
24
|
+
Cooc = M * M^T (if mat is events × activities)
|
|
25
|
+
= M^T * M (if mat is regions × activities — activity co-occurrence)
|
|
26
|
+
|
|
27
|
+
Here we compute the *activity-by-activity* co-occurrence:
|
|
28
|
+
Cooc_{cc'} = sum_r M_{rc} * M_{rc'}
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
mat : array-like (R x C)
|
|
33
|
+
Binary (or value) incidence matrix.
|
|
34
|
+
diagonal : bool
|
|
35
|
+
If False (default), set diagonal to 0.
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
C x C co-occurrence matrix.
|
|
40
|
+
"""
|
|
41
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
42
|
+
col_index = mat.columns if is_df else None
|
|
43
|
+
|
|
44
|
+
arr = validate_matrix(mat)
|
|
45
|
+
cooc = arr.T @ arr
|
|
46
|
+
|
|
47
|
+
if not diagonal:
|
|
48
|
+
np.fill_diagonal(cooc, 0.0)
|
|
49
|
+
|
|
50
|
+
if is_df:
|
|
51
|
+
return pd.DataFrame(cooc, index=col_index, columns=col_index)
|
|
52
|
+
return cooc
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def relatedness_index(
|
|
56
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
57
|
+
method: Literal["probability", "association", "cosine", "jaccard"] = "cosine",
|
|
58
|
+
diagonal: bool = False,
|
|
59
|
+
) -> Union[pd.DataFrame, np.ndarray]:
|
|
60
|
+
"""
|
|
61
|
+
Pairwise relatedness index between activities from co-occurrence.
|
|
62
|
+
|
|
63
|
+
Methods
|
|
64
|
+
-------
|
|
65
|
+
"probability" (Steijn 2017):
|
|
66
|
+
SM_{ij} = C_{ij} / (((S_i/T)(S_j/(T-S_i)) + (S_j/T)(S_i/(T-S_j))) * T/2)
|
|
67
|
+
|
|
68
|
+
"association" (association strength):
|
|
69
|
+
SA_{ij} = (C_{ij}/T) / ((S_i/T)(S_j/T))
|
|
70
|
+
|
|
71
|
+
"cosine":
|
|
72
|
+
SC_{ij} = C_{ij} / sqrt(S_i * S_j)
|
|
73
|
+
|
|
74
|
+
"jaccard":
|
|
75
|
+
SJ_{ij} = C_{ij} / (S_i + S_j - C_{ij})
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
mat : array-like (R x C)
|
|
80
|
+
Binary incidence matrix.
|
|
81
|
+
method : str
|
|
82
|
+
Normalization method.
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
C x C normalized relatedness matrix.
|
|
87
|
+
"""
|
|
88
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
89
|
+
col_index = mat.columns if is_df else None
|
|
90
|
+
|
|
91
|
+
arr = validate_matrix(mat)
|
|
92
|
+
cooc = arr.T @ arr # C x C
|
|
93
|
+
n_events = arr.shape[0] # T = number of regions/events
|
|
94
|
+
s = cooc.diagonal().copy() # S_i = total occurrences of activity i
|
|
95
|
+
|
|
96
|
+
if method == "cosine":
|
|
97
|
+
denom = np.sqrt(s[:, None] * s[None, :])
|
|
98
|
+
result = safe_divide(cooc, denom)
|
|
99
|
+
|
|
100
|
+
elif method == "jaccard":
|
|
101
|
+
denom = s[:, None] + s[None, :] - cooc
|
|
102
|
+
result = safe_divide(cooc, denom)
|
|
103
|
+
|
|
104
|
+
elif method == "association":
|
|
105
|
+
p_i = s / n_events
|
|
106
|
+
expected = p_i[:, None] * p_i[None, :]
|
|
107
|
+
result = safe_divide(cooc / n_events, expected)
|
|
108
|
+
|
|
109
|
+
elif method == "probability":
|
|
110
|
+
T = n_events
|
|
111
|
+
si = s.astype(float)
|
|
112
|
+
# expected = ((si/T)(sj/(T-si)) + (sj/T)(si/(T-sj))) * T/2
|
|
113
|
+
t1 = safe_divide(si[:, None] * si[None, :], T * (T - si[:, None]))
|
|
114
|
+
t2 = safe_divide(si[None, :] * si[:, None], T * (T - si[None, :]))
|
|
115
|
+
expected = (t1 + t2) * T / 2
|
|
116
|
+
result = safe_divide(cooc, expected)
|
|
117
|
+
|
|
118
|
+
else:
|
|
119
|
+
raise ValueError("method must be 'probability', 'association', 'cosine', or 'jaccard'.")
|
|
120
|
+
|
|
121
|
+
if not diagonal:
|
|
122
|
+
np.fill_diagonal(result, 0.0)
|
|
123
|
+
|
|
124
|
+
if is_df:
|
|
125
|
+
return pd.DataFrame(result, index=col_index, columns=col_index)
|
|
126
|
+
return result
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def z_score_novelty(
|
|
130
|
+
incidence: Union[np.ndarray, pd.DataFrame],
|
|
131
|
+
) -> Union[pd.DataFrame, np.ndarray]:
|
|
132
|
+
"""
|
|
133
|
+
Z-score of technological novelty (atypicality of co-occurrence).
|
|
134
|
+
|
|
135
|
+
Measures how much each pair of technologies co-occurs more or less
|
|
136
|
+
than expected by chance.
|
|
137
|
+
|
|
138
|
+
z_{ij} = (C_{ij} - mu_{ij}) / sigma_{ij}
|
|
139
|
+
where
|
|
140
|
+
mu_{ij} = n_i * n_j / P
|
|
141
|
+
sigma_{ij} = sqrt(mu_{ij} * (1 - n_i/P) * (P - n_j)/(P-1))
|
|
142
|
+
|
|
143
|
+
Parameters
|
|
144
|
+
----------
|
|
145
|
+
incidence : array-like (P x T)
|
|
146
|
+
Patent × technology (or event × activity) incidence matrix.
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
T x T z-score matrix.
|
|
151
|
+
"""
|
|
152
|
+
is_df = isinstance(incidence, pd.DataFrame)
|
|
153
|
+
col_index = incidence.columns if is_df else None
|
|
154
|
+
|
|
155
|
+
arr = validate_matrix(incidence)
|
|
156
|
+
P = arr.shape[0] # number of patents/events
|
|
157
|
+
|
|
158
|
+
n = arr.sum(axis=0) # occurrences per technology (T,)
|
|
159
|
+
cooc = arr.T @ arr # T x T
|
|
160
|
+
|
|
161
|
+
mu = (n[:, None] * n[None, :]) / P # expected co-occurrence
|
|
162
|
+
|
|
163
|
+
variance = mu * (1 - n[:, None] / P) * safe_divide(
|
|
164
|
+
P - n[None, :], P - 1
|
|
165
|
+
)
|
|
166
|
+
sigma = np.sqrt(np.maximum(variance, 0))
|
|
167
|
+
|
|
168
|
+
z = safe_divide(cooc - mu, sigma)
|
|
169
|
+
np.fill_diagonal(z, 0.0)
|
|
170
|
+
|
|
171
|
+
if is_df:
|
|
172
|
+
return pd.DataFrame(z, index=col_index, columns=col_index)
|
|
173
|
+
return z
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cross-space proximity and relatedness between two different activity spaces.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Catalan et al. (2020) "Cross-space proximity and relatedness".
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import Union
|
|
12
|
+
|
|
13
|
+
from ..core.utils import validate_matrix, safe_divide, binarize
|
|
14
|
+
from ..core.rca import rca as compute_rca
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def cross_proximity(
|
|
18
|
+
mat_a: Union[np.ndarray, pd.DataFrame],
|
|
19
|
+
mat_b: Union[np.ndarray, pd.DataFrame],
|
|
20
|
+
use_rca: bool = True,
|
|
21
|
+
threshold: float = 1.0,
|
|
22
|
+
) -> Union[pd.DataFrame, np.ndarray]:
|
|
23
|
+
"""
|
|
24
|
+
Cross-space proximity between activity space A and activity space B.
|
|
25
|
+
|
|
26
|
+
X_phi_{ij} = min(C_{ij}/U_j^B, C_{ij}/U_i^A)
|
|
27
|
+
where C_{ij} = (M_A^T * M_B)_{ij} = co-presence count
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
mat_a : array-like (R x A)
|
|
32
|
+
Region × activity-A matrix.
|
|
33
|
+
mat_b : array-like (R x B)
|
|
34
|
+
Region × activity-B matrix.
|
|
35
|
+
use_rca : bool
|
|
36
|
+
Compute RCA before binarizing for both matrices.
|
|
37
|
+
threshold : float
|
|
38
|
+
Binarization threshold.
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
A x B cross-proximity matrix.
|
|
43
|
+
"""
|
|
44
|
+
is_df_a = isinstance(mat_a, pd.DataFrame)
|
|
45
|
+
is_df_b = isinstance(mat_b, pd.DataFrame)
|
|
46
|
+
col_a = mat_a.columns if is_df_a else None
|
|
47
|
+
col_b = mat_b.columns if is_df_b else None
|
|
48
|
+
|
|
49
|
+
arr_a = validate_matrix(mat_a)
|
|
50
|
+
arr_b = validate_matrix(mat_b)
|
|
51
|
+
|
|
52
|
+
if arr_a.shape[0] != arr_b.shape[0]:
|
|
53
|
+
raise ValueError("mat_a and mat_b must have the same number of rows (locations).")
|
|
54
|
+
|
|
55
|
+
if use_rca:
|
|
56
|
+
m_a = binarize(compute_rca(arr_a), threshold)
|
|
57
|
+
m_b = binarize(compute_rca(arr_b), threshold)
|
|
58
|
+
else:
|
|
59
|
+
m_a = binarize(arr_a, threshold)
|
|
60
|
+
m_b = binarize(arr_b, threshold)
|
|
61
|
+
|
|
62
|
+
# Co-presence: A x B
|
|
63
|
+
c = m_a.T @ m_b # (A x R) @ (R x B) = A x B
|
|
64
|
+
|
|
65
|
+
ub_a = m_a.sum(axis=0) # A
|
|
66
|
+
ub_b = m_b.sum(axis=0) # B
|
|
67
|
+
|
|
68
|
+
# min(C/U_b, C/U_a) = C / max(U_a, U_b)
|
|
69
|
+
denom = np.maximum(ub_a[:, None], ub_b[None, :])
|
|
70
|
+
result = safe_divide(c, denom)
|
|
71
|
+
|
|
72
|
+
if is_df_a and is_df_b:
|
|
73
|
+
return pd.DataFrame(result, index=col_a, columns=col_b)
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def cross_relatedness(
|
|
78
|
+
mat_a: Union[np.ndarray, pd.DataFrame],
|
|
79
|
+
x_phi: Union[np.ndarray, pd.DataFrame],
|
|
80
|
+
use_rca: bool = True,
|
|
81
|
+
threshold: float = 1.0,
|
|
82
|
+
) -> Union[pd.DataFrame, np.ndarray]:
|
|
83
|
+
"""
|
|
84
|
+
Cross-space relatedness density.
|
|
85
|
+
|
|
86
|
+
x_density_{rb} = (M_A * X_phi)_{rb} / sum_a X_phi_{ab}
|
|
87
|
+
|
|
88
|
+
Relatedness of region r to each activity b of space B, given the
|
|
89
|
+
region's portfolio in space A (the fraction of space-A activities
|
|
90
|
+
related to b that the region already holds).
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
mat_a : array-like (R x A)
|
|
95
|
+
Region × activity-A value matrix.
|
|
96
|
+
x_phi : array-like (A x B)
|
|
97
|
+
Cross-proximity matrix from `cross_proximity`.
|
|
98
|
+
use_rca : bool
|
|
99
|
+
Compute RCA before binarizing mat_a.
|
|
100
|
+
threshold : float
|
|
101
|
+
Binarization threshold.
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
R x B cross-relatedness matrix (columns = activities of space B).
|
|
106
|
+
"""
|
|
107
|
+
is_df = isinstance(mat_a, pd.DataFrame)
|
|
108
|
+
row_index = mat_a.index if is_df else None
|
|
109
|
+
col_b = x_phi.columns if isinstance(x_phi, pd.DataFrame) else None
|
|
110
|
+
|
|
111
|
+
arr_a = validate_matrix(mat_a)
|
|
112
|
+
phi_arr = (
|
|
113
|
+
x_phi.values if isinstance(x_phi, pd.DataFrame) else np.array(x_phi, dtype=float)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
if arr_a.shape[1] != phi_arr.shape[0]:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
"x_phi must have one row per activity of mat_a "
|
|
119
|
+
f"(mat_a has {arr_a.shape[1]} activities, x_phi has "
|
|
120
|
+
f"{phi_arr.shape[0]} rows)."
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if use_rca:
|
|
124
|
+
m_a = binarize(compute_rca(arr_a), threshold)
|
|
125
|
+
else:
|
|
126
|
+
m_a = binarize(arr_a, threshold)
|
|
127
|
+
|
|
128
|
+
# numerator: R x B
|
|
129
|
+
numerator = m_a @ phi_arr
|
|
130
|
+
|
|
131
|
+
# denominator: 1 x B (sum of each column of x_phi)
|
|
132
|
+
denom = phi_arr.sum(axis=0, keepdims=True)
|
|
133
|
+
|
|
134
|
+
result = safe_divide(numerator, denom)
|
|
135
|
+
|
|
136
|
+
if is_df:
|
|
137
|
+
return pd.DataFrame(result, index=row_index, columns=col_b)
|
|
138
|
+
return result
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# Short alias matching the documented API
|
|
142
|
+
cross_space_proximity = cross_proximity
|