econcomplex 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- econcomplex/__init__.py +220 -0
- econcomplex/complexity/__init__.py +23 -0
- econcomplex/complexity/eci_pci.py +131 -0
- econcomplex/complexity/eigenvector.py +115 -0
- econcomplex/complexity/fitness.py +130 -0
- econcomplex/complexity/reflections.py +173 -0
- econcomplex/complexity/subnational.py +82 -0
- econcomplex/core/__init__.py +23 -0
- econcomplex/core/diversity.py +125 -0
- econcomplex/core/preprocess.py +83 -0
- econcomplex/core/rca.py +161 -0
- econcomplex/core/utils.py +137 -0
- econcomplex/dynamics/__init__.py +10 -0
- econcomplex/dynamics/entry_exit.py +248 -0
- econcomplex/dynamics/growth.py +146 -0
- econcomplex/inequality/__init__.py +11 -0
- econcomplex/inequality/concentration.py +148 -0
- econcomplex/inequality/gini.py +164 -0
- econcomplex/optimization/__init__.py +46 -0
- econcomplex/optimization/diffusion.py +379 -0
- econcomplex/optimization/growth_target.py +170 -0
- econcomplex/optimization/portfolio.py +178 -0
- econcomplex/optimization/steppingstone.py +267 -0
- econcomplex/outlook/__init__.py +6 -0
- econcomplex/outlook/coi_cog.py +168 -0
- econcomplex/patents/__init__.py +7 -0
- econcomplex/patents/recombination.py +135 -0
- econcomplex/pipeline.py +255 -0
- econcomplex/productivity/__init__.py +8 -0
- econcomplex/productivity/prody.py +218 -0
- econcomplex/relatedness/__init__.py +25 -0
- econcomplex/relatedness/cooccurrence.py +173 -0
- econcomplex/relatedness/cross_space.py +142 -0
- econcomplex/relatedness/density.py +232 -0
- econcomplex/relatedness/proximity.py +214 -0
- econcomplex/specialization/__init__.py +17 -0
- econcomplex/specialization/location_quotient.py +163 -0
- econcomplex/specialization/similarity.py +68 -0
- econcomplex-1.0.0.dist-info/METADATA +223 -0
- econcomplex-1.0.0.dist-info/RECORD +43 -0
- econcomplex-1.0.0.dist-info/WHEEL +5 -0
- econcomplex-1.0.0.dist-info/licenses/LICENSE +22 -0
- econcomplex-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Method of Reflections (MOR) for Economic Complexity.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Hidalgo & Hausmann (2009) "The Building Blocks of Economic Complexity".
|
|
7
|
+
Balland & Rigby (2017) for regional adaptation.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from typing import Optional, Tuple, Union
|
|
13
|
+
|
|
14
|
+
from ..core.utils import validate_matrix, safe_divide, normalize_zscore, binarize
|
|
15
|
+
from ..core.rca import rca as compute_rca
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _binary_presence(mat, use_rca: bool, threshold: float):
|
|
19
|
+
"""Binary Mcp plus DataFrame metadata, shared by the MoR variants."""
|
|
20
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
21
|
+
row_index = mat.index if is_df else None
|
|
22
|
+
col_index = mat.columns if is_df else None
|
|
23
|
+
arr = validate_matrix(mat)
|
|
24
|
+
if use_rca:
|
|
25
|
+
m = binarize(compute_rca(arr), threshold)
|
|
26
|
+
else:
|
|
27
|
+
m = binarize(arr, threshold)
|
|
28
|
+
return m, is_df, row_index, col_index
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _reflect(m: np.ndarray, iterations: int, tol: Optional[float] = None):
|
|
32
|
+
"""
|
|
33
|
+
Iterate the Method of Reflections from the binary matrix `m`:
|
|
34
|
+
k_{r,n} = (M * k_{c,n-1}) / k_{r,0}
|
|
35
|
+
k_{c,n} = (M^T * k_{r,n-1}) / k_{c,0}
|
|
36
|
+
Optionally stop early when the z-scored reflections change by less
|
|
37
|
+
than `tol`. Returns (kc, kp, kc0, kp0).
|
|
38
|
+
"""
|
|
39
|
+
kc0 = m.sum(axis=1) # diversity
|
|
40
|
+
kp0 = m.sum(axis=0) # ubiquity
|
|
41
|
+
kc = kc0.copy()
|
|
42
|
+
kp = kp0.copy()
|
|
43
|
+
for _ in range(iterations):
|
|
44
|
+
kc_new = safe_divide(m @ kp, kc0)
|
|
45
|
+
kp_new = safe_divide(m.T @ kc, kp0)
|
|
46
|
+
if tol is not None:
|
|
47
|
+
delta_c = np.max(np.abs(normalize_zscore(kc_new) - normalize_zscore(kc)))
|
|
48
|
+
delta_p = np.max(np.abs(normalize_zscore(kp_new) - normalize_zscore(kp)))
|
|
49
|
+
kc = kc_new
|
|
50
|
+
kp = kp_new
|
|
51
|
+
if delta_c < tol and delta_p < tol:
|
|
52
|
+
break
|
|
53
|
+
else:
|
|
54
|
+
kc = kc_new
|
|
55
|
+
kp = kp_new
|
|
56
|
+
return kc, kp, kc0, kp0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def method_of_reflections(
|
|
60
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
61
|
+
use_rca: bool = True,
|
|
62
|
+
threshold: float = 1.0,
|
|
63
|
+
iterations: int = 20,
|
|
64
|
+
return_both: bool = True,
|
|
65
|
+
tol: Optional[float] = 1e-10,
|
|
66
|
+
) -> Union[Tuple[pd.Series, pd.Series], Tuple[np.ndarray, np.ndarray]]:
|
|
67
|
+
"""
|
|
68
|
+
Method of Reflections (iterative) to compute ECI and PCI.
|
|
69
|
+
|
|
70
|
+
Starting from the binary Mcp matrix:
|
|
71
|
+
k_{r,0} = diversity (row sums)
|
|
72
|
+
k_{c,0} = ubiquity (col sums)
|
|
73
|
+
k_{r,n} = (M * k_{c,n-1}) / k_{r,0}
|
|
74
|
+
k_{c,n} = (M^T * k_{r,n-1}) / k_{c,0}
|
|
75
|
+
Final values are z-score normalized.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
mat : array-like (R x C)
|
|
80
|
+
Value or pre-binarized matrix.
|
|
81
|
+
use_rca : bool
|
|
82
|
+
Compute RCA internally before binarizing.
|
|
83
|
+
threshold : float
|
|
84
|
+
Binarization threshold.
|
|
85
|
+
iterations : int
|
|
86
|
+
Number of reflection steps (default 20).
|
|
87
|
+
return_both : bool
|
|
88
|
+
If True, return (ECI, PCI); if False, return ECI only.
|
|
89
|
+
tol : float, optional
|
|
90
|
+
Convergence tolerance: stop early when the z-scored reflections
|
|
91
|
+
change by less than `tol` between iterations. The check uses the
|
|
92
|
+
z-scored vectors because the raw reflections converge to a
|
|
93
|
+
constant. Pass None to always run all `iterations`.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
(eci, pci) as pd.Series (or ndarrays if input is ndarray).
|
|
98
|
+
"""
|
|
99
|
+
m, is_df, row_index, col_index = _binary_presence(mat, use_rca, threshold)
|
|
100
|
+
kc, kp, kc0, kp0 = _reflect(m, iterations, tol=tol)
|
|
101
|
+
|
|
102
|
+
eci = normalize_zscore(kc)
|
|
103
|
+
pci = normalize_zscore(kp)
|
|
104
|
+
|
|
105
|
+
# Sign orientation, as in the eigenvector method: ECI correlates
|
|
106
|
+
# positively with diversity, PCI negatively with ubiquity
|
|
107
|
+
if np.std(eci) > 0 and np.std(kc0) > 0 and np.corrcoef(eci, kc0)[0, 1] < 0:
|
|
108
|
+
eci = -eci
|
|
109
|
+
if np.std(pci) > 0 and np.std(kp0) > 0 and np.corrcoef(pci, kp0)[0, 1] > 0:
|
|
110
|
+
pci = -pci
|
|
111
|
+
|
|
112
|
+
if is_df:
|
|
113
|
+
return (
|
|
114
|
+
pd.Series(eci, index=row_index, name="eci"),
|
|
115
|
+
pd.Series(pci, index=col_index, name="pci"),
|
|
116
|
+
)
|
|
117
|
+
return eci, pci
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def mor_regions(
|
|
121
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
122
|
+
use_rca: bool = True,
|
|
123
|
+
threshold: float = 1.0,
|
|
124
|
+
steps: int = 20,
|
|
125
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
126
|
+
"""
|
|
127
|
+
Method of Reflections — region/country side only.
|
|
128
|
+
|
|
129
|
+
Returns the region complexity at a given reflection step (0–22),
|
|
130
|
+
rescaled to 0–100 for steps > 1.
|
|
131
|
+
|
|
132
|
+
step 0 = raw diversity
|
|
133
|
+
step 1 = avg ubiquity of industries present in each region
|
|
134
|
+
step 2+ = higher-order reflections
|
|
135
|
+
"""
|
|
136
|
+
m, is_df, row_index, _ = _binary_presence(mat, use_rca, threshold)
|
|
137
|
+
kc, kp, _, _ = _reflect(m, steps)
|
|
138
|
+
|
|
139
|
+
if steps > 1:
|
|
140
|
+
mn, mx = kc.min(), kc.max()
|
|
141
|
+
result = (kc - mn) / (mx - mn) * 100 if mx != mn else np.zeros_like(kc)
|
|
142
|
+
else:
|
|
143
|
+
result = kc
|
|
144
|
+
|
|
145
|
+
if is_df:
|
|
146
|
+
return pd.Series(result, index=row_index, name=f"mor_region_step{steps}")
|
|
147
|
+
return result
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def mor_activities(
|
|
151
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
152
|
+
use_rca: bool = True,
|
|
153
|
+
threshold: float = 1.0,
|
|
154
|
+
steps: int = 19,
|
|
155
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
156
|
+
"""
|
|
157
|
+
Method of Reflections — activity/technology side only.
|
|
158
|
+
|
|
159
|
+
step 0 = raw ubiquity
|
|
160
|
+
step 1+ = higher-order reflections; rescaled to 0–100 for steps > 1.
|
|
161
|
+
"""
|
|
162
|
+
m, is_df, _, col_index = _binary_presence(mat, use_rca, threshold)
|
|
163
|
+
kc, kp, _, _ = _reflect(m, steps)
|
|
164
|
+
|
|
165
|
+
if steps > 1:
|
|
166
|
+
mn, mx = kp.min(), kp.max()
|
|
167
|
+
result = (kp - mn) / (mx - mn) * 100 if mx != mn else np.zeros_like(kp)
|
|
168
|
+
else:
|
|
169
|
+
result = kp
|
|
170
|
+
|
|
171
|
+
if is_df:
|
|
172
|
+
return pd.Series(result, index=col_index, name=f"mor_activity_step{steps}")
|
|
173
|
+
return result
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Subnational / external ECI using externally supplied PCI.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Balland & Rigby (2017); Mealy et al. (2019).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import Tuple, Union
|
|
12
|
+
|
|
13
|
+
from ..core.utils import validate_matrix, binarize, normalize_zscore
|
|
14
|
+
from ..core.rca import rca as compute_rca
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def subnational_eci(
|
|
18
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
19
|
+
pci_external: Union[np.ndarray, pd.Series],
|
|
20
|
+
use_rca: bool = True,
|
|
21
|
+
threshold: float = 1.0,
|
|
22
|
+
standardize: bool = True,
|
|
23
|
+
) -> Tuple[Union[pd.Series, np.ndarray], Union[pd.Series, np.ndarray]]:
|
|
24
|
+
"""
|
|
25
|
+
Subnational ECI using an externally supplied PCI vector.
|
|
26
|
+
|
|
27
|
+
Each region's ECI is the mean PCI of activities it has RCA in:
|
|
28
|
+
ECI_r = mean(PCI_c for c where M_{rc} = 1)
|
|
29
|
+
|
|
30
|
+
Useful when comparing subnational units to a global product space,
|
|
31
|
+
or when the dataset is too small to compute reliable eigenvectors.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
mat : array-like (R x C)
|
|
36
|
+
Regional value matrix.
|
|
37
|
+
pci_external : array-like (length C)
|
|
38
|
+
External PCI vector (e.g., global product complexity index).
|
|
39
|
+
use_rca : bool
|
|
40
|
+
Compute RCA before binarizing.
|
|
41
|
+
threshold : float
|
|
42
|
+
Binarization threshold.
|
|
43
|
+
standardize : bool
|
|
44
|
+
If True, z-score normalize the resulting ECI.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
(eci, pci_external) as pd.Series or ndarrays.
|
|
49
|
+
"""
|
|
50
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
51
|
+
row_index = mat.index if is_df else None
|
|
52
|
+
col_index = mat.columns if is_df else None
|
|
53
|
+
|
|
54
|
+
arr = validate_matrix(mat)
|
|
55
|
+
|
|
56
|
+
if use_rca:
|
|
57
|
+
m = binarize(compute_rca(arr), threshold)
|
|
58
|
+
else:
|
|
59
|
+
m = binarize(arr, threshold)
|
|
60
|
+
|
|
61
|
+
pci_arr = np.array(pci_external, dtype=float)
|
|
62
|
+
if len(pci_arr) != m.shape[1]:
|
|
63
|
+
raise ValueError("Length of pci_external must match number of columns in mat.")
|
|
64
|
+
|
|
65
|
+
row_sums = m.sum(axis=1)
|
|
66
|
+
eci_raw = np.where(
|
|
67
|
+
row_sums > 0,
|
|
68
|
+
(m @ pci_arr) / row_sums,
|
|
69
|
+
0.0,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if standardize:
|
|
73
|
+
eci_raw = normalize_zscore(eci_raw)
|
|
74
|
+
|
|
75
|
+
if is_df:
|
|
76
|
+
pci_out = (
|
|
77
|
+
pci_external
|
|
78
|
+
if isinstance(pci_external, pd.Series)
|
|
79
|
+
else pd.Series(pci_arr, index=col_index, name="pci")
|
|
80
|
+
)
|
|
81
|
+
return pd.Series(eci_raw, index=row_index, name="eci_external"), pci_out
|
|
82
|
+
return eci_raw, pci_arr
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .rca import rca, rpop, mcp
|
|
2
|
+
from .diversity import diversity, ubiquity, normalized_ubiquity
|
|
3
|
+
from .utils import (
|
|
4
|
+
pivot_to_matrix,
|
|
5
|
+
melt_matrix,
|
|
6
|
+
binarize,
|
|
7
|
+
normalize_zscore,
|
|
8
|
+
normalize_01,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"rca",
|
|
13
|
+
"rpop",
|
|
14
|
+
"mcp",
|
|
15
|
+
"diversity",
|
|
16
|
+
"ubiquity",
|
|
17
|
+
"normalized_ubiquity",
|
|
18
|
+
"pivot_to_matrix",
|
|
19
|
+
"melt_matrix",
|
|
20
|
+
"binarize",
|
|
21
|
+
"normalize_zscore",
|
|
22
|
+
"normalize_01",
|
|
23
|
+
]
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Diversity and ubiquity indicators.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
from .utils import validate_matrix, safe_divide
|
|
10
|
+
from .rca import rca as compute_rca
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def diversity(
|
|
14
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
15
|
+
use_rca: bool = True,
|
|
16
|
+
threshold: float = 1.0,
|
|
17
|
+
) -> pd.Series:
|
|
18
|
+
"""
|
|
19
|
+
Diversity: number of activities in which each region has RCA >= threshold.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
mat : array-like (R x C)
|
|
24
|
+
Value matrix or pre-computed binary Mcp matrix.
|
|
25
|
+
use_rca : bool
|
|
26
|
+
If True, compute RCA internally before binarizing.
|
|
27
|
+
threshold : float
|
|
28
|
+
Binarization threshold.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
pd.Series indexed by region.
|
|
33
|
+
"""
|
|
34
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
35
|
+
index = mat.index if is_df else None
|
|
36
|
+
|
|
37
|
+
arr = validate_matrix(mat)
|
|
38
|
+
|
|
39
|
+
if use_rca:
|
|
40
|
+
rca_mat = compute_rca(arr)
|
|
41
|
+
m = (rca_mat >= threshold).astype(float)
|
|
42
|
+
else:
|
|
43
|
+
m = (arr >= threshold).astype(float)
|
|
44
|
+
|
|
45
|
+
result = m.sum(axis=1)
|
|
46
|
+
|
|
47
|
+
if is_df:
|
|
48
|
+
return pd.Series(result, index=index, name="diversity")
|
|
49
|
+
return result
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def ubiquity(
|
|
53
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
54
|
+
use_rca: bool = True,
|
|
55
|
+
threshold: float = 1.0,
|
|
56
|
+
) -> pd.Series:
|
|
57
|
+
"""
|
|
58
|
+
Ubiquity: number of regions where each activity has RCA >= threshold.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
mat : array-like (R x C)
|
|
63
|
+
Value matrix or pre-computed binary Mcp matrix.
|
|
64
|
+
use_rca : bool
|
|
65
|
+
If True, compute RCA internally before binarizing.
|
|
66
|
+
threshold : float
|
|
67
|
+
Binarization threshold.
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
pd.Series indexed by activity.
|
|
72
|
+
"""
|
|
73
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
74
|
+
columns = mat.columns if is_df else None
|
|
75
|
+
|
|
76
|
+
arr = validate_matrix(mat)
|
|
77
|
+
|
|
78
|
+
if use_rca:
|
|
79
|
+
rca_mat = compute_rca(arr)
|
|
80
|
+
m = (rca_mat >= threshold).astype(float)
|
|
81
|
+
else:
|
|
82
|
+
m = (arr >= threshold).astype(float)
|
|
83
|
+
|
|
84
|
+
result = m.sum(axis=0)
|
|
85
|
+
|
|
86
|
+
if is_df:
|
|
87
|
+
return pd.Series(result, index=columns, name="ubiquity")
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def normalized_ubiquity(
|
|
92
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
93
|
+
threshold: float = 1.0,
|
|
94
|
+
) -> pd.Series:
|
|
95
|
+
"""
|
|
96
|
+
Normalized ubiquity: ratio of activity's share of total value to
|
|
97
|
+
its share of ubiquity.
|
|
98
|
+
|
|
99
|
+
norm_ubiquity_c = (sum_r x_{rc} / sum_{rc} x_{rc}) / (U_c / sum_c U_c)
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
pd.Series indexed by activity.
|
|
104
|
+
"""
|
|
105
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
106
|
+
columns = mat.columns if is_df else None
|
|
107
|
+
|
|
108
|
+
arr = validate_matrix(mat)
|
|
109
|
+
total = arr.sum()
|
|
110
|
+
ub = ubiquity(arr, use_rca=True, threshold=threshold)
|
|
111
|
+
|
|
112
|
+
if isinstance(ub, pd.Series):
|
|
113
|
+
ub_arr = ub.values
|
|
114
|
+
else:
|
|
115
|
+
ub_arr = ub
|
|
116
|
+
|
|
117
|
+
col_sums = arr.sum(axis=0)
|
|
118
|
+
value_share = col_sums / total
|
|
119
|
+
ubiquity_share = safe_divide(ub_arr, ub_arr.sum())
|
|
120
|
+
|
|
121
|
+
result = safe_divide(value_share, ubiquity_share)
|
|
122
|
+
|
|
123
|
+
if is_df:
|
|
124
|
+
return pd.Series(result, index=columns, name="norm_ubiquity")
|
|
125
|
+
return result
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pre-processing utilities for sparse location-activity matrices.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Hidalgo & Hausmann (2009); OEC Atlas methodology.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import Union
|
|
12
|
+
|
|
13
|
+
from .utils import validate_matrix, binarize
|
|
14
|
+
from .rca import rca as compute_rca
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def trim_core(
|
|
18
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
19
|
+
dmin: int = 1,
|
|
20
|
+
umin: int = 1,
|
|
21
|
+
use_rca: bool = True,
|
|
22
|
+
threshold: float = 1.0,
|
|
23
|
+
max_iter: int = 100,
|
|
24
|
+
) -> Union[np.ndarray, pd.DataFrame]:
|
|
25
|
+
"""
|
|
26
|
+
Iteratively prune a value matrix to its (dmin, umin)-core.
|
|
27
|
+
|
|
28
|
+
Drops locations (rows) with diversity < dmin and activities (columns)
|
|
29
|
+
with ubiquity < umin, where diversity/ubiquity are counted on the
|
|
30
|
+
binary presence matrix (RCA >= threshold by default). Because removing
|
|
31
|
+
a row or column changes the RCA of the remaining cells, the presence
|
|
32
|
+
matrix is recomputed after each pass until the matrix is stable.
|
|
33
|
+
|
|
34
|
+
With the defaults (dmin=1, umin=1) only degenerate units are removed:
|
|
35
|
+
locations with no activity above the RCA threshold and activities
|
|
36
|
+
with no location above it. These units cannot be ranked and distort
|
|
37
|
+
(or break) the complexity calculations. For very sparse networks
|
|
38
|
+
(e.g. subnational trade data) the literature computes complexity on
|
|
39
|
+
the well-connected core, typically dmin=2, umin=2
|
|
40
|
+
(Hidalgo & Hausmann 2009; OEC Atlas).
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
mat : array-like (R x C)
|
|
45
|
+
Value matrix.
|
|
46
|
+
dmin : int
|
|
47
|
+
Minimum diversity for a location to be kept (default 1).
|
|
48
|
+
umin : int
|
|
49
|
+
Minimum ubiquity for an activity to be kept (default 1).
|
|
50
|
+
use_rca : bool
|
|
51
|
+
Compute RCA before binarizing (default True).
|
|
52
|
+
threshold : float
|
|
53
|
+
Binarization threshold (default 1.0).
|
|
54
|
+
max_iter : int
|
|
55
|
+
Safety cap on pruning passes.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
The surviving sub-matrix, same type as the input. With a DataFrame
|
|
60
|
+
the original labels are preserved, so dropped units can be recovered
|
|
61
|
+
by reindexing (`result.reindex(mat.index)`).
|
|
62
|
+
"""
|
|
63
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
64
|
+
work = mat.astype(float) if is_df else pd.DataFrame(validate_matrix(mat))
|
|
65
|
+
|
|
66
|
+
# All-zero rows/columns first (always degenerate, whatever the core)
|
|
67
|
+
work = work.loc[work.sum(axis=1) > 0, work.sum(axis=0) > 0]
|
|
68
|
+
|
|
69
|
+
for _ in range(max_iter):
|
|
70
|
+
if work.shape[0] == 0 or work.shape[1] == 0:
|
|
71
|
+
break
|
|
72
|
+
if use_rca:
|
|
73
|
+
m = binarize(compute_rca(work.values), threshold)
|
|
74
|
+
else:
|
|
75
|
+
m = binarize(work.values, threshold)
|
|
76
|
+
keep_r = m.sum(axis=1) >= dmin
|
|
77
|
+
keep_c = m.sum(axis=0) >= umin
|
|
78
|
+
if keep_r.all() and keep_c.all():
|
|
79
|
+
break
|
|
80
|
+
work = work.loc[work.index[keep_r], work.columns[keep_c]]
|
|
81
|
+
work = work.loc[work.sum(axis=1) > 0, work.sum(axis=0) > 0]
|
|
82
|
+
|
|
83
|
+
return work if is_df else work.values
|
econcomplex/core/rca.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Revealed Comparative Advantage (RCA) / Balassa Index and variants.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Balassa (1965); Hausmann et al. (2007); Balland & Rigby (2017).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import Optional, Union
|
|
12
|
+
|
|
13
|
+
from .utils import validate_matrix, safe_divide, binarize
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def rca(
|
|
17
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
18
|
+
binary: bool = False,
|
|
19
|
+
threshold: float = 1.0,
|
|
20
|
+
) -> Union[np.ndarray, pd.DataFrame]:
|
|
21
|
+
"""
|
|
22
|
+
Revealed Comparative Advantage (Balassa Index).
|
|
23
|
+
|
|
24
|
+
RCA_{rc} = (x_{rc} / sum_c x_{rc}) / (sum_r x_{rc} / sum_{rc} x_{rc})
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
mat : array-like (R x C)
|
|
29
|
+
Region × industry (or country × product) value matrix.
|
|
30
|
+
binary : bool
|
|
31
|
+
If True, return binary matrix thresholded at `threshold`.
|
|
32
|
+
threshold : float
|
|
33
|
+
Binarization threshold (default 1.0).
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
Same type as input (DataFrame or ndarray).
|
|
38
|
+
"""
|
|
39
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
40
|
+
index = mat.index if is_df else None
|
|
41
|
+
columns = mat.columns if is_df else None
|
|
42
|
+
|
|
43
|
+
arr = validate_matrix(mat)
|
|
44
|
+
total = arr.sum()
|
|
45
|
+
if total == 0:
|
|
46
|
+
raise ValueError("Matrix sum is zero; cannot compute RCA.")
|
|
47
|
+
|
|
48
|
+
row_sums = arr.sum(axis=1, keepdims=True) # R × 1
|
|
49
|
+
col_sums = arr.sum(axis=0, keepdims=True) # 1 × C
|
|
50
|
+
|
|
51
|
+
share_rc = safe_divide(arr, row_sums) # share of industry c in region r
|
|
52
|
+
share_c = col_sums / total # national share of industry c
|
|
53
|
+
|
|
54
|
+
result = safe_divide(share_rc, share_c)
|
|
55
|
+
|
|
56
|
+
if binary:
|
|
57
|
+
result = binarize(result, threshold)
|
|
58
|
+
|
|
59
|
+
if is_df:
|
|
60
|
+
return pd.DataFrame(result, index=index, columns=columns)
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def rpop(
|
|
65
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
66
|
+
pop: Union[np.ndarray, pd.Series],
|
|
67
|
+
binary: bool = False,
|
|
68
|
+
threshold: float = 1.0,
|
|
69
|
+
) -> Union[np.ndarray, pd.DataFrame]:
|
|
70
|
+
"""
|
|
71
|
+
Population-normalized RCA (RPOP).
|
|
72
|
+
|
|
73
|
+
RPOP_{rc} = (x_{rc} / pop_r) / (sum_r x_{rc} / sum_r pop_r)
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
mat : array-like (R x C)
|
|
78
|
+
Region × industry value matrix.
|
|
79
|
+
pop : 1-D array-like (length R)
|
|
80
|
+
Population vector for each region.
|
|
81
|
+
binary : bool
|
|
82
|
+
If True, binarize at `threshold`.
|
|
83
|
+
threshold : float
|
|
84
|
+
Binarization threshold (default 1.0).
|
|
85
|
+
"""
|
|
86
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
87
|
+
index = mat.index if is_df else None
|
|
88
|
+
columns = mat.columns if is_df else None
|
|
89
|
+
|
|
90
|
+
arr = validate_matrix(mat)
|
|
91
|
+
pop_arr = np.array(pop, dtype=float).reshape(-1, 1) # R × 1
|
|
92
|
+
|
|
93
|
+
if len(pop_arr) != arr.shape[0]:
|
|
94
|
+
raise ValueError("Length of pop must equal number of rows in mat.")
|
|
95
|
+
|
|
96
|
+
world_pop = pop_arr.sum()
|
|
97
|
+
col_sums = arr.sum(axis=0, keepdims=True) # 1 × C
|
|
98
|
+
|
|
99
|
+
share_rc = safe_divide(arr, pop_arr) # x_{rc} / pop_r
|
|
100
|
+
share_c = col_sums / world_pop # X_c / world_pop
|
|
101
|
+
|
|
102
|
+
result = safe_divide(share_rc, share_c)
|
|
103
|
+
|
|
104
|
+
if binary:
|
|
105
|
+
result = binarize(result, threshold)
|
|
106
|
+
|
|
107
|
+
if is_df:
|
|
108
|
+
return pd.DataFrame(result, index=index, columns=columns)
|
|
109
|
+
return result
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def mcp(
|
|
113
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
114
|
+
presence_test: str = "rca",
|
|
115
|
+
rca_threshold: float = 1.0,
|
|
116
|
+
rpop_threshold: float = 1.0,
|
|
117
|
+
pop: Optional[Union[np.ndarray, pd.Series]] = None,
|
|
118
|
+
) -> Union[np.ndarray, pd.DataFrame]:
|
|
119
|
+
"""
|
|
120
|
+
Binary presence matrix (Mcp).
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
mat : array-like (R x C)
|
|
125
|
+
Raw value matrix.
|
|
126
|
+
presence_test : str
|
|
127
|
+
One of 'rca', 'rpop', 'both' (union of rca and rpop), 'manual'
|
|
128
|
+
(treat mat as already binary).
|
|
129
|
+
rca_threshold : float
|
|
130
|
+
RCA binarization threshold.
|
|
131
|
+
rpop_threshold : float
|
|
132
|
+
RPOP binarization threshold.
|
|
133
|
+
pop : array-like (length R), optional
|
|
134
|
+
Required when presence_test is 'rpop' or 'both'.
|
|
135
|
+
"""
|
|
136
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
137
|
+
|
|
138
|
+
if presence_test == "manual":
|
|
139
|
+
result = binarize(mat, 0.5)
|
|
140
|
+
elif presence_test == "rca":
|
|
141
|
+
result = binarize(rca(mat), rca_threshold)
|
|
142
|
+
elif presence_test == "rpop":
|
|
143
|
+
if pop is None:
|
|
144
|
+
raise ValueError("pop is required for presence_test='rpop'.")
|
|
145
|
+
result = binarize(rpop(mat, pop), rpop_threshold)
|
|
146
|
+
elif presence_test == "both":
|
|
147
|
+
if pop is None:
|
|
148
|
+
raise ValueError("pop is required for presence_test='both'.")
|
|
149
|
+
m_rca = binarize(rca(mat), rca_threshold)
|
|
150
|
+
m_rpop = binarize(rpop(mat, pop), rpop_threshold)
|
|
151
|
+
result = ((m_rca + m_rpop) > 0).astype(float)
|
|
152
|
+
else:
|
|
153
|
+
raise ValueError(
|
|
154
|
+
"presence_test must be one of 'rca', 'rpop', 'both', 'manual'."
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
if is_df:
|
|
158
|
+
index = mat.index
|
|
159
|
+
columns = mat.columns
|
|
160
|
+
return pd.DataFrame(result, index=index, columns=columns)
|
|
161
|
+
return result
|