econcomplex 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- econcomplex/__init__.py +220 -0
- econcomplex/complexity/__init__.py +23 -0
- econcomplex/complexity/eci_pci.py +131 -0
- econcomplex/complexity/eigenvector.py +115 -0
- econcomplex/complexity/fitness.py +130 -0
- econcomplex/complexity/reflections.py +173 -0
- econcomplex/complexity/subnational.py +82 -0
- econcomplex/core/__init__.py +23 -0
- econcomplex/core/diversity.py +125 -0
- econcomplex/core/preprocess.py +83 -0
- econcomplex/core/rca.py +161 -0
- econcomplex/core/utils.py +137 -0
- econcomplex/dynamics/__init__.py +10 -0
- econcomplex/dynamics/entry_exit.py +248 -0
- econcomplex/dynamics/growth.py +146 -0
- econcomplex/inequality/__init__.py +11 -0
- econcomplex/inequality/concentration.py +148 -0
- econcomplex/inequality/gini.py +164 -0
- econcomplex/optimization/__init__.py +46 -0
- econcomplex/optimization/diffusion.py +379 -0
- econcomplex/optimization/growth_target.py +170 -0
- econcomplex/optimization/portfolio.py +178 -0
- econcomplex/optimization/steppingstone.py +267 -0
- econcomplex/outlook/__init__.py +6 -0
- econcomplex/outlook/coi_cog.py +168 -0
- econcomplex/patents/__init__.py +7 -0
- econcomplex/patents/recombination.py +135 -0
- econcomplex/pipeline.py +255 -0
- econcomplex/productivity/__init__.py +8 -0
- econcomplex/productivity/prody.py +218 -0
- econcomplex/relatedness/__init__.py +25 -0
- econcomplex/relatedness/cooccurrence.py +173 -0
- econcomplex/relatedness/cross_space.py +142 -0
- econcomplex/relatedness/density.py +232 -0
- econcomplex/relatedness/proximity.py +214 -0
- econcomplex/specialization/__init__.py +17 -0
- econcomplex/specialization/location_quotient.py +163 -0
- econcomplex/specialization/similarity.py +68 -0
- econcomplex-1.0.0.dist-info/METADATA +223 -0
- econcomplex-1.0.0.dist-info/RECORD +43 -0
- econcomplex-1.0.0.dist-info/WHEEL +5 -0
- econcomplex-1.0.0.dist-info/licenses/LICENSE +22 -0
- econcomplex-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for matrix/dataframe handling.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def pivot_to_matrix(
|
|
11
|
+
df: pd.DataFrame,
|
|
12
|
+
index: str,
|
|
13
|
+
columns: str,
|
|
14
|
+
values: str,
|
|
15
|
+
fill_value: float = 0.0,
|
|
16
|
+
) -> pd.DataFrame:
|
|
17
|
+
"""Convert long-format DataFrame to wide (pivot) matrix."""
|
|
18
|
+
return df.pivot_table(
|
|
19
|
+
index=index, columns=columns, values=values, aggfunc="sum", fill_value=fill_value
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def melt_matrix(
|
|
24
|
+
mat: pd.DataFrame,
|
|
25
|
+
index_name: str = "location",
|
|
26
|
+
columns_name: str = "activity",
|
|
27
|
+
values_name: str = "value",
|
|
28
|
+
) -> pd.DataFrame:
|
|
29
|
+
"""Convert wide matrix to long-format DataFrame."""
|
|
30
|
+
df = mat.copy()
|
|
31
|
+
# Ensure index has a name so reset_index() gives a usable column
|
|
32
|
+
if df.index.name is None:
|
|
33
|
+
df.index.name = index_name
|
|
34
|
+
actual_index_name = df.index.name
|
|
35
|
+
df.columns.name = None # avoid extra label
|
|
36
|
+
return df.reset_index().melt(
|
|
37
|
+
id_vars=actual_index_name,
|
|
38
|
+
var_name=columns_name,
|
|
39
|
+
value_name=values_name,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def validate_matrix(mat: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
|
|
44
|
+
"""Return a 2-D numpy array; raise on bad input."""
|
|
45
|
+
if isinstance(mat, pd.DataFrame):
|
|
46
|
+
arr = mat.values.astype(float)
|
|
47
|
+
elif isinstance(mat, np.ndarray):
|
|
48
|
+
arr = mat.astype(float)
|
|
49
|
+
else:
|
|
50
|
+
arr = np.array(mat, dtype=float)
|
|
51
|
+
if arr.ndim != 2:
|
|
52
|
+
raise ValueError("Input must be a 2-D matrix.")
|
|
53
|
+
return arr
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def binarize(mat: Union[np.ndarray, pd.DataFrame], threshold: float = 1.0) -> np.ndarray:
|
|
57
|
+
"""Return binary matrix: 1 where mat >= threshold, else 0."""
|
|
58
|
+
arr = validate_matrix(mat)
|
|
59
|
+
return (arr >= threshold).astype(float)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def safe_divide(numerator: np.ndarray, denominator: np.ndarray) -> np.ndarray:
|
|
63
|
+
"""Element-wise division, returning 0 where denominator == 0."""
|
|
64
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
65
|
+
result = np.where(denominator != 0, numerator / denominator, 0.0)
|
|
66
|
+
return result
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def normalize_zscore(vec: np.ndarray) -> np.ndarray:
|
|
70
|
+
"""Z-score normalize a 1-D vector."""
|
|
71
|
+
std = np.std(vec)
|
|
72
|
+
if std == 0:
|
|
73
|
+
return np.zeros_like(vec)
|
|
74
|
+
return (vec - np.mean(vec)) / std
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def normalize_01(vec: np.ndarray) -> np.ndarray:
|
|
78
|
+
"""Min-max normalize a 1-D vector to [0, 1]."""
|
|
79
|
+
rng = vec.max() - vec.min()
|
|
80
|
+
if rng == 0:
|
|
81
|
+
return np.zeros_like(vec)
|
|
82
|
+
return (vec - vec.min()) / rng
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def make_sample_data(
|
|
86
|
+
n_locs: int = 50,
|
|
87
|
+
n_acts: int = 30,
|
|
88
|
+
seed: int = 42,
|
|
89
|
+
loc_col: str = "loc",
|
|
90
|
+
act_col: str = "act",
|
|
91
|
+
val_col: str = "val",
|
|
92
|
+
) -> pd.DataFrame:
|
|
93
|
+
"""
|
|
94
|
+
Generate synthetic long-format data for examples and testing.
|
|
95
|
+
|
|
96
|
+
The data has the nested (triangular) structure typical of real
|
|
97
|
+
location-activity matrices: high-capability locations are active in
|
|
98
|
+
many activities (including complex ones), while low-capability
|
|
99
|
+
locations concentrate on ubiquitous activities. This makes the
|
|
100
|
+
resulting ECI/PCI, proximity, and density values meaningful.
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
n_locs : int
|
|
105
|
+
Number of locations (rows).
|
|
106
|
+
n_acts : int
|
|
107
|
+
Number of activities (columns).
|
|
108
|
+
seed : int
|
|
109
|
+
Random seed for reproducibility.
|
|
110
|
+
loc_col, act_col, val_col : str
|
|
111
|
+
Column names of the returned DataFrame.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
pd.DataFrame
|
|
116
|
+
Long format with columns [loc_col, act_col, val_col],
|
|
117
|
+
containing only positive entries.
|
|
118
|
+
"""
|
|
119
|
+
rng = np.random.default_rng(seed)
|
|
120
|
+
|
|
121
|
+
# Capability of each location and requirement of each activity
|
|
122
|
+
capability = np.sort(rng.uniform(0.1, 1.0, n_locs))[::-1]
|
|
123
|
+
requirement = np.sort(rng.uniform(0.0, 0.9, n_acts))
|
|
124
|
+
|
|
125
|
+
# Presence probability falls with the capability gap (nested structure)
|
|
126
|
+
gap = capability[:, None] - requirement[None, :]
|
|
127
|
+
presence = rng.random((n_locs, n_acts)) < 1 / (1 + np.exp(-10 * gap))
|
|
128
|
+
|
|
129
|
+
values = presence * rng.lognormal(mean=3.0, sigma=1.0, size=(n_locs, n_acts))
|
|
130
|
+
|
|
131
|
+
locs = [f"L{i + 1:03d}" for i in range(n_locs)]
|
|
132
|
+
acts = [f"A{j + 1:03d}" for j in range(n_acts)]
|
|
133
|
+
|
|
134
|
+
df = pd.DataFrame(values, index=locs, columns=acts)
|
|
135
|
+
long = melt_matrix(df, loc_col, act_col, val_col)
|
|
136
|
+
long = long[long[val_col] > 0].reset_index(drop=True)
|
|
137
|
+
return long
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Industry entry and exit tracking across time periods.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Balland & Rigby (2017); EconGeo (R package).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import List, Union
|
|
12
|
+
|
|
13
|
+
from ..core.utils import validate_matrix, binarize, melt_matrix, pivot_to_matrix
|
|
14
|
+
from ..core.rca import rca as compute_rca
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _panel_to_matrices(
|
|
18
|
+
df: pd.DataFrame, loc: str, act: str, val: str, time: str
|
|
19
|
+
) -> List[pd.DataFrame]:
|
|
20
|
+
"""Split a long-format panel into one aligned matrix per time period."""
|
|
21
|
+
periods = sorted(df[time].unique())
|
|
22
|
+
if len(periods) < 2:
|
|
23
|
+
raise ValueError("Need at least 2 time periods.")
|
|
24
|
+
mats = [pivot_to_matrix(df[df[time] == t], loc, act, val) for t in periods]
|
|
25
|
+
rows = mats[0].index
|
|
26
|
+
cols = mats[0].columns
|
|
27
|
+
for m in mats[1:]:
|
|
28
|
+
rows = rows.union(m.index)
|
|
29
|
+
cols = cols.union(m.columns)
|
|
30
|
+
return [m.reindex(index=rows, columns=cols, fill_value=0.0) for m in mats]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def entry(
|
|
34
|
+
mats: List[Union[np.ndarray, pd.DataFrame]],
|
|
35
|
+
use_rca: bool = True,
|
|
36
|
+
threshold: float = 1.0,
|
|
37
|
+
) -> Union[pd.DataFrame, np.ndarray]:
|
|
38
|
+
"""
|
|
39
|
+
Industry entry matrix: 1 when a (region, activity) transitions from
|
|
40
|
+
absent (M=0) to present (M=1) between consecutive time periods.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
mats : list of array-like (R x C)
|
|
45
|
+
Ordered list of value matrices (at least 2).
|
|
46
|
+
use_rca : bool
|
|
47
|
+
Compute RCA before binarizing.
|
|
48
|
+
threshold : float
|
|
49
|
+
Binarization threshold.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
R x C entry matrix (1 = entry event occurred in at least one period).
|
|
54
|
+
"""
|
|
55
|
+
if len(mats) < 2:
|
|
56
|
+
raise ValueError("Need at least 2 matrices to detect entry.")
|
|
57
|
+
|
|
58
|
+
is_df = isinstance(mats[0], pd.DataFrame)
|
|
59
|
+
row_index = mats[0].index if is_df else None
|
|
60
|
+
col_index = mats[0].columns if is_df else None
|
|
61
|
+
|
|
62
|
+
def _bin(m):
|
|
63
|
+
arr = validate_matrix(m)
|
|
64
|
+
if use_rca:
|
|
65
|
+
return binarize(compute_rca(arr), threshold)
|
|
66
|
+
return binarize(arr, threshold)
|
|
67
|
+
|
|
68
|
+
binary = [_bin(m) for m in mats]
|
|
69
|
+
result = np.zeros_like(binary[0])
|
|
70
|
+
|
|
71
|
+
for t in range(1, len(binary)):
|
|
72
|
+
entry_t = ((binary[t - 1] == 0) & (binary[t] == 1)).astype(float)
|
|
73
|
+
result = np.maximum(result, entry_t)
|
|
74
|
+
|
|
75
|
+
if is_df:
|
|
76
|
+
return pd.DataFrame(result, index=row_index, columns=col_index)
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def exit(
|
|
81
|
+
mats: List[Union[np.ndarray, pd.DataFrame]],
|
|
82
|
+
use_rca: bool = True,
|
|
83
|
+
threshold: float = 1.0,
|
|
84
|
+
) -> Union[pd.DataFrame, np.ndarray]:
|
|
85
|
+
"""
|
|
86
|
+
Industry exit matrix: 1 when a (region, activity) transitions from
|
|
87
|
+
present (M=1) to absent (M=0) between consecutive time periods.
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
mats : list of array-like (R x C)
|
|
92
|
+
Ordered list of value matrices (at least 2).
|
|
93
|
+
use_rca : bool
|
|
94
|
+
Compute RCA before binarizing.
|
|
95
|
+
threshold : float
|
|
96
|
+
Binarization threshold.
|
|
97
|
+
|
|
98
|
+
Returns
|
|
99
|
+
-------
|
|
100
|
+
R x C exit matrix (1 = exit event occurred in at least one period).
|
|
101
|
+
"""
|
|
102
|
+
if len(mats) < 2:
|
|
103
|
+
raise ValueError("Need at least 2 matrices to detect exit.")
|
|
104
|
+
|
|
105
|
+
is_df = isinstance(mats[0], pd.DataFrame)
|
|
106
|
+
row_index = mats[0].index if is_df else None
|
|
107
|
+
col_index = mats[0].columns if is_df else None
|
|
108
|
+
|
|
109
|
+
def _bin(m):
|
|
110
|
+
arr = validate_matrix(m)
|
|
111
|
+
if use_rca:
|
|
112
|
+
return binarize(compute_rca(arr), threshold)
|
|
113
|
+
return binarize(arr, threshold)
|
|
114
|
+
|
|
115
|
+
binary = [_bin(m) for m in mats]
|
|
116
|
+
result = np.zeros_like(binary[0])
|
|
117
|
+
|
|
118
|
+
for t in range(1, len(binary)):
|
|
119
|
+
exit_t = ((binary[t - 1] == 1) & (binary[t] == 0)).astype(float)
|
|
120
|
+
result = np.maximum(result, exit_t)
|
|
121
|
+
|
|
122
|
+
if is_df:
|
|
123
|
+
return pd.DataFrame(result, index=row_index, columns=col_index)
|
|
124
|
+
return result
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def entry_exit_summary(
|
|
128
|
+
mats: List[Union[np.ndarray, pd.DataFrame]],
|
|
129
|
+
use_rca: bool = True,
|
|
130
|
+
threshold: float = 1.0,
|
|
131
|
+
) -> pd.DataFrame:
|
|
132
|
+
"""
|
|
133
|
+
Summary of entry and exit events per (region, activity) pair.
|
|
134
|
+
|
|
135
|
+
Returns a long-format DataFrame with columns:
|
|
136
|
+
- location, activity, n_entries, n_exits, net_change
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
mats : list of array-like (R x C)
|
|
141
|
+
Ordered list of value matrices.
|
|
142
|
+
use_rca : bool
|
|
143
|
+
Compute RCA before binarizing.
|
|
144
|
+
threshold : float
|
|
145
|
+
Binarization threshold.
|
|
146
|
+
"""
|
|
147
|
+
if len(mats) < 2:
|
|
148
|
+
raise ValueError("Need at least 2 matrices.")
|
|
149
|
+
|
|
150
|
+
is_df = isinstance(mats[0], pd.DataFrame)
|
|
151
|
+
row_index = mats[0].index if is_df else range(mats[0].shape[0])
|
|
152
|
+
col_index = mats[0].columns if is_df else range(mats[0].shape[1])
|
|
153
|
+
|
|
154
|
+
def _bin(m):
|
|
155
|
+
arr = validate_matrix(m)
|
|
156
|
+
if use_rca:
|
|
157
|
+
return binarize(compute_rca(arr), threshold)
|
|
158
|
+
return binarize(arr, threshold)
|
|
159
|
+
|
|
160
|
+
binary = [_bin(m) for m in mats]
|
|
161
|
+
n_r, n_c = binary[0].shape
|
|
162
|
+
|
|
163
|
+
n_entries = np.zeros((n_r, n_c))
|
|
164
|
+
n_exits = np.zeros((n_r, n_c))
|
|
165
|
+
|
|
166
|
+
for t in range(1, len(binary)):
|
|
167
|
+
n_entries += ((binary[t - 1] == 0) & (binary[t] == 1)).astype(float)
|
|
168
|
+
n_exits += ((binary[t - 1] == 1) & (binary[t] == 0)).astype(float)
|
|
169
|
+
|
|
170
|
+
rows = []
|
|
171
|
+
for i, loc in enumerate(row_index):
|
|
172
|
+
for j, act in enumerate(col_index):
|
|
173
|
+
if n_entries[i, j] > 0 or n_exits[i, j] > 0:
|
|
174
|
+
rows.append({
|
|
175
|
+
"location": loc,
|
|
176
|
+
"activity": act,
|
|
177
|
+
"n_entries": int(n_entries[i, j]),
|
|
178
|
+
"n_exits": int(n_exits[i, j]),
|
|
179
|
+
"net_change": int(n_entries[i, j] - n_exits[i, j]),
|
|
180
|
+
})
|
|
181
|
+
|
|
182
|
+
return pd.DataFrame(rows)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def entry_tracking(
|
|
186
|
+
df: pd.DataFrame,
|
|
187
|
+
loc: str,
|
|
188
|
+
act: str,
|
|
189
|
+
val: str,
|
|
190
|
+
time: str,
|
|
191
|
+
use_rca: bool = True,
|
|
192
|
+
threshold: float = 1.0,
|
|
193
|
+
) -> pd.DataFrame:
|
|
194
|
+
"""
|
|
195
|
+
Long-format wrapper around `entry` for panel data.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
df : pd.DataFrame
|
|
200
|
+
Long-format data.
|
|
201
|
+
loc, act, val, time : str
|
|
202
|
+
Column names for location, activity, value, and time period.
|
|
203
|
+
use_rca : bool
|
|
204
|
+
Compute RCA before binarizing.
|
|
205
|
+
threshold : float
|
|
206
|
+
Binarization threshold.
|
|
207
|
+
|
|
208
|
+
Returns
|
|
209
|
+
-------
|
|
210
|
+
pd.DataFrame with columns [loc, act, 'entry'] (entry = 1 when the
|
|
211
|
+
pair entered in at least one period transition).
|
|
212
|
+
"""
|
|
213
|
+
mats = _panel_to_matrices(df, loc, act, val, time)
|
|
214
|
+
result = entry(mats, use_rca=use_rca, threshold=threshold)
|
|
215
|
+
return melt_matrix(result, loc, act, "entry")
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def exit_tracking(
|
|
219
|
+
df: pd.DataFrame,
|
|
220
|
+
loc: str,
|
|
221
|
+
act: str,
|
|
222
|
+
val: str,
|
|
223
|
+
time: str,
|
|
224
|
+
use_rca: bool = True,
|
|
225
|
+
threshold: float = 1.0,
|
|
226
|
+
) -> pd.DataFrame:
|
|
227
|
+
"""
|
|
228
|
+
Long-format wrapper around `exit` for panel data.
|
|
229
|
+
|
|
230
|
+
Parameters
|
|
231
|
+
----------
|
|
232
|
+
df : pd.DataFrame
|
|
233
|
+
Long-format data.
|
|
234
|
+
loc, act, val, time : str
|
|
235
|
+
Column names for location, activity, value, and time period.
|
|
236
|
+
use_rca : bool
|
|
237
|
+
Compute RCA before binarizing.
|
|
238
|
+
threshold : float
|
|
239
|
+
Binarization threshold.
|
|
240
|
+
|
|
241
|
+
Returns
|
|
242
|
+
-------
|
|
243
|
+
pd.DataFrame with columns [loc, act, 'exit'] (exit = 1 when the
|
|
244
|
+
pair exited in at least one period transition).
|
|
245
|
+
"""
|
|
246
|
+
mats = _panel_to_matrices(df, loc, act, val, time)
|
|
247
|
+
result = exit(mats, use_rca=use_rca, threshold=threshold)
|
|
248
|
+
return melt_matrix(result, loc, act, "exit")
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Regional and industry growth rates.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
from ..core.utils import validate_matrix, safe_divide, pivot_to_matrix
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def growth_rate(
|
|
13
|
+
mat1: Union[np.ndarray, pd.DataFrame],
|
|
14
|
+
mat2: Union[np.ndarray, pd.DataFrame],
|
|
15
|
+
axis: int = 0,
|
|
16
|
+
pct: bool = True,
|
|
17
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
18
|
+
"""
|
|
19
|
+
Growth rate between two time periods.
|
|
20
|
+
|
|
21
|
+
g = (sum_t2 - sum_t1) / sum_t1 [* 100 if pct]
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
mat1 : array-like (R x C)
|
|
26
|
+
Matrix for time period 1.
|
|
27
|
+
mat2 : array-like (R x C)
|
|
28
|
+
Matrix for time period 2 (same shape as mat1).
|
|
29
|
+
axis : int
|
|
30
|
+
0 = aggregate over columns (regional growth), rows returned.
|
|
31
|
+
1 = aggregate over rows (industry growth), columns returned.
|
|
32
|
+
pct : bool
|
|
33
|
+
If True, multiply by 100 to return percentage.
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
pd.Series or ndarray.
|
|
38
|
+
"""
|
|
39
|
+
is_df = isinstance(mat1, pd.DataFrame)
|
|
40
|
+
|
|
41
|
+
arr1 = validate_matrix(mat1)
|
|
42
|
+
arr2 = validate_matrix(mat2)
|
|
43
|
+
|
|
44
|
+
if arr1.shape != arr2.shape:
|
|
45
|
+
raise ValueError("mat1 and mat2 must have the same shape.")
|
|
46
|
+
|
|
47
|
+
sum1 = arr1.sum(axis=1 - axis) # opposite axis
|
|
48
|
+
sum2 = arr2.sum(axis=1 - axis)
|
|
49
|
+
|
|
50
|
+
g = safe_divide(sum2 - sum1, sum1)
|
|
51
|
+
if pct:
|
|
52
|
+
g = g * 100
|
|
53
|
+
|
|
54
|
+
if is_df:
|
|
55
|
+
index = mat1.index if axis == 0 else mat1.columns
|
|
56
|
+
name = "growth_region" if axis == 0 else "growth_industry"
|
|
57
|
+
return pd.Series(g, index=index, name=name)
|
|
58
|
+
return g
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def growth_matrix(
|
|
62
|
+
mat1: Union[np.ndarray, pd.DataFrame],
|
|
63
|
+
mat2: Union[np.ndarray, pd.DataFrame],
|
|
64
|
+
pct: bool = True,
|
|
65
|
+
) -> Union[pd.DataFrame, np.ndarray]:
|
|
66
|
+
"""
|
|
67
|
+
Element-wise growth rate matrix.
|
|
68
|
+
|
|
69
|
+
g_{rc} = (x_{rc,t2} - x_{rc,t1}) / x_{rc,t1}
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
mat1, mat2 : array-like (R x C)
|
|
74
|
+
pct : bool
|
|
75
|
+
Multiply by 100.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
R x C growth rate matrix.
|
|
80
|
+
"""
|
|
81
|
+
is_df = isinstance(mat1, pd.DataFrame)
|
|
82
|
+
row_index = mat1.index if is_df else None
|
|
83
|
+
col_index = mat1.columns if is_df else None
|
|
84
|
+
|
|
85
|
+
arr1 = validate_matrix(mat1)
|
|
86
|
+
arr2 = validate_matrix(mat2)
|
|
87
|
+
|
|
88
|
+
if arr1.shape != arr2.shape:
|
|
89
|
+
raise ValueError("mat1 and mat2 must have the same shape.")
|
|
90
|
+
|
|
91
|
+
g = safe_divide(arr2 - arr1, arr1)
|
|
92
|
+
if pct:
|
|
93
|
+
g = g * 100
|
|
94
|
+
|
|
95
|
+
if is_df:
|
|
96
|
+
return pd.DataFrame(g, index=row_index, columns=col_index)
|
|
97
|
+
return g
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def growth_rates(
|
|
101
|
+
df: pd.DataFrame,
|
|
102
|
+
loc: str,
|
|
103
|
+
act: str,
|
|
104
|
+
val: str,
|
|
105
|
+
time: str,
|
|
106
|
+
axis: int = 0,
|
|
107
|
+
pct: bool = True,
|
|
108
|
+
) -> pd.DataFrame:
|
|
109
|
+
"""
|
|
110
|
+
Long-format wrapper around `growth_rate` for panel data.
|
|
111
|
+
|
|
112
|
+
Computes the growth rate between each pair of consecutive time periods.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
df : pd.DataFrame
|
|
117
|
+
Long-format data.
|
|
118
|
+
loc, act, val, time : str
|
|
119
|
+
Column names for location, activity, value, and time period.
|
|
120
|
+
axis : int
|
|
121
|
+
0 = growth per location, 1 = growth per activity.
|
|
122
|
+
pct : bool
|
|
123
|
+
If True, return percentage.
|
|
124
|
+
|
|
125
|
+
Returns
|
|
126
|
+
-------
|
|
127
|
+
pd.DataFrame with columns [unit, time, 'growth'], where unit is `loc`
|
|
128
|
+
(axis=0) or `act` (axis=1) and `time` marks the end of each interval.
|
|
129
|
+
"""
|
|
130
|
+
periods = sorted(df[time].unique())
|
|
131
|
+
if len(periods) < 2:
|
|
132
|
+
raise ValueError("Need at least 2 time periods to compute growth.")
|
|
133
|
+
|
|
134
|
+
unit_col = loc if axis == 0 else act
|
|
135
|
+
out = []
|
|
136
|
+
for t1, t2 in zip(periods[:-1], periods[1:]):
|
|
137
|
+
m1 = pivot_to_matrix(df[df[time] == t1], loc, act, val)
|
|
138
|
+
m2 = pivot_to_matrix(df[df[time] == t2], loc, act, val)
|
|
139
|
+
rows = m1.index.union(m2.index)
|
|
140
|
+
cols = m1.columns.union(m2.columns)
|
|
141
|
+
m1 = m1.reindex(index=rows, columns=cols, fill_value=0.0)
|
|
142
|
+
m2 = m2.reindex(index=rows, columns=cols, fill_value=0.0)
|
|
143
|
+
g = growth_rate(m1, m2, axis=axis, pct=pct)
|
|
144
|
+
out.append(pd.DataFrame({unit_col: g.index, time: t2, "growth": g.values}))
|
|
145
|
+
|
|
146
|
+
return pd.concat(out, ignore_index=True)
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Concentration and dispersion measures.
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
Herfindahl-Hirschman (1945/1950); Shannon (1948); Hoover (1936).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import Optional, Union
|
|
12
|
+
|
|
13
|
+
from ..core.utils import validate_matrix, safe_divide
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def herfindahl(
|
|
17
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
18
|
+
normalize: bool = True,
|
|
19
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
20
|
+
"""
|
|
21
|
+
Herfindahl-Hirschman Index (HHI) per region.
|
|
22
|
+
|
|
23
|
+
HHI_r = sum_c (x_{rc} / X_r)^2
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
mat : array-like (R x C)
|
|
28
|
+
Value matrix.
|
|
29
|
+
normalize : bool
|
|
30
|
+
If True, normalize to [0, 1] using (HHI - 1/C) / (1 - 1/C).
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
pd.Series indexed by region.
|
|
35
|
+
"""
|
|
36
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
37
|
+
row_index = mat.index if is_df else None
|
|
38
|
+
|
|
39
|
+
arr = validate_matrix(mat)
|
|
40
|
+
row_sums = arr.sum(axis=1, keepdims=True)
|
|
41
|
+
shares = safe_divide(arr, row_sums)
|
|
42
|
+
hhi = (shares ** 2).sum(axis=1)
|
|
43
|
+
|
|
44
|
+
if normalize:
|
|
45
|
+
n = arr.shape[1]
|
|
46
|
+
min_hhi = 1.0 / n
|
|
47
|
+
hhi = safe_divide(hhi - min_hhi, 1.0 - min_hhi)
|
|
48
|
+
|
|
49
|
+
if is_df:
|
|
50
|
+
return pd.Series(hhi, index=row_index, name="herfindahl")
|
|
51
|
+
return hhi
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def shannon_entropy(
|
|
55
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
56
|
+
base: float = 2.0,
|
|
57
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
58
|
+
"""
|
|
59
|
+
Shannon entropy per region.
|
|
60
|
+
|
|
61
|
+
H_r = -sum_c (s_{rc} * log_base(s_{rc}))
|
|
62
|
+
where s_{rc} = x_{rc} / X_r
|
|
63
|
+
|
|
64
|
+
Higher entropy = more diversified regional economy.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
mat : array-like (R x C)
|
|
69
|
+
Value matrix.
|
|
70
|
+
base : float
|
|
71
|
+
Logarithm base (default 2 = bits).
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
pd.Series indexed by region.
|
|
76
|
+
"""
|
|
77
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
78
|
+
row_index = mat.index if is_df else None
|
|
79
|
+
|
|
80
|
+
arr = validate_matrix(mat)
|
|
81
|
+
row_sums = arr.sum(axis=1, keepdims=True)
|
|
82
|
+
shares = safe_divide(arr, row_sums)
|
|
83
|
+
|
|
84
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
85
|
+
log_shares = np.where(shares > 0, np.log(shares) / np.log(base), 0.0)
|
|
86
|
+
|
|
87
|
+
result = -(shares * log_shares).sum(axis=1)
|
|
88
|
+
|
|
89
|
+
if is_df:
|
|
90
|
+
return pd.Series(result, index=row_index, name="shannon_entropy")
|
|
91
|
+
return result
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def hoover_index(
|
|
95
|
+
mat: Union[np.ndarray, pd.DataFrame],
|
|
96
|
+
pop: Optional[Union[np.ndarray, pd.Series]] = None,
|
|
97
|
+
) -> Union[pd.Series, np.ndarray]:
|
|
98
|
+
"""
|
|
99
|
+
Hoover Index (Robin Hood Index) per activity.
|
|
100
|
+
|
|
101
|
+
H_c = (1/2) * sum_r |E_r/E_total - A_r/A_total| * 100
|
|
102
|
+
|
|
103
|
+
where E_r = employment of region r in activity c,
|
|
104
|
+
A_r = total employment / population of region r.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
mat : array-like (R x C)
|
|
109
|
+
Value matrix.
|
|
110
|
+
pop : array-like (length R), optional
|
|
111
|
+
Population/reference vector. If None, uses row sums.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
pd.Series indexed by activity (values 0–100).
|
|
116
|
+
"""
|
|
117
|
+
is_df = isinstance(mat, pd.DataFrame)
|
|
118
|
+
col_index = mat.columns if is_df else None
|
|
119
|
+
|
|
120
|
+
arr = validate_matrix(mat)
|
|
121
|
+
|
|
122
|
+
if pop is None:
|
|
123
|
+
pop_arr = arr.sum(axis=1)
|
|
124
|
+
else:
|
|
125
|
+
pop_arr = np.array(pop, dtype=float)
|
|
126
|
+
|
|
127
|
+
total_pop = pop_arr.sum()
|
|
128
|
+
|
|
129
|
+
col_sums = arr.sum(axis=0)
|
|
130
|
+
|
|
131
|
+
results = []
|
|
132
|
+
for c in range(arr.shape[1]):
|
|
133
|
+
if col_sums[c] == 0:
|
|
134
|
+
results.append(0.0)
|
|
135
|
+
continue
|
|
136
|
+
share_e = arr[:, c] / col_sums[c]
|
|
137
|
+
share_a = pop_arr / total_pop
|
|
138
|
+
hi = 0.5 * np.abs(share_e - share_a).sum() * 100
|
|
139
|
+
results.append(hi)
|
|
140
|
+
|
|
141
|
+
result = np.array(results)
|
|
142
|
+
if is_df:
|
|
143
|
+
return pd.Series(result, index=col_index, name="hoover_index")
|
|
144
|
+
return result
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# Short alias matching the documented API
|
|
148
|
+
hhi = herfindahl
|