econcomplex 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. econcomplex/__init__.py +220 -0
  2. econcomplex/complexity/__init__.py +23 -0
  3. econcomplex/complexity/eci_pci.py +131 -0
  4. econcomplex/complexity/eigenvector.py +115 -0
  5. econcomplex/complexity/fitness.py +130 -0
  6. econcomplex/complexity/reflections.py +173 -0
  7. econcomplex/complexity/subnational.py +82 -0
  8. econcomplex/core/__init__.py +23 -0
  9. econcomplex/core/diversity.py +125 -0
  10. econcomplex/core/preprocess.py +83 -0
  11. econcomplex/core/rca.py +161 -0
  12. econcomplex/core/utils.py +137 -0
  13. econcomplex/dynamics/__init__.py +10 -0
  14. econcomplex/dynamics/entry_exit.py +248 -0
  15. econcomplex/dynamics/growth.py +146 -0
  16. econcomplex/inequality/__init__.py +11 -0
  17. econcomplex/inequality/concentration.py +148 -0
  18. econcomplex/inequality/gini.py +164 -0
  19. econcomplex/optimization/__init__.py +46 -0
  20. econcomplex/optimization/diffusion.py +379 -0
  21. econcomplex/optimization/growth_target.py +170 -0
  22. econcomplex/optimization/portfolio.py +178 -0
  23. econcomplex/optimization/steppingstone.py +267 -0
  24. econcomplex/outlook/__init__.py +6 -0
  25. econcomplex/outlook/coi_cog.py +168 -0
  26. econcomplex/patents/__init__.py +7 -0
  27. econcomplex/patents/recombination.py +135 -0
  28. econcomplex/pipeline.py +255 -0
  29. econcomplex/productivity/__init__.py +8 -0
  30. econcomplex/productivity/prody.py +218 -0
  31. econcomplex/relatedness/__init__.py +25 -0
  32. econcomplex/relatedness/cooccurrence.py +173 -0
  33. econcomplex/relatedness/cross_space.py +142 -0
  34. econcomplex/relatedness/density.py +232 -0
  35. econcomplex/relatedness/proximity.py +214 -0
  36. econcomplex/specialization/__init__.py +17 -0
  37. econcomplex/specialization/location_quotient.py +163 -0
  38. econcomplex/specialization/similarity.py +68 -0
  39. econcomplex-1.0.0.dist-info/METADATA +223 -0
  40. econcomplex-1.0.0.dist-info/RECORD +43 -0
  41. econcomplex-1.0.0.dist-info/WHEEL +5 -0
  42. econcomplex-1.0.0.dist-info/licenses/LICENSE +22 -0
  43. econcomplex-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,137 @@
1
+ """
2
+ Utility functions for matrix/dataframe handling.
3
+ """
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from typing import Union
8
+
9
+
10
+ def pivot_to_matrix(
11
+ df: pd.DataFrame,
12
+ index: str,
13
+ columns: str,
14
+ values: str,
15
+ fill_value: float = 0.0,
16
+ ) -> pd.DataFrame:
17
+ """Convert long-format DataFrame to wide (pivot) matrix."""
18
+ return df.pivot_table(
19
+ index=index, columns=columns, values=values, aggfunc="sum", fill_value=fill_value
20
+ )
21
+
22
+
23
+ def melt_matrix(
24
+ mat: pd.DataFrame,
25
+ index_name: str = "location",
26
+ columns_name: str = "activity",
27
+ values_name: str = "value",
28
+ ) -> pd.DataFrame:
29
+ """Convert wide matrix to long-format DataFrame."""
30
+ df = mat.copy()
31
+ # Ensure index has a name so reset_index() gives a usable column
32
+ if df.index.name is None:
33
+ df.index.name = index_name
34
+ actual_index_name = df.index.name
35
+ df.columns.name = None # avoid extra label
36
+ return df.reset_index().melt(
37
+ id_vars=actual_index_name,
38
+ var_name=columns_name,
39
+ value_name=values_name,
40
+ )
41
+
42
+
43
+ def validate_matrix(mat: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
44
+ """Return a 2-D numpy array; raise on bad input."""
45
+ if isinstance(mat, pd.DataFrame):
46
+ arr = mat.values.astype(float)
47
+ elif isinstance(mat, np.ndarray):
48
+ arr = mat.astype(float)
49
+ else:
50
+ arr = np.array(mat, dtype=float)
51
+ if arr.ndim != 2:
52
+ raise ValueError("Input must be a 2-D matrix.")
53
+ return arr
54
+
55
+
56
+ def binarize(mat: Union[np.ndarray, pd.DataFrame], threshold: float = 1.0) -> np.ndarray:
57
+ """Return binary matrix: 1 where mat >= threshold, else 0."""
58
+ arr = validate_matrix(mat)
59
+ return (arr >= threshold).astype(float)
60
+
61
+
62
+ def safe_divide(numerator: np.ndarray, denominator: np.ndarray) -> np.ndarray:
63
+ """Element-wise division, returning 0 where denominator == 0."""
64
+ with np.errstate(divide="ignore", invalid="ignore"):
65
+ result = np.where(denominator != 0, numerator / denominator, 0.0)
66
+ return result
67
+
68
+
69
+ def normalize_zscore(vec: np.ndarray) -> np.ndarray:
70
+ """Z-score normalize a 1-D vector."""
71
+ std = np.std(vec)
72
+ if std == 0:
73
+ return np.zeros_like(vec)
74
+ return (vec - np.mean(vec)) / std
75
+
76
+
77
+ def normalize_01(vec: np.ndarray) -> np.ndarray:
78
+ """Min-max normalize a 1-D vector to [0, 1]."""
79
+ rng = vec.max() - vec.min()
80
+ if rng == 0:
81
+ return np.zeros_like(vec)
82
+ return (vec - vec.min()) / rng
83
+
84
+
85
+ def make_sample_data(
86
+ n_locs: int = 50,
87
+ n_acts: int = 30,
88
+ seed: int = 42,
89
+ loc_col: str = "loc",
90
+ act_col: str = "act",
91
+ val_col: str = "val",
92
+ ) -> pd.DataFrame:
93
+ """
94
+ Generate synthetic long-format data for examples and testing.
95
+
96
+ The data has the nested (triangular) structure typical of real
97
+ location-activity matrices: high-capability locations are active in
98
+ many activities (including complex ones), while low-capability
99
+ locations concentrate on ubiquitous activities. This makes the
100
+ resulting ECI/PCI, proximity, and density values meaningful.
101
+
102
+ Parameters
103
+ ----------
104
+ n_locs : int
105
+ Number of locations (rows).
106
+ n_acts : int
107
+ Number of activities (columns).
108
+ seed : int
109
+ Random seed for reproducibility.
110
+ loc_col, act_col, val_col : str
111
+ Column names of the returned DataFrame.
112
+
113
+ Returns
114
+ -------
115
+ pd.DataFrame
116
+ Long format with columns [loc_col, act_col, val_col],
117
+ containing only positive entries.
118
+ """
119
+ rng = np.random.default_rng(seed)
120
+
121
+ # Capability of each location and requirement of each activity
122
+ capability = np.sort(rng.uniform(0.1, 1.0, n_locs))[::-1]
123
+ requirement = np.sort(rng.uniform(0.0, 0.9, n_acts))
124
+
125
+ # Presence probability falls with the capability gap (nested structure)
126
+ gap = capability[:, None] - requirement[None, :]
127
+ presence = rng.random((n_locs, n_acts)) < 1 / (1 + np.exp(-10 * gap))
128
+
129
+ values = presence * rng.lognormal(mean=3.0, sigma=1.0, size=(n_locs, n_acts))
130
+
131
+ locs = [f"L{i + 1:03d}" for i in range(n_locs)]
132
+ acts = [f"A{j + 1:03d}" for j in range(n_acts)]
133
+
134
+ df = pd.DataFrame(values, index=locs, columns=acts)
135
+ long = melt_matrix(df, loc_col, act_col, val_col)
136
+ long = long[long[val_col] > 0].reset_index(drop=True)
137
+ return long
@@ -0,0 +1,10 @@
1
+ from .growth import growth_rate, growth_matrix
2
+ from .entry_exit import entry, exit, entry_exit_summary
3
+
4
+ __all__ = [
5
+ "growth_rate",
6
+ "growth_matrix",
7
+ "entry",
8
+ "exit",
9
+ "entry_exit_summary",
10
+ ]
@@ -0,0 +1,248 @@
1
+ """
2
+ Industry entry and exit tracking across time periods.
3
+
4
+ References
5
+ ----------
6
+ Balland & Rigby (2017); EconGeo (R package).
7
+ """
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from typing import List, Union
12
+
13
+ from ..core.utils import validate_matrix, binarize, melt_matrix, pivot_to_matrix
14
+ from ..core.rca import rca as compute_rca
15
+
16
+
17
+ def _panel_to_matrices(
18
+ df: pd.DataFrame, loc: str, act: str, val: str, time: str
19
+ ) -> List[pd.DataFrame]:
20
+ """Split a long-format panel into one aligned matrix per time period."""
21
+ periods = sorted(df[time].unique())
22
+ if len(periods) < 2:
23
+ raise ValueError("Need at least 2 time periods.")
24
+ mats = [pivot_to_matrix(df[df[time] == t], loc, act, val) for t in periods]
25
+ rows = mats[0].index
26
+ cols = mats[0].columns
27
+ for m in mats[1:]:
28
+ rows = rows.union(m.index)
29
+ cols = cols.union(m.columns)
30
+ return [m.reindex(index=rows, columns=cols, fill_value=0.0) for m in mats]
31
+
32
+
33
+ def entry(
34
+ mats: List[Union[np.ndarray, pd.DataFrame]],
35
+ use_rca: bool = True,
36
+ threshold: float = 1.0,
37
+ ) -> Union[pd.DataFrame, np.ndarray]:
38
+ """
39
+ Industry entry matrix: 1 when a (region, activity) transitions from
40
+ absent (M=0) to present (M=1) between consecutive time periods.
41
+
42
+ Parameters
43
+ ----------
44
+ mats : list of array-like (R x C)
45
+ Ordered list of value matrices (at least 2).
46
+ use_rca : bool
47
+ Compute RCA before binarizing.
48
+ threshold : float
49
+ Binarization threshold.
50
+
51
+ Returns
52
+ -------
53
+ R x C entry matrix (1 = entry event occurred in at least one period).
54
+ """
55
+ if len(mats) < 2:
56
+ raise ValueError("Need at least 2 matrices to detect entry.")
57
+
58
+ is_df = isinstance(mats[0], pd.DataFrame)
59
+ row_index = mats[0].index if is_df else None
60
+ col_index = mats[0].columns if is_df else None
61
+
62
+ def _bin(m):
63
+ arr = validate_matrix(m)
64
+ if use_rca:
65
+ return binarize(compute_rca(arr), threshold)
66
+ return binarize(arr, threshold)
67
+
68
+ binary = [_bin(m) for m in mats]
69
+ result = np.zeros_like(binary[0])
70
+
71
+ for t in range(1, len(binary)):
72
+ entry_t = ((binary[t - 1] == 0) & (binary[t] == 1)).astype(float)
73
+ result = np.maximum(result, entry_t)
74
+
75
+ if is_df:
76
+ return pd.DataFrame(result, index=row_index, columns=col_index)
77
+ return result
78
+
79
+
80
+ def exit(
81
+ mats: List[Union[np.ndarray, pd.DataFrame]],
82
+ use_rca: bool = True,
83
+ threshold: float = 1.0,
84
+ ) -> Union[pd.DataFrame, np.ndarray]:
85
+ """
86
+ Industry exit matrix: 1 when a (region, activity) transitions from
87
+ present (M=1) to absent (M=0) between consecutive time periods.
88
+
89
+ Parameters
90
+ ----------
91
+ mats : list of array-like (R x C)
92
+ Ordered list of value matrices (at least 2).
93
+ use_rca : bool
94
+ Compute RCA before binarizing.
95
+ threshold : float
96
+ Binarization threshold.
97
+
98
+ Returns
99
+ -------
100
+ R x C exit matrix (1 = exit event occurred in at least one period).
101
+ """
102
+ if len(mats) < 2:
103
+ raise ValueError("Need at least 2 matrices to detect exit.")
104
+
105
+ is_df = isinstance(mats[0], pd.DataFrame)
106
+ row_index = mats[0].index if is_df else None
107
+ col_index = mats[0].columns if is_df else None
108
+
109
+ def _bin(m):
110
+ arr = validate_matrix(m)
111
+ if use_rca:
112
+ return binarize(compute_rca(arr), threshold)
113
+ return binarize(arr, threshold)
114
+
115
+ binary = [_bin(m) for m in mats]
116
+ result = np.zeros_like(binary[0])
117
+
118
+ for t in range(1, len(binary)):
119
+ exit_t = ((binary[t - 1] == 1) & (binary[t] == 0)).astype(float)
120
+ result = np.maximum(result, exit_t)
121
+
122
+ if is_df:
123
+ return pd.DataFrame(result, index=row_index, columns=col_index)
124
+ return result
125
+
126
+
127
+ def entry_exit_summary(
128
+ mats: List[Union[np.ndarray, pd.DataFrame]],
129
+ use_rca: bool = True,
130
+ threshold: float = 1.0,
131
+ ) -> pd.DataFrame:
132
+ """
133
+ Summary of entry and exit events per (region, activity) pair.
134
+
135
+ Returns a long-format DataFrame with columns:
136
+ - location, activity, n_entries, n_exits, net_change
137
+
138
+ Parameters
139
+ ----------
140
+ mats : list of array-like (R x C)
141
+ Ordered list of value matrices.
142
+ use_rca : bool
143
+ Compute RCA before binarizing.
144
+ threshold : float
145
+ Binarization threshold.
146
+ """
147
+ if len(mats) < 2:
148
+ raise ValueError("Need at least 2 matrices.")
149
+
150
+ is_df = isinstance(mats[0], pd.DataFrame)
151
+ row_index = mats[0].index if is_df else range(mats[0].shape[0])
152
+ col_index = mats[0].columns if is_df else range(mats[0].shape[1])
153
+
154
+ def _bin(m):
155
+ arr = validate_matrix(m)
156
+ if use_rca:
157
+ return binarize(compute_rca(arr), threshold)
158
+ return binarize(arr, threshold)
159
+
160
+ binary = [_bin(m) for m in mats]
161
+ n_r, n_c = binary[0].shape
162
+
163
+ n_entries = np.zeros((n_r, n_c))
164
+ n_exits = np.zeros((n_r, n_c))
165
+
166
+ for t in range(1, len(binary)):
167
+ n_entries += ((binary[t - 1] == 0) & (binary[t] == 1)).astype(float)
168
+ n_exits += ((binary[t - 1] == 1) & (binary[t] == 0)).astype(float)
169
+
170
+ rows = []
171
+ for i, loc in enumerate(row_index):
172
+ for j, act in enumerate(col_index):
173
+ if n_entries[i, j] > 0 or n_exits[i, j] > 0:
174
+ rows.append({
175
+ "location": loc,
176
+ "activity": act,
177
+ "n_entries": int(n_entries[i, j]),
178
+ "n_exits": int(n_exits[i, j]),
179
+ "net_change": int(n_entries[i, j] - n_exits[i, j]),
180
+ })
181
+
182
+ return pd.DataFrame(rows)
183
+
184
+
185
+ def entry_tracking(
186
+ df: pd.DataFrame,
187
+ loc: str,
188
+ act: str,
189
+ val: str,
190
+ time: str,
191
+ use_rca: bool = True,
192
+ threshold: float = 1.0,
193
+ ) -> pd.DataFrame:
194
+ """
195
+ Long-format wrapper around `entry` for panel data.
196
+
197
+ Parameters
198
+ ----------
199
+ df : pd.DataFrame
200
+ Long-format data.
201
+ loc, act, val, time : str
202
+ Column names for location, activity, value, and time period.
203
+ use_rca : bool
204
+ Compute RCA before binarizing.
205
+ threshold : float
206
+ Binarization threshold.
207
+
208
+ Returns
209
+ -------
210
+ pd.DataFrame with columns [loc, act, 'entry'] (entry = 1 when the
211
+ pair entered in at least one period transition).
212
+ """
213
+ mats = _panel_to_matrices(df, loc, act, val, time)
214
+ result = entry(mats, use_rca=use_rca, threshold=threshold)
215
+ return melt_matrix(result, loc, act, "entry")
216
+
217
+
218
+ def exit_tracking(
219
+ df: pd.DataFrame,
220
+ loc: str,
221
+ act: str,
222
+ val: str,
223
+ time: str,
224
+ use_rca: bool = True,
225
+ threshold: float = 1.0,
226
+ ) -> pd.DataFrame:
227
+ """
228
+ Long-format wrapper around `exit` for panel data.
229
+
230
+ Parameters
231
+ ----------
232
+ df : pd.DataFrame
233
+ Long-format data.
234
+ loc, act, val, time : str
235
+ Column names for location, activity, value, and time period.
236
+ use_rca : bool
237
+ Compute RCA before binarizing.
238
+ threshold : float
239
+ Binarization threshold.
240
+
241
+ Returns
242
+ -------
243
+ pd.DataFrame with columns [loc, act, 'exit'] (exit = 1 when the
244
+ pair exited in at least one period transition).
245
+ """
246
+ mats = _panel_to_matrices(df, loc, act, val, time)
247
+ result = exit(mats, use_rca=use_rca, threshold=threshold)
248
+ return melt_matrix(result, loc, act, "exit")
@@ -0,0 +1,146 @@
1
+ """
2
+ Regional and industry growth rates.
3
+ """
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from typing import Union
8
+
9
+ from ..core.utils import validate_matrix, safe_divide, pivot_to_matrix
10
+
11
+
12
+ def growth_rate(
13
+ mat1: Union[np.ndarray, pd.DataFrame],
14
+ mat2: Union[np.ndarray, pd.DataFrame],
15
+ axis: int = 0,
16
+ pct: bool = True,
17
+ ) -> Union[pd.Series, np.ndarray]:
18
+ """
19
+ Growth rate between two time periods.
20
+
21
+ g = (sum_t2 - sum_t1) / sum_t1 [* 100 if pct]
22
+
23
+ Parameters
24
+ ----------
25
+ mat1 : array-like (R x C)
26
+ Matrix for time period 1.
27
+ mat2 : array-like (R x C)
28
+ Matrix for time period 2 (same shape as mat1).
29
+ axis : int
30
+ 0 = aggregate over columns (regional growth), rows returned.
31
+ 1 = aggregate over rows (industry growth), columns returned.
32
+ pct : bool
33
+ If True, multiply by 100 to return percentage.
34
+
35
+ Returns
36
+ -------
37
+ pd.Series or ndarray.
38
+ """
39
+ is_df = isinstance(mat1, pd.DataFrame)
40
+
41
+ arr1 = validate_matrix(mat1)
42
+ arr2 = validate_matrix(mat2)
43
+
44
+ if arr1.shape != arr2.shape:
45
+ raise ValueError("mat1 and mat2 must have the same shape.")
46
+
47
+ sum1 = arr1.sum(axis=1 - axis) # opposite axis
48
+ sum2 = arr2.sum(axis=1 - axis)
49
+
50
+ g = safe_divide(sum2 - sum1, sum1)
51
+ if pct:
52
+ g = g * 100
53
+
54
+ if is_df:
55
+ index = mat1.index if axis == 0 else mat1.columns
56
+ name = "growth_region" if axis == 0 else "growth_industry"
57
+ return pd.Series(g, index=index, name=name)
58
+ return g
59
+
60
+
61
+ def growth_matrix(
62
+ mat1: Union[np.ndarray, pd.DataFrame],
63
+ mat2: Union[np.ndarray, pd.DataFrame],
64
+ pct: bool = True,
65
+ ) -> Union[pd.DataFrame, np.ndarray]:
66
+ """
67
+ Element-wise growth rate matrix.
68
+
69
+ g_{rc} = (x_{rc,t2} - x_{rc,t1}) / x_{rc,t1}
70
+
71
+ Parameters
72
+ ----------
73
+ mat1, mat2 : array-like (R x C)
74
+ pct : bool
75
+ Multiply by 100.
76
+
77
+ Returns
78
+ -------
79
+ R x C growth rate matrix.
80
+ """
81
+ is_df = isinstance(mat1, pd.DataFrame)
82
+ row_index = mat1.index if is_df else None
83
+ col_index = mat1.columns if is_df else None
84
+
85
+ arr1 = validate_matrix(mat1)
86
+ arr2 = validate_matrix(mat2)
87
+
88
+ if arr1.shape != arr2.shape:
89
+ raise ValueError("mat1 and mat2 must have the same shape.")
90
+
91
+ g = safe_divide(arr2 - arr1, arr1)
92
+ if pct:
93
+ g = g * 100
94
+
95
+ if is_df:
96
+ return pd.DataFrame(g, index=row_index, columns=col_index)
97
+ return g
98
+
99
+
100
+ def growth_rates(
101
+ df: pd.DataFrame,
102
+ loc: str,
103
+ act: str,
104
+ val: str,
105
+ time: str,
106
+ axis: int = 0,
107
+ pct: bool = True,
108
+ ) -> pd.DataFrame:
109
+ """
110
+ Long-format wrapper around `growth_rate` for panel data.
111
+
112
+ Computes the growth rate between each pair of consecutive time periods.
113
+
114
+ Parameters
115
+ ----------
116
+ df : pd.DataFrame
117
+ Long-format data.
118
+ loc, act, val, time : str
119
+ Column names for location, activity, value, and time period.
120
+ axis : int
121
+ 0 = growth per location, 1 = growth per activity.
122
+ pct : bool
123
+ If True, return percentage.
124
+
125
+ Returns
126
+ -------
127
+ pd.DataFrame with columns [unit, time, 'growth'], where unit is `loc`
128
+ (axis=0) or `act` (axis=1) and `time` marks the end of each interval.
129
+ """
130
+ periods = sorted(df[time].unique())
131
+ if len(periods) < 2:
132
+ raise ValueError("Need at least 2 time periods to compute growth.")
133
+
134
+ unit_col = loc if axis == 0 else act
135
+ out = []
136
+ for t1, t2 in zip(periods[:-1], periods[1:]):
137
+ m1 = pivot_to_matrix(df[df[time] == t1], loc, act, val)
138
+ m2 = pivot_to_matrix(df[df[time] == t2], loc, act, val)
139
+ rows = m1.index.union(m2.index)
140
+ cols = m1.columns.union(m2.columns)
141
+ m1 = m1.reindex(index=rows, columns=cols, fill_value=0.0)
142
+ m2 = m2.reindex(index=rows, columns=cols, fill_value=0.0)
143
+ g = growth_rate(m1, m2, axis=axis, pct=pct)
144
+ out.append(pd.DataFrame({unit_col: g.index, time: t2, "growth": g.values}))
145
+
146
+ return pd.concat(out, ignore_index=True)
@@ -0,0 +1,11 @@
1
+ from .gini import gini, locational_gini, hoover_gini
2
+ from .concentration import herfindahl, shannon_entropy, hoover_index
3
+
4
+ __all__ = [
5
+ "gini",
6
+ "locational_gini",
7
+ "hoover_gini",
8
+ "herfindahl",
9
+ "shannon_entropy",
10
+ "hoover_index",
11
+ ]
@@ -0,0 +1,148 @@
1
+ """
2
+ Concentration and dispersion measures.
3
+
4
+ References
5
+ ----------
6
+ Herfindahl-Hirschman (1945/1950); Shannon (1948); Hoover (1936).
7
+ """
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from typing import Optional, Union
12
+
13
+ from ..core.utils import validate_matrix, safe_divide
14
+
15
+
16
+ def herfindahl(
17
+ mat: Union[np.ndarray, pd.DataFrame],
18
+ normalize: bool = True,
19
+ ) -> Union[pd.Series, np.ndarray]:
20
+ """
21
+ Herfindahl-Hirschman Index (HHI) per region.
22
+
23
+ HHI_r = sum_c (x_{rc} / X_r)^2
24
+
25
+ Parameters
26
+ ----------
27
+ mat : array-like (R x C)
28
+ Value matrix.
29
+ normalize : bool
30
+ If True, normalize to [0, 1] using (HHI - 1/C) / (1 - 1/C).
31
+
32
+ Returns
33
+ -------
34
+ pd.Series indexed by region.
35
+ """
36
+ is_df = isinstance(mat, pd.DataFrame)
37
+ row_index = mat.index if is_df else None
38
+
39
+ arr = validate_matrix(mat)
40
+ row_sums = arr.sum(axis=1, keepdims=True)
41
+ shares = safe_divide(arr, row_sums)
42
+ hhi = (shares ** 2).sum(axis=1)
43
+
44
+ if normalize:
45
+ n = arr.shape[1]
46
+ min_hhi = 1.0 / n
47
+ hhi = safe_divide(hhi - min_hhi, 1.0 - min_hhi)
48
+
49
+ if is_df:
50
+ return pd.Series(hhi, index=row_index, name="herfindahl")
51
+ return hhi
52
+
53
+
54
+ def shannon_entropy(
55
+ mat: Union[np.ndarray, pd.DataFrame],
56
+ base: float = 2.0,
57
+ ) -> Union[pd.Series, np.ndarray]:
58
+ """
59
+ Shannon entropy per region.
60
+
61
+ H_r = -sum_c (s_{rc} * log_base(s_{rc}))
62
+ where s_{rc} = x_{rc} / X_r
63
+
64
+ Higher entropy = more diversified regional economy.
65
+
66
+ Parameters
67
+ ----------
68
+ mat : array-like (R x C)
69
+ Value matrix.
70
+ base : float
71
+ Logarithm base (default 2 = bits).
72
+
73
+ Returns
74
+ -------
75
+ pd.Series indexed by region.
76
+ """
77
+ is_df = isinstance(mat, pd.DataFrame)
78
+ row_index = mat.index if is_df else None
79
+
80
+ arr = validate_matrix(mat)
81
+ row_sums = arr.sum(axis=1, keepdims=True)
82
+ shares = safe_divide(arr, row_sums)
83
+
84
+ with np.errstate(divide="ignore", invalid="ignore"):
85
+ log_shares = np.where(shares > 0, np.log(shares) / np.log(base), 0.0)
86
+
87
+ result = -(shares * log_shares).sum(axis=1)
88
+
89
+ if is_df:
90
+ return pd.Series(result, index=row_index, name="shannon_entropy")
91
+ return result
92
+
93
+
94
+ def hoover_index(
95
+ mat: Union[np.ndarray, pd.DataFrame],
96
+ pop: Optional[Union[np.ndarray, pd.Series]] = None,
97
+ ) -> Union[pd.Series, np.ndarray]:
98
+ """
99
+ Hoover Index (Robin Hood Index) per activity.
100
+
101
+ H_c = (1/2) * sum_r |E_r/E_total - A_r/A_total| * 100
102
+
103
+ where E_r = employment of region r in activity c,
104
+ A_r = total employment / population of region r.
105
+
106
+ Parameters
107
+ ----------
108
+ mat : array-like (R x C)
109
+ Value matrix.
110
+ pop : array-like (length R), optional
111
+ Population/reference vector. If None, uses row sums.
112
+
113
+ Returns
114
+ -------
115
+ pd.Series indexed by activity (values 0–100).
116
+ """
117
+ is_df = isinstance(mat, pd.DataFrame)
118
+ col_index = mat.columns if is_df else None
119
+
120
+ arr = validate_matrix(mat)
121
+
122
+ if pop is None:
123
+ pop_arr = arr.sum(axis=1)
124
+ else:
125
+ pop_arr = np.array(pop, dtype=float)
126
+
127
+ total_pop = pop_arr.sum()
128
+
129
+ col_sums = arr.sum(axis=0)
130
+
131
+ results = []
132
+ for c in range(arr.shape[1]):
133
+ if col_sums[c] == 0:
134
+ results.append(0.0)
135
+ continue
136
+ share_e = arr[:, c] / col_sums[c]
137
+ share_a = pop_arr / total_pop
138
+ hi = 0.5 * np.abs(share_e - share_a).sum() * 100
139
+ results.append(hi)
140
+
141
+ result = np.array(results)
142
+ if is_df:
143
+ return pd.Series(result, index=col_index, name="hoover_index")
144
+ return result
145
+
146
+
147
+ # Short alias matching the documented API
148
+ hhi = herfindahl