econcomplex 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. econcomplex/__init__.py +220 -0
  2. econcomplex/complexity/__init__.py +23 -0
  3. econcomplex/complexity/eci_pci.py +131 -0
  4. econcomplex/complexity/eigenvector.py +115 -0
  5. econcomplex/complexity/fitness.py +130 -0
  6. econcomplex/complexity/reflections.py +173 -0
  7. econcomplex/complexity/subnational.py +82 -0
  8. econcomplex/core/__init__.py +23 -0
  9. econcomplex/core/diversity.py +125 -0
  10. econcomplex/core/preprocess.py +83 -0
  11. econcomplex/core/rca.py +161 -0
  12. econcomplex/core/utils.py +137 -0
  13. econcomplex/dynamics/__init__.py +10 -0
  14. econcomplex/dynamics/entry_exit.py +248 -0
  15. econcomplex/dynamics/growth.py +146 -0
  16. econcomplex/inequality/__init__.py +11 -0
  17. econcomplex/inequality/concentration.py +148 -0
  18. econcomplex/inequality/gini.py +164 -0
  19. econcomplex/optimization/__init__.py +46 -0
  20. econcomplex/optimization/diffusion.py +379 -0
  21. econcomplex/optimization/growth_target.py +170 -0
  22. econcomplex/optimization/portfolio.py +178 -0
  23. econcomplex/optimization/steppingstone.py +267 -0
  24. econcomplex/outlook/__init__.py +6 -0
  25. econcomplex/outlook/coi_cog.py +168 -0
  26. econcomplex/patents/__init__.py +7 -0
  27. econcomplex/patents/recombination.py +135 -0
  28. econcomplex/pipeline.py +255 -0
  29. econcomplex/productivity/__init__.py +8 -0
  30. econcomplex/productivity/prody.py +218 -0
  31. econcomplex/relatedness/__init__.py +25 -0
  32. econcomplex/relatedness/cooccurrence.py +173 -0
  33. econcomplex/relatedness/cross_space.py +142 -0
  34. econcomplex/relatedness/density.py +232 -0
  35. econcomplex/relatedness/proximity.py +214 -0
  36. econcomplex/specialization/__init__.py +17 -0
  37. econcomplex/specialization/location_quotient.py +163 -0
  38. econcomplex/specialization/similarity.py +68 -0
  39. econcomplex-1.0.0.dist-info/METADATA +223 -0
  40. econcomplex-1.0.0.dist-info/RECORD +43 -0
  41. econcomplex-1.0.0.dist-info/WHEEL +5 -0
  42. econcomplex-1.0.0.dist-info/licenses/LICENSE +22 -0
  43. econcomplex-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,173 @@
1
+ """
2
+ Method of Reflections (MOR) for Economic Complexity.
3
+
4
+ References
5
+ ----------
6
+ Hidalgo & Hausmann (2009) "The Building Blocks of Economic Complexity".
7
+ Balland & Rigby (2017) for regional adaptation.
8
+ """
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ from typing import Optional, Tuple, Union
13
+
14
+ from ..core.utils import validate_matrix, safe_divide, normalize_zscore, binarize
15
+ from ..core.rca import rca as compute_rca
16
+
17
+
18
+ def _binary_presence(mat, use_rca: bool, threshold: float):
19
+ """Binary Mcp plus DataFrame metadata, shared by the MoR variants."""
20
+ is_df = isinstance(mat, pd.DataFrame)
21
+ row_index = mat.index if is_df else None
22
+ col_index = mat.columns if is_df else None
23
+ arr = validate_matrix(mat)
24
+ if use_rca:
25
+ m = binarize(compute_rca(arr), threshold)
26
+ else:
27
+ m = binarize(arr, threshold)
28
+ return m, is_df, row_index, col_index
29
+
30
+
31
+ def _reflect(m: np.ndarray, iterations: int, tol: Optional[float] = None):
32
+ """
33
+ Iterate the Method of Reflections from the binary matrix `m`:
34
+ k_{r,n} = (M * k_{c,n-1}) / k_{r,0}
35
+ k_{c,n} = (M^T * k_{r,n-1}) / k_{c,0}
36
+ Optionally stop early when the z-scored reflections change by less
37
+ than `tol`. Returns (kc, kp, kc0, kp0).
38
+ """
39
+ kc0 = m.sum(axis=1) # diversity
40
+ kp0 = m.sum(axis=0) # ubiquity
41
+ kc = kc0.copy()
42
+ kp = kp0.copy()
43
+ for _ in range(iterations):
44
+ kc_new = safe_divide(m @ kp, kc0)
45
+ kp_new = safe_divide(m.T @ kc, kp0)
46
+ if tol is not None:
47
+ delta_c = np.max(np.abs(normalize_zscore(kc_new) - normalize_zscore(kc)))
48
+ delta_p = np.max(np.abs(normalize_zscore(kp_new) - normalize_zscore(kp)))
49
+ kc = kc_new
50
+ kp = kp_new
51
+ if delta_c < tol and delta_p < tol:
52
+ break
53
+ else:
54
+ kc = kc_new
55
+ kp = kp_new
56
+ return kc, kp, kc0, kp0
57
+
58
+
59
+ def method_of_reflections(
60
+ mat: Union[np.ndarray, pd.DataFrame],
61
+ use_rca: bool = True,
62
+ threshold: float = 1.0,
63
+ iterations: int = 20,
64
+ return_both: bool = True,
65
+ tol: Optional[float] = 1e-10,
66
+ ) -> Union[Tuple[pd.Series, pd.Series], Tuple[np.ndarray, np.ndarray]]:
67
+ """
68
+ Method of Reflections (iterative) to compute ECI and PCI.
69
+
70
+ Starting from the binary Mcp matrix:
71
+ k_{r,0} = diversity (row sums)
72
+ k_{c,0} = ubiquity (col sums)
73
+ k_{r,n} = (M * k_{c,n-1}) / k_{r,0}
74
+ k_{c,n} = (M^T * k_{r,n-1}) / k_{c,0}
75
+ Final values are z-score normalized.
76
+
77
+ Parameters
78
+ ----------
79
+ mat : array-like (R x C)
80
+ Value or pre-binarized matrix.
81
+ use_rca : bool
82
+ Compute RCA internally before binarizing.
83
+ threshold : float
84
+ Binarization threshold.
85
+ iterations : int
86
+ Number of reflection steps (default 20).
87
+ return_both : bool
88
+ If True, return (ECI, PCI); if False, return ECI only.
89
+ tol : float, optional
90
+ Convergence tolerance: stop early when the z-scored reflections
91
+ change by less than `tol` between iterations. The check uses the
92
+ z-scored vectors because the raw reflections converge to a
93
+ constant. Pass None to always run all `iterations`.
94
+
95
+ Returns
96
+ -------
97
+ (eci, pci) as pd.Series (or ndarrays if input is ndarray).
98
+ """
99
+ m, is_df, row_index, col_index = _binary_presence(mat, use_rca, threshold)
100
+ kc, kp, kc0, kp0 = _reflect(m, iterations, tol=tol)
101
+
102
+ eci = normalize_zscore(kc)
103
+ pci = normalize_zscore(kp)
104
+
105
+ # Sign orientation, as in the eigenvector method: ECI correlates
106
+ # positively with diversity, PCI negatively with ubiquity
107
+ if np.std(eci) > 0 and np.std(kc0) > 0 and np.corrcoef(eci, kc0)[0, 1] < 0:
108
+ eci = -eci
109
+ if np.std(pci) > 0 and np.std(kp0) > 0 and np.corrcoef(pci, kp0)[0, 1] > 0:
110
+ pci = -pci
111
+
112
+ if is_df:
113
+ return (
114
+ pd.Series(eci, index=row_index, name="eci"),
115
+ pd.Series(pci, index=col_index, name="pci"),
116
+ )
117
+ return eci, pci
118
+
119
+
120
+ def mor_regions(
121
+ mat: Union[np.ndarray, pd.DataFrame],
122
+ use_rca: bool = True,
123
+ threshold: float = 1.0,
124
+ steps: int = 20,
125
+ ) -> Union[pd.Series, np.ndarray]:
126
+ """
127
+ Method of Reflections — region/country side only.
128
+
129
+ Returns the region complexity at a given reflection step (0–22),
130
+ rescaled to 0–100 for steps > 1.
131
+
132
+ step 0 = raw diversity
133
+ step 1 = avg ubiquity of industries present in each region
134
+ step 2+ = higher-order reflections
135
+ """
136
+ m, is_df, row_index, _ = _binary_presence(mat, use_rca, threshold)
137
+ kc, kp, _, _ = _reflect(m, steps)
138
+
139
+ if steps > 1:
140
+ mn, mx = kc.min(), kc.max()
141
+ result = (kc - mn) / (mx - mn) * 100 if mx != mn else np.zeros_like(kc)
142
+ else:
143
+ result = kc
144
+
145
+ if is_df:
146
+ return pd.Series(result, index=row_index, name=f"mor_region_step{steps}")
147
+ return result
148
+
149
+
150
+ def mor_activities(
151
+ mat: Union[np.ndarray, pd.DataFrame],
152
+ use_rca: bool = True,
153
+ threshold: float = 1.0,
154
+ steps: int = 19,
155
+ ) -> Union[pd.Series, np.ndarray]:
156
+ """
157
+ Method of Reflections — activity/technology side only.
158
+
159
+ step 0 = raw ubiquity
160
+ step 1+ = higher-order reflections; rescaled to 0–100 for steps > 1.
161
+ """
162
+ m, is_df, _, col_index = _binary_presence(mat, use_rca, threshold)
163
+ kc, kp, _, _ = _reflect(m, steps)
164
+
165
+ if steps > 1:
166
+ mn, mx = kp.min(), kp.max()
167
+ result = (kp - mn) / (mx - mn) * 100 if mx != mn else np.zeros_like(kp)
168
+ else:
169
+ result = kp
170
+
171
+ if is_df:
172
+ return pd.Series(result, index=col_index, name=f"mor_activity_step{steps}")
173
+ return result
@@ -0,0 +1,82 @@
1
+ """
2
+ Subnational / external ECI using externally supplied PCI.
3
+
4
+ References
5
+ ----------
6
+ Balland & Rigby (2017); Mealy et al. (2019).
7
+ """
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from typing import Tuple, Union
12
+
13
+ from ..core.utils import validate_matrix, binarize, normalize_zscore
14
+ from ..core.rca import rca as compute_rca
15
+
16
+
17
+ def subnational_eci(
18
+ mat: Union[np.ndarray, pd.DataFrame],
19
+ pci_external: Union[np.ndarray, pd.Series],
20
+ use_rca: bool = True,
21
+ threshold: float = 1.0,
22
+ standardize: bool = True,
23
+ ) -> Tuple[Union[pd.Series, np.ndarray], Union[pd.Series, np.ndarray]]:
24
+ """
25
+ Subnational ECI using an externally supplied PCI vector.
26
+
27
+ Each region's ECI is the mean PCI of activities it has RCA in:
28
+ ECI_r = mean(PCI_c for c where M_{rc} = 1)
29
+
30
+ Useful when comparing subnational units to a global product space,
31
+ or when the dataset is too small to compute reliable eigenvectors.
32
+
33
+ Parameters
34
+ ----------
35
+ mat : array-like (R x C)
36
+ Regional value matrix.
37
+ pci_external : array-like (length C)
38
+ External PCI vector (e.g., global product complexity index).
39
+ use_rca : bool
40
+ Compute RCA before binarizing.
41
+ threshold : float
42
+ Binarization threshold.
43
+ standardize : bool
44
+ If True, z-score normalize the resulting ECI.
45
+
46
+ Returns
47
+ -------
48
+ (eci, pci_external) as pd.Series or ndarrays.
49
+ """
50
+ is_df = isinstance(mat, pd.DataFrame)
51
+ row_index = mat.index if is_df else None
52
+ col_index = mat.columns if is_df else None
53
+
54
+ arr = validate_matrix(mat)
55
+
56
+ if use_rca:
57
+ m = binarize(compute_rca(arr), threshold)
58
+ else:
59
+ m = binarize(arr, threshold)
60
+
61
+ pci_arr = np.array(pci_external, dtype=float)
62
+ if len(pci_arr) != m.shape[1]:
63
+ raise ValueError("Length of pci_external must match number of columns in mat.")
64
+
65
+ row_sums = m.sum(axis=1)
66
+ eci_raw = np.where(
67
+ row_sums > 0,
68
+ (m @ pci_arr) / row_sums,
69
+ 0.0,
70
+ )
71
+
72
+ if standardize:
73
+ eci_raw = normalize_zscore(eci_raw)
74
+
75
+ if is_df:
76
+ pci_out = (
77
+ pci_external
78
+ if isinstance(pci_external, pd.Series)
79
+ else pd.Series(pci_arr, index=col_index, name="pci")
80
+ )
81
+ return pd.Series(eci_raw, index=row_index, name="eci_external"), pci_out
82
+ return eci_raw, pci_arr
@@ -0,0 +1,23 @@
1
+ from .rca import rca, rpop, mcp
2
+ from .diversity import diversity, ubiquity, normalized_ubiquity
3
+ from .utils import (
4
+ pivot_to_matrix,
5
+ melt_matrix,
6
+ binarize,
7
+ normalize_zscore,
8
+ normalize_01,
9
+ )
10
+
11
+ __all__ = [
12
+ "rca",
13
+ "rpop",
14
+ "mcp",
15
+ "diversity",
16
+ "ubiquity",
17
+ "normalized_ubiquity",
18
+ "pivot_to_matrix",
19
+ "melt_matrix",
20
+ "binarize",
21
+ "normalize_zscore",
22
+ "normalize_01",
23
+ ]
@@ -0,0 +1,125 @@
1
+ """
2
+ Diversity and ubiquity indicators.
3
+ """
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from typing import Union
8
+
9
+ from .utils import validate_matrix, safe_divide
10
+ from .rca import rca as compute_rca
11
+
12
+
13
+ def diversity(
14
+ mat: Union[np.ndarray, pd.DataFrame],
15
+ use_rca: bool = True,
16
+ threshold: float = 1.0,
17
+ ) -> pd.Series:
18
+ """
19
+ Diversity: number of activities in which each region has RCA >= threshold.
20
+
21
+ Parameters
22
+ ----------
23
+ mat : array-like (R x C)
24
+ Value matrix or pre-computed binary Mcp matrix.
25
+ use_rca : bool
26
+ If True, compute RCA internally before binarizing.
27
+ threshold : float
28
+ Binarization threshold.
29
+
30
+ Returns
31
+ -------
32
+ pd.Series indexed by region.
33
+ """
34
+ is_df = isinstance(mat, pd.DataFrame)
35
+ index = mat.index if is_df else None
36
+
37
+ arr = validate_matrix(mat)
38
+
39
+ if use_rca:
40
+ rca_mat = compute_rca(arr)
41
+ m = (rca_mat >= threshold).astype(float)
42
+ else:
43
+ m = (arr >= threshold).astype(float)
44
+
45
+ result = m.sum(axis=1)
46
+
47
+ if is_df:
48
+ return pd.Series(result, index=index, name="diversity")
49
+ return result
50
+
51
+
52
+ def ubiquity(
53
+ mat: Union[np.ndarray, pd.DataFrame],
54
+ use_rca: bool = True,
55
+ threshold: float = 1.0,
56
+ ) -> pd.Series:
57
+ """
58
+ Ubiquity: number of regions where each activity has RCA >= threshold.
59
+
60
+ Parameters
61
+ ----------
62
+ mat : array-like (R x C)
63
+ Value matrix or pre-computed binary Mcp matrix.
64
+ use_rca : bool
65
+ If True, compute RCA internally before binarizing.
66
+ threshold : float
67
+ Binarization threshold.
68
+
69
+ Returns
70
+ -------
71
+ pd.Series indexed by activity.
72
+ """
73
+ is_df = isinstance(mat, pd.DataFrame)
74
+ columns = mat.columns if is_df else None
75
+
76
+ arr = validate_matrix(mat)
77
+
78
+ if use_rca:
79
+ rca_mat = compute_rca(arr)
80
+ m = (rca_mat >= threshold).astype(float)
81
+ else:
82
+ m = (arr >= threshold).astype(float)
83
+
84
+ result = m.sum(axis=0)
85
+
86
+ if is_df:
87
+ return pd.Series(result, index=columns, name="ubiquity")
88
+ return result
89
+
90
+
91
+ def normalized_ubiquity(
92
+ mat: Union[np.ndarray, pd.DataFrame],
93
+ threshold: float = 1.0,
94
+ ) -> pd.Series:
95
+ """
96
+ Normalized ubiquity: ratio of activity's share of total value to
97
+ its share of ubiquity.
98
+
99
+ norm_ubiquity_c = (sum_r x_{rc} / sum_{rc} x_{rc}) / (U_c / sum_c U_c)
100
+
101
+ Returns
102
+ -------
103
+ pd.Series indexed by activity.
104
+ """
105
+ is_df = isinstance(mat, pd.DataFrame)
106
+ columns = mat.columns if is_df else None
107
+
108
+ arr = validate_matrix(mat)
109
+ total = arr.sum()
110
+ ub = ubiquity(arr, use_rca=True, threshold=threshold)
111
+
112
+ if isinstance(ub, pd.Series):
113
+ ub_arr = ub.values
114
+ else:
115
+ ub_arr = ub
116
+
117
+ col_sums = arr.sum(axis=0)
118
+ value_share = col_sums / total
119
+ ubiquity_share = safe_divide(ub_arr, ub_arr.sum())
120
+
121
+ result = safe_divide(value_share, ubiquity_share)
122
+
123
+ if is_df:
124
+ return pd.Series(result, index=columns, name="norm_ubiquity")
125
+ return result
@@ -0,0 +1,83 @@
1
+ """
2
+ Pre-processing utilities for sparse location-activity matrices.
3
+
4
+ References
5
+ ----------
6
+ Hidalgo & Hausmann (2009); OEC Atlas methodology.
7
+ """
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from typing import Union
12
+
13
+ from .utils import validate_matrix, binarize
14
+ from .rca import rca as compute_rca
15
+
16
+
17
+ def trim_core(
18
+ mat: Union[np.ndarray, pd.DataFrame],
19
+ dmin: int = 1,
20
+ umin: int = 1,
21
+ use_rca: bool = True,
22
+ threshold: float = 1.0,
23
+ max_iter: int = 100,
24
+ ) -> Union[np.ndarray, pd.DataFrame]:
25
+ """
26
+ Iteratively prune a value matrix to its (dmin, umin)-core.
27
+
28
+ Drops locations (rows) with diversity < dmin and activities (columns)
29
+ with ubiquity < umin, where diversity/ubiquity are counted on the
30
+ binary presence matrix (RCA >= threshold by default). Because removing
31
+ a row or column changes the RCA of the remaining cells, the presence
32
+ matrix is recomputed after each pass until the matrix is stable.
33
+
34
+ With the defaults (dmin=1, umin=1) only degenerate units are removed:
35
+ locations with no activity above the RCA threshold and activities
36
+ with no location above it. These units cannot be ranked and distort
37
+ (or break) the complexity calculations. For very sparse networks
38
+ (e.g. subnational trade data) the literature computes complexity on
39
+ the well-connected core, typically dmin=2, umin=2
40
+ (Hidalgo & Hausmann 2009; OEC Atlas).
41
+
42
+ Parameters
43
+ ----------
44
+ mat : array-like (R x C)
45
+ Value matrix.
46
+ dmin : int
47
+ Minimum diversity for a location to be kept (default 1).
48
+ umin : int
49
+ Minimum ubiquity for an activity to be kept (default 1).
50
+ use_rca : bool
51
+ Compute RCA before binarizing (default True).
52
+ threshold : float
53
+ Binarization threshold (default 1.0).
54
+ max_iter : int
55
+ Safety cap on pruning passes.
56
+
57
+ Returns
58
+ -------
59
+ The surviving sub-matrix, same type as the input. With a DataFrame
60
+ the original labels are preserved, so dropped units can be recovered
61
+ by reindexing (`result.reindex(mat.index)`).
62
+ """
63
+ is_df = isinstance(mat, pd.DataFrame)
64
+ work = mat.astype(float) if is_df else pd.DataFrame(validate_matrix(mat))
65
+
66
+ # All-zero rows/columns first (always degenerate, whatever the core)
67
+ work = work.loc[work.sum(axis=1) > 0, work.sum(axis=0) > 0]
68
+
69
+ for _ in range(max_iter):
70
+ if work.shape[0] == 0 or work.shape[1] == 0:
71
+ break
72
+ if use_rca:
73
+ m = binarize(compute_rca(work.values), threshold)
74
+ else:
75
+ m = binarize(work.values, threshold)
76
+ keep_r = m.sum(axis=1) >= dmin
77
+ keep_c = m.sum(axis=0) >= umin
78
+ if keep_r.all() and keep_c.all():
79
+ break
80
+ work = work.loc[work.index[keep_r], work.columns[keep_c]]
81
+ work = work.loc[work.sum(axis=1) > 0, work.sum(axis=0) > 0]
82
+
83
+ return work if is_df else work.values
@@ -0,0 +1,161 @@
1
+ """
2
+ Revealed Comparative Advantage (RCA) / Balassa Index and variants.
3
+
4
+ References
5
+ ----------
6
+ Balassa (1965); Hausmann et al. (2007); Balland & Rigby (2017).
7
+ """
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from typing import Optional, Union
12
+
13
+ from .utils import validate_matrix, safe_divide, binarize
14
+
15
+
16
+ def rca(
17
+ mat: Union[np.ndarray, pd.DataFrame],
18
+ binary: bool = False,
19
+ threshold: float = 1.0,
20
+ ) -> Union[np.ndarray, pd.DataFrame]:
21
+ """
22
+ Revealed Comparative Advantage (Balassa Index).
23
+
24
+ RCA_{rc} = (x_{rc} / sum_c x_{rc}) / (sum_r x_{rc} / sum_{rc} x_{rc})
25
+
26
+ Parameters
27
+ ----------
28
+ mat : array-like (R x C)
29
+ Region × industry (or country × product) value matrix.
30
+ binary : bool
31
+ If True, return binary matrix thresholded at `threshold`.
32
+ threshold : float
33
+ Binarization threshold (default 1.0).
34
+
35
+ Returns
36
+ -------
37
+ Same type as input (DataFrame or ndarray).
38
+ """
39
+ is_df = isinstance(mat, pd.DataFrame)
40
+ index = mat.index if is_df else None
41
+ columns = mat.columns if is_df else None
42
+
43
+ arr = validate_matrix(mat)
44
+ total = arr.sum()
45
+ if total == 0:
46
+ raise ValueError("Matrix sum is zero; cannot compute RCA.")
47
+
48
+ row_sums = arr.sum(axis=1, keepdims=True) # R × 1
49
+ col_sums = arr.sum(axis=0, keepdims=True) # 1 × C
50
+
51
+ share_rc = safe_divide(arr, row_sums) # share of industry c in region r
52
+ share_c = col_sums / total # national share of industry c
53
+
54
+ result = safe_divide(share_rc, share_c)
55
+
56
+ if binary:
57
+ result = binarize(result, threshold)
58
+
59
+ if is_df:
60
+ return pd.DataFrame(result, index=index, columns=columns)
61
+ return result
62
+
63
+
64
+ def rpop(
65
+ mat: Union[np.ndarray, pd.DataFrame],
66
+ pop: Union[np.ndarray, pd.Series],
67
+ binary: bool = False,
68
+ threshold: float = 1.0,
69
+ ) -> Union[np.ndarray, pd.DataFrame]:
70
+ """
71
+ Population-normalized RCA (RPOP).
72
+
73
+ RPOP_{rc} = (x_{rc} / pop_r) / (sum_r x_{rc} / sum_r pop_r)
74
+
75
+ Parameters
76
+ ----------
77
+ mat : array-like (R x C)
78
+ Region × industry value matrix.
79
+ pop : 1-D array-like (length R)
80
+ Population vector for each region.
81
+ binary : bool
82
+ If True, binarize at `threshold`.
83
+ threshold : float
84
+ Binarization threshold (default 1.0).
85
+ """
86
+ is_df = isinstance(mat, pd.DataFrame)
87
+ index = mat.index if is_df else None
88
+ columns = mat.columns if is_df else None
89
+
90
+ arr = validate_matrix(mat)
91
+ pop_arr = np.array(pop, dtype=float).reshape(-1, 1) # R × 1
92
+
93
+ if len(pop_arr) != arr.shape[0]:
94
+ raise ValueError("Length of pop must equal number of rows in mat.")
95
+
96
+ world_pop = pop_arr.sum()
97
+ col_sums = arr.sum(axis=0, keepdims=True) # 1 × C
98
+
99
+ share_rc = safe_divide(arr, pop_arr) # x_{rc} / pop_r
100
+ share_c = col_sums / world_pop # X_c / world_pop
101
+
102
+ result = safe_divide(share_rc, share_c)
103
+
104
+ if binary:
105
+ result = binarize(result, threshold)
106
+
107
+ if is_df:
108
+ return pd.DataFrame(result, index=index, columns=columns)
109
+ return result
110
+
111
+
112
+ def mcp(
113
+ mat: Union[np.ndarray, pd.DataFrame],
114
+ presence_test: str = "rca",
115
+ rca_threshold: float = 1.0,
116
+ rpop_threshold: float = 1.0,
117
+ pop: Optional[Union[np.ndarray, pd.Series]] = None,
118
+ ) -> Union[np.ndarray, pd.DataFrame]:
119
+ """
120
+ Binary presence matrix (Mcp).
121
+
122
+ Parameters
123
+ ----------
124
+ mat : array-like (R x C)
125
+ Raw value matrix.
126
+ presence_test : str
127
+ One of 'rca', 'rpop', 'both' (union of rca and rpop), 'manual'
128
+ (treat mat as already binary).
129
+ rca_threshold : float
130
+ RCA binarization threshold.
131
+ rpop_threshold : float
132
+ RPOP binarization threshold.
133
+ pop : array-like (length R), optional
134
+ Required when presence_test is 'rpop' or 'both'.
135
+ """
136
+ is_df = isinstance(mat, pd.DataFrame)
137
+
138
+ if presence_test == "manual":
139
+ result = binarize(mat, 0.5)
140
+ elif presence_test == "rca":
141
+ result = binarize(rca(mat), rca_threshold)
142
+ elif presence_test == "rpop":
143
+ if pop is None:
144
+ raise ValueError("pop is required for presence_test='rpop'.")
145
+ result = binarize(rpop(mat, pop), rpop_threshold)
146
+ elif presence_test == "both":
147
+ if pop is None:
148
+ raise ValueError("pop is required for presence_test='both'.")
149
+ m_rca = binarize(rca(mat), rca_threshold)
150
+ m_rpop = binarize(rpop(mat, pop), rpop_threshold)
151
+ result = ((m_rca + m_rpop) > 0).astype(float)
152
+ else:
153
+ raise ValueError(
154
+ "presence_test must be one of 'rca', 'rpop', 'both', 'manual'."
155
+ )
156
+
157
+ if is_df:
158
+ index = mat.index
159
+ columns = mat.columns
160
+ return pd.DataFrame(result, index=index, columns=columns)
161
+ return result