py-geodetector 0.1.3__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geodetector/geodetector.py +319 -0
- py_geodetector-0.2.1.dist-info/METADATA +95 -0
- py_geodetector-0.2.1.dist-info/RECORD +6 -0
- {py_geodetector-0.1.3.dist-info → py_geodetector-0.2.1.dist-info}/WHEEL +1 -1
- py_geodetector/geodetector.py +0 -225
- py_geodetector-0.1.3.dist-info/METADATA +0 -66
- py_geodetector-0.1.3.dist-info/RECORD +0 -6
- {py_geodetector → geodetector}/__init__.py +0 -0
- {py_geodetector → geodetector}/example_data/disease.csv +0 -0
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import Sequence, Union, Optional, Tuple, Dict
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
from scipy.stats import f, levene, ncf, ttest_ind
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from pandas.api.types import is_integer_dtype, is_string_dtype, is_object_dtype
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def load_example_data() -> pd.DataFrame:
|
|
12
|
+
"""Load example disease dataset."""
|
|
13
|
+
file_path = Path(__file__).parent / "example_data" / "disease.csv"
|
|
14
|
+
df = pd.read_csv(file_path)
|
|
15
|
+
return df
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GeoDetector:
|
|
19
|
+
"""
|
|
20
|
+
GeoDetector class for spatial statistics.
|
|
21
|
+
|
|
22
|
+
References:
|
|
23
|
+
Wang, J. F., Li, X. H., Christakos, G., Liao, Y. L., Zhang, T., Gu, X., & Zheng, X. Y. (2010).
|
|
24
|
+
Geographical detectors-based health risk assessment and its application in the neural tube defects study of the Heshun Region, China.
|
|
25
|
+
International Journal of Geographical Information Science, 24(1), 107-127.
|
|
26
|
+
"""
|
|
27
|
+
def __init__(self, df: pd.DataFrame, y: str, factors: Optional[Sequence[str]] = None, alpha: float = 0.05):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the GeoDetector instance.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
df (pd.DataFrame): The dataset containing both dependent variable and factors.
|
|
33
|
+
y (str): The column name of the dependent variable (numerical).
|
|
34
|
+
factors (Sequence[str], optional): A list of column names for the factors (categorical/stratified).
|
|
35
|
+
If None, automatically detects discrete columns as factors.
|
|
36
|
+
alpha (float, optional): The significance level for hypothesis testing. Defaults to 0.05.
|
|
37
|
+
"""
|
|
38
|
+
self.df = df
|
|
39
|
+
self.y = y
|
|
40
|
+
self.alpha = alpha
|
|
41
|
+
|
|
42
|
+
if factors is None:
|
|
43
|
+
# Automatically detect factors: must be discrete (int, str, or object) and not y
|
|
44
|
+
self.factors = []
|
|
45
|
+
for col in df.columns:
|
|
46
|
+
if col == y:
|
|
47
|
+
continue
|
|
48
|
+
dtype = df[col].dtype
|
|
49
|
+
if is_integer_dtype(dtype) or is_string_dtype(dtype) or is_object_dtype(dtype):
|
|
50
|
+
self.factors.append(col)
|
|
51
|
+
else:
|
|
52
|
+
self.factors = list(factors)
|
|
53
|
+
|
|
54
|
+
self._check_data(df, y, self.factors)
|
|
55
|
+
|
|
56
|
+
def _is_discrete(self, factor: str) -> bool:
|
|
57
|
+
"""Check if a factor is of discrete type."""
|
|
58
|
+
dtype = self.df[factor].dtype
|
|
59
|
+
return is_integer_dtype(dtype) or is_string_dtype(dtype) or is_object_dtype(dtype)
|
|
60
|
+
|
|
61
|
+
def _check_discrete_factors(self, factors: Sequence[str]):
|
|
62
|
+
"""Ensure all provided factors are discrete."""
|
|
63
|
+
for factor in factors:
|
|
64
|
+
if not self._is_discrete(factor):
|
|
65
|
+
raise ValueError(f"Factor '{factor}' must be a discrete type (int, str, or object). "
|
|
66
|
+
f"Current type: {self.df[factor].dtype}. Please discretize it first.")
|
|
67
|
+
|
|
68
|
+
def _check_data(self, df: pd.DataFrame, y: str, factors: Sequence[str]):
|
|
69
|
+
"""Check data validity."""
|
|
70
|
+
if y not in df.columns:
|
|
71
|
+
raise ValueError(f"Y variable [{y}] is not in data")
|
|
72
|
+
|
|
73
|
+
for factor in factors:
|
|
74
|
+
if factor not in df.columns:
|
|
75
|
+
raise ValueError(f"Factor [{factor}] is not in data")
|
|
76
|
+
|
|
77
|
+
if y == factor:
|
|
78
|
+
raise ValueError(f"Y variable [{y}] should not be in Factor variables.")
|
|
79
|
+
|
|
80
|
+
# Check column data types for provided factors
|
|
81
|
+
self._check_discrete_factors(factors)
|
|
82
|
+
|
|
83
|
+
if df.isnull().values.any():
|
|
84
|
+
raise ValueError("Data contains NULL values")
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def _cal_ssw(cls, df: pd.DataFrame, y: str, factor: Union[str, list], extra_factor: Optional[str] = None) -> Tuple[float, float, float]:
|
|
88
|
+
"""
|
|
89
|
+
Calculate the Within Sum of Squares (SSW) and other statistics for the q-statistic using vectorization.
|
|
90
|
+
"""
|
|
91
|
+
group_cols = [factor] if isinstance(factor, str) else list(factor)
|
|
92
|
+
if extra_factor:
|
|
93
|
+
group_cols.append(extra_factor)
|
|
94
|
+
|
|
95
|
+
agg_df = df.groupby(group_cols)[y].agg(['var', 'mean', 'count'])
|
|
96
|
+
agg_df['var'] = agg_df['var'].fillna(0)
|
|
97
|
+
|
|
98
|
+
strataVarSum = ((agg_df['count'] - 1) * agg_df['var']).sum()
|
|
99
|
+
lamda_1st_sum = (agg_df['mean'] ** 2).sum()
|
|
100
|
+
lamda_2nd_sum = (np.sqrt(agg_df['count']) * agg_df['mean']).sum()
|
|
101
|
+
|
|
102
|
+
return strataVarSum, lamda_1st_sum, lamda_2nd_sum
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def _cal_q(cls, df: pd.DataFrame, y: str, factor: str, extra_factor: Optional[str] = None) -> Tuple[float, float, float]:
|
|
106
|
+
"""Calculate q-statistic."""
|
|
107
|
+
strataVarSum, lamda_1st_sum, lamda_2nd_sum = cls._cal_ssw(df, y, factor, extra_factor)
|
|
108
|
+
total_var = (df.shape[0] - 1) * df[y].var(ddof=1)
|
|
109
|
+
q = 1 - strataVarSum / total_var
|
|
110
|
+
return q, lamda_1st_sum, lamda_2nd_sum
|
|
111
|
+
|
|
112
|
+
def factor_detector(self, factors: Optional[Union[str, Sequence[str]]] = None) -> Union[pd.DataFrame, Tuple[float, float]]:
|
|
113
|
+
"""
|
|
114
|
+
Factor detector: detects the spatial stratification heterogeneity of Y.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
factors (str or list, optional): Factors to detect. If None, use all factors.
|
|
118
|
+
If a single string, returns (q, p).
|
|
119
|
+
"""
|
|
120
|
+
target_factors = factors if factors is not None else self.factors
|
|
121
|
+
if isinstance(target_factors, str):
|
|
122
|
+
target_factors = [target_factors]
|
|
123
|
+
|
|
124
|
+
# Check if factors are discrete
|
|
125
|
+
self._check_discrete_factors(target_factors)
|
|
126
|
+
|
|
127
|
+
res_df = pd.DataFrame(index=["q statistic", "p value"], columns=target_factors, dtype="float64")
|
|
128
|
+
n_popu = self.df.shape[0]
|
|
129
|
+
y_var = self.df[self.y].var(ddof=1)
|
|
130
|
+
|
|
131
|
+
for factor in target_factors:
|
|
132
|
+
n_stra = self.df[factor].nunique()
|
|
133
|
+
q, lamda_1st_sum, lamda_2nd_sum = self._cal_q(self.df, self.y, factor)
|
|
134
|
+
|
|
135
|
+
nc_param = (lamda_1st_sum - np.square(lamda_2nd_sum) / n_popu) / y_var
|
|
136
|
+
f_val = (n_popu - n_stra) * q / ((n_stra - 1) * (1 - q))
|
|
137
|
+
p_val = ncf.sf(f_val, n_stra - 1, n_popu - n_stra, nc=nc_param)
|
|
138
|
+
|
|
139
|
+
res_df.loc["q statistic", factor] = q
|
|
140
|
+
res_df.loc["p value", factor] = p_val
|
|
141
|
+
|
|
142
|
+
if isinstance(factors, str):
|
|
143
|
+
return res_df.iloc[0, 0], res_df.iloc[1, 0]
|
|
144
|
+
return res_df
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def _interaction_relationship(df: pd.DataFrame) -> pd.DataFrame:
|
|
148
|
+
"""Determine the type of interaction relationship."""
|
|
149
|
+
out_df = pd.DataFrame(index=df.index, columns=df.columns)
|
|
150
|
+
factors = df.index
|
|
151
|
+
for i, f1 in enumerate(factors):
|
|
152
|
+
for j in range(i + 1, len(factors)):
|
|
153
|
+
f2 = factors[j]
|
|
154
|
+
i_q = df.loc[f2, f1]
|
|
155
|
+
q1 = df.loc[f1, f1]
|
|
156
|
+
q2 = df.loc[f2, f2]
|
|
157
|
+
|
|
158
|
+
if i_q <= q1 and i_q <= q2:
|
|
159
|
+
rel = "Weaken, nonlinear"
|
|
160
|
+
elif q1 < i_q < q2 or q2 < i_q < q1:
|
|
161
|
+
rel = "Weaken, uni-"
|
|
162
|
+
elif i_q == (q1 + q2):
|
|
163
|
+
rel = "Independent"
|
|
164
|
+
elif i_q > max(q1, q2):
|
|
165
|
+
rel = "Enhance, bi-"
|
|
166
|
+
|
|
167
|
+
if i_q > (q1 + q2):
|
|
168
|
+
rel = "Enhance, nonlinear"
|
|
169
|
+
|
|
170
|
+
out_df.loc[f2, f1] = rel
|
|
171
|
+
return out_df
|
|
172
|
+
|
|
173
|
+
def interaction_detector(self, factor1: Optional[str] = None, factor2: Optional[str] = None, relationship: bool = False, factors: Optional[Sequence[str]] = None):
|
|
174
|
+
"""
|
|
175
|
+
Interaction detector.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
factor1, factor2 (str, optional): If both provided, returns interaction q for just this pair.
|
|
179
|
+
relationship (bool): If True, returns relationship type.
|
|
180
|
+
factors (Sequence[str], optional): Custom list of factors for full matrix calculation.
|
|
181
|
+
"""
|
|
182
|
+
# If any specific factor is provided, BOTH must be provided
|
|
183
|
+
if factor1 or factor2:
|
|
184
|
+
if not (factor1 and factor2):
|
|
185
|
+
raise ValueError("Both factor1 and factor2 must be provided for pairwise interaction detection.")
|
|
186
|
+
|
|
187
|
+
if factor1 == factor2:
|
|
188
|
+
raise ValueError("factor1 and factor2 must be different for interaction detection.")
|
|
189
|
+
self._check_discrete_factors([factor1, factor2])
|
|
190
|
+
|
|
191
|
+
q, _, _ = self._cal_q(self.df, self.y, factor1, factor2)
|
|
192
|
+
if not relationship:
|
|
193
|
+
return q
|
|
194
|
+
|
|
195
|
+
q1, _, _ = self._cal_q(self.df, self.y, factor1)
|
|
196
|
+
q2, _, _ = self._cal_q(self.df, self.y, factor2)
|
|
197
|
+
temp_df = pd.DataFrame({factor1: [q1, q], factor2: [np.nan, q2]}, index=[factor1, factor2])
|
|
198
|
+
rel_df = self._interaction_relationship(temp_df)
|
|
199
|
+
return q, rel_df.loc[factor2, factor1]
|
|
200
|
+
|
|
201
|
+
# Full matrix calculation
|
|
202
|
+
target_factors = factors if factors is not None else self.factors
|
|
203
|
+
self._check_discrete_factors(target_factors)
|
|
204
|
+
|
|
205
|
+
inter_df = pd.DataFrame(index=target_factors, columns=target_factors, dtype="float64")
|
|
206
|
+
for i, f1 in enumerate(target_factors):
|
|
207
|
+
for j in range(i + 1):
|
|
208
|
+
f2 = target_factors[j]
|
|
209
|
+
q, _, _ = self._cal_q(self.df, self.y, f1, f2)
|
|
210
|
+
inter_df.loc[f1, f2] = q
|
|
211
|
+
|
|
212
|
+
if relationship:
|
|
213
|
+
return inter_df, self._interaction_relationship(inter_df)
|
|
214
|
+
return inter_df
|
|
215
|
+
|
|
216
|
+
def ecological_detector(self, factor1: Optional[str] = None, factor2: Optional[str] = None, factors: Optional[Sequence[str]] = None) -> Union[pd.DataFrame, str]:
|
|
217
|
+
"""Ecological detector."""
|
|
218
|
+
# If any specific factor is provided, BOTH must be provided
|
|
219
|
+
if factor1 or factor2:
|
|
220
|
+
if not (factor1 and factor2):
|
|
221
|
+
raise ValueError("Both factor1 and factor2 must be provided for pairwise ecological detection.")
|
|
222
|
+
|
|
223
|
+
if factor1 == factor2:
|
|
224
|
+
raise ValueError("factor1 and factor2 must be different for ecological detection.")
|
|
225
|
+
self._check_discrete_factors([factor1, factor2])
|
|
226
|
+
|
|
227
|
+
ssw1, _, _ = self._cal_ssw(self.df, self.y, factor1)
|
|
228
|
+
ssw2, _, _ = self._cal_ssw(self.df, self.y, factor2)
|
|
229
|
+
dfn = self.df[factor1].count() - 1
|
|
230
|
+
dfd = self.df[factor2].count() - 1
|
|
231
|
+
fval = (dfn * (dfd - 1) * ssw1) / (dfd * (dfn - 1) * ssw2)
|
|
232
|
+
return 'Y' if fval < f.ppf(self.alpha, dfn, dfd) else 'N'
|
|
233
|
+
|
|
234
|
+
target_factors = factors if factors is not None else self.factors
|
|
235
|
+
self._check_discrete_factors(target_factors)
|
|
236
|
+
|
|
237
|
+
eco_df = pd.DataFrame(index=target_factors, columns=target_factors, dtype="object")
|
|
238
|
+
for i, f1 in enumerate(target_factors):
|
|
239
|
+
ssw1, _, _ = self._cal_ssw(self.df, self.y, f1)
|
|
240
|
+
dfn = self.df[f1].count() - 1
|
|
241
|
+
for j in range(i):
|
|
242
|
+
f2 = target_factors[j]
|
|
243
|
+
ssw2, _, _ = self._cal_ssw(self.df, self.y, f2)
|
|
244
|
+
dfd = self.df[f2].count() - 1
|
|
245
|
+
fval = (dfn * (dfd - 1) * ssw1) / (dfd * (dfn - 1) * ssw2)
|
|
246
|
+
eco_df.loc[f1, f2] = 'Y' if fval < f.ppf(self.alpha, dfn, dfd) else 'N'
|
|
247
|
+
|
|
248
|
+
return eco_df
|
|
249
|
+
|
|
250
|
+
def risk_detector(self, factor: Optional[str] = None) -> Union[Dict, Dict[str, Dict]]:
|
|
251
|
+
"""Risk detector."""
|
|
252
|
+
target_factors = [factor] if factor else self.factors
|
|
253
|
+
self._check_discrete_factors(target_factors)
|
|
254
|
+
|
|
255
|
+
risk_result = {}
|
|
256
|
+
for f_name in target_factors:
|
|
257
|
+
risk_mean = self.df.groupby(f_name)[self.y].mean()
|
|
258
|
+
strata = np.sort(self.df[f_name].unique())
|
|
259
|
+
t_test_strata = pd.DataFrame(index=strata, columns=strata, dtype=bool)
|
|
260
|
+
|
|
261
|
+
for i in range(len(strata)):
|
|
262
|
+
for j in range(i + 1, len(strata)):
|
|
263
|
+
y_i = self.df.loc[self.df[f_name] == strata[i], self.y].values
|
|
264
|
+
y_j = self.df.loc[self.df[f_name] == strata[j], self.y].values
|
|
265
|
+
|
|
266
|
+
_, p_levene = levene(y_i, y_j)
|
|
267
|
+
equal_var = p_levene >= self.alpha
|
|
268
|
+
_, p_ttest = ttest_ind(y_i, y_j, equal_var=equal_var)
|
|
269
|
+
t_test_strata.loc[strata[j], strata[i]] = p_ttest <= self.alpha
|
|
270
|
+
|
|
271
|
+
risk_result[f_name] = {"risk": risk_mean, "ttest_stra": t_test_strata}
|
|
272
|
+
|
|
273
|
+
return risk_result[factor] if factor else risk_result
|
|
274
|
+
|
|
275
|
+
def _plot_text_labels(self, ax, interaction_df, ecological_df, value_fontsize=10):
|
|
276
|
+
"""Internal plotting helper."""
|
|
277
|
+
for i, row_idx in enumerate(interaction_df.index):
|
|
278
|
+
for j, col_idx in enumerate(interaction_df.columns):
|
|
279
|
+
val = interaction_df.iloc[i, j]
|
|
280
|
+
if not pd.isna(val):
|
|
281
|
+
mark = f"{val:.2f}"
|
|
282
|
+
# Use ecological_df to determine color if available
|
|
283
|
+
color = "k"
|
|
284
|
+
if ecological_df is not None and i < ecological_df.shape[0] and j < ecological_df.shape[1]:
|
|
285
|
+
if ecological_df.iloc[i, j] == 'Y':
|
|
286
|
+
color = "r"
|
|
287
|
+
ax.text(j, i, mark, ha="center", va="center", color=color, fontsize=value_fontsize)
|
|
288
|
+
|
|
289
|
+
def plot(self, factors: Optional[Sequence[str]] = None, tick_fontsize=10, value_fontsize=10, colorbar_fontsize=10, show=True):
|
|
290
|
+
"""
|
|
291
|
+
Plot interaction and ecological detector results.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
factors (Sequence[str], optional): Factors to include in the plot. Defaults to self.factors.
|
|
295
|
+
"""
|
|
296
|
+
target_factors = factors if factors is not None else self.factors
|
|
297
|
+
self._check_discrete_factors(target_factors)
|
|
298
|
+
|
|
299
|
+
inter_df = self.interaction_detector(factors=target_factors)
|
|
300
|
+
eco_df = self.ecological_detector(factors=target_factors)
|
|
301
|
+
|
|
302
|
+
fig, ax = plt.subplots(constrained_layout=True)
|
|
303
|
+
im = ax.imshow(inter_df.values, cmap="YlGnBu", vmin=0, vmax=1)
|
|
304
|
+
self._plot_text_labels(ax, inter_df, eco_df, value_fontsize=value_fontsize)
|
|
305
|
+
|
|
306
|
+
ax.set_xticks(np.arange(len(target_factors)))
|
|
307
|
+
ax.set_yticks(np.arange(len(target_factors)))
|
|
308
|
+
ax.spines['top'].set_visible(False)
|
|
309
|
+
ax.spines['right'].set_visible(False)
|
|
310
|
+
|
|
311
|
+
ax.set_xticklabels(target_factors, fontsize=tick_fontsize)
|
|
312
|
+
ax.set_yticklabels(target_factors, rotation=45, fontsize=tick_fontsize, va='top')
|
|
313
|
+
|
|
314
|
+
cbar = fig.colorbar(im, ax=ax, shrink=0.95, pad=0.02, aspect=25, extend="both")
|
|
315
|
+
cbar.ax.tick_params(labelsize=colorbar_fontsize)
|
|
316
|
+
|
|
317
|
+
if show:
|
|
318
|
+
plt.show()
|
|
319
|
+
return ax
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: py_geodetector
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: A simple Python package for the geodetector
|
|
5
|
+
Project-URL: Homepage, https://github.com/djw-easy/GeoDetector
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/djw-easy/GeoDetector/issues
|
|
7
|
+
Author-email: djw <djweasy@163.com>
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Requires-Python: >=3.7
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# py-geodetector
|
|
15
|
+
|
|
16
|
+
A simple and efficient Python package for the **Geographical Detector** (GeoDetector).
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- **Four Detectors**: Factor, Interaction, Risk, and Ecological detectors.
|
|
21
|
+
- **High Performance**: Vectorized calculations using Pandas and NumPy for large-scale datasets.
|
|
22
|
+
- **Flexible API**: Supports both batch processing and interactive exploratory analysis.
|
|
23
|
+
- **Auto Detection**: Automatically identifies discrete/categorical variables as factors.
|
|
24
|
+
- **Visualization**: Built-in heatmap for interaction results with statistical significance markers.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install py-geodetector
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
### 1. Data Preparation
|
|
35
|
+
- **Format**: pandas DataFrame.
|
|
36
|
+
- **Y (Dependent Variable)**: Numerical/Continuous.
|
|
37
|
+
- **X (Independent Variable)**: **Categorical/Discrete**. If your $X$ is continuous, you must discretize it first (e.g., using `pd.qcut` or Jenks natural breaks).
|
|
38
|
+
|
|
39
|
+
### 2. Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from geodetector import load_example_data, GeoDetector
|
|
43
|
+
|
|
44
|
+
# Load example disease dataset
|
|
45
|
+
df = load_example_data()
|
|
46
|
+
|
|
47
|
+
# Initialize: Automatically detects discrete columns as factors if 'factors' is not provided
|
|
48
|
+
gd = GeoDetector(df, y='incidence')
|
|
49
|
+
print(f"Detected factors: {gd.factors}")
|
|
50
|
+
|
|
51
|
+
# 1. Factor Detector
|
|
52
|
+
# Batch detection for all factors
|
|
53
|
+
factor_df = gd.factor_detector()
|
|
54
|
+
# Single factor detection: returns (q_value, p_value)
|
|
55
|
+
q, p = gd.factor_detector('type')
|
|
56
|
+
|
|
57
|
+
# 2. Interaction Detector
|
|
58
|
+
# Full matrix calculation
|
|
59
|
+
interaction_df = gd.interaction_detector()
|
|
60
|
+
# Pairwise detection with relationship description
|
|
61
|
+
q_inter, relationship = gd.interaction_detector('type', 'region', relationship=True)
|
|
62
|
+
|
|
63
|
+
# 3. Ecological Detector
|
|
64
|
+
# Determine if the impact of two factors are significantly different
|
|
65
|
+
eco_df = gd.ecological_detector()
|
|
66
|
+
|
|
67
|
+
# 4. Risk Detector
|
|
68
|
+
# Compare average Y between sub-groups of a factor
|
|
69
|
+
risk_result = gd.risk_detector('type')
|
|
70
|
+
print(risk_result['risk']) # Mean values for each stratum
|
|
71
|
+
|
|
72
|
+
# 5. Visualization
|
|
73
|
+
# Plot interaction heatmap (red markers indicate significant ecological difference)
|
|
74
|
+
gd.plot(factors=['type', 'region', 'level'])
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### 3. Visualization Result
|
|
78
|
+
The `plot()` method generates a heatmap of the interaction $q$-statistics. Red text indicates that the ecological detector shows a significant difference ($p < 0.05$) between those factors.
|
|
79
|
+
|
|
80
|
+

|
|
81
|
+
|
|
82
|
+
## References
|
|
83
|
+
|
|
84
|
+
```bibtex
|
|
85
|
+
@article{wang2010geographical,
|
|
86
|
+
title={Geographical detectors-based health risk assessment and its application in the neural tube defects study of the Heshun Region, China},
|
|
87
|
+
author={Wang, Jin-Feng and Li, Xin-Hu and Christakos, George and Liao, Yi-Lan and Zhang, Tin and Gu, Xue and Zheng, Xiao-Ying},
|
|
88
|
+
journal={International Journal of Geographical Information Science},
|
|
89
|
+
volume={24},
|
|
90
|
+
number={1},
|
|
91
|
+
pages={107-127},
|
|
92
|
+
year={2010},
|
|
93
|
+
publisher={Taylor \& Francis}
|
|
94
|
+
}
|
|
95
|
+
```
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
geodetector/__init__.py,sha256=bhyJq1ipXNRhG-zPbF_5YUbJ_sana5rJ-FG-ogBNh_U,57
|
|
2
|
+
geodetector/geodetector.py,sha256=9A9rBgTX7Gc4u6I4UzGmP8fYucljJBYQNygT5LDvGes,14917
|
|
3
|
+
geodetector/example_data/disease.csv,sha256=sodkE21Xw-eSlGWxLE1by7hSHQk2FjNxb4ncWdyWLmE,2231
|
|
4
|
+
py_geodetector-0.2.1.dist-info/METADATA,sha256=UYQvydiokR95zpjj_tLxzAC7QuIrXKJw-_HISfUfkT4,3320
|
|
5
|
+
py_geodetector-0.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
py_geodetector-0.2.1.dist-info/RECORD,,
|
py_geodetector/geodetector.py
DELETED
|
@@ -1,225 +0,0 @@
|
|
|
1
|
-
import warnings
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from typing import Sequence
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
|
-
from scipy.stats import f, levene, ncf, ttest_ind
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
def load_example_data():
|
|
11
|
-
file_path = Path(__file__).parent / "example_data" / "disease.csv"
|
|
12
|
-
df = pd.read_csv(file_path)
|
|
13
|
-
return df
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def _plot_value(ax, interaction_df, ecological_df, value_fontsize=10):
|
|
17
|
-
length = len(interaction_df.index)
|
|
18
|
-
for i in range(length):
|
|
19
|
-
for j in range(length):
|
|
20
|
-
if not pd.isna(interaction_df.iloc[i, j]):
|
|
21
|
-
num = str(round(interaction_df.iloc[i, j], 2))
|
|
22
|
-
mark = num[-2:] if 3 == len(num) else num[-3:]
|
|
23
|
-
if 'Y'==ecological_df.iloc[i, j]:
|
|
24
|
-
ax.text(j, i, mark, ha="center", va="center", color="r", fontsize=value_fontsize)
|
|
25
|
-
else:
|
|
26
|
-
ax.text(j, i, mark, ha="center", va="center", color="k", fontsize=value_fontsize)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class GeoDetector(object):
|
|
30
|
-
def __init__(self, df: pd.DataFrame, y: str, factors: Sequence[str], alpha=0.05):
|
|
31
|
-
self.df = df
|
|
32
|
-
self.y = y
|
|
33
|
-
self.factors = factors
|
|
34
|
-
self.alpha = alpha
|
|
35
|
-
self._check_data(df, y, factors)
|
|
36
|
-
self.factor_df, self.interaction_df, self.ecological_df = None, None, None
|
|
37
|
-
|
|
38
|
-
def _check_data(self, df, y, factors):
|
|
39
|
-
for factor in factors:
|
|
40
|
-
if not factor in df.columns:
|
|
41
|
-
raise ValueError('Factor [{}] is not in data')
|
|
42
|
-
|
|
43
|
-
for factor in factors:
|
|
44
|
-
# 检查列的数据类型
|
|
45
|
-
if df[factor].dtype not in ['int64', 'int32', 'int16', 'int8',
|
|
46
|
-
'uint64', 'uint32', 'uint16', 'uint8',
|
|
47
|
-
'object', 'string']:
|
|
48
|
-
# 如果数据类型不是整型或字符型,发出警告
|
|
49
|
-
warnings.warn(f"Factor '{factor}' is not of type 'int' or 'str'.")
|
|
50
|
-
|
|
51
|
-
if y not in df.columns:
|
|
52
|
-
raise ValueError('Factor [{}] is not in data')
|
|
53
|
-
|
|
54
|
-
for factor in factors:
|
|
55
|
-
if y==factor:
|
|
56
|
-
raise ValueError("Y variable should not in Factor variables. ")
|
|
57
|
-
|
|
58
|
-
has_null = df.isnull().values.any()
|
|
59
|
-
if has_null:
|
|
60
|
-
raise ValueError("data hava some objects with value NULL")
|
|
61
|
-
|
|
62
|
-
@classmethod
|
|
63
|
-
def _cal_ssw(self, df: pd.DataFrame, y, factor, extra_factor=None):
|
|
64
|
-
def cal_ssw(df: pd.DataFrame, y):
|
|
65
|
-
length = df.shape[0]
|
|
66
|
-
if length==1:
|
|
67
|
-
strataVar = 0
|
|
68
|
-
lamda_1st = np.square(df[y].values[0])
|
|
69
|
-
lamda_2nd = df[y].values[0]
|
|
70
|
-
else:
|
|
71
|
-
strataVar = (length-1) * df[y].var(ddof=1)
|
|
72
|
-
|
|
73
|
-
lamda_1st = np.square(df[y].values.mean())
|
|
74
|
-
lamda_2nd = np.sqrt(length) * df[y].values.mean()
|
|
75
|
-
return strataVar, lamda_1st, lamda_2nd
|
|
76
|
-
if extra_factor==None:
|
|
77
|
-
df2 = df[[y, factor]].groupby(factor).apply(cal_ssw, y=y)
|
|
78
|
-
else:
|
|
79
|
-
df2 = df[[y]+list(set([factor, extra_factor]))].groupby([factor, extra_factor]).apply(cal_ssw, y=y)
|
|
80
|
-
df2 = df2.apply(pd.Series)
|
|
81
|
-
df2 = df2.sum()
|
|
82
|
-
strataVarSum, lamda_1st_sum, lamda_2nd_sum = df2.values
|
|
83
|
-
return strataVarSum, lamda_1st_sum, lamda_2nd_sum
|
|
84
|
-
|
|
85
|
-
@classmethod
|
|
86
|
-
def _cal_q(self, df, y, factor, extra_factor=None):
|
|
87
|
-
strataVarSum, lamda_1st_sum, lamda_2nd_sum = self._cal_ssw(df, y, factor, extra_factor)
|
|
88
|
-
TotalVar = (df.shape[0]-1)*df[y].var(ddof=1)
|
|
89
|
-
q = 1 - strataVarSum/TotalVar
|
|
90
|
-
return q, lamda_1st_sum, lamda_2nd_sum
|
|
91
|
-
|
|
92
|
-
def factor_dector(self):
|
|
93
|
-
self.factor_df = pd.DataFrame(index=["q statistic", "p value"], columns=self.factors, dtype="float32")
|
|
94
|
-
N_var = self.df[self.y].var(ddof=1)
|
|
95
|
-
N_popu = self.df.shape[0]
|
|
96
|
-
for factor in self.factors:
|
|
97
|
-
N_stra = self.df[factor].unique().shape[0]
|
|
98
|
-
q, lamda_1st_sum, lamda_2nd_sum = self._cal_q(self.df, self.y, factor)
|
|
99
|
-
|
|
100
|
-
#lamda value
|
|
101
|
-
lamda = (lamda_1st_sum - np.square(lamda_2nd_sum) / N_popu) / N_var
|
|
102
|
-
# F value
|
|
103
|
-
F_value = (N_popu - N_stra)* q / ((N_stra - 1)* (1 - q))
|
|
104
|
-
#p value
|
|
105
|
-
p_value = ncf.sf(F_value, N_stra - 1, N_popu - N_stra, nc=lamda)
|
|
106
|
-
|
|
107
|
-
self.factor_df.loc["q statistic", factor] = q
|
|
108
|
-
self.factor_df.loc["p value", factor] = p_value
|
|
109
|
-
return self.factor_df
|
|
110
|
-
|
|
111
|
-
@classmethod
|
|
112
|
-
def _interaction_relationship(self, df):
|
|
113
|
-
out_df = pd.DataFrame(index=df.index, columns=df.columns)
|
|
114
|
-
length = len(df.index)
|
|
115
|
-
for i in range(length):
|
|
116
|
-
for j in range(i+1, length):
|
|
117
|
-
factor1, factor2 = df.index[i], df.index[j]
|
|
118
|
-
i_q = df.loc[factor2, factor1]
|
|
119
|
-
q1 = df.loc[factor1, factor1]
|
|
120
|
-
q2 = df.loc[factor2, factor2]
|
|
121
|
-
|
|
122
|
-
if (i_q <= q1 and i_q <= q2):
|
|
123
|
-
outputRls = "Weaken, nonlinear"
|
|
124
|
-
if (i_q < max(q1, q2) and i_q > min(q1, q2)):
|
|
125
|
-
outputRls = "Weaken, uni-"
|
|
126
|
-
if (i_q == (q1 + q2)):
|
|
127
|
-
outputRls = "Independent"
|
|
128
|
-
if (i_q > max(q1, q2)):
|
|
129
|
-
outputRls = "Enhance, bi-"
|
|
130
|
-
if (i_q > (q1 + q2)):
|
|
131
|
-
outputRls = "Enhance, nonlinear"
|
|
132
|
-
|
|
133
|
-
out_df.loc[factor2, factor1] = outputRls
|
|
134
|
-
return out_df
|
|
135
|
-
|
|
136
|
-
def interaction_detector(self, relationship=False):
|
|
137
|
-
self.interaction_df = pd.DataFrame(index=self.factors, columns=self.factors, dtype="float32")
|
|
138
|
-
length = len(self.factors)
|
|
139
|
-
for i in range(0, length):
|
|
140
|
-
for j in range(0, i+1):
|
|
141
|
-
q, _, _ = self._cal_q(self.df, self.y, self.factors[i], self.factors[j])
|
|
142
|
-
self.interaction_df.loc[self.factors[i], self.factors[j]] = q
|
|
143
|
-
|
|
144
|
-
if relationship:
|
|
145
|
-
self.interaction_relationship_df = self._interaction_relationship(self.interaction_df)
|
|
146
|
-
return self.interaction_df, self.interaction_relationship_df
|
|
147
|
-
return self.interaction_df
|
|
148
|
-
|
|
149
|
-
def ecological_detector(self):
|
|
150
|
-
self.ecological_df = pd.DataFrame(index=self.factors, columns=self.factors, dtype="float32")
|
|
151
|
-
length = len(self.factors)
|
|
152
|
-
for i in range(1, length):
|
|
153
|
-
ssw1, _, _ = self._cal_ssw(self.df, self.y, self.factors[i])
|
|
154
|
-
dfn = self.df[self.factors[i]].notna().sum()-1
|
|
155
|
-
for j in range(0, i):
|
|
156
|
-
ssw2, _, _ = self._cal_ssw(self.df, self.y, self.factors[j])
|
|
157
|
-
dfd = self.df[self.factors[j]].notna().sum()-1
|
|
158
|
-
fval = (dfn*(dfd-1)*ssw1)/(dfd*(dfn-1)*ssw2)
|
|
159
|
-
if fval<f.ppf(self.alpha, dfn, dfn):
|
|
160
|
-
self.ecological_df.loc[self.factors[i], self.factors[j]] = 'Y'
|
|
161
|
-
else:
|
|
162
|
-
self.ecological_df.loc[self.factors[i], self.factors[j]] = 'N'
|
|
163
|
-
return self.ecological_df
|
|
164
|
-
|
|
165
|
-
def risk_detector(self):
|
|
166
|
-
"""
|
|
167
|
-
Compares the difference of average values between sub-groups
|
|
168
|
-
Reference:
|
|
169
|
-
https://github.com/gsnrguo/QGIS-Geographical-detector/blob/main/gd_core/geodetector.py
|
|
170
|
-
"""
|
|
171
|
-
risk_result = dict()
|
|
172
|
-
for factor in self.factors:
|
|
173
|
-
risk_name = self.df.groupby(factor)[self.y].mean()
|
|
174
|
-
strata = np.sort(self.df[factor].unique())
|
|
175
|
-
t_test = np.empty((len(strata), len(strata)))
|
|
176
|
-
t_test.fill(np.nan)
|
|
177
|
-
t_test_strata = pd.DataFrame(t_test, index=strata, columns=strata)
|
|
178
|
-
for i in range(len(strata) - 1):
|
|
179
|
-
for j in range(i + 1, len(strata)):
|
|
180
|
-
y_i = self.df.loc[self.df[factor] == strata[i], [self.y]]
|
|
181
|
-
y_j = self.df.loc[self.df[factor] == strata[j], [self.y]]
|
|
182
|
-
y_i = np.array(y_i).reshape(-1)
|
|
183
|
-
y_j = np.array(y_j).reshape(-1)
|
|
184
|
-
# hypothesis testing of variance homogeneity
|
|
185
|
-
levene_result = levene(y_i, y_j)
|
|
186
|
-
if levene_result.pvalue < self.alpha:
|
|
187
|
-
# variance non-homogeneous
|
|
188
|
-
ttest_result = ttest_ind(y_i, y_j, equal_var=False)
|
|
189
|
-
else:
|
|
190
|
-
ttest_result = ttest_ind(y_i, y_j)
|
|
191
|
-
|
|
192
|
-
t_test_strata.iloc[j, i] = ttest_result.pvalue <= self.alpha
|
|
193
|
-
|
|
194
|
-
risk_factor = dict(risk=risk_name, ttest_stra=t_test_strata)
|
|
195
|
-
risk_result[factor] = risk_factor
|
|
196
|
-
return risk_result
|
|
197
|
-
|
|
198
|
-
def plot(self, tick_fontsize=10, value_fontsize=10, colorbar_fontsize=10, show=True):
|
|
199
|
-
if isinstance(self.interaction_df, type(None)):
|
|
200
|
-
self.interaction_detector()
|
|
201
|
-
if isinstance(self.ecological_df, type(None)):
|
|
202
|
-
self.ecological_detector()
|
|
203
|
-
|
|
204
|
-
fig, ax = plt.subplots(constrained_layout=True)
|
|
205
|
-
|
|
206
|
-
im = ax.imshow(self.interaction_df.values, cmap="YlGnBu", vmin=0, vmax=1)
|
|
207
|
-
_plot_value(ax, self.interaction_df, self.ecological_df, value_fontsize=value_fontsize)
|
|
208
|
-
|
|
209
|
-
ax.set_xticks(np.arange(len(self.factors)))
|
|
210
|
-
ax.set_yticks(np.arange(len(self.factors)))
|
|
211
|
-
ax.spines['top'].set_visible(False)
|
|
212
|
-
ax.spines['right'].set_visible(False)
|
|
213
|
-
|
|
214
|
-
ax.set_xticklabels(self.factors, fontsize=tick_fontsize)
|
|
215
|
-
ax.set_yticklabels(self.factors, rotation=45, fontsize=tick_fontsize)
|
|
216
|
-
ax.tick_params(axis='y', pad=0.1)
|
|
217
|
-
|
|
218
|
-
colorbar = fig.colorbar(im, ax=ax, shrink=0.9, pad=0.01, aspect=25, extend="both")
|
|
219
|
-
colorbar.ax.tick_params(labelsize=colorbar_fontsize)
|
|
220
|
-
|
|
221
|
-
if show:
|
|
222
|
-
plt.show()
|
|
223
|
-
return ax
|
|
224
|
-
else:
|
|
225
|
-
return ax
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.3
|
|
2
|
-
Name: py_geodetector
|
|
3
|
-
Version: 0.1.3
|
|
4
|
-
Summary: A simple Python package for the geodetector
|
|
5
|
-
Project-URL: Homepage, https://github.com/djw-easy/GeoDetector
|
|
6
|
-
Project-URL: Bug Tracker, https://github.com/djw-easy/GeoDetector/issues
|
|
7
|
-
Author-email: djw <djweasy@163.com>
|
|
8
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
-
Classifier: Operating System :: OS Independent
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Requires-Python: >=3.7
|
|
12
|
-
Description-Content-Type: text/markdown
|
|
13
|
-
|
|
14
|
-
# A simple Python package for the geodetector
|
|
15
|
-
|
|
16
|
-
# Install
|
|
17
|
-
|
|
18
|
-
```
|
|
19
|
-
pip install py-geodetector
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
# Usage
|
|
23
|
-
|
|
24
|
-
A quick example of geodetector usage is given in the ./example.ipynb.
|
|
25
|
-
|
|
26
|
-
```python
|
|
27
|
-
from py_geodetector import load_example_data, GeoDetector
|
|
28
|
-
|
|
29
|
-
# load example data
|
|
30
|
-
df = load_example_data()
|
|
31
|
-
|
|
32
|
-
gd = GeoDetector(df)
|
|
33
|
-
# factor detect
|
|
34
|
-
factor_df = gd.factor_dector()
|
|
35
|
-
|
|
36
|
-
# interaction detect
|
|
37
|
-
interaction_df = gd.interaction_detector()
|
|
38
|
-
# or you can generate the interaction relationship as the same time
|
|
39
|
-
interaction_df, interaction_relationship_df = gd.interaction_detector(relationship=True)
|
|
40
|
-
|
|
41
|
-
# ecological detect
|
|
42
|
-
ecological_df = gd.ecological_detector()
|
|
43
|
-
|
|
44
|
-
# risk detect
|
|
45
|
-
risk_result = gd.risk_detector()
|
|
46
|
-
|
|
47
|
-
# plot
|
|
48
|
-
# use a heatmap visualize the interaction detect result,
|
|
49
|
-
# red text means that the ecological detection results show a significant difference
|
|
50
|
-
gd.plot(value_fontsize=14, tick_fontsize=16, colorbar_fontsize=14);
|
|
51
|
-
```
|
|
52
|
-
|
|
53
|
-
# Reference
|
|
54
|
-
|
|
55
|
-
```
|
|
56
|
-
@article{wang2010geographical,
|
|
57
|
-
title={Geographical detectors-based health risk assessment and its application in the neural tube defects study of the Heshun Region, China},
|
|
58
|
-
author={Wang, Jin-Feng and Li, Xin-Hu and Christakos, George and Liao, Yi-Lan and Zhang, Tin and Gu, Xue and Zheng, Xiao-Ying},
|
|
59
|
-
journal={International Journal of Geographical Information Science},
|
|
60
|
-
volume={24},
|
|
61
|
-
number={1},
|
|
62
|
-
pages={107-127},
|
|
63
|
-
year={2010},
|
|
64
|
-
publisher={Taylor \& Francis}
|
|
65
|
-
}
|
|
66
|
-
```
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
py_geodetector/__init__.py,sha256=bhyJq1ipXNRhG-zPbF_5YUbJ_sana5rJ-FG-ogBNh_U,57
|
|
2
|
-
py_geodetector/geodetector.py,sha256=AmIt70OpMEaNQxKAvI-QiFwjfs-Sa-nb89OD12gxiIQ,9911
|
|
3
|
-
py_geodetector/example_data/disease.csv,sha256=sodkE21Xw-eSlGWxLE1by7hSHQk2FjNxb4ncWdyWLmE,2231
|
|
4
|
-
py_geodetector-0.1.3.dist-info/METADATA,sha256=Ngr3o6q8DpBQq5LriVXchSKvTp_zHly3YgkgZR14XOQ,1901
|
|
5
|
-
py_geodetector-0.1.3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
6
|
-
py_geodetector-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|