py-geodetector 0.1.3__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,319 @@
1
+ import warnings
2
+ import numpy as np
3
+ import pandas as pd
4
+ from typing import Sequence, Union, Optional, Tuple, Dict
5
+ import matplotlib.pyplot as plt
6
+ from scipy.stats import f, levene, ncf, ttest_ind
7
+ from pathlib import Path
8
+ from pandas.api.types import is_integer_dtype, is_string_dtype, is_object_dtype
9
+
10
+
11
+ def load_example_data() -> pd.DataFrame:
12
+ """Load example disease dataset."""
13
+ file_path = Path(__file__).parent / "example_data" / "disease.csv"
14
+ df = pd.read_csv(file_path)
15
+ return df
16
+
17
+
18
+ class GeoDetector:
19
+ """
20
+ GeoDetector class for spatial statistics.
21
+
22
+ References:
23
+ Wang, J. F., Li, X. H., Christakos, G., Liao, Y. L., Zhang, T., Gu, X., & Zheng, X. Y. (2010).
24
+ Geographical detectors-based health risk assessment and its application in the neural tube defects study of the Heshun Region, China.
25
+ International Journal of Geographical Information Science, 24(1), 107-127.
26
+ """
27
+ def __init__(self, df: pd.DataFrame, y: str, factors: Optional[Sequence[str]] = None, alpha: float = 0.05):
28
+ """
29
+ Initialize the GeoDetector instance.
30
+
31
+ Args:
32
+ df (pd.DataFrame): The dataset containing both dependent variable and factors.
33
+ y (str): The column name of the dependent variable (numerical).
34
+ factors (Sequence[str], optional): A list of column names for the factors (categorical/stratified).
35
+ If None, automatically detects discrete columns as factors.
36
+ alpha (float, optional): The significance level for hypothesis testing. Defaults to 0.05.
37
+ """
38
+ self.df = df
39
+ self.y = y
40
+ self.alpha = alpha
41
+
42
+ if factors is None:
43
+ # Automatically detect factors: must be discrete (int, str, or object) and not y
44
+ self.factors = []
45
+ for col in df.columns:
46
+ if col == y:
47
+ continue
48
+ dtype = df[col].dtype
49
+ if is_integer_dtype(dtype) or is_string_dtype(dtype) or is_object_dtype(dtype):
50
+ self.factors.append(col)
51
+ else:
52
+ self.factors = list(factors)
53
+
54
+ self._check_data(df, y, self.factors)
55
+
56
+ def _is_discrete(self, factor: str) -> bool:
57
+ """Check if a factor is of discrete type."""
58
+ dtype = self.df[factor].dtype
59
+ return is_integer_dtype(dtype) or is_string_dtype(dtype) or is_object_dtype(dtype)
60
+
61
+ def _check_discrete_factors(self, factors: Sequence[str]):
62
+ """Ensure all provided factors are discrete."""
63
+ for factor in factors:
64
+ if not self._is_discrete(factor):
65
+ raise ValueError(f"Factor '{factor}' must be a discrete type (int, str, or object). "
66
+ f"Current type: {self.df[factor].dtype}. Please discretize it first.")
67
+
68
+ def _check_data(self, df: pd.DataFrame, y: str, factors: Sequence[str]):
69
+ """Check data validity."""
70
+ if y not in df.columns:
71
+ raise ValueError(f"Y variable [{y}] is not in data")
72
+
73
+ for factor in factors:
74
+ if factor not in df.columns:
75
+ raise ValueError(f"Factor [{factor}] is not in data")
76
+
77
+ if y == factor:
78
+ raise ValueError(f"Y variable [{y}] should not be in Factor variables.")
79
+
80
+ # Check column data types for provided factors
81
+ self._check_discrete_factors(factors)
82
+
83
+ if df.isnull().values.any():
84
+ raise ValueError("Data contains NULL values")
85
+
86
+ @classmethod
87
+ def _cal_ssw(cls, df: pd.DataFrame, y: str, factor: Union[str, list], extra_factor: Optional[str] = None) -> Tuple[float, float, float]:
88
+ """
89
+ Calculate the Within Sum of Squares (SSW) and other statistics for the q-statistic using vectorization.
90
+ """
91
+ group_cols = [factor] if isinstance(factor, str) else list(factor)
92
+ if extra_factor:
93
+ group_cols.append(extra_factor)
94
+
95
+ agg_df = df.groupby(group_cols)[y].agg(['var', 'mean', 'count'])
96
+ agg_df['var'] = agg_df['var'].fillna(0)
97
+
98
+ strataVarSum = ((agg_df['count'] - 1) * agg_df['var']).sum()
99
+ lamda_1st_sum = (agg_df['mean'] ** 2).sum()
100
+ lamda_2nd_sum = (np.sqrt(agg_df['count']) * agg_df['mean']).sum()
101
+
102
+ return strataVarSum, lamda_1st_sum, lamda_2nd_sum
103
+
104
+ @classmethod
105
+ def _cal_q(cls, df: pd.DataFrame, y: str, factor: str, extra_factor: Optional[str] = None) -> Tuple[float, float, float]:
106
+ """Calculate q-statistic."""
107
+ strataVarSum, lamda_1st_sum, lamda_2nd_sum = cls._cal_ssw(df, y, factor, extra_factor)
108
+ total_var = (df.shape[0] - 1) * df[y].var(ddof=1)
109
+ q = 1 - strataVarSum / total_var
110
+ return q, lamda_1st_sum, lamda_2nd_sum
111
+
112
+ def factor_detector(self, factors: Optional[Union[str, Sequence[str]]] = None) -> Union[pd.DataFrame, Tuple[float, float]]:
113
+ """
114
+ Factor detector: detects the spatial stratification heterogeneity of Y.
115
+
116
+ Args:
117
+ factors (str or list, optional): Factors to detect. If None, use all factors.
118
+ If a single string, returns (q, p).
119
+ """
120
+ target_factors = factors if factors is not None else self.factors
121
+ if isinstance(target_factors, str):
122
+ target_factors = [target_factors]
123
+
124
+ # Check if factors are discrete
125
+ self._check_discrete_factors(target_factors)
126
+
127
+ res_df = pd.DataFrame(index=["q statistic", "p value"], columns=target_factors, dtype="float64")
128
+ n_popu = self.df.shape[0]
129
+ y_var = self.df[self.y].var(ddof=1)
130
+
131
+ for factor in target_factors:
132
+ n_stra = self.df[factor].nunique()
133
+ q, lamda_1st_sum, lamda_2nd_sum = self._cal_q(self.df, self.y, factor)
134
+
135
+ nc_param = (lamda_1st_sum - np.square(lamda_2nd_sum) / n_popu) / y_var
136
+ f_val = (n_popu - n_stra) * q / ((n_stra - 1) * (1 - q))
137
+ p_val = ncf.sf(f_val, n_stra - 1, n_popu - n_stra, nc=nc_param)
138
+
139
+ res_df.loc["q statistic", factor] = q
140
+ res_df.loc["p value", factor] = p_val
141
+
142
+ if isinstance(factors, str):
143
+ return res_df.iloc[0, 0], res_df.iloc[1, 0]
144
+ return res_df
145
+
146
+ @staticmethod
147
+ def _interaction_relationship(df: pd.DataFrame) -> pd.DataFrame:
148
+ """Determine the type of interaction relationship."""
149
+ out_df = pd.DataFrame(index=df.index, columns=df.columns)
150
+ factors = df.index
151
+ for i, f1 in enumerate(factors):
152
+ for j in range(i + 1, len(factors)):
153
+ f2 = factors[j]
154
+ i_q = df.loc[f2, f1]
155
+ q1 = df.loc[f1, f1]
156
+ q2 = df.loc[f2, f2]
157
+
158
+ if i_q <= q1 and i_q <= q2:
159
+ rel = "Weaken, nonlinear"
160
+ elif q1 < i_q < q2 or q2 < i_q < q1:
161
+ rel = "Weaken, uni-"
162
+ elif i_q == (q1 + q2):
163
+ rel = "Independent"
164
+ elif i_q > max(q1, q2):
165
+ rel = "Enhance, bi-"
166
+
167
+ if i_q > (q1 + q2):
168
+ rel = "Enhance, nonlinear"
169
+
170
+ out_df.loc[f2, f1] = rel
171
+ return out_df
172
+
173
+ def interaction_detector(self, factor1: Optional[str] = None, factor2: Optional[str] = None, relationship: bool = False, factors: Optional[Sequence[str]] = None):
174
+ """
175
+ Interaction detector.
176
+
177
+ Args:
178
+ factor1, factor2 (str, optional): If both provided, returns interaction q for just this pair.
179
+ relationship (bool): If True, returns relationship type.
180
+ factors (Sequence[str], optional): Custom list of factors for full matrix calculation.
181
+ """
182
+ # If any specific factor is provided, BOTH must be provided
183
+ if factor1 or factor2:
184
+ if not (factor1 and factor2):
185
+ raise ValueError("Both factor1 and factor2 must be provided for pairwise interaction detection.")
186
+
187
+ if factor1 == factor2:
188
+ raise ValueError("factor1 and factor2 must be different for interaction detection.")
189
+ self._check_discrete_factors([factor1, factor2])
190
+
191
+ q, _, _ = self._cal_q(self.df, self.y, factor1, factor2)
192
+ if not relationship:
193
+ return q
194
+
195
+ q1, _, _ = self._cal_q(self.df, self.y, factor1)
196
+ q2, _, _ = self._cal_q(self.df, self.y, factor2)
197
+ temp_df = pd.DataFrame({factor1: [q1, q], factor2: [np.nan, q2]}, index=[factor1, factor2])
198
+ rel_df = self._interaction_relationship(temp_df)
199
+ return q, rel_df.loc[factor2, factor1]
200
+
201
+ # Full matrix calculation
202
+ target_factors = factors if factors is not None else self.factors
203
+ self._check_discrete_factors(target_factors)
204
+
205
+ inter_df = pd.DataFrame(index=target_factors, columns=target_factors, dtype="float64")
206
+ for i, f1 in enumerate(target_factors):
207
+ for j in range(i + 1):
208
+ f2 = target_factors[j]
209
+ q, _, _ = self._cal_q(self.df, self.y, f1, f2)
210
+ inter_df.loc[f1, f2] = q
211
+
212
+ if relationship:
213
+ return inter_df, self._interaction_relationship(inter_df)
214
+ return inter_df
215
+
216
+ def ecological_detector(self, factor1: Optional[str] = None, factor2: Optional[str] = None, factors: Optional[Sequence[str]] = None) -> Union[pd.DataFrame, str]:
217
+ """Ecological detector."""
218
+ # If any specific factor is provided, BOTH must be provided
219
+ if factor1 or factor2:
220
+ if not (factor1 and factor2):
221
+ raise ValueError("Both factor1 and factor2 must be provided for pairwise ecological detection.")
222
+
223
+ if factor1 == factor2:
224
+ raise ValueError("factor1 and factor2 must be different for ecological detection.")
225
+ self._check_discrete_factors([factor1, factor2])
226
+
227
+ ssw1, _, _ = self._cal_ssw(self.df, self.y, factor1)
228
+ ssw2, _, _ = self._cal_ssw(self.df, self.y, factor2)
229
+ dfn = self.df[factor1].count() - 1
230
+ dfd = self.df[factor2].count() - 1
231
+ fval = (dfn * (dfd - 1) * ssw1) / (dfd * (dfn - 1) * ssw2)
232
+ return 'Y' if fval < f.ppf(self.alpha, dfn, dfd) else 'N'
233
+
234
+ target_factors = factors if factors is not None else self.factors
235
+ self._check_discrete_factors(target_factors)
236
+
237
+ eco_df = pd.DataFrame(index=target_factors, columns=target_factors, dtype="object")
238
+ for i, f1 in enumerate(target_factors):
239
+ ssw1, _, _ = self._cal_ssw(self.df, self.y, f1)
240
+ dfn = self.df[f1].count() - 1
241
+ for j in range(i):
242
+ f2 = target_factors[j]
243
+ ssw2, _, _ = self._cal_ssw(self.df, self.y, f2)
244
+ dfd = self.df[f2].count() - 1
245
+ fval = (dfn * (dfd - 1) * ssw1) / (dfd * (dfn - 1) * ssw2)
246
+ eco_df.loc[f1, f2] = 'Y' if fval < f.ppf(self.alpha, dfn, dfd) else 'N'
247
+
248
+ return eco_df
249
+
250
+ def risk_detector(self, factor: Optional[str] = None) -> Union[Dict, Dict[str, Dict]]:
251
+ """Risk detector."""
252
+ target_factors = [factor] if factor else self.factors
253
+ self._check_discrete_factors(target_factors)
254
+
255
+ risk_result = {}
256
+ for f_name in target_factors:
257
+ risk_mean = self.df.groupby(f_name)[self.y].mean()
258
+ strata = np.sort(self.df[f_name].unique())
259
+ t_test_strata = pd.DataFrame(index=strata, columns=strata, dtype=bool)
260
+
261
+ for i in range(len(strata)):
262
+ for j in range(i + 1, len(strata)):
263
+ y_i = self.df.loc[self.df[f_name] == strata[i], self.y].values
264
+ y_j = self.df.loc[self.df[f_name] == strata[j], self.y].values
265
+
266
+ _, p_levene = levene(y_i, y_j)
267
+ equal_var = p_levene >= self.alpha
268
+ _, p_ttest = ttest_ind(y_i, y_j, equal_var=equal_var)
269
+ t_test_strata.loc[strata[j], strata[i]] = p_ttest <= self.alpha
270
+
271
+ risk_result[f_name] = {"risk": risk_mean, "ttest_stra": t_test_strata}
272
+
273
+ return risk_result[factor] if factor else risk_result
274
+
275
+ def _plot_text_labels(self, ax, interaction_df, ecological_df, value_fontsize=10):
276
+ """Internal plotting helper."""
277
+ for i, row_idx in enumerate(interaction_df.index):
278
+ for j, col_idx in enumerate(interaction_df.columns):
279
+ val = interaction_df.iloc[i, j]
280
+ if not pd.isna(val):
281
+ mark = f"{val:.2f}"
282
+ # Use ecological_df to determine color if available
283
+ color = "k"
284
+ if ecological_df is not None and i < ecological_df.shape[0] and j < ecological_df.shape[1]:
285
+ if ecological_df.iloc[i, j] == 'Y':
286
+ color = "r"
287
+ ax.text(j, i, mark, ha="center", va="center", color=color, fontsize=value_fontsize)
288
+
289
+ def plot(self, factors: Optional[Sequence[str]] = None, tick_fontsize=10, value_fontsize=10, colorbar_fontsize=10, show=True):
290
+ """
291
+ Plot interaction and ecological detector results.
292
+
293
+ Args:
294
+ factors (Sequence[str], optional): Factors to include in the plot. Defaults to self.factors.
295
+ """
296
+ target_factors = factors if factors is not None else self.factors
297
+ self._check_discrete_factors(target_factors)
298
+
299
+ inter_df = self.interaction_detector(factors=target_factors)
300
+ eco_df = self.ecological_detector(factors=target_factors)
301
+
302
+ fig, ax = plt.subplots(constrained_layout=True)
303
+ im = ax.imshow(inter_df.values, cmap="YlGnBu", vmin=0, vmax=1)
304
+ self._plot_text_labels(ax, inter_df, eco_df, value_fontsize=value_fontsize)
305
+
306
+ ax.set_xticks(np.arange(len(target_factors)))
307
+ ax.set_yticks(np.arange(len(target_factors)))
308
+ ax.spines['top'].set_visible(False)
309
+ ax.spines['right'].set_visible(False)
310
+
311
+ ax.set_xticklabels(target_factors, fontsize=tick_fontsize)
312
+ ax.set_yticklabels(target_factors, rotation=45, fontsize=tick_fontsize, va='top')
313
+
314
+ cbar = fig.colorbar(im, ax=ax, shrink=0.95, pad=0.02, aspect=25, extend="both")
315
+ cbar.ax.tick_params(labelsize=colorbar_fontsize)
316
+
317
+ if show:
318
+ plt.show()
319
+ return ax
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.4
2
+ Name: py_geodetector
3
+ Version: 0.2.1
4
+ Summary: A simple Python package for the geodetector
5
+ Project-URL: Homepage, https://github.com/djw-easy/GeoDetector
6
+ Project-URL: Bug Tracker, https://github.com/djw-easy/GeoDetector/issues
7
+ Author-email: djw <djweasy@163.com>
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+
14
+ # py-geodetector
15
+
16
+ A simple and efficient Python package for the **Geographical Detector** (GeoDetector).
17
+
18
+ ## Features
19
+
20
+ - **Four Detectors**: Factor, Interaction, Risk, and Ecological detectors.
21
+ - **High Performance**: Vectorized calculations using Pandas and NumPy for large-scale datasets.
22
+ - **Flexible API**: Supports both batch processing and interactive exploratory analysis.
23
+ - **Auto Detection**: Automatically identifies discrete/categorical variables as factors.
24
+ - **Visualization**: Built-in heatmap for interaction results with statistical significance markers.
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ pip install py-geodetector
30
+ ```
31
+
32
+ ## Usage
33
+
34
+ ### 1. Data Preparation
35
+ - **Format**: pandas DataFrame.
36
+ - **Y (Dependent Variable)**: Numerical/Continuous.
37
+ - **X (Independent Variable)**: **Categorical/Discrete**. If your $X$ is continuous, you must discretize it first (e.g., using `pd.qcut` or Jenks natural breaks).
38
+
39
+ ### 2. Quick Start
40
+
41
+ ```python
42
+ from geodetector import load_example_data, GeoDetector
43
+
44
+ # Load example disease dataset
45
+ df = load_example_data()
46
+
47
+ # Initialize: Automatically detects discrete columns as factors if 'factors' is not provided
48
+ gd = GeoDetector(df, y='incidence')
49
+ print(f"Detected factors: {gd.factors}")
50
+
51
+ # 1. Factor Detector
52
+ # Batch detection for all factors
53
+ factor_df = gd.factor_detector()
54
+ # Single factor detection: returns (q_value, p_value)
55
+ q, p = gd.factor_detector('type')
56
+
57
+ # 2. Interaction Detector
58
+ # Full matrix calculation
59
+ interaction_df = gd.interaction_detector()
60
+ # Pairwise detection with relationship description
61
+ q_inter, relationship = gd.interaction_detector('type', 'region', relationship=True)
62
+
63
+ # 3. Ecological Detector
64
+ # Determine if the impact of two factors are significantly different
65
+ eco_df = gd.ecological_detector()
66
+
67
+ # 4. Risk Detector
68
+ # Compare average Y between sub-groups of a factor
69
+ risk_result = gd.risk_detector('type')
70
+ print(risk_result['risk']) # Mean values for each stratum
71
+
72
+ # 5. Visualization
73
+ # Plot interaction heatmap (red markers indicate significant ecological difference)
74
+ gd.plot(factors=['type', 'region', 'level'])
75
+ ```
76
+
77
+ ### 3. Visualization Result
78
+ The `plot()` method generates a heatmap of the interaction $q$-statistics. Red text indicates that the ecological detector shows a significant difference ($p < 0.05$) between those factors.
79
+
80
+ ![](./example_result.png)
81
+
82
+ ## References
83
+
84
+ ```bibtex
85
+ @article{wang2010geographical,
86
+ title={Geographical detectors-based health risk assessment and its application in the neural tube defects study of the Heshun Region, China},
87
+ author={Wang, Jin-Feng and Li, Xin-Hu and Christakos, George and Liao, Yi-Lan and Zhang, Tin and Gu, Xue and Zheng, Xiao-Ying},
88
+ journal={International Journal of Geographical Information Science},
89
+ volume={24},
90
+ number={1},
91
+ pages={107-127},
92
+ year={2010},
93
+ publisher={Taylor \& Francis}
94
+ }
95
+ ```
@@ -0,0 +1,6 @@
1
+ geodetector/__init__.py,sha256=bhyJq1ipXNRhG-zPbF_5YUbJ_sana5rJ-FG-ogBNh_U,57
2
+ geodetector/geodetector.py,sha256=9A9rBgTX7Gc4u6I4UzGmP8fYucljJBYQNygT5LDvGes,14917
3
+ geodetector/example_data/disease.csv,sha256=sodkE21Xw-eSlGWxLE1by7hSHQk2FjNxb4ncWdyWLmE,2231
4
+ py_geodetector-0.2.1.dist-info/METADATA,sha256=UYQvydiokR95zpjj_tLxzAC7QuIrXKJw-_HISfUfkT4,3320
5
+ py_geodetector-0.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
+ py_geodetector-0.2.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,225 +0,0 @@
1
- import warnings
2
- import numpy as np
3
- import pandas as pd
4
- from typing import Sequence
5
- import matplotlib.pyplot as plt
6
- from scipy.stats import f, levene, ncf, ttest_ind
7
-
8
-
9
- from pathlib import Path
10
- def load_example_data():
11
- file_path = Path(__file__).parent / "example_data" / "disease.csv"
12
- df = pd.read_csv(file_path)
13
- return df
14
-
15
-
16
- def _plot_value(ax, interaction_df, ecological_df, value_fontsize=10):
17
- length = len(interaction_df.index)
18
- for i in range(length):
19
- for j in range(length):
20
- if not pd.isna(interaction_df.iloc[i, j]):
21
- num = str(round(interaction_df.iloc[i, j], 2))
22
- mark = num[-2:] if 3 == len(num) else num[-3:]
23
- if 'Y'==ecological_df.iloc[i, j]:
24
- ax.text(j, i, mark, ha="center", va="center", color="r", fontsize=value_fontsize)
25
- else:
26
- ax.text(j, i, mark, ha="center", va="center", color="k", fontsize=value_fontsize)
27
-
28
-
29
- class GeoDetector(object):
30
- def __init__(self, df: pd.DataFrame, y: str, factors: Sequence[str], alpha=0.05):
31
- self.df = df
32
- self.y = y
33
- self.factors = factors
34
- self.alpha = alpha
35
- self._check_data(df, y, factors)
36
- self.factor_df, self.interaction_df, self.ecological_df = None, None, None
37
-
38
- def _check_data(self, df, y, factors):
39
- for factor in factors:
40
- if not factor in df.columns:
41
- raise ValueError('Factor [{}] is not in data')
42
-
43
- for factor in factors:
44
- # 检查列的数据类型
45
- if df[factor].dtype not in ['int64', 'int32', 'int16', 'int8',
46
- 'uint64', 'uint32', 'uint16', 'uint8',
47
- 'object', 'string']:
48
- # 如果数据类型不是整型或字符型,发出警告
49
- warnings.warn(f"Factor '{factor}' is not of type 'int' or 'str'.")
50
-
51
- if y not in df.columns:
52
- raise ValueError('Factor [{}] is not in data')
53
-
54
- for factor in factors:
55
- if y==factor:
56
- raise ValueError("Y variable should not in Factor variables. ")
57
-
58
- has_null = df.isnull().values.any()
59
- if has_null:
60
- raise ValueError("data hava some objects with value NULL")
61
-
62
- @classmethod
63
- def _cal_ssw(self, df: pd.DataFrame, y, factor, extra_factor=None):
64
- def cal_ssw(df: pd.DataFrame, y):
65
- length = df.shape[0]
66
- if length==1:
67
- strataVar = 0
68
- lamda_1st = np.square(df[y].values[0])
69
- lamda_2nd = df[y].values[0]
70
- else:
71
- strataVar = (length-1) * df[y].var(ddof=1)
72
-
73
- lamda_1st = np.square(df[y].values.mean())
74
- lamda_2nd = np.sqrt(length) * df[y].values.mean()
75
- return strataVar, lamda_1st, lamda_2nd
76
- if extra_factor==None:
77
- df2 = df[[y, factor]].groupby(factor).apply(cal_ssw, y=y)
78
- else:
79
- df2 = df[[y]+list(set([factor, extra_factor]))].groupby([factor, extra_factor]).apply(cal_ssw, y=y)
80
- df2 = df2.apply(pd.Series)
81
- df2 = df2.sum()
82
- strataVarSum, lamda_1st_sum, lamda_2nd_sum = df2.values
83
- return strataVarSum, lamda_1st_sum, lamda_2nd_sum
84
-
85
- @classmethod
86
- def _cal_q(self, df, y, factor, extra_factor=None):
87
- strataVarSum, lamda_1st_sum, lamda_2nd_sum = self._cal_ssw(df, y, factor, extra_factor)
88
- TotalVar = (df.shape[0]-1)*df[y].var(ddof=1)
89
- q = 1 - strataVarSum/TotalVar
90
- return q, lamda_1st_sum, lamda_2nd_sum
91
-
92
- def factor_dector(self):
93
- self.factor_df = pd.DataFrame(index=["q statistic", "p value"], columns=self.factors, dtype="float32")
94
- N_var = self.df[self.y].var(ddof=1)
95
- N_popu = self.df.shape[0]
96
- for factor in self.factors:
97
- N_stra = self.df[factor].unique().shape[0]
98
- q, lamda_1st_sum, lamda_2nd_sum = self._cal_q(self.df, self.y, factor)
99
-
100
- #lamda value
101
- lamda = (lamda_1st_sum - np.square(lamda_2nd_sum) / N_popu) / N_var
102
- # F value
103
- F_value = (N_popu - N_stra)* q / ((N_stra - 1)* (1 - q))
104
- #p value
105
- p_value = ncf.sf(F_value, N_stra - 1, N_popu - N_stra, nc=lamda)
106
-
107
- self.factor_df.loc["q statistic", factor] = q
108
- self.factor_df.loc["p value", factor] = p_value
109
- return self.factor_df
110
-
111
- @classmethod
112
- def _interaction_relationship(self, df):
113
- out_df = pd.DataFrame(index=df.index, columns=df.columns)
114
- length = len(df.index)
115
- for i in range(length):
116
- for j in range(i+1, length):
117
- factor1, factor2 = df.index[i], df.index[j]
118
- i_q = df.loc[factor2, factor1]
119
- q1 = df.loc[factor1, factor1]
120
- q2 = df.loc[factor2, factor2]
121
-
122
- if (i_q <= q1 and i_q <= q2):
123
- outputRls = "Weaken, nonlinear"
124
- if (i_q < max(q1, q2) and i_q > min(q1, q2)):
125
- outputRls = "Weaken, uni-"
126
- if (i_q == (q1 + q2)):
127
- outputRls = "Independent"
128
- if (i_q > max(q1, q2)):
129
- outputRls = "Enhance, bi-"
130
- if (i_q > (q1 + q2)):
131
- outputRls = "Enhance, nonlinear"
132
-
133
- out_df.loc[factor2, factor1] = outputRls
134
- return out_df
135
-
136
- def interaction_detector(self, relationship=False):
137
- self.interaction_df = pd.DataFrame(index=self.factors, columns=self.factors, dtype="float32")
138
- length = len(self.factors)
139
- for i in range(0, length):
140
- for j in range(0, i+1):
141
- q, _, _ = self._cal_q(self.df, self.y, self.factors[i], self.factors[j])
142
- self.interaction_df.loc[self.factors[i], self.factors[j]] = q
143
-
144
- if relationship:
145
- self.interaction_relationship_df = self._interaction_relationship(self.interaction_df)
146
- return self.interaction_df, self.interaction_relationship_df
147
- return self.interaction_df
148
-
149
- def ecological_detector(self):
150
- self.ecological_df = pd.DataFrame(index=self.factors, columns=self.factors, dtype="float32")
151
- length = len(self.factors)
152
- for i in range(1, length):
153
- ssw1, _, _ = self._cal_ssw(self.df, self.y, self.factors[i])
154
- dfn = self.df[self.factors[i]].notna().sum()-1
155
- for j in range(0, i):
156
- ssw2, _, _ = self._cal_ssw(self.df, self.y, self.factors[j])
157
- dfd = self.df[self.factors[j]].notna().sum()-1
158
- fval = (dfn*(dfd-1)*ssw1)/(dfd*(dfn-1)*ssw2)
159
- if fval<f.ppf(self.alpha, dfn, dfn):
160
- self.ecological_df.loc[self.factors[i], self.factors[j]] = 'Y'
161
- else:
162
- self.ecological_df.loc[self.factors[i], self.factors[j]] = 'N'
163
- return self.ecological_df
164
-
165
- def risk_detector(self):
166
- """
167
- Compares the difference of average values between sub-groups
168
- Reference:
169
- https://github.com/gsnrguo/QGIS-Geographical-detector/blob/main/gd_core/geodetector.py
170
- """
171
- risk_result = dict()
172
- for factor in self.factors:
173
- risk_name = self.df.groupby(factor)[self.y].mean()
174
- strata = np.sort(self.df[factor].unique())
175
- t_test = np.empty((len(strata), len(strata)))
176
- t_test.fill(np.nan)
177
- t_test_strata = pd.DataFrame(t_test, index=strata, columns=strata)
178
- for i in range(len(strata) - 1):
179
- for j in range(i + 1, len(strata)):
180
- y_i = self.df.loc[self.df[factor] == strata[i], [self.y]]
181
- y_j = self.df.loc[self.df[factor] == strata[j], [self.y]]
182
- y_i = np.array(y_i).reshape(-1)
183
- y_j = np.array(y_j).reshape(-1)
184
- # hypothesis testing of variance homogeneity
185
- levene_result = levene(y_i, y_j)
186
- if levene_result.pvalue < self.alpha:
187
- # variance non-homogeneous
188
- ttest_result = ttest_ind(y_i, y_j, equal_var=False)
189
- else:
190
- ttest_result = ttest_ind(y_i, y_j)
191
-
192
- t_test_strata.iloc[j, i] = ttest_result.pvalue <= self.alpha
193
-
194
- risk_factor = dict(risk=risk_name, ttest_stra=t_test_strata)
195
- risk_result[factor] = risk_factor
196
- return risk_result
197
-
198
- def plot(self, tick_fontsize=10, value_fontsize=10, colorbar_fontsize=10, show=True):
199
- if isinstance(self.interaction_df, type(None)):
200
- self.interaction_detector()
201
- if isinstance(self.ecological_df, type(None)):
202
- self.ecological_detector()
203
-
204
- fig, ax = plt.subplots(constrained_layout=True)
205
-
206
- im = ax.imshow(self.interaction_df.values, cmap="YlGnBu", vmin=0, vmax=1)
207
- _plot_value(ax, self.interaction_df, self.ecological_df, value_fontsize=value_fontsize)
208
-
209
- ax.set_xticks(np.arange(len(self.factors)))
210
- ax.set_yticks(np.arange(len(self.factors)))
211
- ax.spines['top'].set_visible(False)
212
- ax.spines['right'].set_visible(False)
213
-
214
- ax.set_xticklabels(self.factors, fontsize=tick_fontsize)
215
- ax.set_yticklabels(self.factors, rotation=45, fontsize=tick_fontsize)
216
- ax.tick_params(axis='y', pad=0.1)
217
-
218
- colorbar = fig.colorbar(im, ax=ax, shrink=0.9, pad=0.01, aspect=25, extend="both")
219
- colorbar.ax.tick_params(labelsize=colorbar_fontsize)
220
-
221
- if show:
222
- plt.show()
223
- return ax
224
- else:
225
- return ax
@@ -1,66 +0,0 @@
1
- Metadata-Version: 2.3
2
- Name: py_geodetector
3
- Version: 0.1.3
4
- Summary: A simple Python package for the geodetector
5
- Project-URL: Homepage, https://github.com/djw-easy/GeoDetector
6
- Project-URL: Bug Tracker, https://github.com/djw-easy/GeoDetector/issues
7
- Author-email: djw <djweasy@163.com>
8
- Classifier: License :: OSI Approved :: MIT License
9
- Classifier: Operating System :: OS Independent
10
- Classifier: Programming Language :: Python :: 3
11
- Requires-Python: >=3.7
12
- Description-Content-Type: text/markdown
13
-
14
- # A simple Python package for the geodetector
15
-
16
- # Install
17
-
18
- ```
19
- pip install py-geodetector
20
- ```
21
-
22
- # Usage
23
-
24
- A quick example of geodetector usage is given in the ./example.ipynb.
25
-
26
- ```python
27
- from py_geodetector import load_example_data, GeoDetector
28
-
29
- # load example data
30
- df = load_example_data()
31
-
32
- gd = GeoDetector(df)
33
- # factor detect
34
- factor_df = gd.factor_dector()
35
-
36
- # interaction detect
37
- interaction_df = gd.interaction_detector()
38
- # or you can generate the interaction relationship as the same time
39
- interaction_df, interaction_relationship_df = gd.interaction_detector(relationship=True)
40
-
41
- # ecological detect
42
- ecological_df = gd.ecological_detector()
43
-
44
- # risk detect
45
- risk_result = gd.risk_detector()
46
-
47
- # plot
48
- # use a heatmap visualize the interaction detect result,
49
- # red text means that the ecological detection results show a significant difference
50
- gd.plot(value_fontsize=14, tick_fontsize=16, colorbar_fontsize=14);
51
- ```
52
-
53
- # Reference
54
-
55
- ```
56
- @article{wang2010geographical,
57
- title={Geographical detectors-based health risk assessment and its application in the neural tube defects study of the Heshun Region, China},
58
- author={Wang, Jin-Feng and Li, Xin-Hu and Christakos, George and Liao, Yi-Lan and Zhang, Tin and Gu, Xue and Zheng, Xiao-Ying},
59
- journal={International Journal of Geographical Information Science},
60
- volume={24},
61
- number={1},
62
- pages={107-127},
63
- year={2010},
64
- publisher={Taylor \& Francis}
65
- }
66
- ```
@@ -1,6 +0,0 @@
1
- py_geodetector/__init__.py,sha256=bhyJq1ipXNRhG-zPbF_5YUbJ_sana5rJ-FG-ogBNh_U,57
2
- py_geodetector/geodetector.py,sha256=AmIt70OpMEaNQxKAvI-QiFwjfs-Sa-nb89OD12gxiIQ,9911
3
- py_geodetector/example_data/disease.csv,sha256=sodkE21Xw-eSlGWxLE1by7hSHQk2FjNxb4ncWdyWLmE,2231
4
- py_geodetector-0.1.3.dist-info/METADATA,sha256=Ngr3o6q8DpBQq5LriVXchSKvTp_zHly3YgkgZR14XOQ,1901
5
- py_geodetector-0.1.3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
6
- py_geodetector-0.1.3.dist-info/RECORD,,
File without changes
File without changes