econmethods 0.0.9__tar.gz → 1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: econmethods
3
- Version: 0.0.9
3
+ Version: 1
4
4
  Summary: A python package implementing various econometrical tests and estimators
5
5
  Home-page: https://github.com/NaturionBG/econmethods
6
6
  Author: NaturionBG
@@ -0,0 +1,592 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import statsmodels.api as sm
4
+ from itertools import product
5
+ import scipy.stats as sc
6
+ from math import floor
7
+ import importlib.resources as resources
8
+ from statsmodels.regression.mixed_linear_model import MixedLM
9
+
10
+ def read_critical_values(sheet: str) -> pd.DataFrame:
11
+ xlsx_path = resources.files("econmethods") / "CADF_Crit_Values.xlsx"
12
+ df = pd.read_excel(xlsx_path, sheet_name=sheet, index_col=0)
13
+ return df
14
+
15
+ class CipsTest:
16
+ '''
17
+ Implementation of the standard Cross-Sectionally Augmented Dickey-Fuller
18
+ procedure to test for non-stationarity I(1) in panel data. Works only with a linear trend.
19
+ ----
20
+ The hypotheses are as follows:
21
+ - *H0*: The Target variable is I(1)
22
+ - *H1*: The Target variable is I(0)\n
23
+ PARAMETERS:
24
+ ----------
25
+ ------
26
+ - *df*: A standart Pandas DataFrame containing panelized data. \n
27
+ Ensure that the DataFrame contains the following columns in this exact order: \n
28
+ 0 - a column of spatial units. Must contain homogenous data, e.g. only countries, companies, regions, etc. \n
29
+ 1 - temporal column. Must contain homogenous data, e.g. only years, months, quarters, etc. \n
30
+ 2 - target variable. Must not contain NaN Values. An Error will be raised otherwise. \n
31
+
32
+ - *T*: Your Temporal window. Will be used to determine the test critical value.
33
+ - *N*: Your Spatial window. Will be used to determine the test critical value.
34
+
35
+ - *trend*: State whether your ADF model has a trend or not. Will be used to determine the test critical value.
36
+
37
+ - *poly_trend*: State whether your target variable has a ditinct polynomial trend. \n
38
+ If a value bigger than 1 is entered, the test will detrend the target variable to get a robust result.
39
+
40
+ - *intercept*: State whether ADF model has an intercept or not. Will be used to determine the test critical value.
41
+
42
+ - *n_lags*: Determine the maximum amount of lags in the Augmented Dickey-Fuller regression.
43
+ the test will choose the best lag amount from 1 to n_lags based on AIC (Akaike Information Criterion)
44
+
45
+ - *level*: Value of significance to conduct the test at (in %%). Only 5 and 1% are allowed.
46
+ -----------------------------
47
+ RETURNS:
48
+ --
49
+ via instance.verdict() -> None: Prints the verdict of the test based on the parameters set by the user.
50
+ '''
51
+ def __init__(self, df: pd.DataFrame, T: int, N: int, trend: bool = False, poly_trend: int = 1, intercept: bool = False, n_lags: int = 2, level: int = 5) -> None:
52
+ CipsTest.__build_tables()
53
+ self.__df = df
54
+ self.__L = 1
55
+ self.__T = T
56
+ self.__N = N
57
+ self.__trend = trend
58
+ self.__C = intercept
59
+ self.__n_lags = n_lags
60
+ if self.__n_lags > floor(self.__T/5):
61
+ self.__n_lags = floor(self.__T/5)
62
+ if self.__n_lags < 1:
63
+ self.__n_lags = 1
64
+ self.__alpha = level/100
65
+ self.__df = self.__df.rename(columns={self.__df.columns[0]:'SpUnit', self.__df.columns[1]:'time', self.__df.columns[2]:'target'})
66
+ self.__df.target = np.log(self.__df.target)
67
+ self.__poly = poly_trend
68
+ if self.__trend:
69
+ if self.__poly > 1:
70
+ self.__df = self.detrend()
71
+ self.__trend = False
72
+ self.verify()
73
+ self.__table = self.get_table()
74
+ self.__CADF_Crit = self.get_critical_value()
75
+ self.__CADF = self.estimate()
76
+
77
+ def verify(self) -> None:
78
+ if self.__df.target.isnull().sum() > 0:
79
+ raise TypeError('Values in Target must NOT be NaN!')
80
+ if self.__alpha != 0.01 and self.__alpha != 0.05:
81
+ raise ValueError('The Significance Level must be either 1 or 5!')
82
+ if self.__poly < 1:
83
+ raise ValueError('The Polynomial Power Cannot be lesser than 1!')
84
+
85
+ @classmethod
86
+ def __build_tables(cls) -> None:
87
+ cls.NTNC_1P = read_critical_values('NTNC_1P')
88
+ cls.NTNC_5P = read_critical_values('NTNC_5P')
89
+ cls.NTC_1P = read_critical_values('NTC_1P')
90
+ cls.NTC_5P = read_critical_values('NTC_5P')
91
+ cls.TC_1P = read_critical_values('TC_1P')
92
+ cls.TC_5P = read_critical_values('TC_5P')
93
+
94
+ def get_table(self) -> pd.DataFrame:
95
+ if not self.__trend and not self.__C:
96
+ if self.__alpha == 0.01:
97
+ return CipsTest.NTNC_1P
98
+ else:
99
+ return CipsTest.NTNC_5P
100
+ if not self.__trend and self.__C:
101
+ if self.__alpha == 0.01:
102
+ return CipsTest.NTC_1P
103
+ else:
104
+ return CipsTest.NTC_5P
105
+ if self.__trend and self.__C:
106
+ if self.__alpha == 0.01:
107
+ return CipsTest.TC_1P
108
+ else:
109
+ return CipsTest.TC_5P
110
+
111
+ def get_critical_value(self) -> float:
112
+ dct = {}
113
+ for arr in product(self.__table.index, self.__table.index):
114
+ lst = np.array(arr)
115
+ dt = np.array([self.__T, self.__N])
116
+ dct[arr] = np.sqrt(np.sum((dt-lst)**2))
117
+ return self.__table.loc[min(dct, key=dct.get)]
118
+
119
+ def detrend(self) -> pd.DataFrame:
120
+ lst = []
121
+ for unit in self.__df.SpUnit.unique():
122
+ subdf = self.__df[self.__df.SpUnit == unit]
123
+ for i in range(1, self.__poly+1):
124
+ subdf.insert(3, f't^{i}', np.linspace(1, len(subdf), len(subdf))**i)
125
+ diff = subdf['target'].copy() - sm.OLS(subdf['target'], sm.add_constant(subdf.iloc[:, 3:])).fit().predict(sm.add_constant(subdf.iloc[:, 3:]))
126
+ subdf.loc[:, 'target'] = diff
127
+ lst.append(subdf.iloc[:, :3])
128
+ return pd.concat(lst, axis=0)
129
+
130
+ def build_regressions(self, lags: int) -> list[pd.DataFrame]:
131
+ lst = []
132
+ for unit in self.__df.SpUnit.unique():
133
+ subdf = self.__df[self.__df.SpUnit == unit]
134
+ if self.__trend:
135
+ subdf.insert(2, 't', np.linspace(1, len(subdf), len(subdf)))
136
+ subdf = pd.concat([subdf, subdf.target.shift(periods=range(1, lags+1))], axis=1)
137
+ subdf['cs_avg'] = self.__df.groupby(['time'])['target'].mean().values
138
+ subdf = pd.concat([subdf, subdf.cs_avg.shift(periods=range(1, lags+1))], axis=1)
139
+ subdf.insert(3, 'target_diff', subdf.target - subdf.target_1)
140
+ subdf['cs_avg_diff'] = subdf.cs_avg - subdf.cs_avg_1
141
+ subdf = pd.concat([subdf, subdf.cs_avg_diff.shift(periods=range(1, lags+1))], axis=1)
142
+ subdf = pd.concat([subdf, subdf.target_diff.shift(periods=range(1, lags+1))], axis=1)
143
+ if self.__trend:
144
+ base = ['target_diff', 't', 'target_1', 'cs_avg_1', 'cs_avg_diff']
145
+ else:
146
+ base = ['target_diff', 'target_1', 'cs_avg_1', 'cs_avg_diff']
147
+ additional = []
148
+ for i in range(1, lags+1):
149
+ additional.append(f'target_diff_{i}')
150
+ additional.append(f'cs_avg_diff_{i}')
151
+ subdf=(subdf.loc[:, base+additional]).iloc[self.__n_lags+1:, :]
152
+ lst.append(subdf)
153
+ return lst
154
+
155
+ def estimate(self) -> float:
156
+ CADF_stat = None
157
+ best_aic = np.inf
158
+ for l in range(1, self.__n_lags+1):
159
+ aic = []
160
+ lst1 = self.build_regressions(l)
161
+ CADF = []
162
+ for frame in lst1:
163
+ if self.__C:
164
+ res = sm.OLS(frame.iloc[:, 0], sm.add_constant(frame.iloc[:, 1:])).fit()
165
+ else:
166
+ res = sm.OLS(frame.iloc[:, 0], frame.iloc[:, 1:]).fit()
167
+ CADF.append(res.tvalues['target_1'])
168
+ aic.append(res.aic)
169
+ if len(np.unique(np.array(CADF))) != 1:
170
+ CADF = np.array(CADF).mean()
171
+ if np.array(aic).mean() < best_aic:
172
+ CADF_stat = CADF
173
+ best_aic = np.array(aic).mean()
174
+ self.__L = l
175
+ return CADF_stat
176
+
177
+ def verdict(self) -> None:
178
+ if self.__CADF < self.__CADF_Crit:
179
+ print(f'{self.__CADF} < {self.__CADF_Crit}\n Your target variable is I(0) according to the CIPS test\n Significance level : {self.__alpha*100}%. \n Selected lag amount: {self.__L}')
180
+ else:
181
+ print(f'{self.__CADF} > {self.__CADF_Crit}\n Your target variable is I(1) according to the CIPS test\n significance level: {self.__alpha*100}%. \n Selected lag amount: {self.__L}')
182
+
183
+ def __del__(self) -> None:
184
+ pass
185
+
186
+
187
+
188
+ class HausmanOneWay:
189
+ '''
190
+ Implementation of the Hausman procedure to test whether it is more feasible to used random effects against fixed effects.
191
+ -
192
+ The Hypotheses are as follows
193
+ - *H0*: Cov(a_i, x_{it}) = 0
194
+ - *H1*: Cov(a_i, x_{it}) != 0\n
195
+ PARAMETERS:
196
+ --
197
+ -----
198
+ - *data*: A pandas DataFrame. Make sure that all items are introduced in this exact order by column index:\n
199
+ Make sure no columns contain Nan Values!
200
+ 0 - your spatial unit. The data must be homogenous, e.g. only contries, regions, etc.
201
+ 1 - your temporal window per panel. The data must be homogenous, e.g. only years, months, etc.
202
+ 2 - Your target variable.
203
+ 3+ - your exogenous variables.
204
+ - *level*: the statistical test significance level.
205
+ ---------
206
+ METHODOLOGY:
207
+ ------
208
+ - The Test prefers an MLE estimator for RE (Random Effects),
209
+ which can be disrupted if the difference between FE and RE is considerably small.
210
+ The test will opt for MLE estimation if sigma_u computed for GLS is greater than 0.
211
+ ------
212
+ RETURNS:
213
+ -
214
+ - Via instance.verdict() prints the verdict of the Hausman test according to the given parameters.
215
+ '''
216
+ def __init__(self, data: pd.DataFrame, level: int = 5) -> None:
217
+ self.__df = data
218
+ self.__exog = len(data.columns[3:])
219
+ self.__l =[]
220
+ for i in range(1, self.__exog+1):
221
+ self.__l.append(f'x{i}')
222
+ self.__df.columns = ['SpUnit', 'time', 'target'] + self.__l
223
+ self.__alpha = level/100
224
+ self.__RE = 'GLS'
225
+ self.__T = len(self.__df.time.unique())
226
+ self.__N = len(self.__df.SpUnit.unique())
227
+
228
+
229
+ def build_FE(self) -> pd.DataFrame:
230
+ fe = self.__df.copy(deep=True)
231
+ for i, unit in enumerate(self.__df.SpUnit.unique()[1:]):
232
+ fe.loc[:, f'd{i}'] = np.where(fe.SpUnit == unit, 1, 0)
233
+ return fe.iloc[:, 2:]
234
+
235
+ def build_GLS(self, w_err: float) -> pd.DataFrame | None:
236
+ re = self.__df.copy(deep=True)
237
+ sigma2 = np.sum((sm.OLS(re.iloc[:, 2], sm.add_constant(re.iloc[:, 3:])).fit()).resid**2) / (self.__N*self.__T - self.__exog)
238
+ sigma_u = sigma2 - w_err
239
+ if sigma_u <= 0:
240
+ sigma_u = 0
241
+ elif sigma_u > 0:
242
+ self.__RE = 'MLE'
243
+ return None
244
+ sig = np.full((self.__T, self.__T), sigma_u)
245
+ np.fill_diagonal(sig, sigma2)
246
+ matrix = np.kron(np.eye(self.__N), sig)
247
+ return matrix
248
+
249
+
250
+ def estimate(self) -> float:
251
+ fe_df = self.build_FE()
252
+ Chi = 0
253
+ fe_res = sm.OLS(fe_df.iloc[:, 0], sm.add_constant(fe_df.iloc[:, 1:])).fit()
254
+ matrix = self.build_GLS(np.sum(fe_res.resid**2) / (self.__N*(self.__T-1) - self.__exog))
255
+ if matrix is not None:
256
+ re_res = sm.GLS(self.__df.iloc[:, 2], sm.add_constant(self.__df.iloc[:, 3:]), matrix).fit()
257
+ else:
258
+ re_res = sm.MixedLM(self.__df['target'], sm.add_constant(self.__df[self.__l]), groups=self.__df.SpUnit).fit(reml = True, maxiter=100_00)
259
+ for b_fe, b_re, var_fe, var_re in zip(fe_res.params[1:], re_res.params[1:], fe_res.bse[1:]**2, re_res.bse[1:]**2):
260
+ Chi += (b_fe - b_re)**2 / (var_fe - var_re)
261
+
262
+ return Chi
263
+
264
+ def verdict(self) -> None:
265
+ Chi = self.estimate()
266
+ ch2 = sc.chi2(self.__exog)
267
+ p = ch2.sf(Chi)
268
+ if p < self.__alpha:
269
+ print(f'P-Value: {p} < Alpha: {self.__alpha}. \n According to the Hausman test, you should use the FE (Fixed Effects) model. \n RE estimator: {self.__RE}')
270
+ else:
271
+ print(f'P-Value: {p} > Alpha: {self.__alpha}. \n According to the Hausman test, you should use the RE (Random Effects) model. \n RE estimator: {self.__RE}')
272
+
273
+ def __del__(self) -> None:
274
+ pass
275
+
276
+
277
+
278
+ class FECM:
279
+ '''
280
+ The implementation of a first-order ECM (Error Correction Model) estimation for panel data.
281
+ ----
282
+ ----
283
+ PARAMETERS:
284
+ ----
285
+ - *df*: a Pandas DataFrame containing panel data. Make sure your data is structured in this exact order (by column index):\n
286
+ 0 - Spatial units. The data must be homogenous, e.g. only cities, contries, regions, etc.\n
287
+ 1 - Temporal units. The data must be homogenous, e.g. only years, months, quarters, etc. \n
288
+ 2 - Your target/endogenous variable. The data must not contain NaN values.\n
289
+ 3+ - Your exogenous variables. The data must not contain NaN values.\n
290
+ - *effects*: Specify what effects your long-run model must have. The class will estimate both the short-run & long-run models.
291
+ Enter one of the following keywords: "fix" | "rand". "rand" by default.
292
+ - *trend*: Specify a trend of which order your target variable is. 0 by default - the data contains no trend.
293
+ - *n_lags*: Specify the maximum amount of lags to test on.
294
+ - *method*: Specify the method of ECM estimation. \n
295
+ Currently implemented methods:\n
296
+ - MG (Mean Group) - Choose this method of you believe all your coefficients may be simply heterogenous.
297
+ - CCEMG (Common Correlated Effects Mean Group) - Choose this one if you also believe that there is valid cross-sectional dependence in the data.
298
+ - CCEP (Common Correlated Effects Pooled) - Choose this one either if your data is lacking temporally or if you believe in the homogeneity of regressors.\n
299
+ Choose between the following keywords: ["MG", "CCEMG", "CCEP"]
300
+ - If CCE- is chosen, the mean target variable for forecasting the differences will be predicted using an AR(d) proccess, where d will be chosen automatically between 1 and n_lags.
301
+ - *coint*: Specify which exogenous variables will be included in the long-run model (A.K.A. are conitegrated with the TARGET variable).
302
+ The rest will be included only in the short-run model.\n
303
+ Enter: a string containing 1 single variable / a list of strings in the following format.\n
304
+ (enumeration from left to right column-wise in your DataFrame):
305
+ - coint = "x1"
306
+ - coint = ["x1", "x3", ...]\n
307
+ Defaults to "x1"
308
+ - *include_x_diffs*: Specify whether the model should include the differences of exogenous variables. Defaults to True.
309
+ - *intercept*: Specify whether the ECM model should have an intercept. Defaults to True.
310
+ - *stat_vars*: a DataFrame of the same format as "df" - includes variables that will not be differenced and included into the ECM in their raw form. Ensure these variables are I(0). Defaults to None.
311
+ ----
312
+ RETURNS:
313
+ --
314
+ A python dictionary (dict) containing:
315
+ - Long-run estimation results | key = "lr_res"
316
+ - ECM (short-run) estimation results | key = "sr_res"\n
317
+ If a CCE- method is chosen:
318
+ - The AR(d) estimation results to forecast the cross-sectional mean | key = "ar"
319
+ '''
320
+ def __init__(self, df: pd.DataFrame, effects: str = 'rand', trend: int = 0, n_lags: int = 1, method: str = 'MG', coint: str | list[str] = 'x1', include_x_diffs: bool = True, intercept: bool = True, stat_vars: pd.DataFrame|None = None) -> None:
321
+ self.__df = df
322
+ self.__eff = effects.lower()
323
+ self.__t = trend
324
+ self.__C = intercept
325
+ self.__lag = n_lags
326
+ self.__method = method.lower()
327
+ self.__exog = len(df.columns[3:])
328
+ self.__l =[]
329
+ self.__stat_vars = stat_vars
330
+ self.__mean_names = ['target_avg']
331
+ self.__x_difs = include_x_diffs
332
+ for i in range(1, self.__exog+1):
333
+ self.__l.append(f'x{i}')
334
+ self.__mean_names.append(f'x{i}_avg')
335
+ self.__df.columns = ['SpUnit', 'time', 'target'] + self.__l
336
+ if isinstance(coint, list):
337
+ self.__lr_df = self.__df.copy(deep=True).loc[:, ['SpUnit', 'time', 'target', *coint]]
338
+ elif isinstance(coint, str):
339
+ self.__lr_df = self.__df.copy(deep=True).loc[:, ['SpUnit', 'time', 'target', coint]]
340
+ else:
341
+ raise TypeError('An invalid Type has been passed into the COINT parameter')
342
+ if self.__t > 0:
343
+ self.__lr_df = self.add_trend()
344
+ self.__N = len(self.__df.SpUnit.unique())
345
+ self.__T = len(self.__df.time.unique())
346
+ if self.__lag > self.__T**(1/3):
347
+ self.__lag = floor(self.__T**(1/3))
348
+ if self.__lag < 1:
349
+ self.__lag = 1
350
+ self.__verify()
351
+ self.__means = self.build_means()
352
+ if self.__method == 'ccemg' or method == 'ccep':
353
+ self.__ar = self.select_ar()
354
+ self.__lr = self.__estimate_lr()
355
+ self.__sr = self.build_sr()
356
+
357
+ def __verify(self) -> None:
358
+ if self.__eff not in ['fix', 'rand']:
359
+ raise ValueError('Non-Valid panel effects type entered!')
360
+ if self.__t < 0:
361
+ raise ValueError('The Trend order cannot be lower than 0!')
362
+ if self.__method not in ['mg', 'ccemg', 'ccep']:
363
+ raise NotImplementedError('Either the estimation method has not been implemented yet, or it is invalid!')
364
+
365
+ def add_trend(self) -> pd.DataFrame:
366
+ lst = []
367
+ for unit in self.__df.SpUnit.unique():
368
+ subdf = self.__lr_df[self.__lr_df.SpUnit == unit].copy(deep=True)
369
+ for i in range(1, self.__t+1):
370
+ subdf.loc[:, f't^{i}'] = np.linspace(1, len(self.__df.time.unique()), len(self.__df.time.unique()))**i
371
+ lst.append(subdf)
372
+ return pd.concat(lst)
373
+
374
+ def build_means(self) -> pd.DataFrame:
375
+ mn = self.__df.copy(deep=True)
376
+ mn = mn.set_index('time')
377
+ means = mn.groupby('time')[['target'] + self.__l].mean()
378
+ means.columns = self.__mean_names
379
+ means = pd.concat([means, means.shift([1])], axis=1)
380
+ for var in means.columns[:2]:
381
+ if 'target' not in var:
382
+ if self.__x_difs:
383
+ means[f'{var}_diff'] = means[var] - means[f'{var}_1']
384
+ means = means.drop(columns=[f'{var}_1'])
385
+ else:
386
+ means[f'{var}_diff'] = means[var] - means[f'{var}_1']
387
+ means = means.drop(columns=[f'{var}_1'])
388
+ return means
389
+
390
+ def build_GLS(self, w_err: float) -> pd.DataFrame:
391
+ re = self.__lr_df.copy(deep=True)
392
+ sigma2 = np.sum((sm.OLS(re.iloc[:, 2], sm.add_constant(re.iloc[:, 3:])).fit()).resid**2) / (self.__N*self.__T - self.__exog)
393
+ sigma_u = sigma2 - w_err
394
+ if sigma_u <= 0:
395
+ sigma_u = 0
396
+ sig = np.full((self.__T, self.__T), sigma_u)
397
+ np.fill_diagonal(sig, sigma2)
398
+ matrix = np.kron(np.eye(self.__N), sig)
399
+ return matrix
400
+
401
+ def build_FE(self) -> pd.DataFrame:
402
+ lr = self.__lr_df.copy(deep=True)
403
+ for i, unit in enumerate(lr.SpUnit.unique()[1:], start=1):
404
+ lr[f'd{i}'] = np.where(lr.SpUnit == unit, 1, 0)
405
+ return lr
406
+
407
+ def __estimate_lr(self) -> pd.DataFrame:
408
+ if self.__eff == 'fix':
409
+ lr_fe = self.build_FE()
410
+ res_lr = sm.OLS(lr_fe.loc[:, 'target'], sm.add_constant(lr_fe.iloc[:, 3:])).fit()
411
+ return res_lr
412
+ else:
413
+ lr_fe = self.build_FE()
414
+ resid = np.sum(sm.OLS(lr_fe.loc[:, 'target'], sm.add_constant(lr_fe.iloc[:, 3:])).fit().resid**2) / (self.__N*(self.__T - 1) - self.__exog)
415
+ lr_re_matrix = self.build_GLS(resid)
416
+ return sm.GLS(self.__lr_df.loc[:, 'target'], sm.add_constant(self.__lr_df.iloc[:, 3:]), lr_re_matrix).fit()
417
+
418
+ def select_ar(self) -> Any:
419
+ current_d = self.__lag+1
420
+ while current_d >= 1:
421
+ frame = pd.DataFrame(self.__means.target_avg)
422
+ temp = []
423
+ for lag in range(1, current_d+1):
424
+ frame.loc[:, f'y_avg{lag}'] = frame['target_avg'].shift(lag)
425
+ temp.append(f'y_avg{lag}')
426
+ frame = frame.dropna()
427
+ part_res = sm.OLS(frame.target_avg, frame[temp]).fit()
428
+ if part_res.pvalues[temp[-1]] < 0.05:
429
+ break
430
+ else:
431
+ current_d -=1
432
+ print(f'Selected AR lag amount: {current_d}')
433
+ return part_res
434
+
435
+ def get_ccemg_frames(self, max_lag: int) -> list[pd.DataFrame]:
436
+ subdfs = []
437
+ for unit in self.__df.SpUnit.unique():
438
+ subdf = self.__df[self.__df.SpUnit == unit].copy(deep=True)
439
+ if self.__x_difs:
440
+ for var in self.__l:
441
+ subdf[f'{var}_lag1'] = subdf[var].shift(1)
442
+ subdf[f'{var}_diff'] = subdf[var]- subdf[f'{var}_lag1']
443
+ subdf = subdf.drop(columns = [f'{var}_lag1'])
444
+ subdf['target_lag1'] = subdf['target'].shift(1)
445
+ subdf.insert(2, 'target_diff', subdf['target'] - subdf['target_lag1'])
446
+ subdf = subdf.drop(columns = ['target_lag1', *self.__l, 'target'])
447
+ subdf['error'] = subdf.error.shift(1)
448
+ subdf = pd.concat([subdf.reset_index(drop=True), self.__means.reset_index(drop=True)], axis=1)
449
+ if self.__stat_vars is not None:
450
+ stat_subdf = self.__stat_vars[self.__stat_vars.SpUnit == unit].copy(deep=True)
451
+ subdf = pd.concat([subdf.reset_index(drop=True), stat_subdf.reset_index(drop=True)], axis=1)
452
+ subdfs.append(subdf.dropna())
453
+ return subdfs
454
+
455
+ def get_mg_frames(self, max_lag: int) -> list[pd.DataFrame]:
456
+ subdfs = []
457
+ for unit in self.__df.SpUnit.unique():
458
+ subdf = self.__df[self.__df.SpUnit == unit].copy(deep=True)
459
+ if self.__x_difs:
460
+ for var in self.__l:
461
+ subdf[f'{var}_lag1'] = subdf[var].shift(1)
462
+ subdf[f'{var}_diff'] = subdf[var]- subdf[f'{var}_lag1']
463
+ subdf = subdf.drop(columns = [f'{var}_lag1'])
464
+ subdf['target_lag1'] = subdf['target'].shift(1)
465
+ subdf.insert(2, 'target_diff', subdf['target'] - subdf['target_lag1'])
466
+ subdf = subdf.drop(columns = ['target_lag1', *self.__l, 'target'])
467
+ subdf['error'] = subdf.error.shift(1)
468
+ if self.__stat_vars is not None:
469
+ stat_subdf = self.__stat_vars[self.__stat_vars.SpUnit == unit].copy(deep=True)
470
+ subdf = pd.concat([subdf.reset_index(drop=True), stat_subdf.reset_index(drop=True)], axis=1)
471
+ subdfs.append(subdf.dropna())
472
+ return subdfs
473
+
474
+ def build_sr(self) -> pd.DataFrame:
475
+ self.__df = pd.concat([self.__df, pd.Series(self.__lr.resid, name='error')], axis=1)
476
+ est = []
477
+ if self.__method == 'ccemg':
478
+ units = self.get_ccemg_frames(self.__lag)
479
+ for model in units:
480
+ if self.__C:
481
+ est.append(sm.OLS(model['target_diff'], sm.add_constant(model.iloc[:, 3:])).fit())
482
+ else:
483
+ est.append(sm.OLS(model['target_diff'], model.iloc[:, 3:]).fit())
484
+ return est
485
+ elif self.__method == 'mg':
486
+ units = self.get_mg_frames(self.__lag)
487
+ for model in units:
488
+ if self.__C:
489
+ est.append(sm.OLS(model['target_diff'], sm.add_constant(model.iloc[:, 3:])).fit())
490
+ else:
491
+ est.append(sm.OLS(model['target_diff'], model.iloc[:, 3:]).fit())
492
+ return est
493
+ elif self.__method == 'ccep':
494
+ units = self.get_ccemg_frames(self.__lag)
495
+ pool = pd.concat(units, axis=0)
496
+ if self.__C:
497
+ est.append(sm.OLS(pool['target_diff'], sm.add_constant(pool.iloc[:, 3:])).fit())
498
+ else:
499
+ est.append(sm.OLS(pool['target_diff'], pool.iloc[:, 3:]).fit())
500
+ return est
501
+
502
+ def fit(self) -> dict:
503
+ dct = dict()
504
+ if self.__method == 'ccep':
505
+ dct['sr_res'] = self.__sr[0]
506
+ dct['lr_res'] = self.__lr
507
+ dct['ar'] = self.__ar
508
+ elif self.__method == 'ccemg' or self.__method == 'mg':
509
+ dct['lr_res'] = self.__lr
510
+ if self.__method == 'ccemg':
511
+ dct['ar'] = self.__ar
512
+ coefs = []
513
+ F_pvalues = []
514
+ tpvalues = []
515
+ rsq = []
516
+ for result in self.__sr:
517
+ coefs.append(result.params)
518
+ F_pvalues.append(result.f_pvalue)
519
+ tpvalues.append(result.pvalues)
520
+ rsq.append(result.rsquared)
521
+ coef_mean = pd.DataFrame(pd.concat(coefs, axis=1).mean(axis=1), columns=['Mean Group coefs'])
522
+ F_pval_mean = np.array(F_pvalues).mean()
523
+ tpvalues_mean = pd.DataFrame(pd.concat(tpvalues, axis=1).mean(axis=1), columns = ['Mean Group T-pvalues'])
524
+ rsq_mean = np.array(rsq).mean()
525
+ res = {
526
+ 'Rsquared': rsq_mean,
527
+ 'F_Pvalue': F_pval_mean,
528
+ 'coefs': pd.concat([coef_mean, tpvalues_mean], axis=1)
529
+ }
530
+ dct['sr_res'] = res
531
+ return dct
532
+
533
+ def __del__(self) -> None:
534
+ pass
535
+
536
+
537
+ class CDTwoWay:
538
+ '''
539
+ Implementation of the CD test to validate/reject cross-sectional dependence.
540
+ -
541
+ H0: p_{ij} = 0 (No Significant Cross-Sectional Dependence)\n
542
+ H1: p_{ij} != 0 (Valid Cross-Sectional Dependence)\n
543
+ ---
544
+ PARAMETERS:
545
+ ----
546
+ - *df*: a Pandas DataFrame containing panel data. Make sure your data is structured in this exact order (by column index):\n
547
+ 0 - Spatial units. The data must be homogenous, e.g. only cities, contries, regions, etc.\n
548
+ 1 - Temporal units. The data must be homogenous, e.g. only years, months, quarters, etc. \n
549
+ 2 - Your target/endogenous variable. The data must not contain NaN values.\n
550
+ 3+ - Your exogenous variables. The data must not contain NaN values.\n
551
+ - *level*: The test significance level. Pass an integer, defaults to 5.
552
+ -----
553
+ RETURNS:
554
+ --
555
+ - Prints a string of text via the "verdict" method - the CD-test results.
556
+ '''
557
+ def __init__(self, df: pd.DataFrame, level: int = 5) -> None:
558
+ self.__df = df
559
+ self.__exog = len(self.__df.columns[3:])
560
+ self.__l = []
561
+ self.__alpha = level/100
562
+ for i in range(1, self.__exog+1):
563
+ self.__l.append(f'x{i}')
564
+ self.__df.columns = ['SpUnit', 'time', 'target'] + self.__l
565
+ self.__N = len(self.__df.SpUnit.unique())
566
+ self.__T = len(self.__df.time.unique())
567
+
568
+ def __resids(self) -> list:
569
+ resids = []
570
+ for unit in self.__df.SpUnit.unique():
571
+ subdf = self.__df[self.__df.SpUnit == unit].copy(deep=True)
572
+ res = sm.OLS(subdf['target'], sm.add_constant(subdf[self.__l])).fit()
573
+ resids.append(res.resid)
574
+ return resids
575
+
576
+ def __fit(self) -> float:
577
+ corrs = []
578
+ pairs = combinations(self.__resids(), r=2)
579
+ for a, b in pairs:
580
+ cr = np.corrcoef(a, b)[1, 0]
581
+ corrs.append(cr)
582
+ return np.sum(corrs)
583
+
584
+ def verdict(self) -> None:
585
+ Z = sc.norm()
586
+ CD = self.__fit() * np.sqrt((2*self.__T)/(self.__N*(self.__N-1)))
587
+ pval = 2*min(Z.sf(CD), Z.cdf(CD))
588
+ if pval < self.__alpha:
589
+ print(f'p-value = {pval} < alpha = {self.__alpha}\n There is Significant Cross-Sectional Dependence in your data according to the CD-test. \n Significance level: {self.__alpha*100}%')
590
+ else:
591
+ print(f'p-value = {pval} > alpha = {self.__alpha} There is No Significant Cross-Sectional Dependence in your data according to the CD-test. \n Significance level: {self.__alpha*100}%')
592
+
@@ -0,0 +1,3 @@
1
+ from .Lib import CipsTest, HausmanOneWay, FECM, CDTwoWay
2
+
3
+ __all__ = ['CipsTest', 'HausmanOneWay', 'FECM', 'CDTwoWay']
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: econmethods
3
- Version: 0.0.9
3
+ Version: 1
4
4
  Summary: A python package implementing various econometrical tests and estimators
5
5
  Home-page: https://github.com/NaturionBG/econmethods
6
6
  Author: NaturionBG
@@ -5,7 +5,7 @@ with open('README.md', 'r') as f:
5
5
 
6
6
  setup(
7
7
  name = 'econmethods',
8
- version = '0.0.9',
8
+ version = 'v1',
9
9
  description='A python package implementing various econometrical tests and estimators',
10
10
  packages = find_packages(),
11
11
  long_description=long_desc,
@@ -1,278 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- import statsmodels.api as sm
4
- from itertools import product
5
- import scipy.stats as sc
6
- from math import floor
7
- import importlib.resources as resources
8
- from statsmodels.regression.mixed_linear_model import MixedLM
9
-
10
- def read_critical_values(sheet: str) -> pd.DataFrame:
11
- xlsx_path = resources.files("econmethods") / "CADF_Crit_Values.xlsx"
12
- df = pd.read_excel(xlsx_path, sheet_name=sheet, index_col=0)
13
- return df
14
-
15
- class CipsTest:
16
- '''
17
- Implementation of the standard Cross-Sectionally Augmented Dickey-Fuller
18
- procedure to test for non-stationarity I(1) in panel data. Works only with a linear trend.
19
- ----
20
- The hypotheses are as follows:
21
- - *H0*: The Target variable is I(1)
22
- - *H1*: The Target variable is I(0)\n
23
- PARAMETERS:
24
- ----------
25
- ------
26
- - *df*: A standart Pandas DataFrame containing panelized data. \n
27
- Ensure that the DataFrame contains the following columns in this exact order: \n
28
- 0 - a column of spatial units. Must contain homogenous data, e.g. only countries, companies, regions, etc. \n
29
- 1 - temporal column. Must contain homogenous data, e.g. only years, months, quarters, etc. \n
30
- 2 - target variable. Must not contain NaN Values. An Error will be raised otherwise. \n
31
-
32
- - *T*: Your Temporal window. Will be used to determine the test critical value.
33
- - *N*: Your Spatial window. Will be used to determine the test critical value.
34
-
35
- - *trend*: State whether your ADF model has a trend or not. Will be used to determine the test critical value.
36
-
37
- - *poly_trend*: State whether your target variable has a ditinct polynomial trend. \n
38
- If a value bigger than 1 is entered, the test will detrend the target variable to get a robust result.
39
-
40
- - *intercept*: State whether ADF model has an intercept or not. Will be used to determine the test critical value.
41
-
42
- - *n_lags*: Determine the maximum amount of lags in the Augmented Dickey-Fuller regression.
43
- the test will choose the best lag amount from 1 to n_lags based on AIC (Akaike Information Criterion)
44
-
45
- - *level*: Value of significance to conduct the test at (in %%). Only 5 and 1% are allowed.
46
- -----------------------------
47
- RETURNS:
48
- --
49
- via instance.verdict() -> None: Prints the verdict of the test based on the parameters set by the user.
50
- '''
51
- def __init__(self, df: pd.DataFrame, T: int, N: int, trend: bool = False, poly_trend: int = 1, intercept: bool = False, n_lags: int = 2, level: int = 5) -> None:
52
- CipsTest.__build_tables()
53
- self.__df = df
54
- self.__L = 1
55
- self.__T = T
56
- self.__N = N
57
- self.__trend = trend
58
- self.__C = intercept
59
- self.__n_lags = n_lags
60
- if self.__n_lags > floor(self.__T/5):
61
- self.__n_lags = floor(self.__T/5)
62
- if self.__n_lags < 1:
63
- self.__n_lags = 1
64
- self.__alpha = level/100
65
- self.__df = self.__df.rename(columns={self.__df.columns[0]:'SpUnit', self.__df.columns[1]:'time', self.__df.columns[2]:'target'})
66
- self.__df.target = np.log(self.__df.target)
67
- self.__poly = poly_trend
68
- if self.__trend:
69
- if self.__poly > 1:
70
- self.__df = self.detrend()
71
- self.__trend = False
72
- self.verify()
73
- self.__table = self.get_table()
74
- self.__CADF_Crit = self.get_critical_value()
75
- self.__CADF = self.estimate()
76
-
77
- def verify(self) -> None:
78
- if self.__df.target.isnull().sum() > 0:
79
- raise TypeError('Values in Target must NOT be NaN!')
80
- if self.__alpha != 0.01 and self.__alpha != 0.05:
81
- raise ValueError('The Significance Level must be either 1 or 5!')
82
- if self.__poly < 1:
83
- raise ValueError('The Polynomial Power Cannot be lesser than 1!')
84
-
85
- @classmethod
86
- def __build_tables(cls) -> None:
87
- cls.NTNC_1P = read_critical_values('NTNC_1P')
88
- cls.NTNC_5P = read_critical_values('NTNC_5P')
89
- cls.NTC_1P = read_critical_values('NTC_1P')
90
- cls.NTC_5P = read_critical_values('NTC_5P')
91
- cls.TC_1P = read_critical_values('TC_1P')
92
- cls.TC_5P = read_critical_values('TC_5P')
93
-
94
- def get_table(self) -> pd.DataFrame:
95
- if not self.__trend and not self.__C:
96
- if self.__alpha == 0.01:
97
- return CipsTest.NTNC_1P
98
- else:
99
- return CipsTest.NTNC_5P
100
- if not self.__trend and self.__C:
101
- if self.__alpha == 0.01:
102
- return CipsTest.NTC_1P
103
- else:
104
- return CipsTest.NTC_5P
105
- if self.__trend and self.__C:
106
- if self.__alpha == 0.01:
107
- return CipsTest.TC_1P
108
- else:
109
- return CipsTest.TC_5P
110
-
111
- def get_critical_value(self) -> float:
112
- dct = {}
113
- for arr in product(self.__table.index, self.__table.index):
114
- lst = np.array(arr)
115
- dt = np.array([self.__T, self.__N])
116
- dct[arr] = np.sqrt(np.sum((dt-lst)**2))
117
- return self.__table.loc[min(dct, key=dct.get)]
118
-
119
- def detrend(self) -> pd.DataFrame:
120
- lst = []
121
- for unit in self.__df.SpUnit.unique():
122
- subdf = self.__df[self.__df.SpUnit == unit]
123
- for i in range(1, self.__poly+1):
124
- subdf.insert(3, f't^{i}', np.linspace(1, len(subdf), len(subdf))**i)
125
- diff = subdf['target'].copy() - sm.OLS(subdf['target'], sm.add_constant(subdf.iloc[:, 3:])).fit().predict(sm.add_constant(subdf.iloc[:, 3:]))
126
- subdf.loc[:, 'target'] = diff
127
- lst.append(subdf.iloc[:, :3])
128
- return pd.concat(lst, axis=0)
129
-
130
- def build_regressions(self, lags: int) -> list[pd.DataFrame]:
131
- lst = []
132
- for unit in self.__df.SpUnit.unique():
133
- subdf = self.__df[self.__df.SpUnit == unit]
134
- if self.__trend:
135
- subdf.insert(2, 't', np.linspace(1, len(subdf), len(subdf)))
136
- subdf = pd.concat([subdf, subdf.target.shift(periods=range(1, lags+1))], axis=1)
137
- subdf['cs_avg'] = self.__df.groupby(['time'])['target'].mean().values
138
- subdf = pd.concat([subdf, subdf.cs_avg.shift(periods=range(1, lags+1))], axis=1)
139
- subdf.insert(3, 'target_diff', subdf.target - subdf.target_1)
140
- subdf['cs_avg_diff'] = subdf.cs_avg - subdf.cs_avg_1
141
- subdf = pd.concat([subdf, subdf.cs_avg_diff.shift(periods=range(1, lags+1))], axis=1)
142
- subdf = pd.concat([subdf, subdf.target_diff.shift(periods=range(1, lags+1))], axis=1)
143
- if self.__trend:
144
- base = ['target_diff', 't', 'target_1', 'cs_avg_1', 'cs_avg_diff']
145
- else:
146
- base = ['target_diff', 'target_1', 'cs_avg_1', 'cs_avg_diff']
147
- additional = []
148
- for i in range(1, lags+1):
149
- additional.append(f'target_diff_{i}')
150
- additional.append(f'cs_avg_diff_{i}')
151
- subdf=(subdf.loc[:, base+additional]).iloc[self.__n_lags+1:, :]
152
- lst.append(subdf)
153
- return lst
154
-
155
- def estimate(self) -> float:
156
- CADF_stat = None
157
- best_aic = np.inf
158
- for l in range(1, self.__n_lags+1):
159
- aic = []
160
- lst1 = self.build_regressions(l)
161
- CADF = []
162
- for frame in lst1:
163
- if self.__C:
164
- res = sm.OLS(frame.iloc[:, 0], sm.add_constant(frame.iloc[:, 1:])).fit()
165
- else:
166
- res = sm.OLS(frame.iloc[:, 0], frame.iloc[:, 1:]).fit()
167
- CADF.append(res.tvalues['target_1'])
168
- aic.append(res.aic)
169
- if len(np.unique(np.array(CADF))) != 1:
170
- CADF = np.array(CADF).mean()
171
- if np.array(aic).mean() < best_aic:
172
- CADF_stat = CADF
173
- best_aic = np.array(aic).mean()
174
- self.__L = l
175
- return CADF_stat
176
-
177
- def verdict(self) -> None:
178
- if self.__CADF < self.__CADF_Crit:
179
- print(f'{self.__CADF} < {self.__CADF_Crit}\n Your target variable is I(0) according to the CIPS test\n Significance level : {self.__alpha*100}%. \n Selected lag amount: {self.__L}')
180
- else:
181
- print(f'{self.__CADF} > {self.__CADF_Crit}\n Your target variable is I(1) according to the CIPS test\n significance level: {self.__alpha*100}%. \n Selected lag amount: {self.__L}')
182
-
183
- def __del__(self) -> None:
184
- pass
185
-
186
-
187
-
188
- class HausmanOneWay:
189
- '''
190
- Implementation of the Hausman procedure to test whether it is more feasible to used random effects against fixed effects.
191
- -
192
- The Hypotheses are as follows
193
- - *H0*: Cov(a_i, x_{it}) = 0
194
- - *H1*: Cov(a_i, x_{it}) != 0\n
195
- PARAMETERS:
196
- --
197
- -----
198
- - *data*: A pandas DataFrame. Make sure that all items are introduced in this exact order by column index:\n
199
- Make sure no columns contain Nan Values!
200
- 0 - your spatial unit. The data must be homogenous, e.g. only contries, regions, etc.
201
- 1 - your temporal window per panel. The data must be homogenous, e.g. only years, months, etc.
202
- 2 - Your target variable.
203
- 3+ - your exogenous variables.
204
- - *level*: the statistical test significance level.
205
- ---------
206
- METHODOLOGY:
207
- ------
208
- - The Test prefers an MLE estimator for RE (Random Effects),
209
- which can be disrupted if the difference between FE and RE is considerably small.
210
- The test will opt for MLE estimation if sigma_u computed for GLS is greater than 0.
211
- ------
212
- RETURNS:
213
- -
214
- - Via instance.verdict() prints the verdict of the Hausman test according to the given parameters.
215
- '''
216
- def __init__(self, data: pd.DataFrame, level: int = 5) -> None:
217
- self.__df = data
218
- self.__exog = len(data.columns[3:])
219
- self.__l =[]
220
- for i in range(1, self.__exog+1):
221
- self.__l.append(f'x{i}')
222
- self.__df.columns = ['SpUnit', 'time', 'target'] + self.__l
223
- self.__alpha = level/100
224
- self.__RE = 'GLS'
225
- self.__T = len(self.__df.time.unique())
226
- self.__N = len(self.__df.SpUnit.unique())
227
-
228
-
229
- def build_FE(self) -> pd.DataFrame:
230
- fe = self.__df.copy(deep=True)
231
- for i, unit in enumerate(self.__df.SpUnit.unique()[1:]):
232
- fe.loc[:, f'd{i}'] = np.where(fe.SpUnit == unit, 1, 0)
233
- return fe.iloc[:, 2:]
234
-
235
- def build_GLS(self, w_err: float) -> pd.DataFrame | None:
236
- re = self.__df.copy(deep=True)
237
- sigma2 = np.sum((sm.OLS(re.iloc[:, 2], sm.add_constant(re.iloc[:, 3:])).fit()).resid**2) / (self.__N*self.__T - self.__exog)
238
- sigma_u = sigma2 - w_err
239
- if sigma_u <= 0:
240
- sigma_u = 0
241
- elif sigma_u > 0:
242
- self.__RE = 'MLE'
243
- return None
244
- sig = np.full((self.__T, self.__T), sigma_u)
245
- np.fill_diagonal(sig, sigma2)
246
- matrix = np.kron(np.eye(self.__N), sig)
247
- return matrix
248
-
249
-
250
- def estimate(self) -> float:
251
- fe_df = self.build_FE()
252
- Chi = 0
253
- fe_res = sm.OLS(fe_df.iloc[:, 0], sm.add_constant(fe_df.iloc[:, 1:])).fit()
254
- matrix = self.build_GLS(np.sum(fe_res.resid**2) / (self.__N*(self.__T-1) - self.__exog))
255
- if matrix is not None:
256
- re_res = sm.GLS(self.__df.iloc[:, 2], sm.add_constant(self.__df.iloc[:, 3:]), matrix).fit()
257
- else:
258
- re_res = sm.MixedLM(self.__df['target'], sm.add_constant(self.__df[self.__l]), groups=self.__df.SpUnit).fit(reml = True, maxiter=100_00)
259
- for b_fe, b_re, var_fe, var_re in zip(fe_res.params[1:], re_res.params[1:], fe_res.bse[1:]**2, re_res.bse[1:]**2):
260
- Chi += (b_fe - b_re)**2 / (var_fe - var_re)
261
-
262
- return Chi
263
-
264
- def verdict(self) -> None:
265
- Chi = self.estimate()
266
- ch2 = sc.chi2(self.__exog)
267
- p = ch2.sf(Chi)
268
- if p < self.__alpha:
269
- print(f'P-Value: {p} < Alpha: {self.__alpha}. \n According to the Hausman test, you should use the FE (Fixed Effects) model. \n RE estimator: {self.__RE}')
270
- else:
271
- print(f'P-Value: {p} > Alpha: {self.__alpha}. \n According to the Hausman test, you should use the RE (Random Effects) model. \n RE estimator: {self.__RE}')
272
-
273
- def __del__(self) -> None:
274
- pass
275
-
276
-
277
-
278
-
@@ -1,3 +0,0 @@
1
- from .Lib import CipsTest, HausmanOneWay
2
-
3
- __all__ = ['CipsTest', 'HausmanOneWay']
File without changes
File without changes