systemgmmkit 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,351 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Literal
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ CovarianceType = Literal["unadjusted", "robust", "clustered"]
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class FixedEffectsSpec:
14
+ """Structured specification for a static panel fixed-effects model.
15
+
16
+ The estimator is intended for the user's main fixed-effects models that sit
17
+ beside dynamic-panel Difference/System GMM robustness checks. It is not a
18
+ replacement for System GMM and should not be used to estimate a lagged-
19
+ dependent-variable model without acknowledging Nickell bias in short panels.
20
+ """
21
+
22
+ dependent: str
23
+ regressors: list[str]
24
+ entity_effects: bool = True
25
+ time_effects: bool = True
26
+ covariance: CovarianceType = "clustered"
27
+ cluster: Literal["entity", "time"] = "entity"
28
+ drop_absorbed: bool = True
29
+ name: str = "fixed_effects"
30
+
31
+ def __post_init__(self) -> None:
32
+ if not self.dependent:
33
+ raise ValueError("dependent cannot be empty.")
34
+ if not self.regressors:
35
+ raise ValueError("regressors cannot be empty.")
36
+ if self.covariance not in {"unadjusted", "robust", "clustered"}:
37
+ raise ValueError("covariance must be 'unadjusted', 'robust', or 'clustered'.")
38
+ if self.cluster not in {"entity", "time"}:
39
+ raise ValueError("cluster must be 'entity' or 'time'.")
40
+
41
+ @property
42
+ def variables(self) -> set[str]:
43
+ return {self.dependent, *self.regressors}
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class FixedEffectsResult:
48
+ """Minimal, backend-independent fixed-effects result object."""
49
+
50
+ spec: FixedEffectsSpec
51
+ nobs: int
52
+ rank: int
53
+ df_resid: int
54
+ params: pd.Series
55
+ std_errors: pd.Series
56
+ tstats: pd.Series
57
+ pvalues: pd.Series
58
+ residuals: pd.Series
59
+ fitted_values: pd.Series
60
+ r2_within: float
61
+ covariance_type: str
62
+ backend: str
63
+ notes: list[str]
64
+
65
+ def summary_frame(self) -> pd.DataFrame:
66
+ return pd.DataFrame(
67
+ {
68
+ "coef": self.params,
69
+ "std_err": self.std_errors,
70
+ "t": self.tstats,
71
+ "p_value": self.pvalues,
72
+ }
73
+ )
74
+
75
+ def to_markdown(self, digits: int = 4) -> str:
76
+ table = self.summary_frame().round(digits)
77
+ lines = [f"# Fixed-effects result: {self.spec.name}", ""]
78
+ lines.append(f"- Backend: `{self.backend}`")
79
+ lines.append(f"- Observations: `{self.nobs}`")
80
+ lines.append(f"- Residual df: `{self.df_resid}`")
81
+ lines.append(f"- Covariance: `{self.covariance_type}`")
82
+ lines.append(f"- Within R²: `{self.r2_within:.{digits}f}`")
83
+ if self.notes:
84
+ lines.append("- Notes:")
85
+ for note in self.notes:
86
+ lines.append(f" - {note}")
87
+ lines.append("")
88
+ lines.append(table.to_markdown())
89
+ return "\n".join(lines)
90
+
91
+
92
+ def _require_columns(data: pd.DataFrame, columns: list[str]) -> None:
93
+ missing = [c for c in columns if c not in data.columns]
94
+ if missing:
95
+ raise KeyError(f"Missing required columns: {missing}")
96
+
97
+
98
+ def _build_lsdv_design(
99
+ data: pd.DataFrame,
100
+ *,
101
+ entity: str,
102
+ time: str,
103
+ spec: FixedEffectsSpec,
104
+ ) -> tuple[pd.Series, pd.DataFrame, pd.DataFrame, list[str]]:
105
+ """Build an explicit least-squares dummy-variable design matrix.
106
+
107
+ LSDV is slower than within-transformation for very large panels but is exact
108
+ for both balanced and unbalanced one-/two-way fixed-effects models and keeps
109
+ the implementation auditable.
110
+ """
111
+
112
+ columns = [entity, time, spec.dependent, *spec.regressors]
113
+ _require_columns(data, columns)
114
+ work = data[columns].dropna().copy()
115
+ if work.empty:
116
+ raise ValueError("No complete observations remain after dropping missing values.")
117
+
118
+ y = work[spec.dependent].astype(float)
119
+ X_parts: list[pd.DataFrame] = []
120
+ notes: list[str] = []
121
+
122
+ # Structural regressors first; output is restricted to these coefficients.
123
+ X_reg = work[spec.regressors].astype(float)
124
+ X_parts.append(X_reg)
125
+
126
+ if spec.entity_effects:
127
+ d_entity = pd.get_dummies(
128
+ work[entity].astype("category"), prefix=f"fe_{entity}", drop_first=True, dtype=float
129
+ )
130
+ if d_entity.shape[1] > 0:
131
+ X_parts.append(d_entity)
132
+ notes.append("Entity fixed effects included via LSDV dummies.")
133
+
134
+ if spec.time_effects:
135
+ d_time = pd.get_dummies(
136
+ work[time].astype("category"), prefix=f"fe_{time}", drop_first=True, dtype=float
137
+ )
138
+ if d_time.shape[1] > 0:
139
+ X_parts.append(d_time)
140
+ notes.append("Time fixed effects included via LSDV dummies.")
141
+
142
+ # Constant is included only when no FE are requested. With FE dummies, a
143
+ # constant plus dropped categories yields the same slopes but adds clutter.
144
+ if not spec.entity_effects and not spec.time_effects:
145
+ X_parts.insert(
146
+ 0, pd.DataFrame({"const": np.ones(len(work), dtype=float)}, index=work.index)
147
+ )
148
+
149
+ X = pd.concat(X_parts, axis=1)
150
+
151
+ # Drop exactly collinear columns if requested. This protects against absorbed
152
+ # variables and duplicate regressors without silently changing named slopes.
153
+ if spec.drop_absorbed:
154
+ keep: list[str] = []
155
+ current = np.empty((len(X), 0), dtype=float)
156
+ current_rank = 0
157
+ dropped: list[str] = []
158
+ for col in X.columns:
159
+ candidate = np.column_stack([current, X[col].to_numpy(dtype=float)])
160
+ candidate_rank = int(np.linalg.matrix_rank(candidate))
161
+ if candidate_rank > current_rank:
162
+ keep.append(col)
163
+ current = candidate
164
+ current_rank = candidate_rank
165
+ else:
166
+ dropped.append(col)
167
+ if dropped:
168
+ notes.append(
169
+ f"Dropped absorbed/collinear columns: {', '.join(dropped[:10])}"
170
+ + ("..." if len(dropped) > 10 else "")
171
+ )
172
+ X = X[keep]
173
+
174
+ return y, X, work[[entity, time]], notes
175
+
176
+
177
+ def _safe_inverse_xtx(X: np.ndarray) -> np.ndarray:
178
+ xtx = X.T @ X
179
+ return np.linalg.pinv(xtx)
180
+
181
+
182
+ def _normal_pvalues_from_t(tstats: np.ndarray) -> np.ndarray:
183
+ # scipy is a core dependency; imported lazily to keep module load light.
184
+ from scipy import stats
185
+
186
+ return 2.0 * stats.norm.sf(np.abs(tstats))
187
+
188
+
189
+ def _covariance_matrix(
190
+ X: np.ndarray,
191
+ residuals: np.ndarray,
192
+ *,
193
+ covariance: CovarianceType,
194
+ clusters: pd.Series | None = None,
195
+ ) -> np.ndarray:
196
+ nobs, k = X.shape
197
+ xtx_inv = _safe_inverse_xtx(X)
198
+
199
+ if covariance == "unadjusted":
200
+ sigma2 = float((residuals @ residuals) / max(nobs - k, 1))
201
+ return sigma2 * xtx_inv
202
+
203
+ if covariance == "robust":
204
+ meat = X.T @ ((residuals**2)[:, None] * X)
205
+ correction = nobs / max(nobs - k, 1)
206
+ return correction * xtx_inv @ meat @ xtx_inv
207
+
208
+ if clusters is None:
209
+ raise ValueError("clusters must be provided when covariance='clustered'.")
210
+
211
+ cluster_values = pd.Series(clusters).to_numpy()
212
+ unique_clusters = pd.unique(cluster_values)
213
+ meat = np.zeros((k, k), dtype=float)
214
+ for c in unique_clusters:
215
+ idx = cluster_values == c
216
+ xg = X[idx, :]
217
+ ug = residuals[idx]
218
+ xu = xg.T @ ug
219
+ meat += np.outer(xu, xu)
220
+
221
+ g = len(unique_clusters)
222
+ if g <= 1:
223
+ correction = nobs / max(nobs - k, 1)
224
+ else:
225
+ correction = (g / (g - 1)) * ((nobs - 1) / max(nobs - k, 1))
226
+ return correction * xtx_inv @ meat @ xtx_inv
227
+
228
+
229
+ def run_fixed_effects_native(
230
+ spec: FixedEffectsSpec,
231
+ data: pd.DataFrame,
232
+ *,
233
+ entity: str,
234
+ time: str,
235
+ ) -> FixedEffectsResult:
236
+ """Estimate a static one-/two-way fixed-effects model using native NumPy.
237
+
238
+ This is an exact LSDV estimator for slopes. It is intentionally conservative
239
+ and transparent. For very large panels, install `linearmodels` and use
240
+ `run_fixed_effects(..., prefer_backend="linearmodels")`.
241
+ """
242
+
243
+ y, X_df, ids, notes = _build_lsdv_design(data, entity=entity, time=time, spec=spec)
244
+ X = X_df.to_numpy(dtype=float)
245
+ yv = y.to_numpy(dtype=float)
246
+
247
+ beta, *_ = np.linalg.lstsq(X, yv, rcond=None)
248
+ fitted = X @ beta
249
+ residuals = yv - fitted
250
+ rank = int(np.linalg.matrix_rank(X))
251
+ df_resid = int(max(len(yv) - rank, 0))
252
+
253
+ cluster_series = ids[entity] if spec.cluster == "entity" else ids[time]
254
+ cov = _covariance_matrix(
255
+ X,
256
+ residuals,
257
+ covariance=spec.covariance,
258
+ clusters=cluster_series if spec.covariance == "clustered" else None,
259
+ )
260
+ se = np.sqrt(np.maximum(np.diag(cov), 0.0))
261
+ with np.errstate(divide="ignore", invalid="ignore"):
262
+ tstats = beta / se
263
+ pvalues = _normal_pvalues_from_t(tstats)
264
+
265
+ params = pd.Series(beta, index=X_df.columns, name="coef")
266
+ std_errors = pd.Series(se, index=X_df.columns, name="std_err")
267
+ t_ser = pd.Series(tstats, index=X_df.columns, name="t")
268
+ p_ser = pd.Series(pvalues, index=X_df.columns, name="p_value")
269
+
270
+ # Return only structural regressors/constant in summary fields; dummies stay internal.
271
+ reported = [c for c in (["const"] + spec.regressors) if c in params.index]
272
+ if not reported:
273
+ raise ValueError("No structural regressors remain after absorption/collinearity checks.")
274
+
275
+ # Within R² approximation based on FE-adjusted residual variance relative to
276
+ # demeaned dependent variable under the requested FE structure.
277
+ y_center = y.copy()
278
+ if spec.entity_effects:
279
+ y_center = y_center - y.groupby(ids[entity]).transform("mean")
280
+ if spec.time_effects:
281
+ y_center = y_center - y.groupby(ids[time]).transform("mean")
282
+ if spec.entity_effects:
283
+ y_center = y_center + y.mean()
284
+ denom = float(np.sum(np.asarray(y_center) ** 2))
285
+ r2_within = float(1.0 - (residuals @ residuals) / denom) if denom > 0 else float("nan")
286
+
287
+ return FixedEffectsResult(
288
+ spec=spec,
289
+ nobs=int(len(yv)),
290
+ rank=rank,
291
+ df_resid=df_resid,
292
+ params=params.loc[reported],
293
+ std_errors=std_errors.loc[reported],
294
+ tstats=t_ser.loc[reported],
295
+ pvalues=p_ser.loc[reported],
296
+ residuals=pd.Series(residuals, index=y.index, name="residual"),
297
+ fitted_values=pd.Series(fitted, index=y.index, name="fitted"),
298
+ r2_within=r2_within,
299
+ covariance_type=spec.covariance
300
+ if spec.covariance != "clustered"
301
+ else f"clustered-{spec.cluster}",
302
+ backend="native-lsdv",
303
+ notes=notes,
304
+ )
305
+
306
+
307
+ def run_fixed_effects(
308
+ spec: FixedEffectsSpec,
309
+ data: pd.DataFrame,
310
+ *,
311
+ entity: str,
312
+ time: str,
313
+ prefer_backend: Literal["native", "linearmodels"] = "native",
314
+ ) -> Any:
315
+ """Run a fixed-effects model.
316
+
317
+ The native backend is dependency-light and exact for slope coefficients. The
318
+ optional `linearmodels` backend returns the upstream `PanelOLS` result object.
319
+ """
320
+
321
+ if prefer_backend == "native":
322
+ return run_fixed_effects_native(spec, data, entity=entity, time=time)
323
+
324
+ if prefer_backend != "linearmodels":
325
+ raise ValueError("prefer_backend must be 'native' or 'linearmodels'.")
326
+
327
+ try:
328
+ from linearmodels.panel import PanelOLS
329
+ except ModuleNotFoundError as exc:
330
+ raise ImportError(
331
+ "linearmodels is required for the linearmodels backend. Install with: "
332
+ "python -m pip install 'systemgmmkit[fe]'"
333
+ ) from exc
334
+
335
+ columns = [entity, time, spec.dependent, *spec.regressors]
336
+ _require_columns(data, columns)
337
+ work = data[columns].dropna().set_index([entity, time]).sort_index()
338
+ y = work[spec.dependent]
339
+ X = work[spec.regressors]
340
+ model = PanelOLS(
341
+ y,
342
+ X,
343
+ entity_effects=spec.entity_effects,
344
+ time_effects=spec.time_effects,
345
+ drop_absorbed=spec.drop_absorbed,
346
+ )
347
+ cov_type = "clustered" if spec.covariance == "clustered" else spec.covariance
348
+ kwargs: dict[str, Any] = {}
349
+ if spec.covariance == "clustered":
350
+ kwargs["cluster_entity" if spec.cluster == "entity" else "cluster_time"] = True
351
+ return model.fit(cov_type=cov_type, **kwargs)
@@ -0,0 +1,100 @@
1
+ """Policy helpers for interpreting GMM parity comparisons.
2
+
3
+ This module separates production backend validation from experimental native-GMM parity.
4
+ Native Difference/System GMM is operational but not certified as pydynpd- or xtabond2-equivalent yet.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+
11
+ PASS_PARITY = "PASS_PARITY"
12
+ FAIL_PARITY = "FAIL_PARITY"
13
+ EXPERIMENTAL_PARITY_PENDING = "EXPERIMENTAL_PARITY_PENDING"
14
+ OPERATIONAL_ONLY = "OPERATIONAL_ONLY"
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class GMMParityDecision:
19
+ """Decision object for GMM comparison outcomes."""
20
+
21
+ status: str
22
+ blocks_release: bool
23
+ message: str
24
+
25
+
26
+ def classify_gmm_parity(
27
+ *,
28
+ estimator: str,
29
+ backend: str,
30
+ comparison_backend: str | None = None,
31
+ execution_passed: bool,
32
+ strict_parity_passed: bool,
33
+ ) -> GMMParityDecision:
34
+ """Classify GMM parity results without over-claiming native-GMM certification.
35
+
36
+ Rules:
37
+ - pydynpd execution failure is release-blocking.
38
+ - pydynpd execution success is production-acceptable even if external strict parity is pending.
39
+ - native GMM execution success with strict parity failure is experimental, not release-blocking.
40
+ - native GMM execution failure is still a release-blocking failure.
41
+ - non-GMM estimators keep strict parity behavior.
42
+ """
43
+
44
+ est = estimator.lower()
45
+ be = backend.lower()
46
+ _ = comparison_backend.lower() if comparison_backend else None
47
+
48
+ is_gmm = "gmm" in est or est in {
49
+ "difference",
50
+ "system",
51
+ "difference_gmm",
52
+ "system_gmm",
53
+ }
54
+
55
+ if not is_gmm:
56
+ return GMMParityDecision(
57
+ status=PASS_PARITY if strict_parity_passed else FAIL_PARITY,
58
+ blocks_release=not strict_parity_passed,
59
+ message="Non-GMM model uses strict parity policy.",
60
+ )
61
+
62
+ if not execution_passed:
63
+ return GMMParityDecision(
64
+ status=FAIL_PARITY,
65
+ blocks_release=True,
66
+ message=f"{backend} GMM execution failed.",
67
+ )
68
+
69
+ if be == "pydynpd":
70
+ return GMMParityDecision(
71
+ status=PASS_PARITY if strict_parity_passed else OPERATIONAL_ONLY,
72
+ blocks_release=False,
73
+ message=(
74
+ "pydynpd is the production GMM backend; strict external parity "
75
+ "remains a certification layer."
76
+ ),
77
+ )
78
+
79
+ if be == "native":
80
+ if strict_parity_passed:
81
+ return GMMParityDecision(
82
+ status=PASS_PARITY,
83
+ blocks_release=False,
84
+ message="Native GMM passed the requested strict parity comparison.",
85
+ )
86
+
87
+ return GMMParityDecision(
88
+ status=EXPERIMENTAL_PARITY_PENDING,
89
+ blocks_release=False,
90
+ message=(
91
+ "Native GMM executed but strict parity is not certified yet. "
92
+ "Use backend='pydynpd' for production Difference/System GMM."
93
+ ),
94
+ )
95
+
96
+ return GMMParityDecision(
97
+ status=FAIL_PARITY,
98
+ blocks_release=True,
99
+ message=f"Unknown GMM backend: {backend}.",
100
+ )