geometrics 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- geometrics/__init__.py +147 -0
- geometrics/_common.py +193 -0
- geometrics/_data_dict.py +369 -0
- geometrics/_geo.py +641 -0
- geometrics/_impacts.py +580 -0
- geometrics/_labels.py +219 -0
- geometrics/_mapping.py +641 -0
- geometrics/_panel.py +179 -0
- geometrics/_roles.py +143 -0
- geometrics/_theme.py +410 -0
- geometrics/_types.py +939 -0
- geometrics/_validation.py +192 -0
- geometrics/clubs.py +1044 -0
- geometrics/convergence.py +1545 -0
- geometrics/data/__init__.py +262 -0
- geometrics/data/_registry.py +82 -0
- geometrics/data/india32_dict.csv +7 -0
- geometrics/data/india520_dict.csv +29 -0
- geometrics/dependence.py +926 -0
- geometrics/distribution_dynamics.py +858 -0
- geometrics/gwr.py +909 -0
- geometrics/maps.py +491 -0
- geometrics/pedagogy/__init__.py +32 -0
- geometrics/pedagogy/_format.py +88 -0
- geometrics/pedagogy/_interpret/__init__.py +63 -0
- geometrics/pedagogy/_interpret/_convergence.py +264 -0
- geometrics/pedagogy/_interpret/_dependence.py +207 -0
- geometrics/pedagogy/_interpret/_dynamics.py +211 -0
- geometrics/pedagogy/_interpret/_gwr.py +161 -0
- geometrics/pedagogy/_interpret/_inequality.py +226 -0
- geometrics/pedagogy/_interpret/_maps.py +70 -0
- geometrics/pedagogy/_interpret/_shared.py +16 -0
- geometrics/pedagogy/_interpret/_spacetime.py +201 -0
- geometrics/pedagogy/_interpret/_spatial_models.py +319 -0
- geometrics/pedagogy/_interpret/_weights.py +86 -0
- geometrics/pedagogy/_mixin.py +44 -0
- geometrics/pedagogy/_registry.py +124 -0
- geometrics/pedagogy/_text/__init__.py +25 -0
- geometrics/pedagogy/_text/convergence.py +201 -0
- geometrics/pedagogy/_text/correlation.py +77 -0
- geometrics/pedagogy/_text/dynamics.py +157 -0
- geometrics/pedagogy/_text/inequality.py +124 -0
- geometrics/pedagogy/_text/models.py +326 -0
- geometrics/pedagogy/_text/spatial.py +263 -0
- geometrics/py.typed +0 -0
- geometrics/regional_inequality.py +985 -0
- geometrics/spacetime.py +665 -0
- geometrics/spatial_models.py +1349 -0
- geometrics/weights.py +578 -0
- geometrics-0.1.0.dist-info/METADATA +159 -0
- geometrics-0.1.0.dist-info/RECORD +53 -0
- geometrics-0.1.0.dist-info/WHEEL +4 -0
- geometrics-0.1.0.dist-info/licenses/LICENSE +21 -0
geometrics/__init__.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""geometrics: regional growth, convergence, and inequality on the PySAL stack.
|
|
2
|
+
|
|
3
|
+
geometrics wraps the standard analyses of the regional convergence literature —
|
|
4
|
+
exploratory spatial data analysis, β/σ/club convergence, spatial econometric models,
|
|
5
|
+
distribution dynamics, inequality decomposition, and local (GWR) models — into
|
|
6
|
+
illustrative, easy-to-apply functions built on libpysal, esda, giddy, inequality,
|
|
7
|
+
mapclassify, spreg, and mgwr.
|
|
8
|
+
|
|
9
|
+
Three inputs drive everything: a geometry with only the entity ID (``read_gdf``),
|
|
10
|
+
a long-form panel (``set_panel`` / ``set_labels``), and a data dictionary
|
|
11
|
+
(``df_dict``, inferable with ``build_data_dict``). Every public function returns a
|
|
12
|
+
frozen result dataclass with ``.df``, ``.fig`` and/or ``.gt``, plain-language
|
|
13
|
+
``.interpret()``, and a concept ``.explain()``.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from geometrics import data
|
|
17
|
+
from geometrics._data_dict import build_data_dict
|
|
18
|
+
from geometrics._geo import read_gdf
|
|
19
|
+
from geometrics._labels import resolve_label, set_labels
|
|
20
|
+
from geometrics._panel import resolve_panel, set_panel
|
|
21
|
+
from geometrics._roles import set_roles
|
|
22
|
+
from geometrics._theme import get_palette, set_palette
|
|
23
|
+
from geometrics._types import (
|
|
24
|
+
BetaConvergenceResult,
|
|
25
|
+
ChoroplethMapResult,
|
|
26
|
+
ConnectivityMapResult,
|
|
27
|
+
ConvergenceClubsResult,
|
|
28
|
+
DistributionOverTimeResult,
|
|
29
|
+
GWRResult,
|
|
30
|
+
InequalityOverTimeResult,
|
|
31
|
+
LisaClusterMapResult,
|
|
32
|
+
MarkovTransitionsResult,
|
|
33
|
+
MGWRResult,
|
|
34
|
+
MoranOverTimeResult,
|
|
35
|
+
MoranPlotResult,
|
|
36
|
+
SigmaConvergenceResult,
|
|
37
|
+
SpacetimeHeatmapResult,
|
|
38
|
+
SpatialDiagnosticsResult,
|
|
39
|
+
SpatialMarkovResult,
|
|
40
|
+
SpatialModelResult,
|
|
41
|
+
TheilDecompositionResult,
|
|
42
|
+
WeightsRobustnessResult,
|
|
43
|
+
)
|
|
44
|
+
from geometrics.clubs import analyze_convergence_clubs
|
|
45
|
+
from geometrics.convergence import (
|
|
46
|
+
analyze_beta_convergence,
|
|
47
|
+
analyze_sigma_convergence,
|
|
48
|
+
growth_cross_section,
|
|
49
|
+
)
|
|
50
|
+
from geometrics.dependence import (
|
|
51
|
+
explore_lisa_cluster_map,
|
|
52
|
+
explore_moran_over_time,
|
|
53
|
+
explore_moran_plot,
|
|
54
|
+
)
|
|
55
|
+
from geometrics.distribution_dynamics import (
|
|
56
|
+
analyze_markov_transitions,
|
|
57
|
+
analyze_spatial_markov,
|
|
58
|
+
)
|
|
59
|
+
from geometrics.gwr import analyze_gwr, analyze_mgwr
|
|
60
|
+
from geometrics.maps import explore_choropleth_map
|
|
61
|
+
from geometrics.pedagogy import Explainer, explain, list_topics
|
|
62
|
+
from geometrics.regional_inequality import (
|
|
63
|
+
analyze_inequality_over_time,
|
|
64
|
+
analyze_theil_decomposition,
|
|
65
|
+
)
|
|
66
|
+
from geometrics.spacetime import (
|
|
67
|
+
explore_distribution_over_time,
|
|
68
|
+
explore_spacetime_heatmap,
|
|
69
|
+
)
|
|
70
|
+
from geometrics.spatial_models import (
|
|
71
|
+
analyze_spatial_diagnostics,
|
|
72
|
+
analyze_spatial_model,
|
|
73
|
+
analyze_spatial_model_by_weights,
|
|
74
|
+
)
|
|
75
|
+
from geometrics.weights import explore_connectivity_map, make_weights
|
|
76
|
+
|
|
77
|
+
__version__ = "0.1.0"
|
|
78
|
+
|
|
79
|
+
__all__ = [
|
|
80
|
+
# ===== EXPLORE =====
|
|
81
|
+
# maps
|
|
82
|
+
"explore_choropleth_map",
|
|
83
|
+
# spatial weights
|
|
84
|
+
"explore_connectivity_map",
|
|
85
|
+
# spatial dependence (ESDA)
|
|
86
|
+
"explore_moran_plot",
|
|
87
|
+
"explore_lisa_cluster_map",
|
|
88
|
+
"explore_moran_over_time",
|
|
89
|
+
# space-time dynamics
|
|
90
|
+
"explore_distribution_over_time",
|
|
91
|
+
"explore_spacetime_heatmap",
|
|
92
|
+
# ===== ANALYZE =====
|
|
93
|
+
# convergence
|
|
94
|
+
"analyze_beta_convergence",
|
|
95
|
+
"analyze_sigma_convergence",
|
|
96
|
+
"analyze_convergence_clubs",
|
|
97
|
+
# spatial econometric models (spreg)
|
|
98
|
+
"analyze_spatial_model",
|
|
99
|
+
"analyze_spatial_diagnostics",
|
|
100
|
+
"analyze_spatial_model_by_weights",
|
|
101
|
+
# distribution dynamics (giddy)
|
|
102
|
+
"analyze_markov_transitions",
|
|
103
|
+
"analyze_spatial_markov",
|
|
104
|
+
# regional inequality (PySAL inequality)
|
|
105
|
+
"analyze_inequality_over_time",
|
|
106
|
+
"analyze_theil_decomposition",
|
|
107
|
+
# local models (mgwr)
|
|
108
|
+
"analyze_gwr",
|
|
109
|
+
"analyze_mgwr",
|
|
110
|
+
# ===== UTILITIES =====
|
|
111
|
+
"read_gdf",
|
|
112
|
+
"make_weights",
|
|
113
|
+
"growth_cross_section",
|
|
114
|
+
"set_panel",
|
|
115
|
+
"resolve_panel",
|
|
116
|
+
"set_labels",
|
|
117
|
+
"resolve_label",
|
|
118
|
+
"set_roles",
|
|
119
|
+
"build_data_dict",
|
|
120
|
+
"set_palette",
|
|
121
|
+
"get_palette",
|
|
122
|
+
"explain",
|
|
123
|
+
"list_topics",
|
|
124
|
+
"Explainer",
|
|
125
|
+
# ===== DATA =====
|
|
126
|
+
"data",
|
|
127
|
+
# ===== RESULT TYPES =====
|
|
128
|
+
"ChoroplethMapResult",
|
|
129
|
+
"ConnectivityMapResult",
|
|
130
|
+
"MoranPlotResult",
|
|
131
|
+
"LisaClusterMapResult",
|
|
132
|
+
"MoranOverTimeResult",
|
|
133
|
+
"DistributionOverTimeResult",
|
|
134
|
+
"SpacetimeHeatmapResult",
|
|
135
|
+
"BetaConvergenceResult",
|
|
136
|
+
"SigmaConvergenceResult",
|
|
137
|
+
"ConvergenceClubsResult",
|
|
138
|
+
"SpatialModelResult",
|
|
139
|
+
"SpatialDiagnosticsResult",
|
|
140
|
+
"WeightsRobustnessResult",
|
|
141
|
+
"MarkovTransitionsResult",
|
|
142
|
+
"SpatialMarkovResult",
|
|
143
|
+
"InequalityOverTimeResult",
|
|
144
|
+
"TheilDecompositionResult",
|
|
145
|
+
"GWRResult",
|
|
146
|
+
"MGWRResult",
|
|
147
|
+
]
|
geometrics/_common.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Shared low-level helpers used across the analytical modules.
|
|
2
|
+
|
|
3
|
+
These are pure, dependency-light utilities (numeric-aware level sorting, a sample-size
|
|
4
|
+
default opacity, time-axis coercion, the standard error, and an x-axis layout builder) that
|
|
5
|
+
several feature modules need. Centralizing them here keeps the feature modules from
|
|
6
|
+
importing private helpers out of one another.
|
|
7
|
+
|
|
8
|
+
This module imports only :mod:`numpy` / :mod:`pandas`, so it can be imported anywhere without
|
|
9
|
+
risking a cycle.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from math import log
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
from pandas.api import types as pdt
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"sorted_levels",
|
|
23
|
+
"argsort_levels",
|
|
24
|
+
"default_alpha",
|
|
25
|
+
"try_convert_ts_id",
|
|
26
|
+
"se",
|
|
27
|
+
"xaxis",
|
|
28
|
+
"entity_display_map",
|
|
29
|
+
"entity_display_series",
|
|
30
|
+
"lead_columns",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# Full date strings only (YYYY-MM-DD / YYYY/MM/DD) — bare-year strings like "2013" must fall
|
|
34
|
+
# through to numeric (R's ``as.Date("2013")`` fails).
|
|
35
|
+
_FULL_DATE = re.compile(r"^\d{4}[-/]\d{1,2}[-/]\d{1,2}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def sorted_levels(values: pd.Series) -> list[str]:
|
|
39
|
+
"""Return the distinct levels of ``values`` sorted numerically when possible.
|
|
40
|
+
|
|
41
|
+
Group labels like ``"2"`` and ``"10"`` must order as 2 < 10, not lexically.
|
|
42
|
+
"""
|
|
43
|
+
levels = list(dict.fromkeys(values.astype(str)))
|
|
44
|
+
num = pd.to_numeric(pd.Series(levels), errors="coerce")
|
|
45
|
+
if not num.isna().any():
|
|
46
|
+
return [lvl for _, lvl in sorted(zip(num, levels, strict=True))]
|
|
47
|
+
return sorted(levels)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def argsort_levels(index: pd.Index) -> np.ndarray:
|
|
51
|
+
"""Return a stable sort order (argsort indices) for ``index``, numeric-aware.
|
|
52
|
+
|
|
53
|
+
Numbers sort numerically (``2`` before ``10``); a non-numeric index sorts lexically.
|
|
54
|
+
"""
|
|
55
|
+
idx = index.astype(str)
|
|
56
|
+
num = pd.to_numeric(pd.Series(idx), errors="coerce")
|
|
57
|
+
keys = num.to_numpy() if not num.isna().any() else idx.to_numpy()
|
|
58
|
+
return np.asarray(np.argsort(keys, kind="stable"))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def default_alpha(n: int) -> float:
|
|
62
|
+
"""Sample-size-based default opacity (ExPanDaR's formula)."""
|
|
63
|
+
if n <= 0:
|
|
64
|
+
return 1.0
|
|
65
|
+
return min(1.0, 1.0 / (1.0 + max(0.0, log(n) - log(100))))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def try_convert_ts_id(s: pd.Series) -> tuple[pd.Series, bool]:
|
|
69
|
+
"""Coerce a time identifier to a nicer type for axis ticks.
|
|
70
|
+
|
|
71
|
+
Cascade (mirrors ExPanDaR's ``try_convert_ts_id``): keep existing datetime/numeric
|
|
72
|
+
types, else try full-date parsing, else numeric, else an ordered categorical.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
tuple of (pandas.Series, bool)
|
|
77
|
+
The converted series and whether it is an ordered categorical (discrete axis).
|
|
78
|
+
"""
|
|
79
|
+
if pdt.is_datetime64_any_dtype(s):
|
|
80
|
+
return s, False
|
|
81
|
+
if pdt.is_numeric_dtype(s) and not pdt.is_bool_dtype(s):
|
|
82
|
+
return s, False
|
|
83
|
+
|
|
84
|
+
# For factor/categorical/object indices, try the same cascade R applies to the
|
|
85
|
+
# character values: full-date -> numeric -> ordered categorical.
|
|
86
|
+
str_vals = s.astype(str)
|
|
87
|
+
if str_vals.str.match(_FULL_DATE).all():
|
|
88
|
+
try:
|
|
89
|
+
return pd.to_datetime(str_vals), False
|
|
90
|
+
except (ValueError, TypeError):
|
|
91
|
+
pass
|
|
92
|
+
num = pd.to_numeric(str_vals, errors="coerce")
|
|
93
|
+
if not num.isna().any():
|
|
94
|
+
return pd.Series(num.to_numpy(), index=s.index), False
|
|
95
|
+
cats = sorted(s.dropna().astype(str).unique(), key=str)
|
|
96
|
+
return s.astype(str).astype(pd.CategoricalDtype(cats, ordered=True)), True
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def se(s: pd.Series) -> float:
|
|
100
|
+
"""Return the standard error of the mean: sd / sqrt(n_non_missing)."""
|
|
101
|
+
cnt = int(s.notna().sum())
|
|
102
|
+
if cnt == 0:
|
|
103
|
+
return np.nan
|
|
104
|
+
return float(s.std(ddof=1) / np.sqrt(cnt))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def xaxis(
|
|
108
|
+
time: str, ordered: bool, ts_values: pd.Series, title: str | None = None
|
|
109
|
+
) -> dict:
|
|
110
|
+
"""Build x-axis layout kwargs, fixing category order when discrete.
|
|
111
|
+
|
|
112
|
+
``title`` overrides the axis title (default: the bare ``time`` name).
|
|
113
|
+
"""
|
|
114
|
+
axis: dict = {"title": title if title is not None else time}
|
|
115
|
+
if ordered:
|
|
116
|
+
cats = [str(c) for c in ts_values.cat.categories]
|
|
117
|
+
axis.update(type="category", categoryorder="array", categoryarray=cats)
|
|
118
|
+
return axis
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _is_blank_name(value: object) -> bool:
|
|
122
|
+
"""Return ``True`` for ``None`` / NaN / empty-or-whitespace strings."""
|
|
123
|
+
if value is None or value is pd.NA:
|
|
124
|
+
return True
|
|
125
|
+
if isinstance(value, float) and np.isnan(value):
|
|
126
|
+
return True
|
|
127
|
+
return not str(value).strip()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def entity_display_map(
|
|
131
|
+
df: pd.DataFrame, entity: str, entity_name: str | None
|
|
132
|
+
) -> dict[str, str]:
|
|
133
|
+
"""Map each entity id (as ``str``) to a ``"Name (id)"`` display string.
|
|
134
|
+
|
|
135
|
+
Used by panel figures/tables so a unit shows a readable label (e.g. ``"Bolivia (BOL)"``)
|
|
136
|
+
instead of the bare id. The mapping is keyed by ``str(id)`` so a lookup is robust to the id
|
|
137
|
+
being re-typed along the way (e.g. an int id stringified by a cross-section reshape): look
|
|
138
|
+
up with ``disp.get(str(u), str(u))``.
|
|
139
|
+
|
|
140
|
+
Falls back to an identity map ``{str(id): str(id)}`` when ``entity_name`` is ``None``, not a
|
|
141
|
+
column of ``df``, or equal to ``entity`` (no ``"X (X)"``); per id, when the name is blank or
|
|
142
|
+
missing the display is the bare ``str(id)``.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
df
|
|
147
|
+
The frame holding the ``entity`` (and optionally ``entity_name``) columns.
|
|
148
|
+
entity
|
|
149
|
+
The entity (unit) id column.
|
|
150
|
+
entity_name
|
|
151
|
+
The human-readable name column constant within each entity, or ``None``.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
dict
|
|
156
|
+
``{str(id): display_string}`` for every distinct id in ``df[entity]``.
|
|
157
|
+
"""
|
|
158
|
+
ids = df[entity].dropna().unique()
|
|
159
|
+
if entity_name is None or entity_name == entity or entity_name not in df.columns:
|
|
160
|
+
return {str(uid): str(uid) for uid in ids}
|
|
161
|
+
pairs = df[[entity, entity_name]].drop_duplicates(subset=[entity])
|
|
162
|
+
names = dict(zip(pairs[entity], pairs[entity_name], strict=True))
|
|
163
|
+
out: dict[str, str] = {}
|
|
164
|
+
for uid in ids:
|
|
165
|
+
name = names.get(uid)
|
|
166
|
+
out[str(uid)] = str(uid) if _is_blank_name(name) else f"{name} ({uid})"
|
|
167
|
+
return out
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def entity_display_series(
|
|
171
|
+
df: pd.DataFrame, entity: str, entity_name: str | None
|
|
172
|
+
) -> pd.Series:
|
|
173
|
+
"""Return per-row ``"Name (id)"`` display labels aligned to ``df.index``.
|
|
174
|
+
|
|
175
|
+
A row-wise convenience over :func:`entity_display_map` (``str(id)`` fallback throughout).
|
|
176
|
+
"""
|
|
177
|
+
disp = entity_display_map(df, entity, entity_name)
|
|
178
|
+
return df[entity].map(lambda uid: disp.get(str(uid), str(uid)))
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def lead_columns(names: list[str], lead: list[str | None]) -> list[str]:
|
|
182
|
+
"""Reorder ``names`` so any of ``lead`` (in order, ignoring ``None``/absent) come first.
|
|
183
|
+
|
|
184
|
+
Stable for the remaining columns. Used to float the declared key variables (main outcome,
|
|
185
|
+
then covariates) to the front of a table or correlation matrix when roles are set; a no-op
|
|
186
|
+
when none of ``lead`` is present (so role-less data keeps its original column order).
|
|
187
|
+
"""
|
|
188
|
+
present = set(names)
|
|
189
|
+
front = list(dict.fromkeys(c for c in lead if c is not None and c in present))
|
|
190
|
+
if not front:
|
|
191
|
+
return list(names)
|
|
192
|
+
front_set = set(front)
|
|
193
|
+
return [*front, *[n for n in names if n not in front_set]]
|
geometrics/_data_dict.py
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
"""Infer a data dictionary (``df_dict``) from a raw DataFrame.
|
|
2
|
+
|
|
3
|
+
The bundled datasets ship a *data dictionary* — a ``df_dict`` frame describing each column's
|
|
4
|
+
human-readable label and its role in the panel (``entity`` / ``time`` / ``factor`` /
|
|
5
|
+
``logical`` / ``numeric``). :func:`build_data_dict` produces a best-guess dictionary for *any*
|
|
6
|
+
DataFrame, so a user who brings only a data file still gets labelled figures and panel-aware
|
|
7
|
+
views — and, in the ``geometrics`` apps, an editable starting point.
|
|
8
|
+
|
|
9
|
+
The result is a plain frame with the same columns the loaders return
|
|
10
|
+
(``var_name`` / ``var_def`` / ``label`` / ``type`` / ``role`` / ``can_be_na``), consumable
|
|
11
|
+
directly by :func:`~geometrics.set_labels`::
|
|
12
|
+
|
|
13
|
+
df = gm.set_labels(df, gm.build_data_dict(df), set_panel=True)
|
|
14
|
+
|
|
15
|
+
The inference is deliberately conservative: column-name hints and dtypes pick the most likely
|
|
16
|
+
roles, but it is only a *guess* — pass ``entity=`` / ``time=`` to pin the panel ids, or edit
|
|
17
|
+
the returned frame.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from collections.abc import Sequence
|
|
24
|
+
|
|
25
|
+
import pandas as pd
|
|
26
|
+
from pandas.api import types as pdt
|
|
27
|
+
|
|
28
|
+
from geometrics._validation import ensure_dataframe
|
|
29
|
+
|
|
30
|
+
__all__ = ["build_data_dict"]
|
|
31
|
+
|
|
32
|
+
#: The six columns of a data-dictionary frame, in order.
|
|
33
|
+
_COLUMNS = ["var_name", "var_def", "label", "type", "role", "can_be_na"]
|
|
34
|
+
|
|
35
|
+
#: Lower-cased name tokens that hint a column is a cross-sectional (unit) identifier.
|
|
36
|
+
_ENTITY_HINTS = {
|
|
37
|
+
"id",
|
|
38
|
+
"ids",
|
|
39
|
+
"code",
|
|
40
|
+
"iso",
|
|
41
|
+
"iso2",
|
|
42
|
+
"iso3",
|
|
43
|
+
"country",
|
|
44
|
+
"countries",
|
|
45
|
+
"nation",
|
|
46
|
+
"firm",
|
|
47
|
+
"company",
|
|
48
|
+
"unit",
|
|
49
|
+
"entity",
|
|
50
|
+
"region",
|
|
51
|
+
"state",
|
|
52
|
+
"province",
|
|
53
|
+
"prov",
|
|
54
|
+
"municipality",
|
|
55
|
+
"muni",
|
|
56
|
+
"department",
|
|
57
|
+
"dept",
|
|
58
|
+
"district",
|
|
59
|
+
"ticker",
|
|
60
|
+
"gvkey",
|
|
61
|
+
"permno",
|
|
62
|
+
"cusip",
|
|
63
|
+
"individual",
|
|
64
|
+
"person",
|
|
65
|
+
"household",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#: Lower-cased name tokens that hint a column holds a human-readable *entity name* (a label
|
|
69
|
+
#: for the unit, e.g. a country/province name), used to pick the entity-name column.
|
|
70
|
+
_NAME_TOKENS = {
|
|
71
|
+
"name",
|
|
72
|
+
"names",
|
|
73
|
+
"country",
|
|
74
|
+
"countries",
|
|
75
|
+
"nation",
|
|
76
|
+
"province",
|
|
77
|
+
"prov",
|
|
78
|
+
"region",
|
|
79
|
+
"state",
|
|
80
|
+
"district",
|
|
81
|
+
"municipality",
|
|
82
|
+
"muni",
|
|
83
|
+
"department",
|
|
84
|
+
"dept",
|
|
85
|
+
"firm",
|
|
86
|
+
"company",
|
|
87
|
+
"person",
|
|
88
|
+
"household",
|
|
89
|
+
"individual",
|
|
90
|
+
"label",
|
|
91
|
+
"title",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
#: Lower-cased name tokens that hint a column is a *code/id* (the opposite of a readable name).
|
|
95
|
+
_CODE_TOKENS = {
|
|
96
|
+
"id",
|
|
97
|
+
"ids",
|
|
98
|
+
"code",
|
|
99
|
+
"iso",
|
|
100
|
+
"iso2",
|
|
101
|
+
"iso3",
|
|
102
|
+
"ticker",
|
|
103
|
+
"gvkey",
|
|
104
|
+
"permno",
|
|
105
|
+
"cusip",
|
|
106
|
+
"key",
|
|
107
|
+
"num",
|
|
108
|
+
"no",
|
|
109
|
+
"gid",
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
#: Lower-cased name tokens that hint a column is the time identifier.
|
|
113
|
+
_TIME_HINTS = {
|
|
114
|
+
"year",
|
|
115
|
+
"yr",
|
|
116
|
+
"date",
|
|
117
|
+
"time",
|
|
118
|
+
"period",
|
|
119
|
+
"quarter",
|
|
120
|
+
"qtr",
|
|
121
|
+
"month",
|
|
122
|
+
"week",
|
|
123
|
+
"wave",
|
|
124
|
+
"fyear",
|
|
125
|
+
"datadate",
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _tokens(name: object) -> set[str]:
|
|
130
|
+
"""Split a column name into lower-cased word tokens."""
|
|
131
|
+
return set(re.split(r"[\s_\-./]+", str(name).strip().lower())) - {""}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _name_matches(name: object, hints: set[str]) -> bool:
|
|
135
|
+
"""Return ``True`` when any word token of ``name`` is one of ``hints``."""
|
|
136
|
+
return bool(_tokens(name) & hints)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _humanize(name: object) -> str:
|
|
140
|
+
"""Turn a column name into a title-cased display label (``gdp_pc`` -> ``Gdp Pc``)."""
|
|
141
|
+
return re.sub(r"[\s_\-./]+", " ", str(name)).strip().title() or str(name)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _looks_like_year(s: pd.Series) -> bool:
|
|
145
|
+
"""Return ``True`` for an integer-valued column whose values look like calendar years."""
|
|
146
|
+
vals = s.dropna()
|
|
147
|
+
if vals.empty or vals.nunique() < 2:
|
|
148
|
+
return False
|
|
149
|
+
integral = pdt.is_integer_dtype(s) or (
|
|
150
|
+
pdt.is_float_dtype(s) and bool((vals % 1 == 0).all())
|
|
151
|
+
)
|
|
152
|
+
if not integral:
|
|
153
|
+
return False
|
|
154
|
+
return bool((vals >= 1500).all() and (vals <= 2200).all())
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _value_type(s: pd.Series, factor_cutoff: int) -> str:
|
|
158
|
+
"""Classify a non-id column as ``logical`` / ``factor`` / ``numeric``."""
|
|
159
|
+
n = int(s.dropna().nunique())
|
|
160
|
+
if pdt.is_bool_dtype(s) or n == 2:
|
|
161
|
+
return "logical"
|
|
162
|
+
if isinstance(s.dtype, pd.CategoricalDtype) or pdt.is_object_dtype(s):
|
|
163
|
+
return "factor"
|
|
164
|
+
if pdt.is_numeric_dtype(s):
|
|
165
|
+
return "factor" if 1 < n <= factor_cutoff else "numeric"
|
|
166
|
+
return "factor"
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _detect_time(df: pd.DataFrame, cols: list[str]) -> str | None:
|
|
170
|
+
"""Detect the most likely time column, or ``None``."""
|
|
171
|
+
hinted = [c for c in cols if _name_matches(c, _TIME_HINTS)]
|
|
172
|
+
datetimes = [c for c in cols if pdt.is_datetime64_any_dtype(df[c])]
|
|
173
|
+
yearish = [c for c in cols if _looks_like_year(df[c])]
|
|
174
|
+
for group in (
|
|
175
|
+
[c for c in hinted if c in yearish or c in datetimes],
|
|
176
|
+
hinted,
|
|
177
|
+
datetimes,
|
|
178
|
+
yearish,
|
|
179
|
+
):
|
|
180
|
+
if group:
|
|
181
|
+
return group[0]
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _detect_entities(
|
|
186
|
+
df: pd.DataFrame, cols: list[str], time_col: str | None
|
|
187
|
+
) -> list[str]:
|
|
188
|
+
"""Detect the cross-sectional identifier column(s), in column order, or ``[]``."""
|
|
189
|
+
candidates = [c for c in cols if c != time_col]
|
|
190
|
+
hinted = [
|
|
191
|
+
c
|
|
192
|
+
for c in candidates
|
|
193
|
+
if _name_matches(c, _ENTITY_HINTS) and not pdt.is_float_dtype(df[c])
|
|
194
|
+
]
|
|
195
|
+
if hinted:
|
|
196
|
+
return hinted
|
|
197
|
+
if time_col is not None: # fall back: a column that forms a key with the time id
|
|
198
|
+
for c in candidates:
|
|
199
|
+
keyable = (
|
|
200
|
+
pdt.is_object_dtype(df[c])
|
|
201
|
+
or isinstance(df[c].dtype, pd.CategoricalDtype)
|
|
202
|
+
or pdt.is_integer_dtype(df[c])
|
|
203
|
+
)
|
|
204
|
+
if (
|
|
205
|
+
keyable
|
|
206
|
+
and df[c].notna().all()
|
|
207
|
+
and not df.duplicated([c, time_col]).any()
|
|
208
|
+
):
|
|
209
|
+
return [c]
|
|
210
|
+
return []
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _avg_len(s: pd.Series) -> float:
|
|
214
|
+
"""Average string length of a column's non-missing values (a name-likeness tiebreak)."""
|
|
215
|
+
vals = s.dropna().astype(str)
|
|
216
|
+
return float(vals.str.len().mean()) if len(vals) else 0.0
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _name_likeness(name: object, s: pd.Series) -> int:
|
|
220
|
+
"""Score how *name-like* (vs *code-like*) a column is — higher means more readable."""
|
|
221
|
+
toks = _tokens(name)
|
|
222
|
+
score = 2 * bool(toks & _NAME_TOKENS) - 2 * bool(toks & _CODE_TOKENS)
|
|
223
|
+
if pdt.is_integer_dtype(s):
|
|
224
|
+
score -= 1
|
|
225
|
+
return score
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _detect_entity_name(
|
|
229
|
+
df: pd.DataFrame, entities: list[str], time_col: str | None
|
|
230
|
+
) -> str | None:
|
|
231
|
+
"""Detect a human-readable entity-name column, or ``None``.
|
|
232
|
+
|
|
233
|
+
A candidate is a text/categorical column that is **constant within** the primary entity id
|
|
234
|
+
and **~1:1** with it (one label per unit). The most name-like candidate wins, but only when
|
|
235
|
+
it is strictly more name-like than the entity id itself (so an id that is already a name —
|
|
236
|
+
e.g. ``country`` paired with ``iso`` — yields ``None`` rather than a backwards label).
|
|
237
|
+
"""
|
|
238
|
+
if not entities:
|
|
239
|
+
return None
|
|
240
|
+
primary = entities[0]
|
|
241
|
+
n_ent = int(df[primary].dropna().nunique())
|
|
242
|
+
if n_ent == 0:
|
|
243
|
+
return None
|
|
244
|
+
candidates: list[str] = []
|
|
245
|
+
for c in df.columns:
|
|
246
|
+
if c in (primary, time_col):
|
|
247
|
+
continue
|
|
248
|
+
s = df[c]
|
|
249
|
+
if pdt.is_numeric_dtype(s) or pdt.is_bool_dtype(s):
|
|
250
|
+
continue
|
|
251
|
+
if not (
|
|
252
|
+
pdt.is_object_dtype(s)
|
|
253
|
+
or isinstance(s.dtype, pd.CategoricalDtype)
|
|
254
|
+
or pdt.is_string_dtype(s)
|
|
255
|
+
):
|
|
256
|
+
continue
|
|
257
|
+
g = df[[primary, c]].dropna()
|
|
258
|
+
if g.empty or (g.groupby(primary)[c].nunique() > 1).any():
|
|
259
|
+
continue # not constant within the entity
|
|
260
|
+
if g[c].nunique() < 0.95 * n_ent:
|
|
261
|
+
continue # not ~1:1 with the entities
|
|
262
|
+
candidates.append(c)
|
|
263
|
+
if not candidates:
|
|
264
|
+
return None
|
|
265
|
+
key = lambda c: (_name_likeness(c, df[c]), _avg_len(df[c])) # noqa: E731
|
|
266
|
+
best = max(candidates, key=key)
|
|
267
|
+
if key(best) <= (_name_likeness(primary, df[primary]), _avg_len(df[primary])):
|
|
268
|
+
return None
|
|
269
|
+
return best
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def build_data_dict(
|
|
273
|
+
df: pd.DataFrame,
|
|
274
|
+
*,
|
|
275
|
+
entity: str | Sequence[str] | None = None,
|
|
276
|
+
time: str | None = None,
|
|
277
|
+
factor_cutoff: int = 10,
|
|
278
|
+
) -> pd.DataFrame:
|
|
279
|
+
"""Infer a best-guess data dictionary (``df_dict``) for ``df``.
|
|
280
|
+
|
|
281
|
+
Produces one row per column with an inferred ``type`` and a humanized ``label``, ready to
|
|
282
|
+
pass to :func:`~geometrics.set_labels`. Column-name hints and dtypes drive the guess: a
|
|
283
|
+
column is typed ``entity`` (name hints like ``country`` / ``iso`` / ``id``, or — failing
|
|
284
|
+
that — the column that uniquely keys the rows together with the time id), ``time`` (name
|
|
285
|
+
hints like ``year`` / ``date``, a datetime dtype, or an integer column in the calendar-year
|
|
286
|
+
range), ``logical`` (boolean or two-valued), ``factor`` (categorical/object, or numeric
|
|
287
|
+
with at most ``factor_cutoff`` distinct values), else ``numeric``.
|
|
288
|
+
|
|
289
|
+
A best-guess ``role`` is also filled: a text column that is constant within the entity and
|
|
290
|
+
~1:1 with it (a readable label for the unit, e.g. a country name beside an ISO code) is
|
|
291
|
+
tagged ``entity_name``; all other rows are left blank. The analytical roles ``outcome`` /
|
|
292
|
+
``covariate`` are never guessed — mark them yourself (in the dictionary or via
|
|
293
|
+
:func:`~geometrics.set_roles`).
|
|
294
|
+
|
|
295
|
+
Parameters
|
|
296
|
+
----------
|
|
297
|
+
df
|
|
298
|
+
The data frame to describe.
|
|
299
|
+
entity
|
|
300
|
+
Explicit entity (unit) identifier column name(s); when given, these win over
|
|
301
|
+
detection (and are validated against ``df``).
|
|
302
|
+
time
|
|
303
|
+
Explicit time identifier column name; when given, it wins over detection.
|
|
304
|
+
factor_cutoff
|
|
305
|
+
Numeric columns with at most this many distinct values are typed ``factor``.
|
|
306
|
+
|
|
307
|
+
Returns
|
|
308
|
+
-------
|
|
309
|
+
pandas.DataFrame
|
|
310
|
+
A dictionary frame with columns ``var_name``, ``var_def``, ``label``, ``type``,
|
|
311
|
+
``role`` and ``can_be_na`` (one row per column of ``df``, in column order).
|
|
312
|
+
|
|
313
|
+
Examples
|
|
314
|
+
--------
|
|
315
|
+
Build a dictionary for any frame, then attach labels + declare the panel in one step:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
import pandas as pd
|
|
319
|
+
|
|
320
|
+
import geometrics as gm
|
|
321
|
+
|
|
322
|
+
df = pd.DataFrame(
|
|
323
|
+
{
|
|
324
|
+
"region": ["A", "A", "B", "B"],
|
|
325
|
+
"year": [2000, 2001, 2000, 2001],
|
|
326
|
+
"gdp_pc": [1.0, 1.1, 2.0, 2.1],
|
|
327
|
+
}
|
|
328
|
+
)
|
|
329
|
+
ddict = gm.build_data_dict(df)
|
|
330
|
+
df = gm.set_labels(df, ddict, set_panel=True)
|
|
331
|
+
ddict.head()
|
|
332
|
+
```
|
|
333
|
+
"""
|
|
334
|
+
df = ensure_dataframe(df)
|
|
335
|
+
cols = list(df.columns)
|
|
336
|
+
|
|
337
|
+
explicit_entities = [entity] if isinstance(entity, str) else list(entity or [])
|
|
338
|
+
for col in (*explicit_entities, *([time] if time is not None else [])):
|
|
339
|
+
if col not in cols:
|
|
340
|
+
raise ValueError(f"column {col!r} is not in df")
|
|
341
|
+
|
|
342
|
+
time_col = time if time is not None else _detect_time(df, cols)
|
|
343
|
+
entities = explicit_entities or _detect_entities(df, cols, time_col)
|
|
344
|
+
entity_set = set(entities)
|
|
345
|
+
name_col = _detect_entity_name(df, entities, time_col)
|
|
346
|
+
|
|
347
|
+
rows = []
|
|
348
|
+
for col in cols:
|
|
349
|
+
if col in entity_set:
|
|
350
|
+
typ = "entity"
|
|
351
|
+
elif col == time_col:
|
|
352
|
+
typ = "time"
|
|
353
|
+
else:
|
|
354
|
+
typ = _value_type(df[col], factor_cutoff)
|
|
355
|
+
label = _humanize(col)
|
|
356
|
+
rows.append(
|
|
357
|
+
{
|
|
358
|
+
"var_name": col,
|
|
359
|
+
"var_def": label,
|
|
360
|
+
"label": label,
|
|
361
|
+
"type": typ,
|
|
362
|
+
"role": "entity_name" if col == name_col else "",
|
|
363
|
+
"can_be_na": typ not in ("entity", "time"),
|
|
364
|
+
}
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
out = pd.DataFrame(rows, columns=_COLUMNS)
|
|
368
|
+
out["can_be_na"] = out["can_be_na"].astype(bool)
|
|
369
|
+
return out
|