geometrics 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. geometrics/__init__.py +147 -0
  2. geometrics/_common.py +193 -0
  3. geometrics/_data_dict.py +369 -0
  4. geometrics/_geo.py +641 -0
  5. geometrics/_impacts.py +580 -0
  6. geometrics/_labels.py +219 -0
  7. geometrics/_mapping.py +641 -0
  8. geometrics/_panel.py +179 -0
  9. geometrics/_roles.py +143 -0
  10. geometrics/_theme.py +410 -0
  11. geometrics/_types.py +939 -0
  12. geometrics/_validation.py +192 -0
  13. geometrics/clubs.py +1044 -0
  14. geometrics/convergence.py +1545 -0
  15. geometrics/data/__init__.py +262 -0
  16. geometrics/data/_registry.py +82 -0
  17. geometrics/data/india32_dict.csv +7 -0
  18. geometrics/data/india520_dict.csv +29 -0
  19. geometrics/dependence.py +926 -0
  20. geometrics/distribution_dynamics.py +858 -0
  21. geometrics/gwr.py +909 -0
  22. geometrics/maps.py +491 -0
  23. geometrics/pedagogy/__init__.py +32 -0
  24. geometrics/pedagogy/_format.py +88 -0
  25. geometrics/pedagogy/_interpret/__init__.py +63 -0
  26. geometrics/pedagogy/_interpret/_convergence.py +264 -0
  27. geometrics/pedagogy/_interpret/_dependence.py +207 -0
  28. geometrics/pedagogy/_interpret/_dynamics.py +211 -0
  29. geometrics/pedagogy/_interpret/_gwr.py +161 -0
  30. geometrics/pedagogy/_interpret/_inequality.py +226 -0
  31. geometrics/pedagogy/_interpret/_maps.py +70 -0
  32. geometrics/pedagogy/_interpret/_shared.py +16 -0
  33. geometrics/pedagogy/_interpret/_spacetime.py +201 -0
  34. geometrics/pedagogy/_interpret/_spatial_models.py +319 -0
  35. geometrics/pedagogy/_interpret/_weights.py +86 -0
  36. geometrics/pedagogy/_mixin.py +44 -0
  37. geometrics/pedagogy/_registry.py +124 -0
  38. geometrics/pedagogy/_text/__init__.py +25 -0
  39. geometrics/pedagogy/_text/convergence.py +201 -0
  40. geometrics/pedagogy/_text/correlation.py +77 -0
  41. geometrics/pedagogy/_text/dynamics.py +157 -0
  42. geometrics/pedagogy/_text/inequality.py +124 -0
  43. geometrics/pedagogy/_text/models.py +326 -0
  44. geometrics/pedagogy/_text/spatial.py +263 -0
  45. geometrics/py.typed +0 -0
  46. geometrics/regional_inequality.py +985 -0
  47. geometrics/spacetime.py +665 -0
  48. geometrics/spatial_models.py +1349 -0
  49. geometrics/weights.py +578 -0
  50. geometrics-0.1.0.dist-info/METADATA +159 -0
  51. geometrics-0.1.0.dist-info/RECORD +53 -0
  52. geometrics-0.1.0.dist-info/WHEEL +4 -0
  53. geometrics-0.1.0.dist-info/licenses/LICENSE +21 -0
geometrics/__init__.py ADDED
@@ -0,0 +1,147 @@
1
+ """geometrics: regional growth, convergence, and inequality on the PySAL stack.
2
+
3
+ geometrics wraps the standard analyses of the regional convergence literature —
4
+ exploratory spatial data analysis, β/σ/club convergence, spatial econometric models,
5
+ distribution dynamics, inequality decomposition, and local (GWR) models — into
6
+ illustrative, easy-to-apply functions built on libpysal, esda, giddy, inequality,
7
+ mapclassify, spreg, and mgwr.
8
+
9
+ Three inputs drive everything: a geometry with only the entity ID (``read_gdf``),
10
+ a long-form panel (``set_panel`` / ``set_labels``), and a data dictionary
11
+ (``df_dict``, inferable with ``build_data_dict``). Every public function returns a
12
+ frozen result dataclass with ``.df``, ``.fig`` and/or ``.gt``, plain-language
13
+ ``.interpret()``, and a concept ``.explain()``.
14
+ """
15
+
16
+ from geometrics import data
17
+ from geometrics._data_dict import build_data_dict
18
+ from geometrics._geo import read_gdf
19
+ from geometrics._labels import resolve_label, set_labels
20
+ from geometrics._panel import resolve_panel, set_panel
21
+ from geometrics._roles import set_roles
22
+ from geometrics._theme import get_palette, set_palette
23
+ from geometrics._types import (
24
+ BetaConvergenceResult,
25
+ ChoroplethMapResult,
26
+ ConnectivityMapResult,
27
+ ConvergenceClubsResult,
28
+ DistributionOverTimeResult,
29
+ GWRResult,
30
+ InequalityOverTimeResult,
31
+ LisaClusterMapResult,
32
+ MarkovTransitionsResult,
33
+ MGWRResult,
34
+ MoranOverTimeResult,
35
+ MoranPlotResult,
36
+ SigmaConvergenceResult,
37
+ SpacetimeHeatmapResult,
38
+ SpatialDiagnosticsResult,
39
+ SpatialMarkovResult,
40
+ SpatialModelResult,
41
+ TheilDecompositionResult,
42
+ WeightsRobustnessResult,
43
+ )
44
+ from geometrics.clubs import analyze_convergence_clubs
45
+ from geometrics.convergence import (
46
+ analyze_beta_convergence,
47
+ analyze_sigma_convergence,
48
+ growth_cross_section,
49
+ )
50
+ from geometrics.dependence import (
51
+ explore_lisa_cluster_map,
52
+ explore_moran_over_time,
53
+ explore_moran_plot,
54
+ )
55
+ from geometrics.distribution_dynamics import (
56
+ analyze_markov_transitions,
57
+ analyze_spatial_markov,
58
+ )
59
+ from geometrics.gwr import analyze_gwr, analyze_mgwr
60
+ from geometrics.maps import explore_choropleth_map
61
+ from geometrics.pedagogy import Explainer, explain, list_topics
62
+ from geometrics.regional_inequality import (
63
+ analyze_inequality_over_time,
64
+ analyze_theil_decomposition,
65
+ )
66
+ from geometrics.spacetime import (
67
+ explore_distribution_over_time,
68
+ explore_spacetime_heatmap,
69
+ )
70
+ from geometrics.spatial_models import (
71
+ analyze_spatial_diagnostics,
72
+ analyze_spatial_model,
73
+ analyze_spatial_model_by_weights,
74
+ )
75
+ from geometrics.weights import explore_connectivity_map, make_weights
76
+
77
+ __version__ = "0.1.0"
78
+
79
+ __all__ = [
80
+ # ===== EXPLORE =====
81
+ # maps
82
+ "explore_choropleth_map",
83
+ # spatial weights
84
+ "explore_connectivity_map",
85
+ # spatial dependence (ESDA)
86
+ "explore_moran_plot",
87
+ "explore_lisa_cluster_map",
88
+ "explore_moran_over_time",
89
+ # space-time dynamics
90
+ "explore_distribution_over_time",
91
+ "explore_spacetime_heatmap",
92
+ # ===== ANALYZE =====
93
+ # convergence
94
+ "analyze_beta_convergence",
95
+ "analyze_sigma_convergence",
96
+ "analyze_convergence_clubs",
97
+ # spatial econometric models (spreg)
98
+ "analyze_spatial_model",
99
+ "analyze_spatial_diagnostics",
100
+ "analyze_spatial_model_by_weights",
101
+ # distribution dynamics (giddy)
102
+ "analyze_markov_transitions",
103
+ "analyze_spatial_markov",
104
+ # regional inequality (PySAL inequality)
105
+ "analyze_inequality_over_time",
106
+ "analyze_theil_decomposition",
107
+ # local models (mgwr)
108
+ "analyze_gwr",
109
+ "analyze_mgwr",
110
+ # ===== UTILITIES =====
111
+ "read_gdf",
112
+ "make_weights",
113
+ "growth_cross_section",
114
+ "set_panel",
115
+ "resolve_panel",
116
+ "set_labels",
117
+ "resolve_label",
118
+ "set_roles",
119
+ "build_data_dict",
120
+ "set_palette",
121
+ "get_palette",
122
+ "explain",
123
+ "list_topics",
124
+ "Explainer",
125
+ # ===== DATA =====
126
+ "data",
127
+ # ===== RESULT TYPES =====
128
+ "ChoroplethMapResult",
129
+ "ConnectivityMapResult",
130
+ "MoranPlotResult",
131
+ "LisaClusterMapResult",
132
+ "MoranOverTimeResult",
133
+ "DistributionOverTimeResult",
134
+ "SpacetimeHeatmapResult",
135
+ "BetaConvergenceResult",
136
+ "SigmaConvergenceResult",
137
+ "ConvergenceClubsResult",
138
+ "SpatialModelResult",
139
+ "SpatialDiagnosticsResult",
140
+ "WeightsRobustnessResult",
141
+ "MarkovTransitionsResult",
142
+ "SpatialMarkovResult",
143
+ "InequalityOverTimeResult",
144
+ "TheilDecompositionResult",
145
+ "GWRResult",
146
+ "MGWRResult",
147
+ ]
geometrics/_common.py ADDED
@@ -0,0 +1,193 @@
1
+ """Shared low-level helpers used across the analytical modules.
2
+
3
+ These are pure, dependency-light utilities (numeric-aware level sorting, a sample-size
4
+ default opacity, time-axis coercion, the standard error, and an x-axis layout builder) that
5
+ several feature modules need. Centralizing them here keeps the feature modules from
6
+ importing private helpers out of one another.
7
+
8
+ This module imports only :mod:`numpy` / :mod:`pandas`, so it can be imported anywhere without
9
+ risking a cycle.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from math import log
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+ from pandas.api import types as pdt
20
+
21
+ __all__ = [
22
+ "sorted_levels",
23
+ "argsort_levels",
24
+ "default_alpha",
25
+ "try_convert_ts_id",
26
+ "se",
27
+ "xaxis",
28
+ "entity_display_map",
29
+ "entity_display_series",
30
+ "lead_columns",
31
+ ]
32
+
33
+ # Full date strings only (YYYY-MM-DD / YYYY/MM/DD) — bare-year strings like "2013" must fall
34
+ # through to numeric (R's ``as.Date("2013")`` fails).
35
+ _FULL_DATE = re.compile(r"^\d{4}[-/]\d{1,2}[-/]\d{1,2}")
36
+
37
+
38
+ def sorted_levels(values: pd.Series) -> list[str]:
39
+ """Return the distinct levels of ``values`` sorted numerically when possible.
40
+
41
+ Group labels like ``"2"`` and ``"10"`` must order as 2 < 10, not lexically.
42
+ """
43
+ levels = list(dict.fromkeys(values.astype(str)))
44
+ num = pd.to_numeric(pd.Series(levels), errors="coerce")
45
+ if not num.isna().any():
46
+ return [lvl for _, lvl in sorted(zip(num, levels, strict=True))]
47
+ return sorted(levels)
48
+
49
+
50
+ def argsort_levels(index: pd.Index) -> np.ndarray:
51
+ """Return a stable sort order (argsort indices) for ``index``, numeric-aware.
52
+
53
+ Numbers sort numerically (``2`` before ``10``); a non-numeric index sorts lexically.
54
+ """
55
+ idx = index.astype(str)
56
+ num = pd.to_numeric(pd.Series(idx), errors="coerce")
57
+ keys = num.to_numpy() if not num.isna().any() else idx.to_numpy()
58
+ return np.asarray(np.argsort(keys, kind="stable"))
59
+
60
+
61
+ def default_alpha(n: int) -> float:
62
+ """Sample-size-based default opacity (ExPanDaR's formula)."""
63
+ if n <= 0:
64
+ return 1.0
65
+ return min(1.0, 1.0 / (1.0 + max(0.0, log(n) - log(100))))
66
+
67
+
68
+ def try_convert_ts_id(s: pd.Series) -> tuple[pd.Series, bool]:
69
+ """Coerce a time identifier to a nicer type for axis ticks.
70
+
71
+ Cascade (mirrors ExPanDaR's ``try_convert_ts_id``): keep existing datetime/numeric
72
+ types, else try full-date parsing, else numeric, else an ordered categorical.
73
+
74
+ Returns
75
+ -------
76
+ tuple of (pandas.Series, bool)
77
+ The converted series and whether it is an ordered categorical (discrete axis).
78
+ """
79
+ if pdt.is_datetime64_any_dtype(s):
80
+ return s, False
81
+ if pdt.is_numeric_dtype(s) and not pdt.is_bool_dtype(s):
82
+ return s, False
83
+
84
+ # For factor/categorical/object indices, try the same cascade R applies to the
85
+ # character values: full-date -> numeric -> ordered categorical.
86
+ str_vals = s.astype(str)
87
+ if str_vals.str.match(_FULL_DATE).all():
88
+ try:
89
+ return pd.to_datetime(str_vals), False
90
+ except (ValueError, TypeError):
91
+ pass
92
+ num = pd.to_numeric(str_vals, errors="coerce")
93
+ if not num.isna().any():
94
+ return pd.Series(num.to_numpy(), index=s.index), False
95
+ cats = sorted(s.dropna().astype(str).unique(), key=str)
96
+ return s.astype(str).astype(pd.CategoricalDtype(cats, ordered=True)), True
97
+
98
+
99
+ def se(s: pd.Series) -> float:
100
+ """Return the standard error of the mean: sd / sqrt(n_non_missing)."""
101
+ cnt = int(s.notna().sum())
102
+ if cnt == 0:
103
+ return np.nan
104
+ return float(s.std(ddof=1) / np.sqrt(cnt))
105
+
106
+
107
+ def xaxis(
108
+ time: str, ordered: bool, ts_values: pd.Series, title: str | None = None
109
+ ) -> dict:
110
+ """Build x-axis layout kwargs, fixing category order when discrete.
111
+
112
+ ``title`` overrides the axis title (default: the bare ``time`` name).
113
+ """
114
+ axis: dict = {"title": title if title is not None else time}
115
+ if ordered:
116
+ cats = [str(c) for c in ts_values.cat.categories]
117
+ axis.update(type="category", categoryorder="array", categoryarray=cats)
118
+ return axis
119
+
120
+
121
+ def _is_blank_name(value: object) -> bool:
122
+ """Return ``True`` for ``None`` / NaN / empty-or-whitespace strings."""
123
+ if value is None or value is pd.NA:
124
+ return True
125
+ if isinstance(value, float) and np.isnan(value):
126
+ return True
127
+ return not str(value).strip()
128
+
129
+
130
+ def entity_display_map(
131
+ df: pd.DataFrame, entity: str, entity_name: str | None
132
+ ) -> dict[str, str]:
133
+ """Map each entity id (as ``str``) to a ``"Name (id)"`` display string.
134
+
135
+ Used by panel figures/tables so a unit shows a readable label (e.g. ``"Bolivia (BOL)"``)
136
+ instead of the bare id. The mapping is keyed by ``str(id)`` so a lookup is robust to the id
137
+ being re-typed along the way (e.g. an int id stringified by a cross-section reshape): look
138
+ up with ``disp.get(str(u), str(u))``.
139
+
140
+ Falls back to an identity map ``{str(id): str(id)}`` when ``entity_name`` is ``None``, not a
141
+ column of ``df``, or equal to ``entity`` (no ``"X (X)"``); per id, when the name is blank or
142
+ missing the display is the bare ``str(id)``.
143
+
144
+ Parameters
145
+ ----------
146
+ df
147
+ The frame holding the ``entity`` (and optionally ``entity_name``) columns.
148
+ entity
149
+ The entity (unit) id column.
150
+ entity_name
151
+ The human-readable name column constant within each entity, or ``None``.
152
+
153
+ Returns
154
+ -------
155
+ dict
156
+ ``{str(id): display_string}`` for every distinct id in ``df[entity]``.
157
+ """
158
+ ids = df[entity].dropna().unique()
159
+ if entity_name is None or entity_name == entity or entity_name not in df.columns:
160
+ return {str(uid): str(uid) for uid in ids}
161
+ pairs = df[[entity, entity_name]].drop_duplicates(subset=[entity])
162
+ names = dict(zip(pairs[entity], pairs[entity_name], strict=True))
163
+ out: dict[str, str] = {}
164
+ for uid in ids:
165
+ name = names.get(uid)
166
+ out[str(uid)] = str(uid) if _is_blank_name(name) else f"{name} ({uid})"
167
+ return out
168
+
169
+
170
+ def entity_display_series(
171
+ df: pd.DataFrame, entity: str, entity_name: str | None
172
+ ) -> pd.Series:
173
+ """Return per-row ``"Name (id)"`` display labels aligned to ``df.index``.
174
+
175
+ A row-wise convenience over :func:`entity_display_map` (``str(id)`` fallback throughout).
176
+ """
177
+ disp = entity_display_map(df, entity, entity_name)
178
+ return df[entity].map(lambda uid: disp.get(str(uid), str(uid)))
179
+
180
+
181
+ def lead_columns(names: list[str], lead: list[str | None]) -> list[str]:
182
+ """Reorder ``names`` so any of ``lead`` (in order, ignoring ``None``/absent) come first.
183
+
184
+ Stable for the remaining columns. Used to float the declared key variables (main outcome,
185
+ then covariates) to the front of a table or correlation matrix when roles are set; a no-op
186
+ when none of ``lead`` is present (so role-less data keeps its original column order).
187
+ """
188
+ present = set(names)
189
+ front = list(dict.fromkeys(c for c in lead if c is not None and c in present))
190
+ if not front:
191
+ return list(names)
192
+ front_set = set(front)
193
+ return [*front, *[n for n in names if n not in front_set]]
@@ -0,0 +1,369 @@
1
+ """Infer a data dictionary (``df_dict``) from a raw DataFrame.
2
+
3
+ The bundled datasets ship a *data dictionary* — a ``df_dict`` frame describing each column's
4
+ human-readable label and its role in the panel (``entity`` / ``time`` / ``factor`` /
5
+ ``logical`` / ``numeric``). :func:`build_data_dict` produces a best-guess dictionary for *any*
6
+ DataFrame, so a user who brings only a data file still gets labelled figures and panel-aware
7
+ views — and, in the ``geometrics`` apps, an editable starting point.
8
+
9
+ The result is a plain frame with the same columns the loaders return
10
+ (``var_name`` / ``var_def`` / ``label`` / ``type`` / ``role`` / ``can_be_na``), consumable
11
+ directly by :func:`~geometrics.set_labels`::
12
+
13
+ df = gm.set_labels(df, gm.build_data_dict(df), set_panel=True)
14
+
15
+ The inference is deliberately conservative: column-name hints and dtypes pick the most likely
16
+ roles, but it is only a *guess* — pass ``entity=`` / ``time=`` to pin the panel ids, or edit
17
+ the returned frame.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import re
23
+ from collections.abc import Sequence
24
+
25
+ import pandas as pd
26
+ from pandas.api import types as pdt
27
+
28
+ from geometrics._validation import ensure_dataframe
29
+
30
+ __all__ = ["build_data_dict"]
31
+
32
+ #: The six columns of a data-dictionary frame, in order.
33
+ _COLUMNS = ["var_name", "var_def", "label", "type", "role", "can_be_na"]
34
+
35
+ #: Lower-cased name tokens that hint a column is a cross-sectional (unit) identifier.
36
+ _ENTITY_HINTS = {
37
+ "id",
38
+ "ids",
39
+ "code",
40
+ "iso",
41
+ "iso2",
42
+ "iso3",
43
+ "country",
44
+ "countries",
45
+ "nation",
46
+ "firm",
47
+ "company",
48
+ "unit",
49
+ "entity",
50
+ "region",
51
+ "state",
52
+ "province",
53
+ "prov",
54
+ "municipality",
55
+ "muni",
56
+ "department",
57
+ "dept",
58
+ "district",
59
+ "ticker",
60
+ "gvkey",
61
+ "permno",
62
+ "cusip",
63
+ "individual",
64
+ "person",
65
+ "household",
66
+ }
67
+
68
+ #: Lower-cased name tokens that hint a column holds a human-readable *entity name* (a label
69
+ #: for the unit, e.g. a country/province name), used to pick the entity-name column.
70
+ _NAME_TOKENS = {
71
+ "name",
72
+ "names",
73
+ "country",
74
+ "countries",
75
+ "nation",
76
+ "province",
77
+ "prov",
78
+ "region",
79
+ "state",
80
+ "district",
81
+ "municipality",
82
+ "muni",
83
+ "department",
84
+ "dept",
85
+ "firm",
86
+ "company",
87
+ "person",
88
+ "household",
89
+ "individual",
90
+ "label",
91
+ "title",
92
+ }
93
+
94
+ #: Lower-cased name tokens that hint a column is a *code/id* (the opposite of a readable name).
95
+ _CODE_TOKENS = {
96
+ "id",
97
+ "ids",
98
+ "code",
99
+ "iso",
100
+ "iso2",
101
+ "iso3",
102
+ "ticker",
103
+ "gvkey",
104
+ "permno",
105
+ "cusip",
106
+ "key",
107
+ "num",
108
+ "no",
109
+ "gid",
110
+ }
111
+
112
+ #: Lower-cased name tokens that hint a column is the time identifier.
113
+ _TIME_HINTS = {
114
+ "year",
115
+ "yr",
116
+ "date",
117
+ "time",
118
+ "period",
119
+ "quarter",
120
+ "qtr",
121
+ "month",
122
+ "week",
123
+ "wave",
124
+ "fyear",
125
+ "datadate",
126
+ }
127
+
128
+
129
+ def _tokens(name: object) -> set[str]:
130
+ """Split a column name into lower-cased word tokens."""
131
+ return set(re.split(r"[\s_\-./]+", str(name).strip().lower())) - {""}
132
+
133
+
134
+ def _name_matches(name: object, hints: set[str]) -> bool:
135
+ """Return ``True`` when any word token of ``name`` is one of ``hints``."""
136
+ return bool(_tokens(name) & hints)
137
+
138
+
139
+ def _humanize(name: object) -> str:
140
+ """Turn a column name into a title-cased display label (``gdp_pc`` -> ``Gdp Pc``)."""
141
+ return re.sub(r"[\s_\-./]+", " ", str(name)).strip().title() or str(name)
142
+
143
+
144
+ def _looks_like_year(s: pd.Series) -> bool:
145
+ """Return ``True`` for an integer-valued column whose values look like calendar years."""
146
+ vals = s.dropna()
147
+ if vals.empty or vals.nunique() < 2:
148
+ return False
149
+ integral = pdt.is_integer_dtype(s) or (
150
+ pdt.is_float_dtype(s) and bool((vals % 1 == 0).all())
151
+ )
152
+ if not integral:
153
+ return False
154
+ return bool((vals >= 1500).all() and (vals <= 2200).all())
155
+
156
+
157
+ def _value_type(s: pd.Series, factor_cutoff: int) -> str:
158
+ """Classify a non-id column as ``logical`` / ``factor`` / ``numeric``."""
159
+ n = int(s.dropna().nunique())
160
+ if pdt.is_bool_dtype(s) or n == 2:
161
+ return "logical"
162
+ if isinstance(s.dtype, pd.CategoricalDtype) or pdt.is_object_dtype(s):
163
+ return "factor"
164
+ if pdt.is_numeric_dtype(s):
165
+ return "factor" if 1 < n <= factor_cutoff else "numeric"
166
+ return "factor"
167
+
168
+
169
+ def _detect_time(df: pd.DataFrame, cols: list[str]) -> str | None:
170
+ """Detect the most likely time column, or ``None``."""
171
+ hinted = [c for c in cols if _name_matches(c, _TIME_HINTS)]
172
+ datetimes = [c for c in cols if pdt.is_datetime64_any_dtype(df[c])]
173
+ yearish = [c for c in cols if _looks_like_year(df[c])]
174
+ for group in (
175
+ [c for c in hinted if c in yearish or c in datetimes],
176
+ hinted,
177
+ datetimes,
178
+ yearish,
179
+ ):
180
+ if group:
181
+ return group[0]
182
+ return None
183
+
184
+
185
+ def _detect_entities(
186
+ df: pd.DataFrame, cols: list[str], time_col: str | None
187
+ ) -> list[str]:
188
+ """Detect the cross-sectional identifier column(s), in column order, or ``[]``."""
189
+ candidates = [c for c in cols if c != time_col]
190
+ hinted = [
191
+ c
192
+ for c in candidates
193
+ if _name_matches(c, _ENTITY_HINTS) and not pdt.is_float_dtype(df[c])
194
+ ]
195
+ if hinted:
196
+ return hinted
197
+ if time_col is not None: # fall back: a column that forms a key with the time id
198
+ for c in candidates:
199
+ keyable = (
200
+ pdt.is_object_dtype(df[c])
201
+ or isinstance(df[c].dtype, pd.CategoricalDtype)
202
+ or pdt.is_integer_dtype(df[c])
203
+ )
204
+ if (
205
+ keyable
206
+ and df[c].notna().all()
207
+ and not df.duplicated([c, time_col]).any()
208
+ ):
209
+ return [c]
210
+ return []
211
+
212
+
213
+ def _avg_len(s: pd.Series) -> float:
214
+ """Average string length of a column's non-missing values (a name-likeness tiebreak)."""
215
+ vals = s.dropna().astype(str)
216
+ return float(vals.str.len().mean()) if len(vals) else 0.0
217
+
218
+
219
+ def _name_likeness(name: object, s: pd.Series) -> int:
220
+ """Score how *name-like* (vs *code-like*) a column is — higher means more readable."""
221
+ toks = _tokens(name)
222
+ score = 2 * bool(toks & _NAME_TOKENS) - 2 * bool(toks & _CODE_TOKENS)
223
+ if pdt.is_integer_dtype(s):
224
+ score -= 1
225
+ return score
226
+
227
+
228
+ def _detect_entity_name(
229
+ df: pd.DataFrame, entities: list[str], time_col: str | None
230
+ ) -> str | None:
231
+ """Detect a human-readable entity-name column, or ``None``.
232
+
233
+ A candidate is a text/categorical column that is **constant within** the primary entity id
234
+ and **~1:1** with it (one label per unit). The most name-like candidate wins, but only when
235
+ it is strictly more name-like than the entity id itself (so an id that is already a name —
236
+ e.g. ``country`` paired with ``iso`` — yields ``None`` rather than a backwards label).
237
+ """
238
+ if not entities:
239
+ return None
240
+ primary = entities[0]
241
+ n_ent = int(df[primary].dropna().nunique())
242
+ if n_ent == 0:
243
+ return None
244
+ candidates: list[str] = []
245
+ for c in df.columns:
246
+ if c in (primary, time_col):
247
+ continue
248
+ s = df[c]
249
+ if pdt.is_numeric_dtype(s) or pdt.is_bool_dtype(s):
250
+ continue
251
+ if not (
252
+ pdt.is_object_dtype(s)
253
+ or isinstance(s.dtype, pd.CategoricalDtype)
254
+ or pdt.is_string_dtype(s)
255
+ ):
256
+ continue
257
+ g = df[[primary, c]].dropna()
258
+ if g.empty or (g.groupby(primary)[c].nunique() > 1).any():
259
+ continue # not constant within the entity
260
+ if g[c].nunique() < 0.95 * n_ent:
261
+ continue # not ~1:1 with the entities
262
+ candidates.append(c)
263
+ if not candidates:
264
+ return None
265
+ key = lambda c: (_name_likeness(c, df[c]), _avg_len(df[c])) # noqa: E731
266
+ best = max(candidates, key=key)
267
+ if key(best) <= (_name_likeness(primary, df[primary]), _avg_len(df[primary])):
268
+ return None
269
+ return best
270
+
271
+
272
+ def build_data_dict(
273
+ df: pd.DataFrame,
274
+ *,
275
+ entity: str | Sequence[str] | None = None,
276
+ time: str | None = None,
277
+ factor_cutoff: int = 10,
278
+ ) -> pd.DataFrame:
279
+ """Infer a best-guess data dictionary (``df_dict``) for ``df``.
280
+
281
+ Produces one row per column with an inferred ``type`` and a humanized ``label``, ready to
282
+ pass to :func:`~geometrics.set_labels`. Column-name hints and dtypes drive the guess: a
283
+ column is typed ``entity`` (name hints like ``country`` / ``iso`` / ``id``, or — failing
284
+ that — the column that uniquely keys the rows together with the time id), ``time`` (name
285
+ hints like ``year`` / ``date``, a datetime dtype, or an integer column in the calendar-year
286
+ range), ``logical`` (boolean or two-valued), ``factor`` (categorical/object, or numeric
287
+ with at most ``factor_cutoff`` distinct values), else ``numeric``.
288
+
289
+ A best-guess ``role`` is also filled: a text column that is constant within the entity and
290
+ ~1:1 with it (a readable label for the unit, e.g. a country name beside an ISO code) is
291
+ tagged ``entity_name``; all other rows are left blank. The analytical roles ``outcome`` /
292
+ ``covariate`` are never guessed — mark them yourself (in the dictionary or via
293
+ :func:`~geometrics.set_roles`).
294
+
295
+ Parameters
296
+ ----------
297
+ df
298
+ The data frame to describe.
299
+ entity
300
+ Explicit entity (unit) identifier column name(s); when given, these win over
301
+ detection (and are validated against ``df``).
302
+ time
303
+ Explicit time identifier column name; when given, it wins over detection.
304
+ factor_cutoff
305
+ Numeric columns with at most this many distinct values are typed ``factor``.
306
+
307
+ Returns
308
+ -------
309
+ pandas.DataFrame
310
+ A dictionary frame with columns ``var_name``, ``var_def``, ``label``, ``type``,
311
+ ``role`` and ``can_be_na`` (one row per column of ``df``, in column order).
312
+
313
+ Examples
314
+ --------
315
+ Build a dictionary for any frame, then attach labels + declare the panel in one step:
316
+
317
+ ```python
318
+ import pandas as pd
319
+
320
+ import geometrics as gm
321
+
322
+ df = pd.DataFrame(
323
+ {
324
+ "region": ["A", "A", "B", "B"],
325
+ "year": [2000, 2001, 2000, 2001],
326
+ "gdp_pc": [1.0, 1.1, 2.0, 2.1],
327
+ }
328
+ )
329
+ ddict = gm.build_data_dict(df)
330
+ df = gm.set_labels(df, ddict, set_panel=True)
331
+ ddict.head()
332
+ ```
333
+ """
334
+ df = ensure_dataframe(df)
335
+ cols = list(df.columns)
336
+
337
+ explicit_entities = [entity] if isinstance(entity, str) else list(entity or [])
338
+ for col in (*explicit_entities, *([time] if time is not None else [])):
339
+ if col not in cols:
340
+ raise ValueError(f"column {col!r} is not in df")
341
+
342
+ time_col = time if time is not None else _detect_time(df, cols)
343
+ entities = explicit_entities or _detect_entities(df, cols, time_col)
344
+ entity_set = set(entities)
345
+ name_col = _detect_entity_name(df, entities, time_col)
346
+
347
+ rows = []
348
+ for col in cols:
349
+ if col in entity_set:
350
+ typ = "entity"
351
+ elif col == time_col:
352
+ typ = "time"
353
+ else:
354
+ typ = _value_type(df[col], factor_cutoff)
355
+ label = _humanize(col)
356
+ rows.append(
357
+ {
358
+ "var_name": col,
359
+ "var_def": label,
360
+ "label": label,
361
+ "type": typ,
362
+ "role": "entity_name" if col == name_col else "",
363
+ "can_be_na": typ not in ("entity", "time"),
364
+ }
365
+ )
366
+
367
+ out = pd.DataFrame(rows, columns=_COLUMNS)
368
+ out["can_be_na"] = out["can_be_na"].astype(bool)
369
+ return out