pyestat 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyestat/__init__.py ADDED
@@ -0,0 +1,62 @@
1
+ """Python client for the e-Stat API.
2
+
3
+ The names re-exported here are pyestat's public surface. For the 0.x series
4
+ stability splits two ways:
5
+
6
+ * **settled** (stability promised) — the consumption path: :class:`EstatClient`
7
+ and its getters (``get_stats_data``, ``get_meta_info``, ``list_stats``,
8
+ ``iter_stats_data_pages``); the response objects :class:`StatsDataResponse`
9
+ (and its ``to_flat``), :class:`MetaInfoResponse`, :class:`StatsListResponse`,
10
+ :class:`Page`, :class:`ClassObj`; :class:`EstatHttpClient`,
11
+ :class:`ProgressEvent`; and the error hierarchy :class:`EstatError`,
12
+ :class:`EstatApiError`, :class:`HttpRetryExhaustedError`,
13
+ :class:`TooManyRowsError`, :class:`AmbiguousRuleError`.
14
+ * **evolving** (may change during 0.x) — the rule-authoring path:
15
+ :class:`RuleV2`, :func:`load_builtin_rules`, and the
16
+ :class:`RuleAuthoringError` category. The rule schema is not frozen yet.
17
+
18
+ The authoring *leaf* errors (``RoleResolutionError``, ``RuleExpansionError``,
19
+ ``UnknownTransformError``, ``TimeFormatError``) and the rule-file
20
+ ``RuleLoadError`` are intentionally not re-exported. Reach them through
21
+ ``pyestat._errors`` if you must, accepting that an underscore path carries no
22
+ stability promise; a coarse ``except EstatError`` catches them all regardless.
23
+ """
24
+ from pyestat._endpoint import (
25
+ ClassObj,
26
+ EstatClient,
27
+ MetaInfoResponse,
28
+ Page,
29
+ StatsDataResponse,
30
+ StatsListResponse,
31
+ )
32
+ from pyestat._engine.builtin import load_builtin_rules
33
+ from pyestat._engine.rule import RuleV2
34
+ from pyestat._errors import (
35
+ AmbiguousRuleError,
36
+ EstatApiError,
37
+ EstatError,
38
+ HttpRetryExhaustedError,
39
+ RuleAuthoringError,
40
+ TooManyRowsError,
41
+ )
42
+ from pyestat._http import EstatHttpClient, ProgressEvent
43
+
44
+
45
+ __all__ = [
46
+ "AmbiguousRuleError",
47
+ "ClassObj",
48
+ "EstatApiError",
49
+ "EstatClient",
50
+ "EstatError",
51
+ "EstatHttpClient",
52
+ "HttpRetryExhaustedError",
53
+ "MetaInfoResponse",
54
+ "Page",
55
+ "ProgressEvent",
56
+ "RuleAuthoringError",
57
+ "RuleV2",
58
+ "StatsDataResponse",
59
+ "StatsListResponse",
60
+ "TooManyRowsError",
61
+ "load_builtin_rules",
62
+ ]
pyestat/_endpoint.py ADDED
@@ -0,0 +1,480 @@
1
+ """Layer 2: Endpoint surface.
2
+
3
+ Maps Python kwargs to e-Stat query parameters, parses the JSON response
4
+ into typed dataclasses, raises :class:`EstatApiError` on
5
+ ``RESULT.STATUS != 0``, and walks ``NEXT_KEY`` pagination. Transport
6
+ mechanics (retry, timeout, ``appId`` injection) live in Layer 1.
7
+
8
+ Out of scope here: rule matching, label substitution, standard-code
9
+ normalization — those are Layer 3.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import math
14
+ from collections.abc import Callable, Iterator, Mapping, Sequence
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from typing import TYPE_CHECKING, Any, Literal
18
+
19
+ from pyestat._http import EstatHttpClient, ProgressEvent
20
+ from pyestat._errors import EstatApiError, TooManyRowsError
21
+
22
+ if TYPE_CHECKING:
23
+ from pyestat._engine.rule import RuleV2
24
+
25
+
26
+ # --- response models -------------------------------------------------------
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class ClassObj:
31
+ """One axis from ``CLASS_INF.CLASS_OBJ``.
32
+
33
+ ``classes`` is the flattened list of ``CLASS`` entries — ``@code``,
34
+ ``@name``, ``@level``, ``@parentCode``, ``@unit`` etc. with the ``@``
35
+ prefix stripped. Names are kept raw; any normalization (e.g. the
36
+ axis classifier's NFKC folding) happens in Layer 3.
37
+ """
38
+
39
+ id: str
40
+ name: str
41
+ classes: tuple[dict[str, Any], ...]
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class Page:
46
+ """One physical page of a ``getStatsData`` response.
47
+
48
+ Each page carries the full ``table_inf`` / ``class_objs`` so a caller
49
+ can consume pages independently without keeping the first page
50
+ around. ``next_key`` is ``None`` on the final page.
51
+ """
52
+
53
+ page_number: int
54
+ values: tuple[dict[str, Any], ...]
55
+ next_key: int | None
56
+ total_number: int | None
57
+ table_inf: dict[str, Any]
58
+ class_objs: tuple[ClassObj, ...]
59
+
60
+
61
+ @dataclass(frozen=True)
62
+ class StatsDataResponse:
63
+ """Aggregated result of :meth:`EstatClient.get_stats_data`.
64
+
65
+ ``values`` is the canonical *nested* form: each field is a
66
+ self-describing object — a ``{code, label}`` dimension, a time cell
67
+ (``{code, label, normalized, granularity}``), or a ``{value, unit}``
68
+ measure — so an agent reads ``row["cat01"]["label"]`` without a suffix
69
+ convention. :meth:`to_flat` projects to one column per field for callers
70
+ who prefer the flat shape. A raw (``rule=None``) response keeps Layer 2's
71
+ flat rows; :meth:`to_flat` leaves them unchanged.
72
+
73
+ Two properties of this shape are part of the contract:
74
+
75
+ * **Values are the raw e-Stat strings, never coerced.** A measure's
76
+ ``value`` and every ``code`` stay exactly as e-Stat sent them — numbers
77
+ arrive as strings (``"1097352"``) and suppression markers (``"-"`` /
78
+ ``"***"`` / ``"X"``) pass through verbatim. Casting is the caller's: a
79
+ guessed numeric type would corrupt those markers, so a pandas user
80
+ applies ``pd.to_numeric(..., errors="coerce")`` themselves. Only a time
81
+ cell's ``normalized`` / ``granularity`` are derived (best-effort); the
82
+ observation ``value`` is never touched.
83
+ * **Row keys depend on the mode.** Under ``"auto"`` / ``"heuristic"`` the
84
+ keys are e-Stat's own axis ids (``cat01``, ``area``, ``time`` …) plus
85
+ ``value`` for the observation — opaque and table-specific, so what an
86
+ axis *means* lives in :attr:`class_objs`, not the key. Stable, semantic
87
+ column names (``commodity``, ``month`` …) come only from a rule; a pivot
88
+ names its folded columns by the meta-axis member name.
89
+ """
90
+
91
+ stats_data_id: str
92
+ total_number: int | None
93
+ table_inf: dict[str, Any]
94
+ class_objs: tuple[ClassObj, ...]
95
+ values: tuple[dict[str, Any], ...]
96
+
97
+ def to_flat(self) -> tuple[dict[str, Any], ...]:
98
+ """Project the nested ``values`` to the flat suffix convention.
99
+
100
+ A ``{code, label}`` dimension flattens to ``K`` / ``K_label``; a time
101
+ cell to ``K`` (normalized) / ``K_code`` / ``K_label`` /
102
+ ``K_granularity``; a ``{value, unit}`` measure to ``K`` plus its unit
103
+ (the lone observation column's unit takes the bare ``unit`` key, a
104
+ pivot measure's a per-column ``K_unit``). Lossless and idempotent — an
105
+ already-flat (``rule=None``) row passes through untouched. For a
106
+ DataFrame: ``pandas.DataFrame(resp.to_flat())``.
107
+
108
+ Raises :class:`FlatProjectionError` when two of the rule's output
109
+ columns map to one flat key (e.g. a column ``unit`` beside a ``value``
110
+ measure); the nested ``values`` are unaffected, so rename a column. A
111
+ built-in rule that would collide degrades to raw output instead, so this
112
+ only fires on a rule you authored.
113
+ """
114
+ # Lazy import keeps the L2 → L3 dependency out of module-load time
115
+ # (the rule subsystem imports this module, not the other way around).
116
+ from pyestat._engine.canonical import to_flat_rows
117
+
118
+ return to_flat_rows(self.values)
119
+
120
+
121
+ @dataclass(frozen=True)
122
+ class MetaInfoResponse:
123
+ """Result of :meth:`EstatClient.get_meta_info`."""
124
+
125
+ stats_data_id: str
126
+ table_inf: dict[str, Any]
127
+ class_objs: tuple[ClassObj, ...]
128
+
129
+
130
+ @dataclass(frozen=True)
131
+ class StatsListResponse:
132
+ """Result of :meth:`EstatClient.list_stats`.
133
+
134
+ ``tables`` is intentionally typed as raw dicts: ``TABLE_INF`` schema
135
+ drifts across statistics families and modeling it would slow down
136
+ keeping pyestat current with the search API.
137
+ """
138
+
139
+ total_number: int
140
+ tables: tuple[dict[str, Any], ...]
141
+
142
+
143
+ # --- helpers ---------------------------------------------------------------
144
+
145
+
146
+ def _ensure_list(x: Any) -> list[Any]:
147
+ """Normalize e-Stat's "single value collapses to a bare dict" quirk.
148
+
149
+ The API inlines a one-element array as the underlying dict whenever
150
+ it can; downstream iteration over ``dict`` keys silently produces
151
+ the wrong result, so the fix-up is centralized here.
152
+ """
153
+ if x is None:
154
+ return []
155
+ if isinstance(x, list):
156
+ return x
157
+ return [x]
158
+
159
+
160
+ def _flatten(entry: Mapping[str, Any]) -> dict[str, Any]:
161
+ """Strip ``@`` prefixes and rename ``$`` to ``value``.
162
+
163
+ Layer 2's only structural rewrite; every other transformation
164
+ (label substitution, standard-code mapping, value casting) is
165
+ Layer 3's responsibility.
166
+ """
167
+ result: dict[str, Any] = {}
168
+ for key, val in entry.items():
169
+ if key.startswith("@"):
170
+ result[key[1:]] = val
171
+ elif key == "$":
172
+ result["value"] = val
173
+ else:
174
+ result[key] = val
175
+ return result
176
+
177
+
178
+ def _parse_class_objs(class_inf: Mapping[str, Any] | None) -> tuple[ClassObj, ...]:
179
+ if not class_inf:
180
+ return ()
181
+ result: list[ClassObj] = []
182
+ for obj in _ensure_list(class_inf.get("CLASS_OBJ")):
183
+ result.append(
184
+ ClassObj(
185
+ id=obj["@id"],
186
+ name=obj["@name"],
187
+ classes=tuple(_flatten(c) for c in _ensure_list(obj.get("CLASS"))),
188
+ )
189
+ )
190
+ return tuple(result)
191
+
192
+
193
+ def _check_status(result: Mapping[str, Any]) -> None:
194
+ status = result.get("STATUS", 0)
195
+ if status != 0:
196
+ raise EstatApiError(status=status, message=result.get("ERROR_MSG", ""))
197
+
198
+
199
+ # --- client ----------------------------------------------------------------
200
+
201
+
202
+ class EstatClient:
203
+ """High-level e-Stat API client (sync).
204
+
205
+ Constructed with an injected :class:`EstatHttpClient` rather than
206
+ raw config so tests can supply a mock transport without monkey-
207
+ patching, and so future async / cached variants can swap the
208
+ transport without touching this surface.
209
+
210
+ The ``"auto"`` path resolves rules by role pattern through three layers,
211
+ ``user > project > builtin``; a rule in a higher layer shadows a lower
212
+ one matching the same pattern, while an unrelated rule leaves the lower
213
+ layers free to fire on other tables.
214
+
215
+ * ``user_rules`` — caller-defined v2 rules injected into the top layer.
216
+ * ``project_rules_dir`` — a directory of ``*.yaml`` / ``*.yml`` rules
217
+ auto-discovered into the middle layer, the escape hatch for
218
+ tables no built-in covers: drop a rule file in the directory and it
219
+ applies with no code change. Defaults to ``"pyestat_rules"`` (i.e.
220
+ ``./pyestat_rules`` relative to the working directory); pass another
221
+ path to relocate it, or ``None`` / ``""`` to opt out. The
222
+ pyestat-specific name keeps a plain client from silently adopting an
223
+ unrelated directory's rules. An absent directory means "no project
224
+ rules", not an error, so the common no-rules case never raises;
225
+ discovery is working-directory dependent, and a *malformed* file in the
226
+ directory raises :class:`RuleLoadError` at construction (the caller
227
+ authored it, so it surfaces — ARCHITECTURE.md).
228
+ * ``builtin_rules`` — the library-bundled rules (the bottom layer),
229
+ loaded from the package by default.
230
+ """
231
+
232
+ def __init__(
233
+ self,
234
+ *,
235
+ app_id: str | None = None,
236
+ http: EstatHttpClient | None = None,
237
+ builtin_rules: "Sequence[RuleV2] | None" = None,
238
+ user_rules: "Sequence[RuleV2] | None" = None,
239
+ project_rules_dir: "str | Path | None" = "pyestat_rules",
240
+ ) -> None:
241
+ if http is None:
242
+ if app_id is None:
243
+ raise ValueError("Either app_id or http is required")
244
+ http = EstatHttpClient(app_id=app_id)
245
+ self._http = http
246
+ # Imported lazily to keep the import graph one-way: the rule
247
+ # subsystem may depend on the endpoint module, but not the
248
+ # other way around at module-import time.
249
+ from pyestat._engine.builtin import load_builtin_rules
250
+ from pyestat._engine.loader import YamlRuleLoader
251
+
252
+ # All three layers hold v2 rules; the auto path resolves them by role
253
+ # pattern (user > project > builtin). The project layer is populated by
254
+ # scanning ``project_rules_dir`` so a caller drops a YAML in the
255
+ # directory and it applies without editing code. Any falsy value
256
+ # (``None`` / ``""``) opts out — the latter matters because ``Path("")``
257
+ # would otherwise collapse to the cwd and scan it. ``load_dir`` returns
258
+ # [] for an absent directory, so a missing default ``./pyestat_rules``
259
+ # is a no-op; a malformed file present in the directory raises
260
+ # RuleLoadError (the caller authored it, so it surfaces — ARCHITECTURE.md).
261
+ self._user_rules: list[RuleV2] = (
262
+ list(user_rules) if user_rules is not None else []
263
+ )
264
+ self._project_rules: list[RuleV2] = (
265
+ YamlRuleLoader().load_dir(Path(project_rules_dir))
266
+ if project_rules_dir
267
+ else []
268
+ )
269
+ self._builtin_rules: list[RuleV2] = (
270
+ list(builtin_rules) if builtin_rules is not None else load_builtin_rules()
271
+ )
272
+
273
+ # ----- getStatsData -----
274
+
275
+ def get_stats_data(
276
+ self,
277
+ stats_data_id: str,
278
+ *,
279
+ rule: "RuleV2 | Literal['auto', 'heuristic'] | None" = "auto",
280
+ aggregates: Literal["include", "exclude", "only"] = "include",
281
+ max_rows: int | None = None,
282
+ progress: Callable[[ProgressEvent], None] | None = None,
283
+ ) -> StatsDataResponse:
284
+ """Fetch one table, walking ``NEXT_KEY`` until all rows are pulled.
285
+
286
+ Every transformed mode returns the canonical *nested* row shape:
287
+ each axis is a ``{code, label}`` cell (``time`` adds ``normalized`` /
288
+ ``granularity``) and the observation is a ``{value, unit}`` measure.
289
+ Call :meth:`StatsDataResponse.to_flat` for the one-column-per-field
290
+ flat shape (pandas). ``rule`` selects the transformation mode:
291
+
292
+ * ``"auto"`` (default) — classify the table's axes, then resolve a
293
+ rule through Layers C > B > A > D: a matching v2 rule
294
+ (user/project, then built-in), else a generic rule built from the
295
+ classified roles (Layer A), else the Layer D fallback when the
296
+ table cannot be structured (a low-confidence axis, or a shape the
297
+ generic rule declines). A rule you supplied that then fails to apply
298
+ surfaces as a typed :class:`EstatError`; a library-provided rule
299
+ degrades to Layer D instead (ARCHITECTURE.md).
300
+ * ``"heuristic"`` — Layer D fallback. The axis classifier detects
301
+ the ``time`` axis and normalizes it best-effort; every axis becomes
302
+ a ``{code, label}`` cell. Raw codes are preserved (in each cell's
303
+ ``code``), the cell value is never coerced, and an unrecognized time
304
+ code keeps ``normalized == code`` — data is preserved, axes are not
305
+ normalized to standard codes (that is out of scope here). Useful
306
+ when you want predictable, lossless output regardless of which
307
+ built-in rules ship.
308
+ * ``None`` — raw mode. Returns Layer 2's untransformed flattened
309
+ rows verbatim (flat scalars, not nested cells).
310
+ * :class:`RuleV2` — apply this rule directly against the table's
311
+ classification, bypassing the resolution chain.
312
+
313
+ ``aggregates`` selects which rows of a hierarchical table you receive,
314
+ independent of ``rule``. e-Stat marks a code hierarchy with
315
+ ``@parentCode`` (総数 → 大分類 → 品目, 全国 → 都道府県); summing a measure
316
+ across a total and its children double-counts. The filter runs on the
317
+ raw rows before any rule, so every mode honors it:
318
+
319
+ * ``"include"`` (default) — every row; today's behavior, unchanged.
320
+ * ``"exclude"`` — drop the aggregates, keeping only the leaves (the
321
+ detail grain), so the result is safe to sum. With several
322
+ hierarchical dimensions a row is kept only when it is a leaf on every
323
+ one.
324
+ * ``"only"`` — keep the aggregates (subtotals / totals), the exact
325
+ complement of ``"exclude"``.
326
+
327
+ Detection is per-response and ``category`` / ``area`` only: a code is
328
+ an aggregate when a child of it is present in the fetched rows, so a
329
+ table holding just a total is not filtered. A hierarchy e-Stat ships
330
+ without ``@parentCode`` is invisible to this filter.
331
+
332
+ When ``max_rows`` is set, a cheap ``cntGetFlg=Y`` probe runs first
333
+ and the call raises :class:`TooManyRowsError` before any data page
334
+ is downloaded if the table exceeds the cap.
335
+ """
336
+ if max_rows is not None:
337
+ payload = self._http.request(
338
+ "/getStatsData",
339
+ params={"statsDataId": stats_data_id, "cntGetFlg": "Y"},
340
+ )
341
+ root = payload["GET_STATS_DATA"]
342
+ _check_status(root["RESULT"])
343
+ total = root["STATISTICAL_DATA"]["RESULT_INF"]["TOTAL_NUMBER"]
344
+ if total > max_rows:
345
+ raise TooManyRowsError(
346
+ stats_data_id=stats_data_id, total=total, limit=max_rows
347
+ )
348
+
349
+ pages = list(self.iter_stats_data_pages(stats_data_id, progress=progress))
350
+ first = pages[0]
351
+ values = tuple(v for p in pages for v in p.values)
352
+ # Imported lazily so the (L3 → L2) dependency direction stays
353
+ # one-way: the rule subsystem consumes ``ClassObj`` from this module.
354
+ # The pipeline owns the classify → aggregate → resolve → apply order
355
+ # and the Layer A–D routing; this method keeps only HTTP, paging, and
356
+ # response typing.
357
+ from pyestat._engine.pipeline import run_pipeline
358
+
359
+ transformed = run_pipeline(
360
+ values,
361
+ first.class_objs,
362
+ first.table_inf,
363
+ stats_data_id,
364
+ rule,
365
+ aggregates,
366
+ user_rules=self._user_rules,
367
+ project_rules=self._project_rules,
368
+ builtin_rules=self._builtin_rules,
369
+ )
370
+ return StatsDataResponse(
371
+ stats_data_id=stats_data_id,
372
+ total_number=first.total_number,
373
+ table_inf=first.table_inf,
374
+ class_objs=first.class_objs,
375
+ values=transformed,
376
+ )
377
+
378
+ def iter_stats_data_pages(
379
+ self,
380
+ stats_data_id: str,
381
+ *,
382
+ progress: Callable[[ProgressEvent], None] | None = None,
383
+ ) -> Iterator[Page]:
384
+ """Yield each ``NEXT_KEY`` page one at a time.
385
+
386
+ Lower-level than :meth:`get_stats_data`: callers can stream a
387
+ 3.8M-row table without materializing the whole list. ``progress``
388
+ is fired *after* each page has been parsed, so a tqdm bridge
389
+ sees the count reflect what was actually received.
390
+ """
391
+ next_key: int | None = None
392
+ page_number = 0
393
+ rows_fetched = 0
394
+ page_size: int | None = None
395
+ while True:
396
+ page_number += 1
397
+ params: dict[str, Any] = {"statsDataId": stats_data_id}
398
+ if next_key is not None:
399
+ params["startPosition"] = next_key
400
+ payload = self._http.request("/getStatsData", params=params)
401
+ page = self._parse_page(payload, page_number)
402
+ rows_fetched += len(page.values)
403
+ if page_size is None and page.values:
404
+ page_size = len(page.values)
405
+ if progress is not None:
406
+ total_pages = (
407
+ math.ceil(page.total_number / page_size)
408
+ if page.total_number and page_size
409
+ else None
410
+ )
411
+ progress(
412
+ ProgressEvent(
413
+ page=page_number,
414
+ total_pages=total_pages,
415
+ rows_fetched=rows_fetched,
416
+ rows_total=page.total_number,
417
+ )
418
+ )
419
+ yield page
420
+ if page.next_key is None:
421
+ break
422
+ next_key = page.next_key
423
+
424
+ @staticmethod
425
+ def _parse_page(payload: Mapping[str, Any], page_number: int) -> Page:
426
+ root = payload["GET_STATS_DATA"]
427
+ _check_status(root["RESULT"])
428
+ sd = root["STATISTICAL_DATA"]
429
+ result_inf = sd.get("RESULT_INF", {})
430
+ next_key_raw = result_inf.get("NEXT_KEY")
431
+ next_key = int(next_key_raw) if next_key_raw is not None else None
432
+ return Page(
433
+ page_number=page_number,
434
+ values=tuple(_flatten(v) for v in _ensure_list(sd.get("DATA_INF", {}).get("VALUE"))),
435
+ next_key=next_key,
436
+ total_number=result_inf.get("TOTAL_NUMBER"),
437
+ table_inf=dict(sd.get("TABLE_INF", {})),
438
+ class_objs=_parse_class_objs(sd.get("CLASS_INF")),
439
+ )
440
+
441
+ # ----- getMetaInfo -----
442
+
443
+ def get_meta_info(self, stats_data_id: str) -> MetaInfoResponse:
444
+ """Fetch axis metadata without downloading data.
445
+
446
+ Lets a caller inspect a table's axes before committing to a
447
+ potentially huge fetch.
448
+ """
449
+ payload = self._http.request(
450
+ "/getMetaInfo", params={"statsDataId": stats_data_id}
451
+ )
452
+ root = payload["GET_META_INFO"]
453
+ _check_status(root["RESULT"])
454
+ metadata = root.get("METADATA_INF", {})
455
+ return MetaInfoResponse(
456
+ stats_data_id=stats_data_id,
457
+ table_inf=dict(metadata.get("TABLE_INF", {})),
458
+ class_objs=_parse_class_objs(metadata.get("CLASS_INF")),
459
+ )
460
+
461
+ # ----- getStatsList -----
462
+
463
+ def list_stats(self, **params: Any) -> StatsListResponse:
464
+ """Search the e-Stat catalog.
465
+
466
+ Parameters are forwarded raw because the search API has many
467
+ rarely-used knobs (``searchWord``, ``statsCode``, ``surveyYears``,
468
+ ``openYears``, ``statsField``…); a Python-side enumeration
469
+ would lag behind the published API without adding safety.
470
+ """
471
+ payload = self._http.request("/getStatsList", params=params)
472
+ root = payload["GET_STATS_LIST"]
473
+ _check_status(root["RESULT"])
474
+ dl = root.get("DATALIST_INF", {})
475
+ result_inf = dl.get("RESULT_INF", {})
476
+ tables = tuple(_ensure_list(dl.get("TABLE_INF")))
477
+ return StatsListResponse(
478
+ total_number=result_inf.get("TOTAL_NUMBER", len(tables)),
479
+ tables=tables,
480
+ )
@@ -0,0 +1,18 @@
1
+ """Layer 3 — the rule-driven transformation engine.
2
+
3
+ Submodules form a small DAG:
4
+
5
+ * :mod:`pyestat._engine.registry` — name → impl lookup primitive.
6
+ * :mod:`pyestat._engine.time` — built-in time parsers + ``best_effort``.
7
+ * :mod:`pyestat._engine.rule` — RuleV2 output-schema pydantic model.
8
+ * :mod:`pyestat._engine.loader` — YAML loader for the schema.
9
+ * :mod:`pyestat._engine.classifier` — axis classifier (role + confidence; Layer A).
10
+ * :mod:`pyestat._engine.role_defaults` — role-default registry + short-form expansion.
11
+ * :mod:`pyestat._engine.resolver` — v2 rule resolution (Layers C > B > A).
12
+ * :mod:`pyestat._engine.apply` — glue that runs the resolved rule over rows.
13
+ * :mod:`pyestat._engine.builtin` — loader for library-bundled rules.
14
+
15
+ Public symbols (``EstatClient``, ``RuleV2``, ``load_builtin_rules`` …)
16
+ re-export from :mod:`pyestat`. Direct ``pyestat._engine.X`` imports are
17
+ internal.
18
+ """
@@ -0,0 +1,131 @@
1
+ """Aggregate vs. detail row selection.
2
+
3
+ e-Stat encodes a code hierarchy with ``@parentCode``: a member that another
4
+ member names as its parent has children — it is an *aggregate* (a total or
5
+ subtotal: 総数, 大分類, 全国). A member with no children is a *leaf* — a
6
+ *detail* row. Summing a measure across a mix of aggregate and leaf rows
7
+ double-counts (食料 plus its 品目 plus 総数), so a caller filtering to leaves
8
+ (``"exclude"`` the aggregates) selects a single, self-consistent grain safe
9
+ to aggregate; filtering to aggregates (``"only"``) selects the rolled-up
10
+ figures.
11
+
12
+ Two deliberate choices, both deterministic:
13
+
14
+ * **Per-response, not absolute.** The parent links present in *this* table
15
+ decide. A table holding only a total (no children fetched) names no parent,
16
+ so nothing is an aggregate and nothing is dropped — there is no
17
+ double-counting with a single grain. The flip side is the contract's edge:
18
+ a hierarchy e-Stat ships *without* ``@parentCode`` (a flat 男女別 総数 / 男 /
19
+ 女) is invisible here and stays unfiltered.
20
+ * **Leaf on every dimension (AND).** Across several hierarchical axes
21
+ (建築主 × 用途) a row is detail only when it is a leaf on *all* of them — the
22
+ safe grain for the cross. ``"only"`` is the exact complement (an aggregate
23
+ on at least one axis), so the two selections partition the rows.
24
+
25
+ Only the dimension axes (``category`` / ``area``) range over the selection:
26
+ ``time`` granularity is the time normalizer's concern, a ``meta-axis`` hierarchy is the
27
+ pivot's to fold, and a ``value`` axis carries no code hierarchy. This
28
+ keeps the selection orthogonal to the conversion rule — it filters the raw
29
+ rows before any rule runs, so ``"auto"``, a built-in, a custom rule, and raw
30
+ mode all honor it uniformly.
31
+ """
32
+ from __future__ import annotations
33
+
34
+ from collections.abc import Mapping, Sequence
35
+ from typing import Any, Literal, get_args
36
+
37
+ from pyestat._endpoint import ClassObj
38
+ from pyestat._engine.classifier import AxisRole, TableClassification
39
+
40
+ AggregateSelection = Literal["include", "exclude", "only"]
41
+
42
+ # The roles whose code hierarchy this selection ranges over. time is
43
+ # granularity, a meta-axis is the pivot's domain, value carries no
44
+ # codes — so the dimension roles are the only ones a parent/leaf split applies
45
+ # to.
46
+ _DIMENSION_ROLES = frozenset({AxisRole.CATEGORY, AxisRole.AREA})
47
+
48
+
49
+ def _aggregate_codes(axis: ClassObj, present: set[Any]) -> set[str]:
50
+ """The codes on ``axis`` that have a child *present in the fetched rows* —
51
+ the aggregates whose presence alongside their children would double-count.
52
+
53
+ Data-driven on purpose (see the module docstring): a parent is an aggregate
54
+ only when one of its children is actually in ``present``. A total fetched
55
+ on its own names no present child, so it is a leaf here and is kept. The
56
+ child itself need not have its own parent present — 食料 is still a subtotal
57
+ over the 品目 below it even if 総数 was not fetched.
58
+ """
59
+ parent_of = {
60
+ str(c["code"]): str(c["parentCode"])
61
+ for c in axis.classes
62
+ if "code" in c and c.get("parentCode") not in (None, "")
63
+ }
64
+ return {
65
+ parent_of[str(code)]
66
+ for code in present
67
+ if code is not None and str(code) in parent_of
68
+ }
69
+
70
+
71
+ def select_rows(
72
+ values: Sequence[Mapping[str, Any]],
73
+ classification: TableClassification,
74
+ class_objs: Sequence[ClassObj],
75
+ selection: AggregateSelection,
76
+ ) -> tuple[dict[str, Any], ...]:
77
+ """Filter ``values`` to detail rows, aggregate rows, or all.
78
+
79
+ * ``"include"`` — every row, unchanged (the default; backward compatible).
80
+ * ``"exclude"`` — drop the aggregates: keep rows that are a leaf on every
81
+ hierarchical dimension axis.
82
+ * ``"only"`` — keep the aggregates: the complement of ``"exclude"``.
83
+
84
+ Aggregates are detected from ``@parentCode`` on the ``category`` / ``area``
85
+ axes only (see the module docstring). A table whose dimensions encode no
86
+ hierarchy has no aggregates, so ``"exclude"`` returns every row and
87
+ ``"only"`` returns none. Rows are returned in input order; the filtered
88
+ tuple holds the original row objects (this is a pure filter).
89
+ """
90
+ if selection not in get_args(AggregateSelection):
91
+ raise ValueError(
92
+ f"`aggregates` must be one of {get_args(AggregateSelection)}, got {selection!r}"
93
+ )
94
+ if selection == "include":
95
+ return tuple(values)
96
+
97
+ dimension_axes = {
98
+ a.axis_id for a in classification.axes if a.role in _DIMENSION_ROLES
99
+ }
100
+ # Per dimension axis, the aggregate codes whose children are present in the
101
+ # fetched rows. An axis with no such aggregate (flat, or only leaves
102
+ # fetched) imposes nothing — every code on it is a leaf.
103
+ parents_by_axis: dict[str, set[str]] = {}
104
+ for obj in class_objs:
105
+ if obj.id not in dimension_axes:
106
+ continue
107
+ present = {row.get(obj.id) for row in values}
108
+ aggregates = _aggregate_codes(obj, present)
109
+ if aggregates:
110
+ parents_by_axis[obj.id] = aggregates
111
+ if not parents_by_axis:
112
+ # Nothing in this table is an aggregate: exclude keeps every (detail)
113
+ # row, only keeps none.
114
+ return tuple(values) if selection == "exclude" else ()
115
+
116
+ keep_detail = selection == "exclude"
117
+ return tuple(
118
+ row
119
+ for row in values
120
+ if _is_detail(row, parents_by_axis) == keep_detail
121
+ )
122
+
123
+
124
+ def _is_detail(row: Mapping[str, Any], parents_by_axis: Mapping[str, set[str]]) -> bool:
125
+ """True when ``row`` is a leaf on *every* hierarchical dimension axis — the
126
+ pure-detail grain. A row that is an aggregate on any one axis is not
127
+ detail."""
128
+ return all(
129
+ str(row.get(axis_id)) not in parents
130
+ for axis_id, parents in parents_by_axis.items()
131
+ )