pyestat 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyestat/__init__.py +62 -0
- pyestat/_endpoint.py +480 -0
- pyestat/_engine/__init__.py +18 -0
- pyestat/_engine/aggregate.py +131 -0
- pyestat/_engine/apply.py +885 -0
- pyestat/_engine/builtin.py +41 -0
- pyestat/_engine/canonical.py +140 -0
- pyestat/_engine/classifier.py +356 -0
- pyestat/_engine/loader.py +88 -0
- pyestat/_engine/pipeline.py +98 -0
- pyestat/_engine/registry.py +57 -0
- pyestat/_engine/resolver.py +129 -0
- pyestat/_engine/role_defaults.py +346 -0
- pyestat/_engine/rule.py +272 -0
- pyestat/_engine/time.py +180 -0
- pyestat/_errors.py +260 -0
- pyestat/_http.py +145 -0
- pyestat/py.typed +0 -0
- pyestat/rules/__init__.py +7 -0
- pyestat/rules/builtin/__init__.py +14 -0
- pyestat/rules/builtin/foreign_trade.yaml +60 -0
- pyestat/rules/builtin/foreign_trade_customs.yaml +49 -0
- pyestat-0.1.0.dist-info/METADATA +283 -0
- pyestat-0.1.0.dist-info/RECORD +26 -0
- pyestat-0.1.0.dist-info/WHEEL +4 -0
- pyestat-0.1.0.dist-info/licenses/LICENSE +21 -0
pyestat/__init__.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Python client for the e-Stat API.
|
|
2
|
+
|
|
3
|
+
The names re-exported here are pyestat's public surface. For the 0.x series
|
|
4
|
+
stability splits two ways:
|
|
5
|
+
|
|
6
|
+
* **settled** (stability promised) — the consumption path: :class:`EstatClient`
|
|
7
|
+
and its getters (``get_stats_data``, ``get_meta_info``, ``list_stats``,
|
|
8
|
+
``iter_stats_data_pages``); the response objects :class:`StatsDataResponse`
|
|
9
|
+
(and its ``to_flat``), :class:`MetaInfoResponse`, :class:`StatsListResponse`,
|
|
10
|
+
:class:`Page`, :class:`ClassObj`; :class:`EstatHttpClient`,
|
|
11
|
+
:class:`ProgressEvent`; and the error hierarchy :class:`EstatError`,
|
|
12
|
+
:class:`EstatApiError`, :class:`HttpRetryExhaustedError`,
|
|
13
|
+
:class:`TooManyRowsError`, :class:`AmbiguousRuleError`.
|
|
14
|
+
* **evolving** (may change during 0.x) — the rule-authoring path:
|
|
15
|
+
:class:`RuleV2`, :func:`load_builtin_rules`, and the
|
|
16
|
+
:class:`RuleAuthoringError` category. The rule schema is not frozen yet.
|
|
17
|
+
|
|
18
|
+
The authoring *leaf* errors (``RoleResolutionError``, ``RuleExpansionError``,
|
|
19
|
+
``UnknownTransformError``, ``TimeFormatError``) and the rule-file
|
|
20
|
+
``RuleLoadError`` are intentionally not re-exported. Reach them through
|
|
21
|
+
``pyestat._errors`` if you must, accepting that an underscore path carries no
|
|
22
|
+
stability promise; a coarse ``except EstatError`` catches them all regardless.
|
|
23
|
+
"""
|
|
24
|
+
from pyestat._endpoint import (
|
|
25
|
+
ClassObj,
|
|
26
|
+
EstatClient,
|
|
27
|
+
MetaInfoResponse,
|
|
28
|
+
Page,
|
|
29
|
+
StatsDataResponse,
|
|
30
|
+
StatsListResponse,
|
|
31
|
+
)
|
|
32
|
+
from pyestat._engine.builtin import load_builtin_rules
|
|
33
|
+
from pyestat._engine.rule import RuleV2
|
|
34
|
+
from pyestat._errors import (
|
|
35
|
+
AmbiguousRuleError,
|
|
36
|
+
EstatApiError,
|
|
37
|
+
EstatError,
|
|
38
|
+
HttpRetryExhaustedError,
|
|
39
|
+
RuleAuthoringError,
|
|
40
|
+
TooManyRowsError,
|
|
41
|
+
)
|
|
42
|
+
from pyestat._http import EstatHttpClient, ProgressEvent
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"AmbiguousRuleError",
|
|
47
|
+
"ClassObj",
|
|
48
|
+
"EstatApiError",
|
|
49
|
+
"EstatClient",
|
|
50
|
+
"EstatError",
|
|
51
|
+
"EstatHttpClient",
|
|
52
|
+
"HttpRetryExhaustedError",
|
|
53
|
+
"MetaInfoResponse",
|
|
54
|
+
"Page",
|
|
55
|
+
"ProgressEvent",
|
|
56
|
+
"RuleAuthoringError",
|
|
57
|
+
"RuleV2",
|
|
58
|
+
"StatsDataResponse",
|
|
59
|
+
"StatsListResponse",
|
|
60
|
+
"TooManyRowsError",
|
|
61
|
+
"load_builtin_rules",
|
|
62
|
+
]
|
pyestat/_endpoint.py
ADDED
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
"""Layer 2: Endpoint surface.
|
|
2
|
+
|
|
3
|
+
Maps Python kwargs to e-Stat query parameters, parses the JSON response
|
|
4
|
+
into typed dataclasses, raises :class:`EstatApiError` on
|
|
5
|
+
``RESULT.STATUS != 0``, and walks ``NEXT_KEY`` pagination. Transport
|
|
6
|
+
mechanics (retry, timeout, ``appId`` injection) live in Layer 1.
|
|
7
|
+
|
|
8
|
+
Out of scope here: rule matching, label substitution, standard-code
|
|
9
|
+
normalization — those are Layer 3.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import math
|
|
14
|
+
from collections.abc import Callable, Iterator, Mapping, Sequence
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
18
|
+
|
|
19
|
+
from pyestat._http import EstatHttpClient, ProgressEvent
|
|
20
|
+
from pyestat._errors import EstatApiError, TooManyRowsError
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from pyestat._engine.rule import RuleV2
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# --- response models -------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class ClassObj:
|
|
31
|
+
"""One axis from ``CLASS_INF.CLASS_OBJ``.
|
|
32
|
+
|
|
33
|
+
``classes`` is the flattened list of ``CLASS`` entries — ``@code``,
|
|
34
|
+
``@name``, ``@level``, ``@parentCode``, ``@unit`` etc. with the ``@``
|
|
35
|
+
prefix stripped. Names are kept raw; any normalization (e.g. the
|
|
36
|
+
axis classifier's NFKC folding) happens in Layer 3.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
id: str
|
|
40
|
+
name: str
|
|
41
|
+
classes: tuple[dict[str, Any], ...]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(frozen=True)
|
|
45
|
+
class Page:
|
|
46
|
+
"""One physical page of a ``getStatsData`` response.
|
|
47
|
+
|
|
48
|
+
Each page carries the full ``table_inf`` / ``class_objs`` so a caller
|
|
49
|
+
can consume pages independently without keeping the first page
|
|
50
|
+
around. ``next_key`` is ``None`` on the final page.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
page_number: int
|
|
54
|
+
values: tuple[dict[str, Any], ...]
|
|
55
|
+
next_key: int | None
|
|
56
|
+
total_number: int | None
|
|
57
|
+
table_inf: dict[str, Any]
|
|
58
|
+
class_objs: tuple[ClassObj, ...]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass(frozen=True)
|
|
62
|
+
class StatsDataResponse:
|
|
63
|
+
"""Aggregated result of :meth:`EstatClient.get_stats_data`.
|
|
64
|
+
|
|
65
|
+
``values`` is the canonical *nested* form: each field is a
|
|
66
|
+
self-describing object — a ``{code, label}`` dimension, a time cell
|
|
67
|
+
(``{code, label, normalized, granularity}``), or a ``{value, unit}``
|
|
68
|
+
measure — so an agent reads ``row["cat01"]["label"]`` without a suffix
|
|
69
|
+
convention. :meth:`to_flat` projects to one column per field for callers
|
|
70
|
+
who prefer the flat shape. A raw (``rule=None``) response keeps Layer 2's
|
|
71
|
+
flat rows; :meth:`to_flat` leaves them unchanged.
|
|
72
|
+
|
|
73
|
+
Two properties of this shape are part of the contract:
|
|
74
|
+
|
|
75
|
+
* **Values are the raw e-Stat strings, never coerced.** A measure's
|
|
76
|
+
``value`` and every ``code`` stay exactly as e-Stat sent them — numbers
|
|
77
|
+
arrive as strings (``"1097352"``) and suppression markers (``"-"`` /
|
|
78
|
+
``"***"`` / ``"X"``) pass through verbatim. Casting is the caller's: a
|
|
79
|
+
guessed numeric type would corrupt those markers, so a pandas user
|
|
80
|
+
applies ``pd.to_numeric(..., errors="coerce")`` themselves. Only a time
|
|
81
|
+
cell's ``normalized`` / ``granularity`` are derived (best-effort); the
|
|
82
|
+
observation ``value`` is never touched.
|
|
83
|
+
* **Row keys depend on the mode.** Under ``"auto"`` / ``"heuristic"`` the
|
|
84
|
+
keys are e-Stat's own axis ids (``cat01``, ``area``, ``time`` …) plus
|
|
85
|
+
``value`` for the observation — opaque and table-specific, so what an
|
|
86
|
+
axis *means* lives in :attr:`class_objs`, not the key. Stable, semantic
|
|
87
|
+
column names (``commodity``, ``month`` …) come only from a rule; a pivot
|
|
88
|
+
names its folded columns by the meta-axis member name.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
stats_data_id: str
|
|
92
|
+
total_number: int | None
|
|
93
|
+
table_inf: dict[str, Any]
|
|
94
|
+
class_objs: tuple[ClassObj, ...]
|
|
95
|
+
values: tuple[dict[str, Any], ...]
|
|
96
|
+
|
|
97
|
+
def to_flat(self) -> tuple[dict[str, Any], ...]:
|
|
98
|
+
"""Project the nested ``values`` to the flat suffix convention.
|
|
99
|
+
|
|
100
|
+
A ``{code, label}`` dimension flattens to ``K`` / ``K_label``; a time
|
|
101
|
+
cell to ``K`` (normalized) / ``K_code`` / ``K_label`` /
|
|
102
|
+
``K_granularity``; a ``{value, unit}`` measure to ``K`` plus its unit
|
|
103
|
+
(the lone observation column's unit takes the bare ``unit`` key, a
|
|
104
|
+
pivot measure's a per-column ``K_unit``). Lossless and idempotent — an
|
|
105
|
+
already-flat (``rule=None``) row passes through untouched. For a
|
|
106
|
+
DataFrame: ``pandas.DataFrame(resp.to_flat())``.
|
|
107
|
+
|
|
108
|
+
Raises :class:`FlatProjectionError` when two of the rule's output
|
|
109
|
+
columns map to one flat key (e.g. a column ``unit`` beside a ``value``
|
|
110
|
+
measure); the nested ``values`` are unaffected, so rename a column. A
|
|
111
|
+
built-in rule that would collide degrades to raw output instead, so this
|
|
112
|
+
only fires on a rule you authored.
|
|
113
|
+
"""
|
|
114
|
+
# Lazy import keeps the L2 → L3 dependency out of module-load time
|
|
115
|
+
# (the rule subsystem imports this module, not the other way around).
|
|
116
|
+
from pyestat._engine.canonical import to_flat_rows
|
|
117
|
+
|
|
118
|
+
return to_flat_rows(self.values)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass(frozen=True)
|
|
122
|
+
class MetaInfoResponse:
|
|
123
|
+
"""Result of :meth:`EstatClient.get_meta_info`."""
|
|
124
|
+
|
|
125
|
+
stats_data_id: str
|
|
126
|
+
table_inf: dict[str, Any]
|
|
127
|
+
class_objs: tuple[ClassObj, ...]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass(frozen=True)
|
|
131
|
+
class StatsListResponse:
|
|
132
|
+
"""Result of :meth:`EstatClient.list_stats`.
|
|
133
|
+
|
|
134
|
+
``tables`` is intentionally typed as raw dicts: ``TABLE_INF`` schema
|
|
135
|
+
drifts across statistics families and modeling it would slow down
|
|
136
|
+
keeping pyestat current with the search API.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
total_number: int
|
|
140
|
+
tables: tuple[dict[str, Any], ...]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# --- helpers ---------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _ensure_list(x: Any) -> list[Any]:
|
|
147
|
+
"""Normalize e-Stat's "single value collapses to a bare dict" quirk.
|
|
148
|
+
|
|
149
|
+
The API inlines a one-element array as the underlying dict whenever
|
|
150
|
+
it can; downstream iteration over ``dict`` keys silently produces
|
|
151
|
+
the wrong result, so the fix-up is centralized here.
|
|
152
|
+
"""
|
|
153
|
+
if x is None:
|
|
154
|
+
return []
|
|
155
|
+
if isinstance(x, list):
|
|
156
|
+
return x
|
|
157
|
+
return [x]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _flatten(entry: Mapping[str, Any]) -> dict[str, Any]:
|
|
161
|
+
"""Strip ``@`` prefixes and rename ``$`` to ``value``.
|
|
162
|
+
|
|
163
|
+
Layer 2's only structural rewrite; every other transformation
|
|
164
|
+
(label substitution, standard-code mapping, value casting) is
|
|
165
|
+
Layer 3's responsibility.
|
|
166
|
+
"""
|
|
167
|
+
result: dict[str, Any] = {}
|
|
168
|
+
for key, val in entry.items():
|
|
169
|
+
if key.startswith("@"):
|
|
170
|
+
result[key[1:]] = val
|
|
171
|
+
elif key == "$":
|
|
172
|
+
result["value"] = val
|
|
173
|
+
else:
|
|
174
|
+
result[key] = val
|
|
175
|
+
return result
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _parse_class_objs(class_inf: Mapping[str, Any] | None) -> tuple[ClassObj, ...]:
|
|
179
|
+
if not class_inf:
|
|
180
|
+
return ()
|
|
181
|
+
result: list[ClassObj] = []
|
|
182
|
+
for obj in _ensure_list(class_inf.get("CLASS_OBJ")):
|
|
183
|
+
result.append(
|
|
184
|
+
ClassObj(
|
|
185
|
+
id=obj["@id"],
|
|
186
|
+
name=obj["@name"],
|
|
187
|
+
classes=tuple(_flatten(c) for c in _ensure_list(obj.get("CLASS"))),
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
return tuple(result)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _check_status(result: Mapping[str, Any]) -> None:
|
|
194
|
+
status = result.get("STATUS", 0)
|
|
195
|
+
if status != 0:
|
|
196
|
+
raise EstatApiError(status=status, message=result.get("ERROR_MSG", ""))
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# --- client ----------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class EstatClient:
|
|
203
|
+
"""High-level e-Stat API client (sync).
|
|
204
|
+
|
|
205
|
+
Constructed with an injected :class:`EstatHttpClient` rather than
|
|
206
|
+
raw config so tests can supply a mock transport without monkey-
|
|
207
|
+
patching, and so future async / cached variants can swap the
|
|
208
|
+
transport without touching this surface.
|
|
209
|
+
|
|
210
|
+
The ``"auto"`` path resolves rules by role pattern through three layers,
|
|
211
|
+
``user > project > builtin``; a rule in a higher layer shadows a lower
|
|
212
|
+
one matching the same pattern, while an unrelated rule leaves the lower
|
|
213
|
+
layers free to fire on other tables.
|
|
214
|
+
|
|
215
|
+
* ``user_rules`` — caller-defined v2 rules injected into the top layer.
|
|
216
|
+
* ``project_rules_dir`` — a directory of ``*.yaml`` / ``*.yml`` rules
|
|
217
|
+
auto-discovered into the middle layer, the escape hatch for
|
|
218
|
+
tables no built-in covers: drop a rule file in the directory and it
|
|
219
|
+
applies with no code change. Defaults to ``"pyestat_rules"`` (i.e.
|
|
220
|
+
``./pyestat_rules`` relative to the working directory); pass another
|
|
221
|
+
path to relocate it, or ``None`` / ``""`` to opt out. The
|
|
222
|
+
pyestat-specific name keeps a plain client from silently adopting an
|
|
223
|
+
unrelated directory's rules. An absent directory means "no project
|
|
224
|
+
rules", not an error, so the common no-rules case never raises;
|
|
225
|
+
discovery is working-directory dependent, and a *malformed* file in the
|
|
226
|
+
directory raises :class:`RuleLoadError` at construction (the caller
|
|
227
|
+
authored it, so it surfaces — ARCHITECTURE.md).
|
|
228
|
+
* ``builtin_rules`` — the library-bundled rules (the bottom layer),
|
|
229
|
+
loaded from the package by default.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
def __init__(
|
|
233
|
+
self,
|
|
234
|
+
*,
|
|
235
|
+
app_id: str | None = None,
|
|
236
|
+
http: EstatHttpClient | None = None,
|
|
237
|
+
builtin_rules: "Sequence[RuleV2] | None" = None,
|
|
238
|
+
user_rules: "Sequence[RuleV2] | None" = None,
|
|
239
|
+
project_rules_dir: "str | Path | None" = "pyestat_rules",
|
|
240
|
+
) -> None:
|
|
241
|
+
if http is None:
|
|
242
|
+
if app_id is None:
|
|
243
|
+
raise ValueError("Either app_id or http is required")
|
|
244
|
+
http = EstatHttpClient(app_id=app_id)
|
|
245
|
+
self._http = http
|
|
246
|
+
# Imported lazily to keep the import graph one-way: the rule
|
|
247
|
+
# subsystem may depend on the endpoint module, but not the
|
|
248
|
+
# other way around at module-import time.
|
|
249
|
+
from pyestat._engine.builtin import load_builtin_rules
|
|
250
|
+
from pyestat._engine.loader import YamlRuleLoader
|
|
251
|
+
|
|
252
|
+
# All three layers hold v2 rules; the auto path resolves them by role
|
|
253
|
+
# pattern (user > project > builtin). The project layer is populated by
|
|
254
|
+
# scanning ``project_rules_dir`` so a caller drops a YAML in the
|
|
255
|
+
# directory and it applies without editing code. Any falsy value
|
|
256
|
+
# (``None`` / ``""``) opts out — the latter matters because ``Path("")``
|
|
257
|
+
# would otherwise collapse to the cwd and scan it. ``load_dir`` returns
|
|
258
|
+
# [] for an absent directory, so a missing default ``./pyestat_rules``
|
|
259
|
+
# is a no-op; a malformed file present in the directory raises
|
|
260
|
+
# RuleLoadError (the caller authored it, so it surfaces — ARCHITECTURE.md).
|
|
261
|
+
self._user_rules: list[RuleV2] = (
|
|
262
|
+
list(user_rules) if user_rules is not None else []
|
|
263
|
+
)
|
|
264
|
+
self._project_rules: list[RuleV2] = (
|
|
265
|
+
YamlRuleLoader().load_dir(Path(project_rules_dir))
|
|
266
|
+
if project_rules_dir
|
|
267
|
+
else []
|
|
268
|
+
)
|
|
269
|
+
self._builtin_rules: list[RuleV2] = (
|
|
270
|
+
list(builtin_rules) if builtin_rules is not None else load_builtin_rules()
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# ----- getStatsData -----
|
|
274
|
+
|
|
275
|
+
def get_stats_data(
|
|
276
|
+
self,
|
|
277
|
+
stats_data_id: str,
|
|
278
|
+
*,
|
|
279
|
+
rule: "RuleV2 | Literal['auto', 'heuristic'] | None" = "auto",
|
|
280
|
+
aggregates: Literal["include", "exclude", "only"] = "include",
|
|
281
|
+
max_rows: int | None = None,
|
|
282
|
+
progress: Callable[[ProgressEvent], None] | None = None,
|
|
283
|
+
) -> StatsDataResponse:
|
|
284
|
+
"""Fetch one table, walking ``NEXT_KEY`` until all rows are pulled.
|
|
285
|
+
|
|
286
|
+
Every transformed mode returns the canonical *nested* row shape:
|
|
287
|
+
each axis is a ``{code, label}`` cell (``time`` adds ``normalized`` /
|
|
288
|
+
``granularity``) and the observation is a ``{value, unit}`` measure.
|
|
289
|
+
Call :meth:`StatsDataResponse.to_flat` for the one-column-per-field
|
|
290
|
+
flat shape (pandas). ``rule`` selects the transformation mode:
|
|
291
|
+
|
|
292
|
+
* ``"auto"`` (default) — classify the table's axes, then resolve a
|
|
293
|
+
rule through Layers C > B > A > D: a matching v2 rule
|
|
294
|
+
(user/project, then built-in), else a generic rule built from the
|
|
295
|
+
classified roles (Layer A), else the Layer D fallback when the
|
|
296
|
+
table cannot be structured (a low-confidence axis, or a shape the
|
|
297
|
+
generic rule declines). A rule you supplied that then fails to apply
|
|
298
|
+
surfaces as a typed :class:`EstatError`; a library-provided rule
|
|
299
|
+
degrades to Layer D instead (ARCHITECTURE.md).
|
|
300
|
+
* ``"heuristic"`` — Layer D fallback. The axis classifier detects
|
|
301
|
+
the ``time`` axis and normalizes it best-effort; every axis becomes
|
|
302
|
+
a ``{code, label}`` cell. Raw codes are preserved (in each cell's
|
|
303
|
+
``code``), the cell value is never coerced, and an unrecognized time
|
|
304
|
+
code keeps ``normalized == code`` — data is preserved, axes are not
|
|
305
|
+
normalized to standard codes (that is out of scope here). Useful
|
|
306
|
+
when you want predictable, lossless output regardless of which
|
|
307
|
+
built-in rules ship.
|
|
308
|
+
* ``None`` — raw mode. Returns Layer 2's untransformed flattened
|
|
309
|
+
rows verbatim (flat scalars, not nested cells).
|
|
310
|
+
* :class:`RuleV2` — apply this rule directly against the table's
|
|
311
|
+
classification, bypassing the resolution chain.
|
|
312
|
+
|
|
313
|
+
``aggregates`` selects which rows of a hierarchical table you receive,
|
|
314
|
+
independent of ``rule``. e-Stat marks a code hierarchy with
|
|
315
|
+
``@parentCode`` (総数 → 大分類 → 品目, 全国 → 都道府県); summing a measure
|
|
316
|
+
across a total and its children double-counts. The filter runs on the
|
|
317
|
+
raw rows before any rule, so every mode honors it:
|
|
318
|
+
|
|
319
|
+
* ``"include"`` (default) — every row; today's behavior, unchanged.
|
|
320
|
+
* ``"exclude"`` — drop the aggregates, keeping only the leaves (the
|
|
321
|
+
detail grain), so the result is safe to sum. With several
|
|
322
|
+
hierarchical dimensions a row is kept only when it is a leaf on every
|
|
323
|
+
one.
|
|
324
|
+
* ``"only"`` — keep the aggregates (subtotals / totals), the exact
|
|
325
|
+
complement of ``"exclude"``.
|
|
326
|
+
|
|
327
|
+
Detection is per-response and ``category`` / ``area`` only: a code is
|
|
328
|
+
an aggregate when a child of it is present in the fetched rows, so a
|
|
329
|
+
table holding just a total is not filtered. A hierarchy e-Stat ships
|
|
330
|
+
without ``@parentCode`` is invisible to this filter.
|
|
331
|
+
|
|
332
|
+
When ``max_rows`` is set, a cheap ``cntGetFlg=Y`` probe runs first
|
|
333
|
+
and the call raises :class:`TooManyRowsError` before any data page
|
|
334
|
+
is downloaded if the table exceeds the cap.
|
|
335
|
+
"""
|
|
336
|
+
if max_rows is not None:
|
|
337
|
+
payload = self._http.request(
|
|
338
|
+
"/getStatsData",
|
|
339
|
+
params={"statsDataId": stats_data_id, "cntGetFlg": "Y"},
|
|
340
|
+
)
|
|
341
|
+
root = payload["GET_STATS_DATA"]
|
|
342
|
+
_check_status(root["RESULT"])
|
|
343
|
+
total = root["STATISTICAL_DATA"]["RESULT_INF"]["TOTAL_NUMBER"]
|
|
344
|
+
if total > max_rows:
|
|
345
|
+
raise TooManyRowsError(
|
|
346
|
+
stats_data_id=stats_data_id, total=total, limit=max_rows
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
pages = list(self.iter_stats_data_pages(stats_data_id, progress=progress))
|
|
350
|
+
first = pages[0]
|
|
351
|
+
values = tuple(v for p in pages for v in p.values)
|
|
352
|
+
# Imported lazily so the (L3 → L2) dependency direction stays
|
|
353
|
+
# one-way: the rule subsystem consumes ``ClassObj`` from this module.
|
|
354
|
+
# The pipeline owns the classify → aggregate → resolve → apply order
|
|
355
|
+
# and the Layer A–D routing; this method keeps only HTTP, paging, and
|
|
356
|
+
# response typing.
|
|
357
|
+
from pyestat._engine.pipeline import run_pipeline
|
|
358
|
+
|
|
359
|
+
transformed = run_pipeline(
|
|
360
|
+
values,
|
|
361
|
+
first.class_objs,
|
|
362
|
+
first.table_inf,
|
|
363
|
+
stats_data_id,
|
|
364
|
+
rule,
|
|
365
|
+
aggregates,
|
|
366
|
+
user_rules=self._user_rules,
|
|
367
|
+
project_rules=self._project_rules,
|
|
368
|
+
builtin_rules=self._builtin_rules,
|
|
369
|
+
)
|
|
370
|
+
return StatsDataResponse(
|
|
371
|
+
stats_data_id=stats_data_id,
|
|
372
|
+
total_number=first.total_number,
|
|
373
|
+
table_inf=first.table_inf,
|
|
374
|
+
class_objs=first.class_objs,
|
|
375
|
+
values=transformed,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
def iter_stats_data_pages(
|
|
379
|
+
self,
|
|
380
|
+
stats_data_id: str,
|
|
381
|
+
*,
|
|
382
|
+
progress: Callable[[ProgressEvent], None] | None = None,
|
|
383
|
+
) -> Iterator[Page]:
|
|
384
|
+
"""Yield each ``NEXT_KEY`` page one at a time.
|
|
385
|
+
|
|
386
|
+
Lower-level than :meth:`get_stats_data`: callers can stream a
|
|
387
|
+
3.8M-row table without materializing the whole list. ``progress``
|
|
388
|
+
is fired *after* each page has been parsed, so a tqdm bridge
|
|
389
|
+
sees the count reflect what was actually received.
|
|
390
|
+
"""
|
|
391
|
+
next_key: int | None = None
|
|
392
|
+
page_number = 0
|
|
393
|
+
rows_fetched = 0
|
|
394
|
+
page_size: int | None = None
|
|
395
|
+
while True:
|
|
396
|
+
page_number += 1
|
|
397
|
+
params: dict[str, Any] = {"statsDataId": stats_data_id}
|
|
398
|
+
if next_key is not None:
|
|
399
|
+
params["startPosition"] = next_key
|
|
400
|
+
payload = self._http.request("/getStatsData", params=params)
|
|
401
|
+
page = self._parse_page(payload, page_number)
|
|
402
|
+
rows_fetched += len(page.values)
|
|
403
|
+
if page_size is None and page.values:
|
|
404
|
+
page_size = len(page.values)
|
|
405
|
+
if progress is not None:
|
|
406
|
+
total_pages = (
|
|
407
|
+
math.ceil(page.total_number / page_size)
|
|
408
|
+
if page.total_number and page_size
|
|
409
|
+
else None
|
|
410
|
+
)
|
|
411
|
+
progress(
|
|
412
|
+
ProgressEvent(
|
|
413
|
+
page=page_number,
|
|
414
|
+
total_pages=total_pages,
|
|
415
|
+
rows_fetched=rows_fetched,
|
|
416
|
+
rows_total=page.total_number,
|
|
417
|
+
)
|
|
418
|
+
)
|
|
419
|
+
yield page
|
|
420
|
+
if page.next_key is None:
|
|
421
|
+
break
|
|
422
|
+
next_key = page.next_key
|
|
423
|
+
|
|
424
|
+
@staticmethod
|
|
425
|
+
def _parse_page(payload: Mapping[str, Any], page_number: int) -> Page:
|
|
426
|
+
root = payload["GET_STATS_DATA"]
|
|
427
|
+
_check_status(root["RESULT"])
|
|
428
|
+
sd = root["STATISTICAL_DATA"]
|
|
429
|
+
result_inf = sd.get("RESULT_INF", {})
|
|
430
|
+
next_key_raw = result_inf.get("NEXT_KEY")
|
|
431
|
+
next_key = int(next_key_raw) if next_key_raw is not None else None
|
|
432
|
+
return Page(
|
|
433
|
+
page_number=page_number,
|
|
434
|
+
values=tuple(_flatten(v) for v in _ensure_list(sd.get("DATA_INF", {}).get("VALUE"))),
|
|
435
|
+
next_key=next_key,
|
|
436
|
+
total_number=result_inf.get("TOTAL_NUMBER"),
|
|
437
|
+
table_inf=dict(sd.get("TABLE_INF", {})),
|
|
438
|
+
class_objs=_parse_class_objs(sd.get("CLASS_INF")),
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# ----- getMetaInfo -----
|
|
442
|
+
|
|
443
|
+
def get_meta_info(self, stats_data_id: str) -> MetaInfoResponse:
|
|
444
|
+
"""Fetch axis metadata without downloading data.
|
|
445
|
+
|
|
446
|
+
Lets a caller inspect a table's axes before committing to a
|
|
447
|
+
potentially huge fetch.
|
|
448
|
+
"""
|
|
449
|
+
payload = self._http.request(
|
|
450
|
+
"/getMetaInfo", params={"statsDataId": stats_data_id}
|
|
451
|
+
)
|
|
452
|
+
root = payload["GET_META_INFO"]
|
|
453
|
+
_check_status(root["RESULT"])
|
|
454
|
+
metadata = root.get("METADATA_INF", {})
|
|
455
|
+
return MetaInfoResponse(
|
|
456
|
+
stats_data_id=stats_data_id,
|
|
457
|
+
table_inf=dict(metadata.get("TABLE_INF", {})),
|
|
458
|
+
class_objs=_parse_class_objs(metadata.get("CLASS_INF")),
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
# ----- getStatsList -----
|
|
462
|
+
|
|
463
|
+
def list_stats(self, **params: Any) -> StatsListResponse:
|
|
464
|
+
"""Search the e-Stat catalog.
|
|
465
|
+
|
|
466
|
+
Parameters are forwarded raw because the search API has many
|
|
467
|
+
rarely-used knobs (``searchWord``, ``statsCode``, ``surveyYears``,
|
|
468
|
+
``openYears``, ``statsField``…); a Python-side enumeration
|
|
469
|
+
would lag behind the published API without adding safety.
|
|
470
|
+
"""
|
|
471
|
+
payload = self._http.request("/getStatsList", params=params)
|
|
472
|
+
root = payload["GET_STATS_LIST"]
|
|
473
|
+
_check_status(root["RESULT"])
|
|
474
|
+
dl = root.get("DATALIST_INF", {})
|
|
475
|
+
result_inf = dl.get("RESULT_INF", {})
|
|
476
|
+
tables = tuple(_ensure_list(dl.get("TABLE_INF")))
|
|
477
|
+
return StatsListResponse(
|
|
478
|
+
total_number=result_inf.get("TOTAL_NUMBER", len(tables)),
|
|
479
|
+
tables=tables,
|
|
480
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Layer 3 — the rule-driven transformation engine.
|
|
2
|
+
|
|
3
|
+
Submodules form a small DAG:
|
|
4
|
+
|
|
5
|
+
* :mod:`pyestat._engine.registry` — name → impl lookup primitive.
|
|
6
|
+
* :mod:`pyestat._engine.time` — built-in time parsers + ``best_effort``.
|
|
7
|
+
* :mod:`pyestat._engine.rule` — RuleV2 output-schema pydantic model.
|
|
8
|
+
* :mod:`pyestat._engine.loader` — YAML loader for the schema.
|
|
9
|
+
* :mod:`pyestat._engine.classifier` — axis classifier (role + confidence; Layer A).
|
|
10
|
+
* :mod:`pyestat._engine.role_defaults` — role-default registry + short-form expansion.
|
|
11
|
+
* :mod:`pyestat._engine.resolver` — v2 rule resolution (Layers C > B > A).
|
|
12
|
+
* :mod:`pyestat._engine.apply` — glue that runs the resolved rule over rows.
|
|
13
|
+
* :mod:`pyestat._engine.builtin` — loader for library-bundled rules.
|
|
14
|
+
|
|
15
|
+
Public symbols (``EstatClient``, ``RuleV2``, ``load_builtin_rules`` …)
|
|
16
|
+
re-export from :mod:`pyestat`. Direct ``pyestat._engine.X`` imports are
|
|
17
|
+
internal.
|
|
18
|
+
"""
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Aggregate vs. detail row selection.
|
|
2
|
+
|
|
3
|
+
e-Stat encodes a code hierarchy with ``@parentCode``: a member that another
|
|
4
|
+
member names as its parent has children — it is an *aggregate* (a total or
|
|
5
|
+
subtotal: 総数, 大分類, 全国). A member with no children is a *leaf* — a
|
|
6
|
+
*detail* row. Summing a measure across a mix of aggregate and leaf rows
|
|
7
|
+
double-counts (食料 plus its 品目 plus 総数), so a caller filtering to leaves
|
|
8
|
+
(``"exclude"`` the aggregates) selects a single, self-consistent grain safe
|
|
9
|
+
to aggregate; filtering to aggregates (``"only"``) selects the rolled-up
|
|
10
|
+
figures.
|
|
11
|
+
|
|
12
|
+
Two deliberate choices, both deterministic:
|
|
13
|
+
|
|
14
|
+
* **Per-response, not absolute.** The parent links present in *this* table
|
|
15
|
+
decide. A table holding only a total (no children fetched) names no parent,
|
|
16
|
+
so nothing is an aggregate and nothing is dropped — there is no
|
|
17
|
+
double-counting with a single grain. The flip side is the contract's edge:
|
|
18
|
+
a hierarchy e-Stat ships *without* ``@parentCode`` (a flat 男女別 総数 / 男 /
|
|
19
|
+
女) is invisible here and stays unfiltered.
|
|
20
|
+
* **Leaf on every dimension (AND).** Across several hierarchical axes
|
|
21
|
+
(建築主 × 用途) a row is detail only when it is a leaf on *all* of them — the
|
|
22
|
+
safe grain for the cross. ``"only"`` is the exact complement (an aggregate
|
|
23
|
+
on at least one axis), so the two selections partition the rows.
|
|
24
|
+
|
|
25
|
+
Only the dimension axes (``category`` / ``area``) range over the selection:
|
|
26
|
+
``time`` granularity is the time normalizer's concern, a ``meta-axis`` hierarchy is the
|
|
27
|
+
pivot's to fold, and a ``value`` axis carries no code hierarchy. This
|
|
28
|
+
keeps the selection orthogonal to the conversion rule — it filters the raw
|
|
29
|
+
rows before any rule runs, so ``"auto"``, a built-in, a custom rule, and raw
|
|
30
|
+
mode all honor it uniformly.
|
|
31
|
+
"""
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
from collections.abc import Mapping, Sequence
|
|
35
|
+
from typing import Any, Literal, get_args
|
|
36
|
+
|
|
37
|
+
from pyestat._endpoint import ClassObj
|
|
38
|
+
from pyestat._engine.classifier import AxisRole, TableClassification
|
|
39
|
+
|
|
40
|
+
AggregateSelection = Literal["include", "exclude", "only"]
|
|
41
|
+
|
|
42
|
+
# The roles whose code hierarchy this selection ranges over. time is
|
|
43
|
+
# granularity, a meta-axis is the pivot's domain, value carries no
|
|
44
|
+
# codes — so the dimension roles are the only ones a parent/leaf split applies
|
|
45
|
+
# to.
|
|
46
|
+
_DIMENSION_ROLES = frozenset({AxisRole.CATEGORY, AxisRole.AREA})
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _aggregate_codes(axis: ClassObj, present: set[Any]) -> set[str]:
|
|
50
|
+
"""The codes on ``axis`` that have a child *present in the fetched rows* —
|
|
51
|
+
the aggregates whose presence alongside their children would double-count.
|
|
52
|
+
|
|
53
|
+
Data-driven on purpose (see the module docstring): a parent is an aggregate
|
|
54
|
+
only when one of its children is actually in ``present``. A total fetched
|
|
55
|
+
on its own names no present child, so it is a leaf here and is kept. The
|
|
56
|
+
child itself need not have its own parent present — 食料 is still a subtotal
|
|
57
|
+
over the 品目 below it even if 総数 was not fetched.
|
|
58
|
+
"""
|
|
59
|
+
parent_of = {
|
|
60
|
+
str(c["code"]): str(c["parentCode"])
|
|
61
|
+
for c in axis.classes
|
|
62
|
+
if "code" in c and c.get("parentCode") not in (None, "")
|
|
63
|
+
}
|
|
64
|
+
return {
|
|
65
|
+
parent_of[str(code)]
|
|
66
|
+
for code in present
|
|
67
|
+
if code is not None and str(code) in parent_of
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def select_rows(
|
|
72
|
+
values: Sequence[Mapping[str, Any]],
|
|
73
|
+
classification: TableClassification,
|
|
74
|
+
class_objs: Sequence[ClassObj],
|
|
75
|
+
selection: AggregateSelection,
|
|
76
|
+
) -> tuple[dict[str, Any], ...]:
|
|
77
|
+
"""Filter ``values`` to detail rows, aggregate rows, or all.
|
|
78
|
+
|
|
79
|
+
* ``"include"`` — every row, unchanged (the default; backward compatible).
|
|
80
|
+
* ``"exclude"`` — drop the aggregates: keep rows that are a leaf on every
|
|
81
|
+
hierarchical dimension axis.
|
|
82
|
+
* ``"only"`` — keep the aggregates: the complement of ``"exclude"``.
|
|
83
|
+
|
|
84
|
+
Aggregates are detected from ``@parentCode`` on the ``category`` / ``area``
|
|
85
|
+
axes only (see the module docstring). A table whose dimensions encode no
|
|
86
|
+
hierarchy has no aggregates, so ``"exclude"`` returns every row and
|
|
87
|
+
``"only"`` returns none. Rows are returned in input order; the filtered
|
|
88
|
+
tuple holds the original row objects (this is a pure filter).
|
|
89
|
+
"""
|
|
90
|
+
if selection not in get_args(AggregateSelection):
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"`aggregates` must be one of {get_args(AggregateSelection)}, got {selection!r}"
|
|
93
|
+
)
|
|
94
|
+
if selection == "include":
|
|
95
|
+
return tuple(values)
|
|
96
|
+
|
|
97
|
+
dimension_axes = {
|
|
98
|
+
a.axis_id for a in classification.axes if a.role in _DIMENSION_ROLES
|
|
99
|
+
}
|
|
100
|
+
# Per dimension axis, the aggregate codes whose children are present in the
|
|
101
|
+
# fetched rows. An axis with no such aggregate (flat, or only leaves
|
|
102
|
+
# fetched) imposes nothing — every code on it is a leaf.
|
|
103
|
+
parents_by_axis: dict[str, set[str]] = {}
|
|
104
|
+
for obj in class_objs:
|
|
105
|
+
if obj.id not in dimension_axes:
|
|
106
|
+
continue
|
|
107
|
+
present = {row.get(obj.id) for row in values}
|
|
108
|
+
aggregates = _aggregate_codes(obj, present)
|
|
109
|
+
if aggregates:
|
|
110
|
+
parents_by_axis[obj.id] = aggregates
|
|
111
|
+
if not parents_by_axis:
|
|
112
|
+
# Nothing in this table is an aggregate: exclude keeps every (detail)
|
|
113
|
+
# row, only keeps none.
|
|
114
|
+
return tuple(values) if selection == "exclude" else ()
|
|
115
|
+
|
|
116
|
+
keep_detail = selection == "exclude"
|
|
117
|
+
return tuple(
|
|
118
|
+
row
|
|
119
|
+
for row in values
|
|
120
|
+
if _is_detail(row, parents_by_axis) == keep_detail
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _is_detail(row: Mapping[str, Any], parents_by_axis: Mapping[str, set[str]]) -> bool:
|
|
125
|
+
"""True when ``row`` is a leaf on *every* hierarchical dimension axis — the
|
|
126
|
+
pure-detail grain. A row that is an aggregate on any one axis is not
|
|
127
|
+
detail."""
|
|
128
|
+
return all(
|
|
129
|
+
str(row.get(axis_id)) not in parents
|
|
130
|
+
for axis_id, parents in parents_by_axis.items()
|
|
131
|
+
)
|