pyestat 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyestat-0.1.0/LICENSE +21 -0
- pyestat-0.1.0/PKG-INFO +283 -0
- pyestat-0.1.0/README.md +267 -0
- pyestat-0.1.0/pyproject.toml +42 -0
- pyestat-0.1.0/src/pyestat/__init__.py +62 -0
- pyestat-0.1.0/src/pyestat/_endpoint.py +480 -0
- pyestat-0.1.0/src/pyestat/_engine/__init__.py +18 -0
- pyestat-0.1.0/src/pyestat/_engine/aggregate.py +131 -0
- pyestat-0.1.0/src/pyestat/_engine/apply.py +885 -0
- pyestat-0.1.0/src/pyestat/_engine/builtin.py +41 -0
- pyestat-0.1.0/src/pyestat/_engine/canonical.py +140 -0
- pyestat-0.1.0/src/pyestat/_engine/classifier.py +356 -0
- pyestat-0.1.0/src/pyestat/_engine/loader.py +88 -0
- pyestat-0.1.0/src/pyestat/_engine/pipeline.py +98 -0
- pyestat-0.1.0/src/pyestat/_engine/registry.py +57 -0
- pyestat-0.1.0/src/pyestat/_engine/resolver.py +129 -0
- pyestat-0.1.0/src/pyestat/_engine/role_defaults.py +346 -0
- pyestat-0.1.0/src/pyestat/_engine/rule.py +272 -0
- pyestat-0.1.0/src/pyestat/_engine/time.py +180 -0
- pyestat-0.1.0/src/pyestat/_errors.py +260 -0
- pyestat-0.1.0/src/pyestat/_http.py +145 -0
- pyestat-0.1.0/src/pyestat/py.typed +0 -0
- pyestat-0.1.0/src/pyestat/rules/__init__.py +7 -0
- pyestat-0.1.0/src/pyestat/rules/builtin/__init__.py +14 -0
- pyestat-0.1.0/src/pyestat/rules/builtin/foreign_trade.yaml +60 -0
- pyestat-0.1.0/src/pyestat/rules/builtin/foreign_trade_customs.yaml +49 -0
pyestat-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 khaym
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pyestat-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyestat
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python client for the e-Stat API (Japanese government statistics portal) with structured outputs for LLMs and data scientists.
|
|
5
|
+
Author: khaym
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: httpx>=0.28.1
|
|
9
|
+
Requires-Dist: pydantic>=2.13.4
|
|
10
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
11
|
+
Requires-Python: >=3.11
|
|
12
|
+
Project-URL: Homepage, https://github.com/khaym/pyestat
|
|
13
|
+
Project-URL: Repository, https://github.com/khaym/pyestat
|
|
14
|
+
Project-URL: Issues, https://github.com/khaym/pyestat/issues
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
<p align="center">
|
|
18
|
+
<picture>
|
|
19
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/khaym/pyestat/main/assets/logo-dark.png">
|
|
20
|
+
<img src="https://raw.githubusercontent.com/khaym/pyestat/main/assets/logo.png" alt="pyestat" width="420">
|
|
21
|
+
</picture>
|
|
22
|
+
</p>
|
|
23
|
+
|
|
24
|
+
<p align="center">
|
|
25
|
+
<em>Structured, typed results from Japan's official statistics portal
|
|
26
|
+
(<a href="https://www.e-stat.go.jp/api/">e-Stat</a>) — ready for LLMs and data scientists.</em>
|
|
27
|
+
</p>
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<a href="https://pypi.org/project/pyestat/"><img src="https://img.shields.io/pypi/v/pyestat?color=082060" alt="PyPI version"></a>
|
|
31
|
+
<a href="https://pypi.org/project/pyestat/"><img src="https://img.shields.io/pypi/pyversions/pyestat?color=082060" alt="Python versions"></a>
|
|
32
|
+
<a href="https://github.com/khaym/pyestat/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue" alt="License: MIT"></a>
|
|
33
|
+
</p>
|
|
34
|
+
|
|
35
|
+
<p align="center">
|
|
36
|
+
<a href="#why-another-e-stat-library">Why</a> •
|
|
37
|
+
<a href="#install">Install</a> •
|
|
38
|
+
<a href="#usage">Usage</a> •
|
|
39
|
+
<a href="https://github.com/khaym/pyestat/blob/main/docs/AUTHORING_RULES.md">Writing rules</a> •
|
|
40
|
+
<a href="#license">License</a>
|
|
41
|
+
</p>
|
|
42
|
+
|
|
43
|
+
<p align="center">
|
|
44
|
+
<b>English</b> • <a href="https://github.com/khaym/pyestat/blob/main/README.ja.md">日本語</a>
|
|
45
|
+
</p>
|
|
46
|
+
|
|
47
|
+
## Why another e-Stat library?
|
|
48
|
+
|
|
49
|
+
The e-Stat API returns JSON that is a thin re-encoding of the original XML:
|
|
50
|
+
dimension codes hide under `@`-prefixed keys and the cell value under `$`, while
|
|
51
|
+
the human-readable labels and units for those codes live in a separate
|
|
52
|
+
`CLASS_INF` block you must join yourself. Logical errors arrive as HTTP 200 with
|
|
53
|
+
a non-zero `RESULT.STATUS`. Existing Python wrappers stop at "give me a
|
|
54
|
+
DataFrame" and pass these quirks through to the caller.
|
|
55
|
+
|
|
56
|
+
`pyestat` resolves them. By default (`rule="auto"`) it classifies each axis and
|
|
57
|
+
returns self-describing cells — no per-table rule required:
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
# e-Stat's raw VALUE cell: codes only — labels and units live in CLASS_INF
|
|
61
|
+
{"@cat01": "000", "@time": "2020000000", "$": "126146"}
|
|
62
|
+
|
|
63
|
+
# pyestat (rule="auto"): codes resolved to labels, value carries its unit
|
|
64
|
+
{"cat01": {"code": "000", "label": "男女計"},
|
|
65
|
+
"time": {"code": "2020000000", "label": "2020年",
|
|
66
|
+
"normalized": "2020", "granularity": "yearly"},
|
|
67
|
+
"value": {"value": "126146", "unit": "千人"}}
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
So an LLM agent or a researcher consumes a response without learning the e-Stat
|
|
71
|
+
wire format, joining `CLASS_INF` by hand, or special-casing the HTTP-200 error
|
|
72
|
+
channel.
|
|
73
|
+
|
|
74
|
+
### What you get
|
|
75
|
+
|
|
76
|
+
**Find and fetch**
|
|
77
|
+
|
|
78
|
+
- Search the catalog with `list_stats` (`searchWord`, `statsCode`, …).
|
|
79
|
+
- Inspect a table's axes with `get_meta_info` before downloading anything.
|
|
80
|
+
- Stream millions of rows page by page with `iter_stats_data_pages` — cap a
|
|
81
|
+
fetch up front with `max_rows` (raises `TooManyRowsError`) or follow it with a
|
|
82
|
+
`progress` callback.
|
|
83
|
+
|
|
84
|
+
**Structure** (`rule="auto"`, the default — no rule needed)
|
|
85
|
+
|
|
86
|
+
- Dimension codes resolved to `{code, label}` — `CLASS_INF` joined for you.
|
|
87
|
+
- Time normalized, with granularity tagged (yearly / monthly / …).
|
|
88
|
+
- A measure spread across rows folded into one record, key auto-detected — for
|
|
89
|
+
tables whose measure axis is flat (GDP, CPI, 建築着工 …). Hierarchical crosses
|
|
90
|
+
(trade's measure × period) and multi-category tables stay as lossless raw.
|
|
91
|
+
- `aggregates="exclude"` drops subtotal/total rows for a sum-safe leaf grain
|
|
92
|
+
(`"only"` keeps just the totals), read from `@parentCode` — a fetch option, so
|
|
93
|
+
it works in any mode.
|
|
94
|
+
- Your own `RuleV2` adds domain-specific column names and explicit pivots of
|
|
95
|
+
hierarchical crosses (`where` / `key` / `unit_from`).
|
|
96
|
+
|
|
97
|
+
**Hand off**
|
|
98
|
+
|
|
99
|
+
- `to_flat()` projects the nested cells to one column per field, for pandas.
|
|
100
|
+
|
|
101
|
+
Throughout, values pass through verbatim — numbers stay strings and suppression
|
|
102
|
+
markers (`-` / `***` / `X`) are preserved — e-Stat's HTTP-200 logical errors
|
|
103
|
+
surface as a typed `EstatApiError`, and transient network failures are retried.
|
|
104
|
+
|
|
105
|
+
## Status
|
|
106
|
+
|
|
107
|
+
pyestat is pre-1.0. Two parts of the surface move at different speeds:
|
|
108
|
+
|
|
109
|
+
- **Settled** — what you *consume*: the nested `StatsDataResponse` shape
|
|
110
|
+
(with its `to_flat()` projection) and the `EstatError` hierarchy hold
|
|
111
|
+
across 0.x.
|
|
112
|
+
- **Evolving** — what you *author*: the `RuleV2` rule schema may still
|
|
113
|
+
change across 0.x as built-in coverage grows.
|
|
114
|
+
|
|
115
|
+
## Install
|
|
116
|
+
|
|
117
|
+
```sh
|
|
118
|
+
uv add pyestat
|
|
119
|
+
# or
|
|
120
|
+
pip install pyestat
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Usage
|
|
124
|
+
|
|
125
|
+
Register for an `appId` at <https://www.e-stat.go.jp/api/>, then pass it
|
|
126
|
+
explicitly to `EstatClient(app_id=...)`. A common convention is to keep it
|
|
127
|
+
in an `ESTAT_APP_ID` environment variable and read it yourself:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
import os
|
|
131
|
+
|
|
132
|
+
from pyestat import EstatClient, EstatApiError
|
|
133
|
+
|
|
134
|
+
client = EstatClient(app_id=os.environ["ESTAT_APP_ID"])
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
response = client.get_stats_data(stats_data_id="0003448237")
|
|
138
|
+
except EstatApiError as exc:
|
|
139
|
+
# e-Stat reports logical errors with HTTP 200 + STATUS != 0.
|
|
140
|
+
print(f"e-Stat refused the query: {exc.status} {exc.message}")
|
|
141
|
+
else:
|
|
142
|
+
print(response.stats_data_id) # "0003448237"
|
|
143
|
+
for row in response.values:
|
|
144
|
+
# The default rule="auto" returns self-describing *nested* cells:
|
|
145
|
+
# each axis is {code, label}, time adds normalized/granularity,
|
|
146
|
+
# the observation is {value, unit}.
|
|
147
|
+
print(row)
|
|
148
|
+
# -> {"cat01": {"code": "000", "label": "男女計"},
|
|
149
|
+
# "time": {"code": "2020000000", "label": "2020年",
|
|
150
|
+
# "normalized": "2020", "granularity": "yearly"},
|
|
151
|
+
# "value": {"value": "126146", "unit": "千人"}}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Prefer one column per field (e.g. for pandas)? `to_flat()` projects the
|
|
155
|
+
nested cells to the familiar suffix shape — losslessly, and as a no-op on a
|
|
156
|
+
raw (`rule=None`) response:
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
flat = response.to_flat()
|
|
160
|
+
# -> [{"cat01": "000", "cat01_label": "男女計",
|
|
161
|
+
# "time": "2020", "time_code": "2020000000",
|
|
162
|
+
# "time_label": "2020年", "time_granularity": "yearly",
|
|
163
|
+
# "value": "126146", "unit": "千人"}, ...]
|
|
164
|
+
|
|
165
|
+
import pandas as pd
|
|
166
|
+
df = pd.DataFrame(flat)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Pass `rule=None` instead to get e-Stat's raw rows unchanged (`@`-prefixed
|
|
170
|
+
dimensions become plain keys, `"$"` becomes `"value"`) — flat scalars, no
|
|
171
|
+
labels or normalization.
|
|
172
|
+
|
|
173
|
+
## Writing your own rules
|
|
174
|
+
|
|
175
|
+
`pyestat` ships built-in rules for a small set of tables and falls back to
|
|
176
|
+
`rule="auto"` for the rest. When you want different structuring — or
|
|
177
|
+
domain-specific column names — supply your own `RuleV2`:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
from pyestat import EstatClient, RuleV2
|
|
181
|
+
|
|
182
|
+
custom = RuleV2.model_validate({
|
|
183
|
+
"schema_version": "2",
|
|
184
|
+
"match": {"role_pattern": ["value", "area", "time"]},
|
|
185
|
+
"output": [
|
|
186
|
+
{"column": "year", "source": {"role": "time"}, "transform": "yearly"},
|
|
187
|
+
{"column": "region", "source": {"role": "area"}},
|
|
188
|
+
{"column": "value", "source": {"role": "value"}},
|
|
189
|
+
],
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
client = EstatClient(user_rules=[custom])
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
A rule declares the **output columns** you want, each drawn from an axis *role*
|
|
196
|
+
the classifier infers, so one rule covers every table sharing that role pattern.
|
|
197
|
+
Pivoting rows split across a `meta-axis`, naming columns for `to_flat()`, and
|
|
198
|
+
dropping rule files into a directory are covered in
|
|
199
|
+
**[Writing rules →](https://github.com/khaym/pyestat/blob/main/docs/AUTHORING_RULES.md)**.
|
|
200
|
+
|
|
201
|
+
> The `RuleV2` schema is evolving across 0.x — see [Status](#status).
|
|
202
|
+
|
|
203
|
+
## Error behavior
|
|
204
|
+
|
|
205
|
+
On the default `rule="auto"` path, whether a *rule* failure reaches you
|
|
206
|
+
turns on who authored the failing rule — fall back when it is pyestat's,
|
|
207
|
+
surface when it is yours:
|
|
208
|
+
|
|
209
|
+
- A built-in rule that cannot apply degrades to lossless raw output
|
|
210
|
+
instead of raising: its failure is internal and you cannot edit it, so
|
|
211
|
+
preserved data beats a crash.
|
|
212
|
+
- A rule you supplied — an explicit `rule=RuleV2(...)`, a
|
|
213
|
+
`user_rules=` entry, or a file in `./pyestat_rules` — that cannot apply
|
|
214
|
+
raises a typed error so you can fix it and re-run.
|
|
215
|
+
|
|
216
|
+
So `get_stats_data(id)` on a table pyestat does not yet handle returns
|
|
217
|
+
usable raw rows rather than failing, while a mistake in your own rule is
|
|
218
|
+
reported.
|
|
219
|
+
|
|
220
|
+
Every pyestat error inherits from `EstatError`, so a coarse
|
|
221
|
+
`except EstatError` catches them all; catch a leaf (`EstatApiError`,
|
|
222
|
+
`TooManyRowsError`, …) when you want to act on one case.
|
|
223
|
+
|
|
224
|
+
## Configuring the appId
|
|
225
|
+
|
|
226
|
+
[Usage](#usage) shows the basic convention — pass `app_id` explicitly, kept in
|
|
227
|
+
an `ESTAT_APP_ID` variable. How that variable reaches the environment is your
|
|
228
|
+
project's call; a few common patterns:
|
|
229
|
+
|
|
230
|
+
**Shell export** (interactive use):
|
|
231
|
+
|
|
232
|
+
```sh
|
|
233
|
+
export ESTAT_APP_ID="<your-app-id>"
|
|
234
|
+
python your_script.py
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
**`.env` file + [python-dotenv](https://github.com/theskumar/python-dotenv)**
|
|
238
|
+
(local development, Jupyter):
|
|
239
|
+
|
|
240
|
+
```sh
|
|
241
|
+
echo 'ESTAT_APP_ID=<your-app-id>' > .env
|
|
242
|
+
# in your code (or notebook cell):
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
import os
|
|
247
|
+
|
|
248
|
+
from dotenv import load_dotenv
|
|
249
|
+
from pyestat import EstatClient
|
|
250
|
+
|
|
251
|
+
load_dotenv()
|
|
252
|
+
client = EstatClient(app_id=os.environ["ESTAT_APP_ID"])
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
**Docker / Compose**: pass `-e ESTAT_APP_ID=...` or set it under
|
|
256
|
+
`environment:` in your compose file.
|
|
257
|
+
|
|
258
|
+
**CI (GitHub Actions, etc.)**: store the appId as an encrypted secret and
|
|
259
|
+
inject it as an env var in the workflow step.
|
|
260
|
+
|
|
261
|
+
**Production**: pull it from your secret manager
|
|
262
|
+
(AWS Secrets Manager / GCP Secret Manager / HashiCorp Vault / ...) at
|
|
263
|
+
startup and pass it to `EstatClient(app_id=...)`.
|
|
264
|
+
|
|
265
|
+
`pyestat` deliberately avoids reading the environment or bundling a dotenv
|
|
266
|
+
loader, so it does not constrain how you manage secrets.
|
|
267
|
+
|
|
268
|
+
## Development
|
|
269
|
+
|
|
270
|
+
```sh
|
|
271
|
+
uv sync # install runtime + dev deps
|
|
272
|
+
cp .env.example .env # then fill in your ESTAT_APP_ID
|
|
273
|
+
uv run pytest # runs unit + live API tests
|
|
274
|
+
uv run pytest -m "not integration" # unit only (no network)
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
The live integration test under `tests/test_get_stats_data_integration.py`
|
|
278
|
+
auto-skips if `ESTAT_APP_ID` is not set, so the unit suite stays
|
|
279
|
+
hermetic without extra flags.
|
|
280
|
+
|
|
281
|
+
## License
|
|
282
|
+
|
|
283
|
+
MIT License. See [LICENSE](https://github.com/khaym/pyestat/blob/main/LICENSE) for details.
|
pyestat-0.1.0/README.md
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<picture>
|
|
3
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/khaym/pyestat/main/assets/logo-dark.png">
|
|
4
|
+
<img src="https://raw.githubusercontent.com/khaym/pyestat/main/assets/logo.png" alt="pyestat" width="420">
|
|
5
|
+
</picture>
|
|
6
|
+
</p>
|
|
7
|
+
|
|
8
|
+
<p align="center">
|
|
9
|
+
<em>Structured, typed results from Japan's official statistics portal
|
|
10
|
+
(<a href="https://www.e-stat.go.jp/api/">e-Stat</a>) — ready for LLMs and data scientists.</em>
|
|
11
|
+
</p>
|
|
12
|
+
|
|
13
|
+
<p align="center">
|
|
14
|
+
<a href="https://pypi.org/project/pyestat/"><img src="https://img.shields.io/pypi/v/pyestat?color=082060" alt="PyPI version"></a>
|
|
15
|
+
<a href="https://pypi.org/project/pyestat/"><img src="https://img.shields.io/pypi/pyversions/pyestat?color=082060" alt="Python versions"></a>
|
|
16
|
+
<a href="https://github.com/khaym/pyestat/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue" alt="License: MIT"></a>
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<a href="#why-another-e-stat-library">Why</a> •
|
|
21
|
+
<a href="#install">Install</a> •
|
|
22
|
+
<a href="#usage">Usage</a> •
|
|
23
|
+
<a href="https://github.com/khaym/pyestat/blob/main/docs/AUTHORING_RULES.md">Writing rules</a> •
|
|
24
|
+
<a href="#license">License</a>
|
|
25
|
+
</p>
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<b>English</b> • <a href="https://github.com/khaym/pyestat/blob/main/README.ja.md">日本語</a>
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
## Why another e-Stat library?
|
|
32
|
+
|
|
33
|
+
The e-Stat API returns JSON that is a thin re-encoding of the original XML:
|
|
34
|
+
dimension codes hide under `@`-prefixed keys and the cell value under `$`, while
|
|
35
|
+
the human-readable labels and units for those codes live in a separate
|
|
36
|
+
`CLASS_INF` block you must join yourself. Logical errors arrive as HTTP 200 with
|
|
37
|
+
a non-zero `RESULT.STATUS`. Existing Python wrappers stop at "give me a
|
|
38
|
+
DataFrame" and pass these quirks through to the caller.
|
|
39
|
+
|
|
40
|
+
`pyestat` resolves them. By default (`rule="auto"`) it classifies each axis and
|
|
41
|
+
returns self-describing cells — no per-table rule required:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
# e-Stat's raw VALUE cell: codes only — labels and units live in CLASS_INF
|
|
45
|
+
{"@cat01": "000", "@time": "2020000000", "$": "126146"}
|
|
46
|
+
|
|
47
|
+
# pyestat (rule="auto"): codes resolved to labels, value carries its unit
|
|
48
|
+
{"cat01": {"code": "000", "label": "男女計"},
|
|
49
|
+
"time": {"code": "2020000000", "label": "2020年",
|
|
50
|
+
"normalized": "2020", "granularity": "yearly"},
|
|
51
|
+
"value": {"value": "126146", "unit": "千人"}}
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
So an LLM agent or a researcher consumes a response without learning the e-Stat
|
|
55
|
+
wire format, joining `CLASS_INF` by hand, or special-casing the HTTP-200 error
|
|
56
|
+
channel.
|
|
57
|
+
|
|
58
|
+
### What you get
|
|
59
|
+
|
|
60
|
+
**Find and fetch**
|
|
61
|
+
|
|
62
|
+
- Search the catalog with `list_stats` (`searchWord`, `statsCode`, …).
|
|
63
|
+
- Inspect a table's axes with `get_meta_info` before downloading anything.
|
|
64
|
+
- Stream millions of rows page by page with `iter_stats_data_pages` — cap a
|
|
65
|
+
fetch up front with `max_rows` (raises `TooManyRowsError`) or follow it with a
|
|
66
|
+
`progress` callback.
|
|
67
|
+
|
|
68
|
+
**Structure** (`rule="auto"`, the default — no rule needed)
|
|
69
|
+
|
|
70
|
+
- Dimension codes resolved to `{code, label}` — `CLASS_INF` joined for you.
|
|
71
|
+
- Time normalized, with granularity tagged (yearly / monthly / …).
|
|
72
|
+
- A measure spread across rows folded into one record, key auto-detected — for
|
|
73
|
+
tables whose measure axis is flat (GDP, CPI, 建築着工 …). Hierarchical crosses
|
|
74
|
+
(trade's measure × period) and multi-category tables stay as lossless raw.
|
|
75
|
+
- `aggregates="exclude"` drops subtotal/total rows for a sum-safe leaf grain
|
|
76
|
+
(`"only"` keeps just the totals), read from `@parentCode` — a fetch option, so
|
|
77
|
+
it works in any mode.
|
|
78
|
+
- Your own `RuleV2` adds domain-specific column names and explicit pivots of
|
|
79
|
+
hierarchical crosses (`where` / `key` / `unit_from`).
|
|
80
|
+
|
|
81
|
+
**Hand off**
|
|
82
|
+
|
|
83
|
+
- `to_flat()` projects the nested cells to one column per field, for pandas.
|
|
84
|
+
|
|
85
|
+
Throughout, values pass through verbatim — numbers stay strings and suppression
|
|
86
|
+
markers (`-` / `***` / `X`) are preserved — e-Stat's HTTP-200 logical errors
|
|
87
|
+
surface as a typed `EstatApiError`, and transient network failures are retried.
|
|
88
|
+
|
|
89
|
+
## Status
|
|
90
|
+
|
|
91
|
+
pyestat is pre-1.0. Two parts of the surface move at different speeds:
|
|
92
|
+
|
|
93
|
+
- **Settled** — what you *consume*: the nested `StatsDataResponse` shape
|
|
94
|
+
(with its `to_flat()` projection) and the `EstatError` hierarchy hold
|
|
95
|
+
across 0.x.
|
|
96
|
+
- **Evolving** — what you *author*: the `RuleV2` rule schema may still
|
|
97
|
+
change across 0.x as built-in coverage grows.
|
|
98
|
+
|
|
99
|
+
## Install
|
|
100
|
+
|
|
101
|
+
```sh
|
|
102
|
+
uv add pyestat
|
|
103
|
+
# or
|
|
104
|
+
pip install pyestat
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Usage
|
|
108
|
+
|
|
109
|
+
Register for an `appId` at <https://www.e-stat.go.jp/api/>, then pass it
|
|
110
|
+
explicitly to `EstatClient(app_id=...)`. A common convention is to keep it
|
|
111
|
+
in an `ESTAT_APP_ID` environment variable and read it yourself:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
import os
|
|
115
|
+
|
|
116
|
+
from pyestat import EstatClient, EstatApiError
|
|
117
|
+
|
|
118
|
+
client = EstatClient(app_id=os.environ["ESTAT_APP_ID"])
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
response = client.get_stats_data(stats_data_id="0003448237")
|
|
122
|
+
except EstatApiError as exc:
|
|
123
|
+
# e-Stat reports logical errors with HTTP 200 + STATUS != 0.
|
|
124
|
+
print(f"e-Stat refused the query: {exc.status} {exc.message}")
|
|
125
|
+
else:
|
|
126
|
+
print(response.stats_data_id) # "0003448237"
|
|
127
|
+
for row in response.values:
|
|
128
|
+
# The default rule="auto" returns self-describing *nested* cells:
|
|
129
|
+
# each axis is {code, label}, time adds normalized/granularity,
|
|
130
|
+
# the observation is {value, unit}.
|
|
131
|
+
print(row)
|
|
132
|
+
# -> {"cat01": {"code": "000", "label": "男女計"},
|
|
133
|
+
# "time": {"code": "2020000000", "label": "2020年",
|
|
134
|
+
# "normalized": "2020", "granularity": "yearly"},
|
|
135
|
+
# "value": {"value": "126146", "unit": "千人"}}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Prefer one column per field (e.g. for pandas)? `to_flat()` projects the
|
|
139
|
+
nested cells to the familiar suffix shape — losslessly, and as a no-op on a
|
|
140
|
+
raw (`rule=None`) response:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
flat = response.to_flat()
|
|
144
|
+
# -> [{"cat01": "000", "cat01_label": "男女計",
|
|
145
|
+
# "time": "2020", "time_code": "2020000000",
|
|
146
|
+
# "time_label": "2020年", "time_granularity": "yearly",
|
|
147
|
+
# "value": "126146", "unit": "千人"}, ...]
|
|
148
|
+
|
|
149
|
+
import pandas as pd
|
|
150
|
+
df = pd.DataFrame(flat)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Pass `rule=None` instead to get e-Stat's raw rows unchanged (`@`-prefixed
|
|
154
|
+
dimensions become plain keys, `"$"` becomes `"value"`) — flat scalars, no
|
|
155
|
+
labels or normalization.
|
|
156
|
+
|
|
157
|
+
## Writing your own rules
|
|
158
|
+
|
|
159
|
+
`pyestat` ships built-in rules for a small set of tables and falls back to
|
|
160
|
+
`rule="auto"` for the rest. When you want different structuring — or
|
|
161
|
+
domain-specific column names — supply your own `RuleV2`:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from pyestat import EstatClient, RuleV2
|
|
165
|
+
|
|
166
|
+
custom = RuleV2.model_validate({
|
|
167
|
+
"schema_version": "2",
|
|
168
|
+
"match": {"role_pattern": ["value", "area", "time"]},
|
|
169
|
+
"output": [
|
|
170
|
+
{"column": "year", "source": {"role": "time"}, "transform": "yearly"},
|
|
171
|
+
{"column": "region", "source": {"role": "area"}},
|
|
172
|
+
{"column": "value", "source": {"role": "value"}},
|
|
173
|
+
],
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
client = EstatClient(user_rules=[custom])
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
A rule declares the **output columns** you want, each drawn from an axis *role*
|
|
180
|
+
the classifier infers, so one rule covers every table sharing that role pattern.
|
|
181
|
+
Pivoting rows split across a `meta-axis`, naming columns for `to_flat()`, and
|
|
182
|
+
dropping rule files into a directory are covered in
|
|
183
|
+
**[Writing rules →](https://github.com/khaym/pyestat/blob/main/docs/AUTHORING_RULES.md)**.
|
|
184
|
+
|
|
185
|
+
> The `RuleV2` schema is evolving across 0.x — see [Status](#status).
|
|
186
|
+
|
|
187
|
+
## Error behavior
|
|
188
|
+
|
|
189
|
+
On the default `rule="auto"` path, whether a *rule* failure reaches you
|
|
190
|
+
turns on who authored the failing rule — fall back when it is pyestat's,
|
|
191
|
+
surface when it is yours:
|
|
192
|
+
|
|
193
|
+
- A built-in rule that cannot apply degrades to lossless raw output
|
|
194
|
+
instead of raising: its failure is internal and you cannot edit it, so
|
|
195
|
+
preserved data beats a crash.
|
|
196
|
+
- A rule you supplied — an explicit `rule=RuleV2(...)`, a
|
|
197
|
+
`user_rules=` entry, or a file in `./pyestat_rules` — that cannot apply
|
|
198
|
+
raises a typed error so you can fix it and re-run.
|
|
199
|
+
|
|
200
|
+
So `get_stats_data(id)` on a table pyestat does not yet handle returns
|
|
201
|
+
usable raw rows rather than failing, while a mistake in your own rule is
|
|
202
|
+
reported.
|
|
203
|
+
|
|
204
|
+
Every pyestat error inherits from `EstatError`, so a coarse
|
|
205
|
+
`except EstatError` catches them all; catch a leaf (`EstatApiError`,
|
|
206
|
+
`TooManyRowsError`, …) when you want to act on one case.
|
|
207
|
+
|
|
208
|
+
## Configuring the appId
|
|
209
|
+
|
|
210
|
+
[Usage](#usage) shows the basic convention — pass `app_id` explicitly, kept in
|
|
211
|
+
an `ESTAT_APP_ID` variable. How that variable reaches the environment is your
|
|
212
|
+
project's call; a few common patterns:
|
|
213
|
+
|
|
214
|
+
**Shell export** (interactive use):
|
|
215
|
+
|
|
216
|
+
```sh
|
|
217
|
+
export ESTAT_APP_ID="<your-app-id>"
|
|
218
|
+
python your_script.py
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
**`.env` file + [python-dotenv](https://github.com/theskumar/python-dotenv)**
|
|
222
|
+
(local development, Jupyter):
|
|
223
|
+
|
|
224
|
+
```sh
|
|
225
|
+
echo 'ESTAT_APP_ID=<your-app-id>' > .env
|
|
226
|
+
# in your code (or notebook cell):
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
import os
|
|
231
|
+
|
|
232
|
+
from dotenv import load_dotenv
|
|
233
|
+
from pyestat import EstatClient
|
|
234
|
+
|
|
235
|
+
load_dotenv()
|
|
236
|
+
client = EstatClient(app_id=os.environ["ESTAT_APP_ID"])
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Docker / Compose**: pass `-e ESTAT_APP_ID=...` or set it under
|
|
240
|
+
`environment:` in your compose file.
|
|
241
|
+
|
|
242
|
+
**CI (GitHub Actions, etc.)**: store the appId as an encrypted secret and
|
|
243
|
+
inject it as an env var in the workflow step.
|
|
244
|
+
|
|
245
|
+
**Production**: pull it from your secret manager
|
|
246
|
+
(AWS Secrets Manager / GCP Secret Manager / HashiCorp Vault / ...) at
|
|
247
|
+
startup and pass it to `EstatClient(app_id=...)`.
|
|
248
|
+
|
|
249
|
+
`pyestat` deliberately avoids reading the environment or bundling a dotenv
|
|
250
|
+
loader, so it does not constrain how you manage secrets.
|
|
251
|
+
|
|
252
|
+
## Development
|
|
253
|
+
|
|
254
|
+
```sh
|
|
255
|
+
uv sync # install runtime + dev deps
|
|
256
|
+
cp .env.example .env # then fill in your ESTAT_APP_ID
|
|
257
|
+
uv run pytest # runs unit + live API tests
|
|
258
|
+
uv run pytest -m "not integration" # unit only (no network)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
The live integration test under `tests/test_get_stats_data_integration.py`
|
|
262
|
+
auto-skips if `ESTAT_APP_ID` is not set, so the unit suite stays
|
|
263
|
+
hermetic without extra flags.
|
|
264
|
+
|
|
265
|
+
## License
|
|
266
|
+
|
|
267
|
+
MIT License. See [LICENSE](https://github.com/khaym/pyestat/blob/main/LICENSE) for details.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pyestat"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Python client for the e-Stat API (Japanese government statistics portal) with structured outputs for LLMs and data scientists."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
license-files = ["LICENSE"]
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "khaym" }
|
|
10
|
+
]
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"httpx>=0.28.1",
|
|
14
|
+
"pydantic>=2.13.4",
|
|
15
|
+
"pyyaml>=6.0.3",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Homepage = "https://github.com/khaym/pyestat"
|
|
20
|
+
Repository = "https://github.com/khaym/pyestat"
|
|
21
|
+
Issues = "https://github.com/khaym/pyestat/issues"
|
|
22
|
+
|
|
23
|
+
[build-system]
|
|
24
|
+
requires = ["uv_build>=0.11.14,<0.12.0"]
|
|
25
|
+
build-backend = "uv_build"
|
|
26
|
+
|
|
27
|
+
[dependency-groups]
|
|
28
|
+
dev = [
|
|
29
|
+
"pytest>=9.0.3",
|
|
30
|
+
"python-dotenv>=1.2.2",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[tool.uv]
|
|
34
|
+
required-version = ">=0.11.14"
|
|
35
|
+
exclude-newer = "P3D"
|
|
36
|
+
index-strategy = "first-index"
|
|
37
|
+
no-build-isolation-package = []
|
|
38
|
+
|
|
39
|
+
[tool.pytest.ini_options]
|
|
40
|
+
markers = [
|
|
41
|
+
"integration: tests that hit the live e-Stat API (require ESTAT_APP_ID)",
|
|
42
|
+
]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Python client for the e-Stat API.
|
|
2
|
+
|
|
3
|
+
The names re-exported here are pyestat's public surface. For the 0.x series
|
|
4
|
+
stability splits two ways:
|
|
5
|
+
|
|
6
|
+
* **settled** (stability promised) — the consumption path: :class:`EstatClient`
|
|
7
|
+
and its getters (``get_stats_data``, ``get_meta_info``, ``list_stats``,
|
|
8
|
+
``iter_stats_data_pages``); the response objects :class:`StatsDataResponse`
|
|
9
|
+
(and its ``to_flat``), :class:`MetaInfoResponse`, :class:`StatsListResponse`,
|
|
10
|
+
:class:`Page`, :class:`ClassObj`; :class:`EstatHttpClient`,
|
|
11
|
+
:class:`ProgressEvent`; and the error hierarchy :class:`EstatError`,
|
|
12
|
+
:class:`EstatApiError`, :class:`HttpRetryExhaustedError`,
|
|
13
|
+
:class:`TooManyRowsError`, :class:`AmbiguousRuleError`.
|
|
14
|
+
* **evolving** (may change during 0.x) — the rule-authoring path:
|
|
15
|
+
:class:`RuleV2`, :func:`load_builtin_rules`, and the
|
|
16
|
+
:class:`RuleAuthoringError` category. The rule schema is not frozen yet.
|
|
17
|
+
|
|
18
|
+
The authoring *leaf* errors (``RoleResolutionError``, ``RuleExpansionError``,
|
|
19
|
+
``UnknownTransformError``, ``TimeFormatError``) and the rule-file
|
|
20
|
+
``RuleLoadError`` are intentionally not re-exported. Reach them through
|
|
21
|
+
``pyestat._errors`` if you must, accepting that an underscore path carries no
|
|
22
|
+
stability promise; a coarse ``except EstatError`` catches them all regardless.
|
|
23
|
+
"""
|
|
24
|
+
from pyestat._endpoint import (
|
|
25
|
+
ClassObj,
|
|
26
|
+
EstatClient,
|
|
27
|
+
MetaInfoResponse,
|
|
28
|
+
Page,
|
|
29
|
+
StatsDataResponse,
|
|
30
|
+
StatsListResponse,
|
|
31
|
+
)
|
|
32
|
+
from pyestat._engine.builtin import load_builtin_rules
|
|
33
|
+
from pyestat._engine.rule import RuleV2
|
|
34
|
+
from pyestat._errors import (
|
|
35
|
+
AmbiguousRuleError,
|
|
36
|
+
EstatApiError,
|
|
37
|
+
EstatError,
|
|
38
|
+
HttpRetryExhaustedError,
|
|
39
|
+
RuleAuthoringError,
|
|
40
|
+
TooManyRowsError,
|
|
41
|
+
)
|
|
42
|
+
from pyestat._http import EstatHttpClient, ProgressEvent
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"AmbiguousRuleError",
|
|
47
|
+
"ClassObj",
|
|
48
|
+
"EstatApiError",
|
|
49
|
+
"EstatClient",
|
|
50
|
+
"EstatError",
|
|
51
|
+
"EstatHttpClient",
|
|
52
|
+
"HttpRetryExhaustedError",
|
|
53
|
+
"MetaInfoResponse",
|
|
54
|
+
"Page",
|
|
55
|
+
"ProgressEvent",
|
|
56
|
+
"RuleAuthoringError",
|
|
57
|
+
"RuleV2",
|
|
58
|
+
"StatsDataResponse",
|
|
59
|
+
"StatsListResponse",
|
|
60
|
+
"TooManyRowsError",
|
|
61
|
+
"load_builtin_rules",
|
|
62
|
+
]
|