ukam-os-builder 0.1.0.dev5__tar.gz → 0.1.0.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/.github/workflows/ci.yml +3 -3
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/PKG-INFO +7 -7
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/README.md +6 -6
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/pyproject.toml +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_api.py +72 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_inspect_results.py +2 -2
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_public_api_integration.py +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_smoke.py +4 -7
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/__init__.py +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/api/api.py +11 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/transform/runner.py +2 -2
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/ngd/to_flatfile.py +68 -36
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/os_builder/inspect_results.py +17 -17
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/os_builder/os_hub.py +38 -14
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/uv.lock +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/.env.example +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/.github/workflows/e2e.yml +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/.github/workflows/release-pypi.yml +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/.gitignore +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/AGENTS.md +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/config.example.yaml +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/prompt.md +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/shell/test_release_locally.sh +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/README.md +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_builtaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_historicaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_prebuildaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_royalmailaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_cli.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_extract_source_filtering.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_settings.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_setup_wizard.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/_exceptions.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/api/cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/api/settings.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/cli.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/split_raw.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/os_builder/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/os_builder/extract.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/os_builder/pipeline_factory.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/pipeline.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/setup_wizard.py +0 -0
|
@@ -3,9 +3,9 @@ name: Build & package
|
|
|
3
3
|
on:
|
|
4
4
|
workflow_dispatch:
|
|
5
5
|
push:
|
|
6
|
-
branches: [main]
|
|
6
|
+
branches: [ main ]
|
|
7
7
|
paths:
|
|
8
|
-
- "
|
|
8
|
+
- "ukam_os_builder/**"
|
|
9
9
|
- "tests/**"
|
|
10
10
|
- "pyproject.toml"
|
|
11
11
|
- "uv.lock"
|
|
@@ -65,4 +65,4 @@ jobs:
|
|
|
65
65
|
uses: actions/upload-artifact@v4
|
|
66
66
|
with:
|
|
67
67
|
name: dist
|
|
68
|
-
path: dist/*
|
|
68
|
+
path: dist/*
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ukam-os-builder
|
|
3
|
-
Version: 0.1.0.
|
|
3
|
+
Version: 0.1.0.dev8
|
|
4
4
|
Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
|
|
5
5
|
Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
6
6
|
Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
@@ -32,11 +32,14 @@ Build OS address data for `uk_address_matcher` from either NGD (National Geograp
|
|
|
32
32
|
|
|
33
33
|
- Python `3.10+`
|
|
34
34
|
- OS Data Hub package and version IDs
|
|
35
|
-
- Network access to OS Downloads API
|
|
35
|
+
- Network access to OS Downloads API for downloads or remote listing
|
|
36
|
+
- Existing downloaded archives if you want to run offline without re-downloading
|
|
36
37
|
- Credentials in `.env`:
|
|
37
38
|
- `OS_PROJECT_API_KEY`
|
|
38
39
|
- `OS_PROJECT_API_SECRET`
|
|
39
40
|
|
|
41
|
+
If the required zip files already exist in your downloads directory, the build can now continue offline without contacting OS Data Hub. `--list-only` still requires network access because it queries remote package metadata.
|
|
42
|
+
|
|
40
43
|
## Install from PyPI
|
|
41
44
|
|
|
42
45
|
```bash
|
|
@@ -221,13 +224,10 @@ Each file contains:
|
|
|
221
224
|
| `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
|
|
222
225
|
| `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
|
|
223
226
|
| `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
|
|
224
|
-
| `
|
|
225
|
-
| `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
|
|
227
|
+
| `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
|
|
226
228
|
| `floorlevel` | VARCHAR | Floor level identifier |
|
|
227
|
-
| `lowestfloorlevel` | DOUBLE | Lowest floor number |
|
|
228
|
-
| `highestfloorlevel` | DOUBLE | Highest floor number |
|
|
229
229
|
|
|
230
|
-
Metadata
|
|
230
|
+
Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
|
|
231
231
|
|
|
232
232
|
</details>
|
|
233
233
|
|
|
@@ -6,11 +6,14 @@ Build OS address data for `uk_address_matcher` from either NGD (National Geograp
|
|
|
6
6
|
|
|
7
7
|
- Python `3.10+`
|
|
8
8
|
- OS Data Hub package and version IDs
|
|
9
|
-
- Network access to OS Downloads API
|
|
9
|
+
- Network access to OS Downloads API for downloads or remote listing
|
|
10
|
+
- Existing downloaded archives if you want to run offline without re-downloading
|
|
10
11
|
- Credentials in `.env`:
|
|
11
12
|
- `OS_PROJECT_API_KEY`
|
|
12
13
|
- `OS_PROJECT_API_SECRET`
|
|
13
14
|
|
|
15
|
+
If the required zip files already exist in your downloads directory, the build can now continue offline without contacting OS Data Hub. `--list-only` still requires network access because it queries remote package metadata.
|
|
16
|
+
|
|
14
17
|
## Install from PyPI
|
|
15
18
|
|
|
16
19
|
```bash
|
|
@@ -195,13 +198,10 @@ Each file contains:
|
|
|
195
198
|
| `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
|
|
196
199
|
| `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
|
|
197
200
|
| `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
|
|
198
|
-
| `
|
|
199
|
-
| `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
|
|
201
|
+
| `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
|
|
200
202
|
| `floorlevel` | VARCHAR | Floor level identifier |
|
|
201
|
-
| `lowestfloorlevel` | DOUBLE | Lowest floor number |
|
|
202
|
-
| `highestfloorlevel` | DOUBLE | Highest floor number |
|
|
203
203
|
|
|
204
|
-
Metadata
|
|
204
|
+
Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
|
|
205
205
|
|
|
206
206
|
</details>
|
|
207
207
|
|
|
@@ -6,6 +6,7 @@ from textwrap import dedent
|
|
|
6
6
|
from typing import Literal
|
|
7
7
|
|
|
8
8
|
import pytest
|
|
9
|
+
import requests
|
|
9
10
|
|
|
10
11
|
from ukam_os_builder.api.api import create_config_and_env, run_from_config
|
|
11
12
|
|
|
@@ -295,3 +296,74 @@ def test_run_from_config_applies_schema_path_override(
|
|
|
295
296
|
|
|
296
297
|
assert calls["step"] == "split"
|
|
297
298
|
assert calls["schema_path"] == custom_schema.resolve()
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def test_run_from_config_continues_when_api_preflight_is_offline(
|
|
302
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
303
|
+
tmp_path: Path,
|
|
304
|
+
caplog: pytest.LogCaptureFixture,
|
|
305
|
+
) -> None:
|
|
306
|
+
monkeypatch.setenv("OS_PROJECT_API_KEY", "key")
|
|
307
|
+
monkeypatch.setenv("OS_PROJECT_API_SECRET", "secret")
|
|
308
|
+
|
|
309
|
+
config_path = tmp_path / "config.yaml"
|
|
310
|
+
_write_config(
|
|
311
|
+
config_path,
|
|
312
|
+
"""
|
|
313
|
+
source:
|
|
314
|
+
type: ngd
|
|
315
|
+
|
|
316
|
+
os_downloads:
|
|
317
|
+
package_id: "16465"
|
|
318
|
+
version_id: "104444"
|
|
319
|
+
""",
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
calls: dict[str, object] = {}
|
|
323
|
+
|
|
324
|
+
def fake_check_api(_settings: object) -> None:
|
|
325
|
+
raise requests.exceptions.ConnectionError("offline")
|
|
326
|
+
|
|
327
|
+
def fake_run_pipeline(step: str, settings: object, force: bool, list_only: bool) -> None:
|
|
328
|
+
calls["step"] = step
|
|
329
|
+
calls["list_only"] = list_only
|
|
330
|
+
|
|
331
|
+
monkeypatch.setattr("ukam_os_builder.api.api.get_package_version", fake_check_api)
|
|
332
|
+
monkeypatch.setattr("ukam_os_builder.api.api.run_pipeline", fake_run_pipeline)
|
|
333
|
+
|
|
334
|
+
with caplog.at_level("WARNING"):
|
|
335
|
+
run_from_config(config_path=config_path, step="all")
|
|
336
|
+
|
|
337
|
+
assert calls["step"] == "all"
|
|
338
|
+
assert calls["list_only"] is False
|
|
339
|
+
assert "Could not reach OS Data Hub during API preflight" in caplog.text
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def test_run_from_config_raises_when_list_only_api_preflight_is_offline(
|
|
343
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
344
|
+
tmp_path: Path,
|
|
345
|
+
) -> None:
|
|
346
|
+
monkeypatch.setenv("OS_PROJECT_API_KEY", "key")
|
|
347
|
+
monkeypatch.setenv("OS_PROJECT_API_SECRET", "secret")
|
|
348
|
+
|
|
349
|
+
config_path = tmp_path / "config.yaml"
|
|
350
|
+
_write_config(
|
|
351
|
+
config_path,
|
|
352
|
+
"""
|
|
353
|
+
source:
|
|
354
|
+
type: ngd
|
|
355
|
+
|
|
356
|
+
os_downloads:
|
|
357
|
+
package_id: "16465"
|
|
358
|
+
version_id: "104444"
|
|
359
|
+
""",
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
def fake_check_api(_settings: object) -> None:
|
|
363
|
+
raise requests.exceptions.ConnectionError("offline")
|
|
364
|
+
|
|
365
|
+
monkeypatch.setattr("ukam_os_builder.api.api.get_package_version", fake_check_api)
|
|
366
|
+
monkeypatch.setattr("ukam_os_builder.api.api.run_pipeline", lambda **_kwargs: None)
|
|
367
|
+
|
|
368
|
+
with pytest.raises(requests.exceptions.ConnectionError, match="offline"):
|
|
369
|
+
run_from_config(config_path=config_path, step="download", list_only=True)
|
|
@@ -26,7 +26,7 @@ def test_inspect_flatfile_variants_uses_config_defaults(tmp_path: Path) -> None:
|
|
|
26
26
|
(1001::BIGINT, 'A'::VARCHAR),
|
|
27
27
|
(1001::BIGINT, 'B'::VARCHAR),
|
|
28
28
|
(1002::BIGINT, 'C'::VARCHAR)
|
|
29
|
-
) AS t(
|
|
29
|
+
) AS t(unique_id, address_concat)
|
|
30
30
|
) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
|
|
31
31
|
"""
|
|
32
32
|
)
|
|
@@ -61,7 +61,7 @@ def test_inspect_flatfile_variants_supports_abp_pattern(tmp_path: Path) -> None:
|
|
|
61
61
|
(2001::BIGINT, 'A'::VARCHAR),
|
|
62
62
|
(2002::BIGINT, 'B'::VARCHAR),
|
|
63
63
|
(2002::BIGINT, 'C'::VARCHAR)
|
|
64
|
-
) AS t(
|
|
64
|
+
) AS t(unique_id, address_concat)
|
|
65
65
|
) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
|
|
66
66
|
"""
|
|
67
67
|
)
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_public_api_integration.py
RENAMED
|
@@ -71,7 +71,7 @@ def test_package_root_inspect_flatfile_variants(tmp_path: Path) -> None:
|
|
|
71
71
|
(4001::BIGINT, 'A'::VARCHAR),
|
|
72
72
|
(4001::BIGINT, 'B'::VARCHAR),
|
|
73
73
|
(4002::BIGINT, 'C'::VARCHAR)
|
|
74
|
-
) AS t(
|
|
74
|
+
) AS t(unique_id, address_concat)
|
|
75
75
|
) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
|
|
76
76
|
"""
|
|
77
77
|
)
|
|
@@ -172,17 +172,14 @@ def test_flatfile_single_chunk(temp_settings: Settings) -> None:
|
|
|
172
172
|
column_names = [row[0] for row in schema]
|
|
173
173
|
|
|
174
174
|
expected_columns = [
|
|
175
|
-
"
|
|
175
|
+
"unique_id",
|
|
176
176
|
"address_concat",
|
|
177
177
|
"postcode",
|
|
178
178
|
"filename",
|
|
179
179
|
"classificationcode",
|
|
180
180
|
"parentuprn",
|
|
181
|
-
"
|
|
182
|
-
"hierarchylevel",
|
|
181
|
+
"lowertierlocalauthoritygsscode",
|
|
183
182
|
"floorlevel",
|
|
184
|
-
"lowestfloorlevel",
|
|
185
|
-
"highestfloorlevel",
|
|
186
183
|
]
|
|
187
184
|
for col in expected_columns:
|
|
188
185
|
assert col in column_names, f"Column {col} should exist in output"
|
|
@@ -231,9 +228,9 @@ def test_deduplication(temp_settings: Settings) -> None:
|
|
|
231
228
|
# Verify no exact duplicates
|
|
232
229
|
con = duckdb.connect()
|
|
233
230
|
result = con.execute(f"""
|
|
234
|
-
SELECT
|
|
231
|
+
SELECT unique_id, address_concat, COUNT(*) as cnt
|
|
235
232
|
FROM read_parquet('{output_files[0].as_posix()}')
|
|
236
|
-
GROUP BY
|
|
233
|
+
GROUP BY unique_id, address_concat
|
|
237
234
|
HAVING COUNT(*) > 1
|
|
238
235
|
""").fetchall()
|
|
239
236
|
|
|
@@ -5,6 +5,7 @@ import os
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Any, Literal
|
|
7
7
|
|
|
8
|
+
import requests
|
|
8
9
|
import yaml
|
|
9
10
|
|
|
10
11
|
from ukam_os_builder.api.settings import Settings, SettingsError, load_settings
|
|
@@ -344,7 +345,16 @@ def run_from_config(
|
|
|
344
345
|
|
|
345
346
|
has_api_key = bool(os.environ.get("OS_PROJECT_API_KEY"))
|
|
346
347
|
if check_api and has_api_key:
|
|
347
|
-
|
|
348
|
+
try:
|
|
349
|
+
get_package_version(settings)
|
|
350
|
+
except requests.exceptions.RequestException as exc:
|
|
351
|
+
if list_only:
|
|
352
|
+
raise
|
|
353
|
+
logger.warning(
|
|
354
|
+
"Could not reach OS Data Hub during API preflight (%s). "
|
|
355
|
+
"Continuing so local downloads can be used if available.",
|
|
356
|
+
exc.__class__.__name__,
|
|
357
|
+
)
|
|
348
358
|
|
|
349
359
|
overwrite_effective = overwrite if overwrite is not None else bool(force)
|
|
350
360
|
run_pipeline(step=step, settings=settings, force=overwrite_effective, list_only=list_only)
|
|
@@ -170,7 +170,7 @@ def _transform_to_flatfile_chunk(
|
|
|
170
170
|
logger.debug("Combination and deduplication in %.2f seconds", perf_counter() - t0)
|
|
171
171
|
|
|
172
172
|
# Get chunk metrics
|
|
173
|
-
chunk_metrics = con.execute("SELECT COUNT(DISTINCT
|
|
173
|
+
chunk_metrics = con.execute("SELECT COUNT(DISTINCT unique_id), COUNT(*) FROM result").fetchone()
|
|
174
174
|
chunk_uprns = chunk_metrics[0]
|
|
175
175
|
chunk_rows = chunk_metrics[1]
|
|
176
176
|
|
|
@@ -244,7 +244,7 @@ def transform_to_flatfile(
|
|
|
244
244
|
con = create_duckdb_connection(settings)
|
|
245
245
|
output_path = output_paths[0]
|
|
246
246
|
stats = con.execute(f"""
|
|
247
|
-
SELECT COUNT(DISTINCT
|
|
247
|
+
SELECT COUNT(DISTINCT unique_id), COUNT(*)
|
|
248
248
|
FROM read_parquet('{output_path.as_posix()}')
|
|
249
249
|
""").fetchone()
|
|
250
250
|
total_uprns = stats[0]
|
|
@@ -97,7 +97,6 @@ def _create_metadata_lookup_view(
|
|
|
97
97
|
""")
|
|
98
98
|
|
|
99
99
|
if not union_parts:
|
|
100
|
-
# No core files found - create empty lookup
|
|
101
100
|
logger.warning("No core feature files found. Metadata lookup will be empty.")
|
|
102
101
|
con.execute("""
|
|
103
102
|
CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
|
|
@@ -112,37 +111,48 @@ def _create_metadata_lookup_view(
|
|
|
112
111
|
CAST(NULL AS DOUBLE) AS highestfloorlevel
|
|
113
112
|
WHERE 1=0
|
|
114
113
|
""")
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
union_sql = "\nUNION ALL\n".join(union_parts)
|
|
114
|
+
else:
|
|
115
|
+
union_sql = "\nUNION ALL\n".join(union_parts)
|
|
118
116
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
117
|
+
sql = f"""
|
|
118
|
+
CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
|
|
119
|
+
WITH core_data AS (
|
|
120
|
+
{union_sql}
|
|
121
|
+
),
|
|
122
|
+
ranked AS (
|
|
123
|
+
SELECT
|
|
124
|
+
*,
|
|
125
|
+
ROW_NUMBER() OVER (
|
|
126
|
+
PARTITION BY uprn
|
|
127
|
+
ORDER BY source_priority
|
|
128
|
+
) AS rn
|
|
129
|
+
FROM core_data
|
|
130
|
+
)
|
|
125
131
|
SELECT
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
132
|
+
uprn,
|
|
133
|
+
classificationcode,
|
|
134
|
+
parentuprn,
|
|
135
|
+
rootuprn,
|
|
136
|
+
hierarchylevel,
|
|
137
|
+
floorlevel,
|
|
138
|
+
lowestfloorlevel,
|
|
139
|
+
highestfloorlevel
|
|
140
|
+
FROM ranked
|
|
141
|
+
WHERE rn = 1;
|
|
142
|
+
"""
|
|
143
|
+
con.execute(sql)
|
|
144
|
+
|
|
145
|
+
built_path = parquet_dir / "add_gb_builtaddress.parquet"
|
|
146
|
+
built_sql = f"""
|
|
147
|
+
CREATE OR REPLACE TEMP VIEW builtaddress_ltla_lookup AS
|
|
133
148
|
SELECT
|
|
134
|
-
uprn,
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
floorlevel,
|
|
140
|
-
lowestfloorlevel,
|
|
141
|
-
highestfloorlevel
|
|
142
|
-
FROM ranked
|
|
143
|
-
WHERE rn = 1;
|
|
149
|
+
CAST(uprn AS BIGINT) AS uprn,
|
|
150
|
+
MAX(CAST(lowertierlocalauthoritygsscode AS VARCHAR)) AS lowertierlocalauthoritygsscode
|
|
151
|
+
FROM read_parquet('{built_path.as_posix()}')
|
|
152
|
+
{where_clause}
|
|
153
|
+
GROUP BY CAST(uprn AS BIGINT)
|
|
144
154
|
"""
|
|
145
|
-
con.execute(
|
|
155
|
+
con.execute(built_sql)
|
|
146
156
|
|
|
147
157
|
|
|
148
158
|
def _create_core_feature_view(
|
|
@@ -183,6 +193,7 @@ def _create_core_feature_view(
|
|
|
183
193
|
CAST(floorlevel AS VARCHAR) AS floorlevel,
|
|
184
194
|
CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
|
|
185
195
|
CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
|
|
196
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
186
197
|
-- Internal columns for deduplication (not in final output)
|
|
187
198
|
CAST(description AS VARCHAR) AS feature_type,
|
|
188
199
|
CAST(addressstatus AS VARCHAR) AS address_status,
|
|
@@ -222,6 +233,7 @@ def _create_core_feature_view(
|
|
|
222
233
|
CAST(floorlevel AS VARCHAR) AS floorlevel,
|
|
223
234
|
CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
|
|
224
235
|
CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
|
|
236
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
225
237
|
-- Internal columns for deduplication (not in final output)
|
|
226
238
|
CAST(description AS VARCHAR) AS feature_type,
|
|
227
239
|
CAST(addressstatus AS VARCHAR) AS address_status,
|
|
@@ -277,6 +289,7 @@ def _create_altadd_view(
|
|
|
277
289
|
CAST(floorlevel AS VARCHAR) AS floorlevel,
|
|
278
290
|
CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
|
|
279
291
|
CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
|
|
292
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
280
293
|
-- Internal columns for deduplication (not in final output)
|
|
281
294
|
'{feature_type}' AS feature_type,
|
|
282
295
|
CAST(addressstatus AS VARCHAR) AS address_status,
|
|
@@ -333,6 +346,7 @@ def _create_royal_mail_view(
|
|
|
333
346
|
CAST(NULL AS VARCHAR) AS floorlevel,
|
|
334
347
|
CAST(NULL AS DOUBLE) AS lowestfloorlevel,
|
|
335
348
|
CAST(NULL AS DOUBLE) AS highestfloorlevel,
|
|
349
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
336
350
|
-- Internal columns for deduplication (not in final output)
|
|
337
351
|
'Royal Mail Address' AS feature_type,
|
|
338
352
|
CAST(NULL AS VARCHAR) AS address_status,
|
|
@@ -363,6 +377,7 @@ def _create_royal_mail_view(
|
|
|
363
377
|
CAST(NULL AS VARCHAR) AS floorlevel,
|
|
364
378
|
CAST(NULL AS DOUBLE) AS lowestfloorlevel,
|
|
365
379
|
CAST(NULL AS DOUBLE) AS highestfloorlevel,
|
|
380
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
366
381
|
-- Internal columns for deduplication (not in final output)
|
|
367
382
|
'Royal Mail Address' AS feature_type,
|
|
368
383
|
CAST(NULL AS VARCHAR) AS address_status,
|
|
@@ -398,12 +413,14 @@ def _enrich_with_metadata(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
398
413
|
COALESCE(a.floorlevel, m.floorlevel) AS floorlevel,
|
|
399
414
|
COALESCE(a.lowestfloorlevel, m.lowestfloorlevel) AS lowestfloorlevel,
|
|
400
415
|
COALESCE(a.highestfloorlevel, m.highestfloorlevel) AS highestfloorlevel,
|
|
416
|
+
b.lowertierlocalauthoritygsscode AS lowertierlocalauthoritygsscode,
|
|
401
417
|
-- Internal columns for deduplication
|
|
402
418
|
a.feature_type,
|
|
403
419
|
a.address_status,
|
|
404
420
|
a.build_status
|
|
405
421
|
FROM all_full_addresses a
|
|
406
|
-
LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn
|
|
422
|
+
LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn
|
|
423
|
+
LEFT JOIN builtaddress_ltla_lookup b ON a.uprn = b.uprn;
|
|
407
424
|
"""
|
|
408
425
|
con.execute(sql)
|
|
409
426
|
|
|
@@ -419,11 +436,28 @@ def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
419
436
|
dedup priority and never override official address data.
|
|
420
437
|
"""
|
|
421
438
|
sql = """
|
|
422
|
-
INSERT INTO all_full_addresses_enriched
|
|
439
|
+
INSERT INTO all_full_addresses_enriched (
|
|
440
|
+
uprn,
|
|
441
|
+
address_concat,
|
|
442
|
+
postcode,
|
|
443
|
+
filename,
|
|
444
|
+
classificationcode,
|
|
445
|
+
parentuprn,
|
|
446
|
+
rootuprn,
|
|
447
|
+
hierarchylevel,
|
|
448
|
+
floorlevel,
|
|
449
|
+
lowestfloorlevel,
|
|
450
|
+
highestfloorlevel,
|
|
451
|
+
lowertierlocalauthoritygsscode,
|
|
452
|
+
feature_type,
|
|
453
|
+
address_status,
|
|
454
|
+
build_status
|
|
455
|
+
)
|
|
423
456
|
WITH level_parsed AS (
|
|
424
457
|
SELECT
|
|
425
458
|
uprn, address_concat, postcode, filename,
|
|
426
459
|
classificationcode, parentuprn, rootuprn,
|
|
460
|
+
lowertierlocalauthoritygsscode,
|
|
427
461
|
hierarchylevel, floorlevel, lowestfloorlevel, highestfloorlevel,
|
|
428
462
|
address_status, build_status,
|
|
429
463
|
CASE
|
|
@@ -464,6 +498,7 @@ def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
464
498
|
floorlevel,
|
|
465
499
|
lowestfloorlevel,
|
|
466
500
|
highestfloorlevel,
|
|
501
|
+
lowertierlocalauthoritygsscode,
|
|
467
502
|
'Custom Level' AS feature_type,
|
|
468
503
|
address_status,
|
|
469
504
|
build_status
|
|
@@ -523,17 +558,14 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
523
558
|
WHERE feature_type NOT IN ('Non-Addressable Object')
|
|
524
559
|
)
|
|
525
560
|
SELECT
|
|
526
|
-
uprn,
|
|
561
|
+
uprn AS unique_id,
|
|
527
562
|
address_concat,
|
|
528
563
|
postcode,
|
|
529
564
|
filename,
|
|
530
565
|
classificationcode,
|
|
531
566
|
parentuprn,
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
floorlevel,
|
|
535
|
-
lowestfloorlevel,
|
|
536
|
-
highestfloorlevel
|
|
567
|
+
lowertierlocalauthoritygsscode,
|
|
568
|
+
floorlevel
|
|
537
569
|
FROM ranked
|
|
538
570
|
WHERE rn = 1;
|
|
539
571
|
"""
|
|
@@ -12,7 +12,7 @@ SourceType = Literal["ngd", "abp"]
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
14
|
_DEFAULT_SELECT_COLUMNS = [
|
|
15
|
-
"
|
|
15
|
+
"unique_id",
|
|
16
16
|
"address_concat",
|
|
17
17
|
"postcode",
|
|
18
18
|
"source",
|
|
@@ -128,9 +128,9 @@ def get_variant_statistics(
|
|
|
128
128
|
|
|
129
129
|
stats = con.sql(f"""
|
|
130
130
|
WITH variant_counts AS (
|
|
131
|
-
SELECT
|
|
131
|
+
SELECT unique_id, COUNT(*) AS variant_count
|
|
132
132
|
FROM read_parquet('{files_sql}')
|
|
133
|
-
GROUP BY
|
|
133
|
+
GROUP BY unique_id
|
|
134
134
|
)
|
|
135
135
|
SELECT
|
|
136
136
|
COUNT(*) AS total_uprns,
|
|
@@ -179,7 +179,7 @@ def get_random_uprn(
|
|
|
179
179
|
|
|
180
180
|
select_columns = _choose_select_columns(con, files_sql, columns)
|
|
181
181
|
random_uprn = con.sql(f"""
|
|
182
|
-
SELECT DISTINCT
|
|
182
|
+
SELECT DISTINCT unique_id
|
|
183
183
|
FROM read_parquet('{files_sql}')
|
|
184
184
|
ORDER BY RANDOM()
|
|
185
185
|
LIMIT 1
|
|
@@ -192,7 +192,7 @@ def get_random_uprn(
|
|
|
192
192
|
SELECT
|
|
193
193
|
{select_columns}
|
|
194
194
|
FROM read_parquet('{files_sql}')
|
|
195
|
-
WHERE
|
|
195
|
+
WHERE unique_id = {int(random_uprn[0])}
|
|
196
196
|
ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
|
|
197
197
|
""")
|
|
198
198
|
|
|
@@ -220,14 +220,14 @@ def get_random_large_uprn(
|
|
|
220
220
|
|
|
221
221
|
selected = con.sql(f"""
|
|
222
222
|
WITH variant_counts AS (
|
|
223
|
-
SELECT
|
|
223
|
+
SELECT unique_id, COUNT(*) AS variant_count
|
|
224
224
|
FROM read_parquet('{files_sql}')
|
|
225
225
|
{where_filter}
|
|
226
|
-
GROUP BY
|
|
227
|
-
ORDER BY variant_count DESC,
|
|
226
|
+
GROUP BY unique_id
|
|
227
|
+
ORDER BY variant_count DESC, unique_id ASC
|
|
228
228
|
LIMIT {int(top_n)}
|
|
229
229
|
)
|
|
230
|
-
SELECT
|
|
230
|
+
SELECT unique_id
|
|
231
231
|
FROM variant_counts
|
|
232
232
|
ORDER BY RANDOM()
|
|
233
233
|
LIMIT 1
|
|
@@ -240,7 +240,7 @@ def get_random_large_uprn(
|
|
|
240
240
|
SELECT
|
|
241
241
|
{select_columns}
|
|
242
242
|
FROM read_parquet('{files_sql}')
|
|
243
|
-
WHERE
|
|
243
|
+
WHERE unique_id = {int(selected[0])}
|
|
244
244
|
{and_filter}
|
|
245
245
|
ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
|
|
246
246
|
""")
|
|
@@ -269,7 +269,7 @@ def get_uprn_variants(
|
|
|
269
269
|
SELECT
|
|
270
270
|
{select_columns}
|
|
271
271
|
FROM read_parquet('{files_sql}')
|
|
272
|
-
WHERE
|
|
272
|
+
WHERE unique_id = {int(uprn)}
|
|
273
273
|
{and_filter}
|
|
274
274
|
ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
|
|
275
275
|
""")
|
|
@@ -317,10 +317,10 @@ def inspect_flatfile_variants(
|
|
|
317
317
|
WITH data AS (
|
|
318
318
|
SELECT * FROM read_parquet('{files_sql}')
|
|
319
319
|
)
|
|
320
|
-
SELECT
|
|
320
|
+
SELECT unique_id, COUNT(*) AS variant_count
|
|
321
321
|
FROM data
|
|
322
|
-
GROUP BY
|
|
323
|
-
ORDER BY variant_count DESC,
|
|
322
|
+
GROUP BY unique_id
|
|
323
|
+
ORDER BY variant_count DESC, unique_id ASC
|
|
324
324
|
LIMIT 1 OFFSET {top_offset}
|
|
325
325
|
"""
|
|
326
326
|
).fetchone()
|
|
@@ -333,7 +333,7 @@ def inspect_flatfile_variants(
|
|
|
333
333
|
f"""
|
|
334
334
|
SELECT COUNT(*)
|
|
335
335
|
FROM read_parquet('{files_sql}')
|
|
336
|
-
WHERE
|
|
336
|
+
WHERE unique_id = ?
|
|
337
337
|
""",
|
|
338
338
|
[target_uprn],
|
|
339
339
|
).fetchone()
|
|
@@ -343,7 +343,7 @@ def inspect_flatfile_variants(
|
|
|
343
343
|
f"""
|
|
344
344
|
SELECT *
|
|
345
345
|
FROM read_parquet('{files_sql}')
|
|
346
|
-
WHERE
|
|
346
|
+
WHERE unique_id = ?
|
|
347
347
|
ORDER BY 1
|
|
348
348
|
""",
|
|
349
349
|
[target_uprn],
|
|
@@ -358,7 +358,7 @@ def inspect_flatfile_variants(
|
|
|
358
358
|
max_width=10_000
|
|
359
359
|
)
|
|
360
360
|
logger.info("Selected UPRN rows:")
|
|
361
|
-
con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE
|
|
361
|
+
con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE unique_id = {target_uprn}").show(
|
|
362
362
|
max_width=10_000
|
|
363
363
|
)
|
|
364
364
|
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/os_builder/os_hub.py
RENAMED
|
@@ -253,6 +253,29 @@ def download_file(
|
|
|
253
253
|
return True
|
|
254
254
|
|
|
255
255
|
|
|
256
|
+
def _use_existing_archives_or_raise(
|
|
257
|
+
downloads_dir: Path,
|
|
258
|
+
reason: str,
|
|
259
|
+
original_exc: Exception,
|
|
260
|
+
) -> list[Path]:
|
|
261
|
+
"""Fall back to existing local archives, or re-raise with a helpful message."""
|
|
262
|
+
existing_archives = _find_existing_download_archives(downloads_dir)
|
|
263
|
+
if existing_archives:
|
|
264
|
+
logger.warning(
|
|
265
|
+
"%s; using %d existing archive(s) in %s and skipping download "
|
|
266
|
+
"(MD5 verification against the OS Data Hub will be skipped).",
|
|
267
|
+
reason,
|
|
268
|
+
len(existing_archives),
|
|
269
|
+
downloads_dir,
|
|
270
|
+
)
|
|
271
|
+
return existing_archives
|
|
272
|
+
|
|
273
|
+
raise ValueError(
|
|
274
|
+
f"{reason}. No local zip files were found in {downloads_dir}, "
|
|
275
|
+
"so download cannot be skipped."
|
|
276
|
+
) from original_exc
|
|
277
|
+
|
|
278
|
+
|
|
256
279
|
def run_download_step(
|
|
257
280
|
settings: Any,
|
|
258
281
|
force: bool = False,
|
|
@@ -266,22 +289,23 @@ def run_download_step(
|
|
|
266
289
|
except ValueError as exc:
|
|
267
290
|
if list_only:
|
|
268
291
|
raise
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
len(existing_archives),
|
|
275
|
-
downloads_dir,
|
|
276
|
-
)
|
|
277
|
-
return existing_archives
|
|
278
|
-
|
|
279
|
-
raise ValueError(
|
|
280
|
-
f"{exc} No local zip files were found in {downloads_dir}, so download cannot be skipped."
|
|
281
|
-
) from exc
|
|
292
|
+
return _use_existing_archives_or_raise(
|
|
293
|
+
downloads_dir,
|
|
294
|
+
reason="No API key found",
|
|
295
|
+
original_exc=exc,
|
|
296
|
+
)
|
|
282
297
|
|
|
283
298
|
logger.info("Fetching package metadata...")
|
|
284
|
-
|
|
299
|
+
try:
|
|
300
|
+
metadata = get_package_version(settings)
|
|
301
|
+
except (requests.exceptions.RequestException, OSError) as exc:
|
|
302
|
+
if list_only:
|
|
303
|
+
raise
|
|
304
|
+
return _use_existing_archives_or_raise(
|
|
305
|
+
downloads_dir,
|
|
306
|
+
reason=f"Could not reach OS Data Hub ({exc.__class__.__name__})",
|
|
307
|
+
original_exc=exc,
|
|
308
|
+
)
|
|
285
309
|
items = list_downloads(metadata)
|
|
286
310
|
|
|
287
311
|
if list_only:
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/.github/workflows/release-pypi.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_builtaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_builtaddress_altadd.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_historicaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_prebuildaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/data/add_gb_royalmailaddress.csv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/tests/test_extract_source_filtering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/os_builder/__init__.py
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev8}/ukam_os_builder/os_builder/extract.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|