ukam-os-builder 0.1.0.dev4__tar.gz → 0.1.0.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/PKG-INFO +3 -6
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/README.md +2 -5
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/pyproject.toml +1 -1
- ukam_os_builder-0.1.0.dev6/tests/test_extract_source_filtering.py +49 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_inspect_results.py +2 -2
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_public_api_integration.py +1 -1
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_smoke.py +4 -8
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/__init__.py +1 -1
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/api.py +15 -6
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/cli.py +0 -1
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/runner.py +2 -2
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +2 -2
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +4 -6
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/ngd/to_flatfile.py +142 -46
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/extract.py +15 -2
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/inspect_results.py +17 -17
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/os_hub.py +72 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/uv.lock +1 -1
- ukam_os_builder-0.1.0.dev4/tests/test_extract_source_filtering.py +0 -27
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.env.example +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.github/workflows/ci.yml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.github/workflows/e2e.yml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.github/workflows/release-pypi.yml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.gitignore +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/AGENTS.md +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/config.example.yaml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/prompt.md +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/shell/test_release_locally.sh +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/README.md +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_historicaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_prebuildaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_royalmailaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_api.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_cli.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_settings.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_setup_wizard.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/_exceptions.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/settings.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/split_raw.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/pipeline_factory.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/pipeline.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/setup_wizard.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ukam-os-builder
|
|
3
|
-
Version: 0.1.0.
|
|
3
|
+
Version: 0.1.0.dev6
|
|
4
4
|
Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
|
|
5
5
|
Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
6
6
|
Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
@@ -221,13 +221,10 @@ Each file contains:
|
|
|
221
221
|
| `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
|
|
222
222
|
| `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
|
|
223
223
|
| `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
|
|
224
|
-
| `
|
|
225
|
-
| `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
|
|
224
|
+
| `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
|
|
226
225
|
| `floorlevel` | VARCHAR | Floor level identifier |
|
|
227
|
-
| `lowestfloorlevel` | DOUBLE | Lowest floor number |
|
|
228
|
-
| `highestfloorlevel` | DOUBLE | Highest floor number |
|
|
229
226
|
|
|
230
|
-
Metadata
|
|
227
|
+
Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
|
|
231
228
|
|
|
232
229
|
</details>
|
|
233
230
|
|
|
@@ -195,13 +195,10 @@ Each file contains:
|
|
|
195
195
|
| `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
|
|
196
196
|
| `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
|
|
197
197
|
| `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
|
|
198
|
-
| `
|
|
199
|
-
| `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
|
|
198
|
+
| `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
|
|
200
199
|
| `floorlevel` | VARCHAR | Floor level identifier |
|
|
201
|
-
| `lowestfloorlevel` | DOUBLE | Lowest floor number |
|
|
202
|
-
| `highestfloorlevel` | DOUBLE | Highest floor number |
|
|
203
200
|
|
|
204
|
-
Metadata
|
|
201
|
+
Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
|
|
205
202
|
|
|
206
203
|
</details>
|
|
207
204
|
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ukam_os_builder.os_builder.extract import (
|
|
6
|
+
_filter_zips_for_source,
|
|
7
|
+
_should_convert_csv_to_parquet,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
|
|
12
|
+
zip_files = [
|
|
13
|
+
Path("add_gb_builtaddress.zip"),
|
|
14
|
+
Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
filtered = _filter_zips_for_source(zip_files, "ngd")
|
|
18
|
+
|
|
19
|
+
assert filtered == [Path("add_gb_builtaddress.zip")]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
|
|
23
|
+
ngd_csv = Path("add_gb_builtaddress.csv")
|
|
24
|
+
abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
|
|
25
|
+
|
|
26
|
+
assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
|
|
27
|
+
assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_filter_zips_for_source_excludes_ngd_historicaddress() -> None:
|
|
31
|
+
zip_files = [
|
|
32
|
+
Path("add_gb_builtaddress.zip"),
|
|
33
|
+
Path("add_gb_historicaddress.zip"),
|
|
34
|
+
Path("add_gb_historicaddress_altadd.zip"),
|
|
35
|
+
Path("add_gb_prebuildaddress.zip"),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
filtered = _filter_zips_for_source(zip_files, "ngd")
|
|
39
|
+
|
|
40
|
+
assert Path("add_gb_builtaddress.zip") in filtered
|
|
41
|
+
assert Path("add_gb_prebuildaddress.zip") in filtered
|
|
42
|
+
assert Path("add_gb_historicaddress.zip") not in filtered
|
|
43
|
+
assert Path("add_gb_historicaddress_altadd.zip") not in filtered
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_should_convert_csv_to_parquet_skips_ngd_historicaddress() -> None:
|
|
47
|
+
assert _should_convert_csv_to_parquet(Path("add_gb_builtaddress.csv"), "ngd") is True
|
|
48
|
+
assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress.csv"), "ngd") is False
|
|
49
|
+
assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress_altadd.csv"), "ngd") is False
|
|
@@ -26,7 +26,7 @@ def test_inspect_flatfile_variants_uses_config_defaults(tmp_path: Path) -> None:
|
|
|
26
26
|
(1001::BIGINT, 'A'::VARCHAR),
|
|
27
27
|
(1001::BIGINT, 'B'::VARCHAR),
|
|
28
28
|
(1002::BIGINT, 'C'::VARCHAR)
|
|
29
|
-
) AS t(
|
|
29
|
+
) AS t(unique_id, address_concat)
|
|
30
30
|
) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
|
|
31
31
|
"""
|
|
32
32
|
)
|
|
@@ -61,7 +61,7 @@ def test_inspect_flatfile_variants_supports_abp_pattern(tmp_path: Path) -> None:
|
|
|
61
61
|
(2001::BIGINT, 'A'::VARCHAR),
|
|
62
62
|
(2002::BIGINT, 'B'::VARCHAR),
|
|
63
63
|
(2002::BIGINT, 'C'::VARCHAR)
|
|
64
|
-
) AS t(
|
|
64
|
+
) AS t(unique_id, address_concat)
|
|
65
65
|
) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
|
|
66
66
|
"""
|
|
67
67
|
)
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_public_api_integration.py
RENAMED
|
@@ -71,7 +71,7 @@ def test_package_root_inspect_flatfile_variants(tmp_path: Path) -> None:
|
|
|
71
71
|
(4001::BIGINT, 'A'::VARCHAR),
|
|
72
72
|
(4001::BIGINT, 'B'::VARCHAR),
|
|
73
73
|
(4002::BIGINT, 'C'::VARCHAR)
|
|
74
|
-
) AS t(
|
|
74
|
+
) AS t(unique_id, address_concat)
|
|
75
75
|
) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
|
|
76
76
|
"""
|
|
77
77
|
)
|
|
@@ -121,7 +121,6 @@ def _prepare_test_parquet(settings: Settings) -> None:
|
|
|
121
121
|
"add_gb_builtaddress_altadd.csv",
|
|
122
122
|
"add_gb_royalmailaddress.csv",
|
|
123
123
|
"add_gb_prebuildaddress.csv",
|
|
124
|
-
"add_gb_historicaddress.csv",
|
|
125
124
|
]
|
|
126
125
|
|
|
127
126
|
for csv_name in sample_files:
|
|
@@ -173,17 +172,14 @@ def test_flatfile_single_chunk(temp_settings: Settings) -> None:
|
|
|
173
172
|
column_names = [row[0] for row in schema]
|
|
174
173
|
|
|
175
174
|
expected_columns = [
|
|
176
|
-
"
|
|
175
|
+
"unique_id",
|
|
177
176
|
"address_concat",
|
|
178
177
|
"postcode",
|
|
179
178
|
"filename",
|
|
180
179
|
"classificationcode",
|
|
181
180
|
"parentuprn",
|
|
182
|
-
"
|
|
183
|
-
"hierarchylevel",
|
|
181
|
+
"lowertierlocalauthoritygsscode",
|
|
184
182
|
"floorlevel",
|
|
185
|
-
"lowestfloorlevel",
|
|
186
|
-
"highestfloorlevel",
|
|
187
183
|
]
|
|
188
184
|
for col in expected_columns:
|
|
189
185
|
assert col in column_names, f"Column {col} should exist in output"
|
|
@@ -232,9 +228,9 @@ def test_deduplication(temp_settings: Settings) -> None:
|
|
|
232
228
|
# Verify no exact duplicates
|
|
233
229
|
con = duckdb.connect()
|
|
234
230
|
result = con.execute(f"""
|
|
235
|
-
SELECT
|
|
231
|
+
SELECT unique_id, address_concat, COUNT(*) as cnt
|
|
236
232
|
FROM read_parquet('{output_files[0].as_posix()}')
|
|
237
|
-
GROUP BY
|
|
233
|
+
GROUP BY unique_id, address_concat
|
|
238
234
|
HAVING COUNT(*) > 1
|
|
239
235
|
""").fetchall()
|
|
240
236
|
|
|
@@ -8,7 +8,7 @@ from typing import Any, Literal
|
|
|
8
8
|
import yaml
|
|
9
9
|
|
|
10
10
|
from ukam_os_builder.api.settings import Settings, SettingsError, load_settings
|
|
11
|
-
from ukam_os_builder.os_builder.os_hub import get_package_version
|
|
11
|
+
from ukam_os_builder.os_builder.os_hub import _get_manifest_path, get_package_version
|
|
12
12
|
from ukam_os_builder.pipeline import run as run_pipeline
|
|
13
13
|
from ukam_os_builder.pipeline import supported_steps_for_source
|
|
14
14
|
|
|
@@ -333,11 +333,6 @@ def run_from_config(
|
|
|
333
333
|
parquet_compression_level=parquet_compression_level,
|
|
334
334
|
)
|
|
335
335
|
logger.info("Resolved work_dir: %s", settings.paths.work_dir)
|
|
336
|
-
logger.info("Resolved downloads_dir: %s", settings.paths.downloads_dir)
|
|
337
|
-
logger.info("Resolved extracted_dir: %s", settings.paths.extracted_dir)
|
|
338
|
-
logger.info("Resolved parquet_dir: %s", settings.paths.parquet_dir)
|
|
339
|
-
logger.info("Resolved output_dir: %s", settings.paths.output_dir)
|
|
340
|
-
|
|
341
336
|
source_type = settings.source.type
|
|
342
337
|
if step != "all":
|
|
343
338
|
supported_steps = supported_steps_for_source(source_type)
|
|
@@ -353,4 +348,18 @@ def run_from_config(
|
|
|
353
348
|
|
|
354
349
|
overwrite_effective = overwrite if overwrite is not None else bool(force)
|
|
355
350
|
run_pipeline(step=step, settings=settings, force=overwrite_effective, list_only=list_only)
|
|
351
|
+
|
|
352
|
+
logger.info(
|
|
353
|
+
"✅ Pipeline run completed\n\n"
|
|
354
|
+
"Where you need to look:\n"
|
|
355
|
+
" • downloads_dir (raw OS Hub extracts): %s%s\n"
|
|
356
|
+
" • output_dir (final files for address matcher): %s%s\n",
|
|
357
|
+
str(settings.paths.downloads_dir),
|
|
358
|
+
"",
|
|
359
|
+
str(settings.paths.output_dir),
|
|
360
|
+
"",
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
_get_manifest_path(settings)
|
|
364
|
+
|
|
356
365
|
return settings
|
|
@@ -145,7 +145,6 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
145
145
|
parquet_compression=args.parquet_compression,
|
|
146
146
|
parquet_compression_level=args.parquet_compression_level,
|
|
147
147
|
)
|
|
148
|
-
logger.info("Pipeline run completed")
|
|
149
148
|
console.print("[bold green]Build completed successfully[/bold green]")
|
|
150
149
|
return 0
|
|
151
150
|
except (SettingsError, ValueError) as exc:
|
|
@@ -170,7 +170,7 @@ def _transform_to_flatfile_chunk(
|
|
|
170
170
|
logger.debug("Combination and deduplication in %.2f seconds", perf_counter() - t0)
|
|
171
171
|
|
|
172
172
|
# Get chunk metrics
|
|
173
|
-
chunk_metrics = con.execute("SELECT COUNT(DISTINCT
|
|
173
|
+
chunk_metrics = con.execute("SELECT COUNT(DISTINCT unique_id), COUNT(*) FROM result").fetchone()
|
|
174
174
|
chunk_uprns = chunk_metrics[0]
|
|
175
175
|
chunk_rows = chunk_metrics[1]
|
|
176
176
|
|
|
@@ -244,7 +244,7 @@ def transform_to_flatfile(
|
|
|
244
244
|
con = create_duckdb_connection(settings)
|
|
245
245
|
output_path = output_paths[0]
|
|
246
246
|
stats = con.execute(f"""
|
|
247
|
-
SELECT COUNT(DISTINCT
|
|
247
|
+
SELECT COUNT(DISTINCT unique_id), COUNT(*)
|
|
248
248
|
FROM read_parquet('{output_path.as_posix()}')
|
|
249
249
|
""").fetchone()
|
|
250
250
|
total_uprns = stats[0]
|
|
@@ -33,7 +33,7 @@ def combine_and_dedupe(con: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyRelatio
|
|
|
33
33
|
),
|
|
34
34
|
ranked AS (
|
|
35
35
|
SELECT *,
|
|
36
|
-
CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2
|
|
36
|
+
CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2 ELSE 9 END AS status_rank,
|
|
37
37
|
CASE source WHEN 'LPI' THEN 0 WHEN 'ORGANISATION' THEN 1 WHEN 'DELIVERY_POINT' THEN 2 WHEN 'CUSTOM_LEVEL' THEN 3 ELSE 4 END AS source_rank
|
|
38
38
|
FROM normalized
|
|
39
39
|
),
|
|
@@ -62,7 +62,7 @@ def combine_and_dedupe(con: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyRelatio
|
|
|
62
62
|
FROM deduped_filtered
|
|
63
63
|
)
|
|
64
64
|
SELECT
|
|
65
|
-
sr.uprn,
|
|
65
|
+
sr.uprn AS unique_id,
|
|
66
66
|
sr.postcode,
|
|
67
67
|
sr.address_concat,
|
|
68
68
|
cb.classification_code,
|
|
@@ -70,15 +70,15 @@ matching messy user input. We output variants based on **Logical Status**:
|
|
|
70
70
|
locally known as "Rose Cottage").
|
|
71
71
|
3. **Provisional (6):** The address assigned during planning/construction, which
|
|
72
72
|
might change before the house is built.
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
|
|
74
|
+
Historic addresses (logical_status=8) are excluded from output.
|
|
75
75
|
|
|
76
76
|
------------------------------------------------------------------------------
|
|
77
77
|
Key Columns Explained
|
|
78
78
|
------------------------------------------------------------------------------
|
|
79
79
|
* `uprn`: The "Golden Key". Use this to link this address to other data.
|
|
80
80
|
* `base_address`: The constructed full address string.
|
|
81
|
-
* `logical_status`: 1=Current, 6=Provisional
|
|
81
|
+
* `logical_status`: 1=Current, 6=Provisional.
|
|
82
82
|
* `official_flag`: 'Y' indicates this is the "official" version, 'N' suggests
|
|
83
83
|
it might be an unofficial alias.
|
|
84
84
|
* `language`: 'ENG' (English) or 'CYM' (Welsh). Streets in Wales often have
|
|
@@ -183,7 +183,6 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
183
183
|
WHEN 1 THEN 0
|
|
184
184
|
WHEN 3 THEN 1
|
|
185
185
|
WHEN 6 THEN 2
|
|
186
|
-
WHEN 8 THEN 3
|
|
187
186
|
ELSE 9
|
|
188
187
|
END AS status_rank
|
|
189
188
|
FROM lpi l
|
|
@@ -192,7 +191,7 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
192
191
|
LEFT JOIN _sd_best_by_lang sd_lang ON sd_lang.usrn = l.usrn AND sd_lang.language = l.language
|
|
193
192
|
LEFT JOIN _sd_best_any sd_any ON sd_any.usrn = l.usrn
|
|
194
193
|
WHERE (b.addressbase_postal != 'N' OR b.addressbase_postal IS NULL)
|
|
195
|
-
AND l.logical_status IN (1, 3, 6
|
|
194
|
+
AND l.logical_status IN (1, 3, 6)
|
|
196
195
|
""")
|
|
197
196
|
|
|
198
197
|
# Deduplicated distinct addresses
|
|
@@ -266,7 +265,6 @@ def render_variants(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
266
265
|
WHEN 1 THEN 'APPROVED'
|
|
267
266
|
WHEN 3 THEN 'ALTERNATIVE'
|
|
268
267
|
WHEN 6 THEN 'PROVISIONAL'
|
|
269
|
-
WHEN 8 THEN 'HISTORICAL'
|
|
270
268
|
END AS variant_label,
|
|
271
269
|
(logical_status = 1) AS is_primary
|
|
272
270
|
FROM lpi_base_distinct
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Transforms the extracted parquet files into a single flatfile suitable for
|
|
4
4
|
UK address matching. This includes:
|
|
5
|
-
- Processing core feature types (Built Address,
|
|
5
|
+
- Processing core feature types (Built Address, Pre-Build Address, etc.)
|
|
6
6
|
- Processing alternate address records
|
|
7
7
|
- Processing Royal Mail addresses
|
|
8
8
|
- Handling Welsh language variants
|
|
@@ -27,8 +27,6 @@ logger = logging.getLogger(__name__)
|
|
|
27
27
|
FEATURE_TYPE_BY_STEM = {
|
|
28
28
|
"add_gb_builtaddress": "Built Address",
|
|
29
29
|
"add_gb_builtaddress_altadd": "Built Address",
|
|
30
|
-
"add_gb_historicaddress": "Historic Address",
|
|
31
|
-
"add_gb_historicaddress_altadd": "Historic Address",
|
|
32
30
|
"add_gb_nonaddressableobject": "Non-Addressable Object",
|
|
33
31
|
"add_gb_nonaddressableobject_altadd": "Non-Addressable Object",
|
|
34
32
|
"add_gb_prebuildaddress": "Pre-Build Address",
|
|
@@ -39,7 +37,6 @@ FEATURE_TYPE_BY_STEM = {
|
|
|
39
37
|
# Core feature stems (contain fulladdress and classification fields)
|
|
40
38
|
CORE_FEATURE_STEMS = {
|
|
41
39
|
"add_gb_builtaddress",
|
|
42
|
-
"add_gb_historicaddress",
|
|
43
40
|
"add_gb_nonaddressableobject",
|
|
44
41
|
"add_gb_prebuildaddress",
|
|
45
42
|
}
|
|
@@ -47,7 +44,6 @@ CORE_FEATURE_STEMS = {
|
|
|
47
44
|
# Alternate address stems (no classification fields)
|
|
48
45
|
ALTADD_STEMS = {
|
|
49
46
|
"add_gb_builtaddress_altadd",
|
|
50
|
-
"add_gb_historicaddress_altadd",
|
|
51
47
|
"add_gb_nonaddressableobject_altadd",
|
|
52
48
|
"add_gb_prebuildaddress_altadd",
|
|
53
49
|
}
|
|
@@ -57,7 +53,6 @@ CORE_FEATURE_PRIORITY = {
|
|
|
57
53
|
"add_gb_builtaddress": 1,
|
|
58
54
|
"add_gb_prebuildaddress": 2,
|
|
59
55
|
"add_gb_nonaddressableobject": 3,
|
|
60
|
-
"add_gb_historicaddress": 4,
|
|
61
56
|
}
|
|
62
57
|
|
|
63
58
|
|
|
@@ -71,7 +66,7 @@ def _create_metadata_lookup_view(
|
|
|
71
66
|
This view is used to enrich Royal Mail and alternate address records
|
|
72
67
|
with metadata (classificationcode, parentuprn, etc.) by UPRN lookup.
|
|
73
68
|
|
|
74
|
-
Uses priority ranking (Built > Pre-Build > Non-Addressable
|
|
69
|
+
Uses priority ranking (Built > Pre-Build > Non-Addressable)
|
|
75
70
|
to dedupe when a UPRN exists in multiple core files.
|
|
76
71
|
|
|
77
72
|
Args:
|
|
@@ -102,7 +97,6 @@ def _create_metadata_lookup_view(
|
|
|
102
97
|
""")
|
|
103
98
|
|
|
104
99
|
if not union_parts:
|
|
105
|
-
# No core files found - create empty lookup
|
|
106
100
|
logger.warning("No core feature files found. Metadata lookup will be empty.")
|
|
107
101
|
con.execute("""
|
|
108
102
|
CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
|
|
@@ -117,37 +111,48 @@ def _create_metadata_lookup_view(
|
|
|
117
111
|
CAST(NULL AS DOUBLE) AS highestfloorlevel
|
|
118
112
|
WHERE 1=0
|
|
119
113
|
""")
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
union_sql = "\nUNION ALL\n".join(union_parts)
|
|
114
|
+
else:
|
|
115
|
+
union_sql = "\nUNION ALL\n".join(union_parts)
|
|
123
116
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
117
|
+
sql = f"""
|
|
118
|
+
CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
|
|
119
|
+
WITH core_data AS (
|
|
120
|
+
{union_sql}
|
|
121
|
+
),
|
|
122
|
+
ranked AS (
|
|
123
|
+
SELECT
|
|
124
|
+
*,
|
|
125
|
+
ROW_NUMBER() OVER (
|
|
126
|
+
PARTITION BY uprn
|
|
127
|
+
ORDER BY source_priority
|
|
128
|
+
) AS rn
|
|
129
|
+
FROM core_data
|
|
130
|
+
)
|
|
130
131
|
SELECT
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
132
|
+
uprn,
|
|
133
|
+
classificationcode,
|
|
134
|
+
parentuprn,
|
|
135
|
+
rootuprn,
|
|
136
|
+
hierarchylevel,
|
|
137
|
+
floorlevel,
|
|
138
|
+
lowestfloorlevel,
|
|
139
|
+
highestfloorlevel
|
|
140
|
+
FROM ranked
|
|
141
|
+
WHERE rn = 1;
|
|
142
|
+
"""
|
|
143
|
+
con.execute(sql)
|
|
144
|
+
|
|
145
|
+
built_path = parquet_dir / "add_gb_builtaddress.parquet"
|
|
146
|
+
built_sql = f"""
|
|
147
|
+
CREATE OR REPLACE TEMP VIEW builtaddress_ltla_lookup AS
|
|
138
148
|
SELECT
|
|
139
|
-
uprn,
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
floorlevel,
|
|
145
|
-
lowestfloorlevel,
|
|
146
|
-
highestfloorlevel
|
|
147
|
-
FROM ranked
|
|
148
|
-
WHERE rn = 1;
|
|
149
|
+
CAST(uprn AS BIGINT) AS uprn,
|
|
150
|
+
MAX(CAST(lowertierlocalauthoritygsscode AS VARCHAR)) AS lowertierlocalauthoritygsscode
|
|
151
|
+
FROM read_parquet('{built_path.as_posix()}')
|
|
152
|
+
{where_clause}
|
|
153
|
+
GROUP BY CAST(uprn AS BIGINT)
|
|
149
154
|
"""
|
|
150
|
-
con.execute(
|
|
155
|
+
con.execute(built_sql)
|
|
151
156
|
|
|
152
157
|
|
|
153
158
|
def _create_core_feature_view(
|
|
@@ -156,7 +161,7 @@ def _create_core_feature_view(
|
|
|
156
161
|
parquet_path: Path,
|
|
157
162
|
uprn_predicate: str | None = None,
|
|
158
163
|
) -> None:
|
|
159
|
-
"""Create view for core feature types (Built,
|
|
164
|
+
"""Create view for core feature types (Built, Pre-Build, Non-Addressable).
|
|
160
165
|
|
|
161
166
|
These tables have fulladdress, classification fields, and Welsh language columns.
|
|
162
167
|
Produces both English and Welsh (where available) address records.
|
|
@@ -188,6 +193,7 @@ def _create_core_feature_view(
|
|
|
188
193
|
CAST(floorlevel AS VARCHAR) AS floorlevel,
|
|
189
194
|
CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
|
|
190
195
|
CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
|
|
196
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
191
197
|
-- Internal columns for deduplication (not in final output)
|
|
192
198
|
CAST(description AS VARCHAR) AS feature_type,
|
|
193
199
|
CAST(addressstatus AS VARCHAR) AS address_status,
|
|
@@ -227,6 +233,7 @@ def _create_core_feature_view(
|
|
|
227
233
|
CAST(floorlevel AS VARCHAR) AS floorlevel,
|
|
228
234
|
CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
|
|
229
235
|
CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
|
|
236
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
230
237
|
-- Internal columns for deduplication (not in final output)
|
|
231
238
|
CAST(description AS VARCHAR) AS feature_type,
|
|
232
239
|
CAST(addressstatus AS VARCHAR) AS address_status,
|
|
@@ -282,6 +289,7 @@ def _create_altadd_view(
|
|
|
282
289
|
CAST(floorlevel AS VARCHAR) AS floorlevel,
|
|
283
290
|
CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
|
|
284
291
|
CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
|
|
292
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
285
293
|
-- Internal columns for deduplication (not in final output)
|
|
286
294
|
'{feature_type}' AS feature_type,
|
|
287
295
|
CAST(addressstatus AS VARCHAR) AS address_status,
|
|
@@ -338,6 +346,7 @@ def _create_royal_mail_view(
|
|
|
338
346
|
CAST(NULL AS VARCHAR) AS floorlevel,
|
|
339
347
|
CAST(NULL AS DOUBLE) AS lowestfloorlevel,
|
|
340
348
|
CAST(NULL AS DOUBLE) AS highestfloorlevel,
|
|
349
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
341
350
|
-- Internal columns for deduplication (not in final output)
|
|
342
351
|
'Royal Mail Address' AS feature_type,
|
|
343
352
|
CAST(NULL AS VARCHAR) AS address_status,
|
|
@@ -368,6 +377,7 @@ def _create_royal_mail_view(
|
|
|
368
377
|
CAST(NULL AS VARCHAR) AS floorlevel,
|
|
369
378
|
CAST(NULL AS DOUBLE) AS lowestfloorlevel,
|
|
370
379
|
CAST(NULL AS DOUBLE) AS highestfloorlevel,
|
|
380
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
371
381
|
-- Internal columns for deduplication (not in final output)
|
|
372
382
|
'Royal Mail Address' AS feature_type,
|
|
373
383
|
CAST(NULL AS VARCHAR) AS address_status,
|
|
@@ -403,12 +413,97 @@ def _enrich_with_metadata(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
403
413
|
COALESCE(a.floorlevel, m.floorlevel) AS floorlevel,
|
|
404
414
|
COALESCE(a.lowestfloorlevel, m.lowestfloorlevel) AS lowestfloorlevel,
|
|
405
415
|
COALESCE(a.highestfloorlevel, m.highestfloorlevel) AS highestfloorlevel,
|
|
416
|
+
b.lowertierlocalauthoritygsscode AS lowertierlocalauthoritygsscode,
|
|
406
417
|
-- Internal columns for deduplication
|
|
407
418
|
a.feature_type,
|
|
408
419
|
a.address_status,
|
|
409
420
|
a.build_status
|
|
410
421
|
FROM all_full_addresses a
|
|
411
|
-
LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn
|
|
422
|
+
LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn
|
|
423
|
+
LEFT JOIN builtaddress_ltla_lookup b ON a.uprn = b.uprn;
|
|
424
|
+
"""
|
|
425
|
+
con.execute(sql)
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
|
|
429
|
+
"""Generate custom level-based address variants and insert into enriched table.
|
|
430
|
+
|
|
431
|
+
Parses the ``floorlevel`` column (VARCHAR) from the enriched address table,
|
|
432
|
+
maps integer floor levels to words (-1=BASEMENT … 6=SIXTH), and prepends the
|
|
433
|
+
word to the existing ``address_concat`` to create additional address variants.
|
|
434
|
+
|
|
435
|
+
These rows use ``feature_type='Custom Level'`` so they receive the lowest
|
|
436
|
+
dedup priority and never override official address data.
|
|
437
|
+
"""
|
|
438
|
+
sql = """
|
|
439
|
+
INSERT INTO all_full_addresses_enriched (
|
|
440
|
+
uprn,
|
|
441
|
+
address_concat,
|
|
442
|
+
postcode,
|
|
443
|
+
filename,
|
|
444
|
+
classificationcode,
|
|
445
|
+
parentuprn,
|
|
446
|
+
rootuprn,
|
|
447
|
+
hierarchylevel,
|
|
448
|
+
floorlevel,
|
|
449
|
+
lowestfloorlevel,
|
|
450
|
+
highestfloorlevel,
|
|
451
|
+
lowertierlocalauthoritygsscode,
|
|
452
|
+
feature_type,
|
|
453
|
+
address_status,
|
|
454
|
+
build_status
|
|
455
|
+
)
|
|
456
|
+
WITH level_parsed AS (
|
|
457
|
+
SELECT
|
|
458
|
+
uprn, address_concat, postcode, filename,
|
|
459
|
+
classificationcode, parentuprn, rootuprn,
|
|
460
|
+
lowertierlocalauthoritygsscode,
|
|
461
|
+
hierarchylevel, floorlevel, lowestfloorlevel, highestfloorlevel,
|
|
462
|
+
address_status, build_status,
|
|
463
|
+
CASE
|
|
464
|
+
WHEN split_part(floorlevel, ',', 1) ~ '^-?[0-9]+$'
|
|
465
|
+
THEN CAST(split_part(floorlevel, ',', 1) AS INTEGER)
|
|
466
|
+
ELSE NULL
|
|
467
|
+
END AS level_int
|
|
468
|
+
FROM all_full_addresses_enriched
|
|
469
|
+
WHERE floorlevel IS NOT NULL
|
|
470
|
+
AND address_concat IS NOT NULL
|
|
471
|
+
AND address_concat <> ''
|
|
472
|
+
),
|
|
473
|
+
level_words AS (
|
|
474
|
+
SELECT
|
|
475
|
+
*,
|
|
476
|
+
CASE level_int
|
|
477
|
+
WHEN -1 THEN 'BASEMENT'
|
|
478
|
+
WHEN 0 THEN 'GROUND'
|
|
479
|
+
WHEN 1 THEN 'FIRST'
|
|
480
|
+
WHEN 2 THEN 'SECOND'
|
|
481
|
+
WHEN 3 THEN 'THIRD'
|
|
482
|
+
WHEN 4 THEN 'FOURTH'
|
|
483
|
+
WHEN 5 THEN 'FIFTH'
|
|
484
|
+
WHEN 6 THEN 'SIXTH'
|
|
485
|
+
END AS level_word
|
|
486
|
+
FROM level_parsed
|
|
487
|
+
WHERE level_int BETWEEN -1 AND 6
|
|
488
|
+
)
|
|
489
|
+
SELECT
|
|
490
|
+
uprn,
|
|
491
|
+
TRIM(concat(level_word, ' ', address_concat)) AS address_concat,
|
|
492
|
+
postcode,
|
|
493
|
+
'CUSTOM_LEVEL' AS filename,
|
|
494
|
+
classificationcode,
|
|
495
|
+
parentuprn,
|
|
496
|
+
rootuprn,
|
|
497
|
+
hierarchylevel,
|
|
498
|
+
floorlevel,
|
|
499
|
+
lowestfloorlevel,
|
|
500
|
+
highestfloorlevel,
|
|
501
|
+
lowertierlocalauthoritygsscode,
|
|
502
|
+
'Custom Level' AS feature_type,
|
|
503
|
+
address_status,
|
|
504
|
+
build_status
|
|
505
|
+
FROM level_words
|
|
506
|
+
WHERE level_word IS NOT NULL;
|
|
412
507
|
"""
|
|
413
508
|
con.execute(sql)
|
|
414
509
|
|
|
@@ -417,7 +512,7 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
417
512
|
"""Create deduplicated view of all addresses.
|
|
418
513
|
|
|
419
514
|
Priority rules for deduplication:
|
|
420
|
-
- Feature type: Built Address -> Pre-Build -> Royal Mail ->
|
|
515
|
+
- Feature type: Built Address -> Pre-Build -> Royal Mail -> Non-Addressable
|
|
421
516
|
- Address status: Approved -> Provisional -> Alternative -> Historical
|
|
422
517
|
- Build status: Built Complete -> Under Construction -> Prebuild -> Historic -> Demolished
|
|
423
518
|
|
|
@@ -433,8 +528,8 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
433
528
|
WHEN 'Built Address' THEN 1
|
|
434
529
|
WHEN 'Pre-Build Address' THEN 2
|
|
435
530
|
WHEN 'Royal Mail Address' THEN 3
|
|
436
|
-
WHEN 'Historic Address' THEN 4
|
|
437
531
|
WHEN 'Non-Addressable Object' THEN 5
|
|
532
|
+
WHEN 'Custom Level' THEN 6
|
|
438
533
|
ELSE 9
|
|
439
534
|
END AS feature_type_rank,
|
|
440
535
|
CASE
|
|
@@ -460,20 +555,17 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
460
555
|
build_status_rank
|
|
461
556
|
) AS rn
|
|
462
557
|
FROM all_full_addresses_enriched
|
|
463
|
-
WHERE feature_type
|
|
558
|
+
WHERE feature_type NOT IN ('Non-Addressable Object')
|
|
464
559
|
)
|
|
465
560
|
SELECT
|
|
466
|
-
uprn,
|
|
561
|
+
uprn AS unique_id,
|
|
467
562
|
address_concat,
|
|
468
563
|
postcode,
|
|
469
564
|
filename,
|
|
470
565
|
classificationcode,
|
|
471
566
|
parentuprn,
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
floorlevel,
|
|
475
|
-
lowestfloorlevel,
|
|
476
|
-
highestfloorlevel
|
|
567
|
+
lowertierlocalauthoritygsscode,
|
|
568
|
+
floorlevel
|
|
477
569
|
FROM ranked
|
|
478
570
|
WHERE rn = 1;
|
|
479
571
|
"""
|
|
@@ -641,6 +733,10 @@ def run_flatfile_step(settings: Settings, force: bool = False) -> list[Path]:
|
|
|
641
733
|
logger.info("Enriching addresses with metadata from core files...")
|
|
642
734
|
_enrich_with_metadata(con)
|
|
643
735
|
|
|
736
|
+
# Generate custom level variants
|
|
737
|
+
logger.info("Generating custom level address variants...")
|
|
738
|
+
_create_custom_level_rows(con)
|
|
739
|
+
|
|
644
740
|
# Create deduplicated view
|
|
645
741
|
logger.info("Creating deduplicated view...")
|
|
646
742
|
_create_dedup_view(con)
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/extract.py
RENAMED
|
@@ -11,6 +11,9 @@ from ukam_os_builder.api.settings import Settings
|
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
|
+
# NGD file stems to exclude (historic addresses are not used in output)
|
|
15
|
+
_NGD_EXCLUDED_STEMS = {"historicaddress"}
|
|
16
|
+
|
|
14
17
|
|
|
15
18
|
def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
|
|
16
19
|
"""Find all downloaded zip files in a directory."""
|
|
@@ -22,11 +25,20 @@ def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
|
|
|
22
25
|
return zip_files
|
|
23
26
|
|
|
24
27
|
|
|
28
|
+
def _is_excluded_ngd_file(name: str) -> bool:
|
|
29
|
+
"""Return True if *name* matches an excluded NGD stem (e.g. historicaddress)."""
|
|
30
|
+
name_lower = name.lower()
|
|
31
|
+
return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
|
|
32
|
+
|
|
33
|
+
|
|
25
34
|
def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
|
|
26
35
|
source_lower = source.lower()
|
|
27
36
|
if source_lower == "ngd":
|
|
28
37
|
ngd_zips = [
|
|
29
|
-
zip_path
|
|
38
|
+
zip_path
|
|
39
|
+
for zip_path in zip_files
|
|
40
|
+
if zip_path.name.lower().startswith("add_gb_")
|
|
41
|
+
and not _is_excluded_ngd_file(zip_path.name)
|
|
30
42
|
]
|
|
31
43
|
return ngd_zips or zip_files
|
|
32
44
|
if source_lower == "abp":
|
|
@@ -39,7 +51,8 @@ def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
|
|
|
39
51
|
|
|
40
52
|
def _should_convert_csv_to_parquet(csv_path: Path, source: str) -> bool:
|
|
41
53
|
if source.lower() == "ngd":
|
|
42
|
-
|
|
54
|
+
name_lower = csv_path.name.lower()
|
|
55
|
+
return name_lower.startswith("add_gb_") and not _is_excluded_ngd_file(name_lower)
|
|
43
56
|
return True
|
|
44
57
|
|
|
45
58
|
|
|
@@ -12,7 +12,7 @@ SourceType = Literal["ngd", "abp"]
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
14
|
_DEFAULT_SELECT_COLUMNS = [
|
|
15
|
-
"
|
|
15
|
+
"unique_id",
|
|
16
16
|
"address_concat",
|
|
17
17
|
"postcode",
|
|
18
18
|
"source",
|
|
@@ -128,9 +128,9 @@ def get_variant_statistics(
|
|
|
128
128
|
|
|
129
129
|
stats = con.sql(f"""
|
|
130
130
|
WITH variant_counts AS (
|
|
131
|
-
SELECT
|
|
131
|
+
SELECT unique_id, COUNT(*) AS variant_count
|
|
132
132
|
FROM read_parquet('{files_sql}')
|
|
133
|
-
GROUP BY
|
|
133
|
+
GROUP BY unique_id
|
|
134
134
|
)
|
|
135
135
|
SELECT
|
|
136
136
|
COUNT(*) AS total_uprns,
|
|
@@ -179,7 +179,7 @@ def get_random_uprn(
|
|
|
179
179
|
|
|
180
180
|
select_columns = _choose_select_columns(con, files_sql, columns)
|
|
181
181
|
random_uprn = con.sql(f"""
|
|
182
|
-
SELECT DISTINCT
|
|
182
|
+
SELECT DISTINCT unique_id
|
|
183
183
|
FROM read_parquet('{files_sql}')
|
|
184
184
|
ORDER BY RANDOM()
|
|
185
185
|
LIMIT 1
|
|
@@ -192,7 +192,7 @@ def get_random_uprn(
|
|
|
192
192
|
SELECT
|
|
193
193
|
{select_columns}
|
|
194
194
|
FROM read_parquet('{files_sql}')
|
|
195
|
-
WHERE
|
|
195
|
+
WHERE unique_id = {int(random_uprn[0])}
|
|
196
196
|
ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
|
|
197
197
|
""")
|
|
198
198
|
|
|
@@ -220,14 +220,14 @@ def get_random_large_uprn(
|
|
|
220
220
|
|
|
221
221
|
selected = con.sql(f"""
|
|
222
222
|
WITH variant_counts AS (
|
|
223
|
-
SELECT
|
|
223
|
+
SELECT unique_id, COUNT(*) AS variant_count
|
|
224
224
|
FROM read_parquet('{files_sql}')
|
|
225
225
|
{where_filter}
|
|
226
|
-
GROUP BY
|
|
227
|
-
ORDER BY variant_count DESC,
|
|
226
|
+
GROUP BY unique_id
|
|
227
|
+
ORDER BY variant_count DESC, unique_id ASC
|
|
228
228
|
LIMIT {int(top_n)}
|
|
229
229
|
)
|
|
230
|
-
SELECT
|
|
230
|
+
SELECT unique_id
|
|
231
231
|
FROM variant_counts
|
|
232
232
|
ORDER BY RANDOM()
|
|
233
233
|
LIMIT 1
|
|
@@ -240,7 +240,7 @@ def get_random_large_uprn(
|
|
|
240
240
|
SELECT
|
|
241
241
|
{select_columns}
|
|
242
242
|
FROM read_parquet('{files_sql}')
|
|
243
|
-
WHERE
|
|
243
|
+
WHERE unique_id = {int(selected[0])}
|
|
244
244
|
{and_filter}
|
|
245
245
|
ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
|
|
246
246
|
""")
|
|
@@ -269,7 +269,7 @@ def get_uprn_variants(
|
|
|
269
269
|
SELECT
|
|
270
270
|
{select_columns}
|
|
271
271
|
FROM read_parquet('{files_sql}')
|
|
272
|
-
WHERE
|
|
272
|
+
WHERE unique_id = {int(uprn)}
|
|
273
273
|
{and_filter}
|
|
274
274
|
ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
|
|
275
275
|
""")
|
|
@@ -317,10 +317,10 @@ def inspect_flatfile_variants(
|
|
|
317
317
|
WITH data AS (
|
|
318
318
|
SELECT * FROM read_parquet('{files_sql}')
|
|
319
319
|
)
|
|
320
|
-
SELECT
|
|
320
|
+
SELECT unique_id, COUNT(*) AS variant_count
|
|
321
321
|
FROM data
|
|
322
|
-
GROUP BY
|
|
323
|
-
ORDER BY variant_count DESC,
|
|
322
|
+
GROUP BY unique_id
|
|
323
|
+
ORDER BY variant_count DESC, unique_id ASC
|
|
324
324
|
LIMIT 1 OFFSET {top_offset}
|
|
325
325
|
"""
|
|
326
326
|
).fetchone()
|
|
@@ -333,7 +333,7 @@ def inspect_flatfile_variants(
|
|
|
333
333
|
f"""
|
|
334
334
|
SELECT COUNT(*)
|
|
335
335
|
FROM read_parquet('{files_sql}')
|
|
336
|
-
WHERE
|
|
336
|
+
WHERE unique_id = ?
|
|
337
337
|
""",
|
|
338
338
|
[target_uprn],
|
|
339
339
|
).fetchone()
|
|
@@ -343,7 +343,7 @@ def inspect_flatfile_variants(
|
|
|
343
343
|
f"""
|
|
344
344
|
SELECT *
|
|
345
345
|
FROM read_parquet('{files_sql}')
|
|
346
|
-
WHERE
|
|
346
|
+
WHERE unique_id = ?
|
|
347
347
|
ORDER BY 1
|
|
348
348
|
""",
|
|
349
349
|
[target_uprn],
|
|
@@ -358,7 +358,7 @@ def inspect_flatfile_variants(
|
|
|
358
358
|
max_width=10_000
|
|
359
359
|
)
|
|
360
360
|
logger.info("Selected UPRN rows:")
|
|
361
|
-
con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE
|
|
361
|
+
con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE unique_id = {target_uprn}").show(
|
|
362
362
|
max_width=10_000
|
|
363
363
|
)
|
|
364
364
|
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/os_hub.py
RENAMED
|
@@ -9,9 +9,25 @@ from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
|
|
9
9
|
|
|
10
10
|
import requests
|
|
11
11
|
|
|
12
|
+
from ukam_os_builder.api.settings import Settings
|
|
13
|
+
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
14
16
|
API_BASE_URL = "https://api.os.uk/downloads/v1"
|
|
17
|
+
|
|
18
|
+
# NGD file stems to exclude (historic addresses are not used in output)
|
|
19
|
+
_NGD_EXCLUDED_STEMS = {"historicaddress"}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _should_skip_ngd_download(filename: str, settings: object) -> bool:
|
|
23
|
+
"""Return True if *filename* is an NGD historic-address archive."""
|
|
24
|
+
source_type = getattr(getattr(settings, "source", None), "type", "")
|
|
25
|
+
if source_type != "ngd":
|
|
26
|
+
return False
|
|
27
|
+
name_lower = filename.lower()
|
|
28
|
+
return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
|
|
29
|
+
|
|
30
|
+
|
|
15
31
|
DEFAULT_CHUNK_SIZE = 1024 * 1024 * 20 # 20 MiB
|
|
16
32
|
DEFAULT_CONNECT_TIMEOUT_SECONDS = 30
|
|
17
33
|
DEFAULT_READ_TIMEOUT_SECONDS = 300
|
|
@@ -293,6 +309,11 @@ def run_download_step(
|
|
|
293
309
|
logger.warning("No URL for %s, skipping", item.filename)
|
|
294
310
|
continue
|
|
295
311
|
|
|
312
|
+
# Skip NGD historic address files — they are excluded from output
|
|
313
|
+
if _should_skip_ngd_download(item.filename, settings):
|
|
314
|
+
logger.info("Skipping historic address file: %s", item.filename)
|
|
315
|
+
continue
|
|
316
|
+
|
|
296
317
|
dest_path = downloads_dir / item.filename
|
|
297
318
|
was_downloaded = download_file(
|
|
298
319
|
url=item.url,
|
|
@@ -312,3 +333,54 @@ def run_download_step(
|
|
|
312
333
|
|
|
313
334
|
logger.info("Download complete: %d file(s)", len(downloaded))
|
|
314
335
|
return downloaded
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _get_manifest_path(settings: Settings) -> Path | None:
|
|
339
|
+
downloads_dir = settings.paths.downloads_dir.resolve()
|
|
340
|
+
source_type = settings.source.type # "abp" | "ngd"
|
|
341
|
+
|
|
342
|
+
if source_type == "abp":
|
|
343
|
+
candidates = list(downloads_dir.glob("*-Order_Details.txt"))
|
|
344
|
+
if not candidates:
|
|
345
|
+
logger.info("➡️ Manifest (ABP order details) not found. Check: %s", downloads_dir)
|
|
346
|
+
return None
|
|
347
|
+
|
|
348
|
+
manifest = max(candidates, key=lambda p: p.stat().st_mtime).resolve()
|
|
349
|
+
|
|
350
|
+
if len(candidates) > 1:
|
|
351
|
+
logger.warning(
|
|
352
|
+
"Multiple ABP manifests found in %s. Using newest: %s",
|
|
353
|
+
downloads_dir,
|
|
354
|
+
manifest,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
logger.info("➡️ Manifest (ABP order details): %s", manifest)
|
|
358
|
+
return manifest
|
|
359
|
+
|
|
360
|
+
elif source_type == "ngd":
|
|
361
|
+
candidates = list(
|
|
362
|
+
downloads_dir.glob("*_orderSummary.json")
|
|
363
|
+
) # adjust if it's "*.orderSummary.json"
|
|
364
|
+
if not candidates:
|
|
365
|
+
logger.info("➡️ Manifests (NGD order summaries) not found. Check: %s", downloads_dir)
|
|
366
|
+
return None
|
|
367
|
+
|
|
368
|
+
built_candidates = list(downloads_dir.glob("*builtaddress*_orderSummary.json"))
|
|
369
|
+
built_manifest = (
|
|
370
|
+
max(built_candidates, key=lambda p: p.stat().st_mtime).resolve()
|
|
371
|
+
if built_candidates
|
|
372
|
+
else None
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
logger.info(
|
|
376
|
+
"➡️ Manifests (NGD order summaries): %s (%d files)\n"
|
|
377
|
+
" ↳ Built address order summary: %s",
|
|
378
|
+
downloads_dir,
|
|
379
|
+
len(candidates),
|
|
380
|
+
built_manifest if built_manifest else "(not found)",
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
return downloads_dir
|
|
384
|
+
|
|
385
|
+
logger.warning("Unknown source type %r. No manifest lookup performed.", source_type)
|
|
386
|
+
return None
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from ukam_os_builder.os_builder.extract import (
|
|
6
|
-
_filter_zips_for_source,
|
|
7
|
-
_should_convert_csv_to_parquet,
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
|
|
12
|
-
zip_files = [
|
|
13
|
-
Path("add_gb_builtaddress.zip"),
|
|
14
|
-
Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
|
|
15
|
-
]
|
|
16
|
-
|
|
17
|
-
filtered = _filter_zips_for_source(zip_files, "ngd")
|
|
18
|
-
|
|
19
|
-
assert filtered == [Path("add_gb_builtaddress.zip")]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
|
|
23
|
-
ngd_csv = Path("add_gb_builtaddress.csv")
|
|
24
|
-
abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
|
|
25
|
-
|
|
26
|
-
assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
|
|
27
|
-
assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.github/workflows/release-pypi.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress_altadd.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_historicaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_prebuildaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_royalmailaddress.csv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|