ukam-os-builder 0.1.0.dev4__tar.gz → 0.1.0.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/PKG-INFO +1 -1
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/pyproject.toml +1 -1
- ukam_os_builder-0.1.0.dev5/tests/test_extract_source_filtering.py +49 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_smoke.py +0 -1
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/__init__.py +1 -1
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/api.py +15 -6
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/cli.py +0 -1
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +1 -1
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +4 -6
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/ngd/to_flatfile.py +75 -11
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/extract.py +15 -2
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/os_hub.py +72 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/uv.lock +1 -1
- ukam_os_builder-0.1.0.dev4/tests/test_extract_source_filtering.py +0 -27
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.env.example +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.github/workflows/ci.yml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.github/workflows/e2e.yml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.github/workflows/release-pypi.yml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.gitignore +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/AGENTS.md +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/README.md +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/config.example.yaml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/prompt.md +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/shell/test_release_locally.sh +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/README.md +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_historicaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_prebuildaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_royalmailaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_api.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_cli.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_inspect_results.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_public_api_integration.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_settings.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_setup_wizard.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/_exceptions.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/settings.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/split_raw.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/runner.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/inspect_results.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/pipeline_factory.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/pipeline.py +0 -0
- {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/setup_wizard.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ukam-os-builder
|
|
3
|
-
Version: 0.1.0.
|
|
3
|
+
Version: 0.1.0.dev5
|
|
4
4
|
Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
|
|
5
5
|
Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
6
6
|
Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ukam_os_builder.os_builder.extract import (
|
|
6
|
+
_filter_zips_for_source,
|
|
7
|
+
_should_convert_csv_to_parquet,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
|
|
12
|
+
zip_files = [
|
|
13
|
+
Path("add_gb_builtaddress.zip"),
|
|
14
|
+
Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
filtered = _filter_zips_for_source(zip_files, "ngd")
|
|
18
|
+
|
|
19
|
+
assert filtered == [Path("add_gb_builtaddress.zip")]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
|
|
23
|
+
ngd_csv = Path("add_gb_builtaddress.csv")
|
|
24
|
+
abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
|
|
25
|
+
|
|
26
|
+
assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
|
|
27
|
+
assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_filter_zips_for_source_excludes_ngd_historicaddress() -> None:
|
|
31
|
+
zip_files = [
|
|
32
|
+
Path("add_gb_builtaddress.zip"),
|
|
33
|
+
Path("add_gb_historicaddress.zip"),
|
|
34
|
+
Path("add_gb_historicaddress_altadd.zip"),
|
|
35
|
+
Path("add_gb_prebuildaddress.zip"),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
filtered = _filter_zips_for_source(zip_files, "ngd")
|
|
39
|
+
|
|
40
|
+
assert Path("add_gb_builtaddress.zip") in filtered
|
|
41
|
+
assert Path("add_gb_prebuildaddress.zip") in filtered
|
|
42
|
+
assert Path("add_gb_historicaddress.zip") not in filtered
|
|
43
|
+
assert Path("add_gb_historicaddress_altadd.zip") not in filtered
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_should_convert_csv_to_parquet_skips_ngd_historicaddress() -> None:
|
|
47
|
+
assert _should_convert_csv_to_parquet(Path("add_gb_builtaddress.csv"), "ngd") is True
|
|
48
|
+
assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress.csv"), "ngd") is False
|
|
49
|
+
assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress_altadd.csv"), "ngd") is False
|
|
@@ -8,7 +8,7 @@ from typing import Any, Literal
|
|
|
8
8
|
import yaml
|
|
9
9
|
|
|
10
10
|
from ukam_os_builder.api.settings import Settings, SettingsError, load_settings
|
|
11
|
-
from ukam_os_builder.os_builder.os_hub import get_package_version
|
|
11
|
+
from ukam_os_builder.os_builder.os_hub import _get_manifest_path, get_package_version
|
|
12
12
|
from ukam_os_builder.pipeline import run as run_pipeline
|
|
13
13
|
from ukam_os_builder.pipeline import supported_steps_for_source
|
|
14
14
|
|
|
@@ -333,11 +333,6 @@ def run_from_config(
|
|
|
333
333
|
parquet_compression_level=parquet_compression_level,
|
|
334
334
|
)
|
|
335
335
|
logger.info("Resolved work_dir: %s", settings.paths.work_dir)
|
|
336
|
-
logger.info("Resolved downloads_dir: %s", settings.paths.downloads_dir)
|
|
337
|
-
logger.info("Resolved extracted_dir: %s", settings.paths.extracted_dir)
|
|
338
|
-
logger.info("Resolved parquet_dir: %s", settings.paths.parquet_dir)
|
|
339
|
-
logger.info("Resolved output_dir: %s", settings.paths.output_dir)
|
|
340
|
-
|
|
341
336
|
source_type = settings.source.type
|
|
342
337
|
if step != "all":
|
|
343
338
|
supported_steps = supported_steps_for_source(source_type)
|
|
@@ -353,4 +348,18 @@ def run_from_config(
|
|
|
353
348
|
|
|
354
349
|
overwrite_effective = overwrite if overwrite is not None else bool(force)
|
|
355
350
|
run_pipeline(step=step, settings=settings, force=overwrite_effective, list_only=list_only)
|
|
351
|
+
|
|
352
|
+
logger.info(
|
|
353
|
+
"✅ Pipeline run completed\n\n"
|
|
354
|
+
"Where you need to look:\n"
|
|
355
|
+
" • downloads_dir (raw OS Hub extracts): %s%s\n"
|
|
356
|
+
" • output_dir (final files for address matcher): %s%s\n",
|
|
357
|
+
str(settings.paths.downloads_dir),
|
|
358
|
+
"",
|
|
359
|
+
str(settings.paths.output_dir),
|
|
360
|
+
"",
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
_get_manifest_path(settings)
|
|
364
|
+
|
|
356
365
|
return settings
|
|
@@ -145,7 +145,6 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
145
145
|
parquet_compression=args.parquet_compression,
|
|
146
146
|
parquet_compression_level=args.parquet_compression_level,
|
|
147
147
|
)
|
|
148
|
-
logger.info("Pipeline run completed")
|
|
149
148
|
console.print("[bold green]Build completed successfully[/bold green]")
|
|
150
149
|
return 0
|
|
151
150
|
except (SettingsError, ValueError) as exc:
|
|
@@ -33,7 +33,7 @@ def combine_and_dedupe(con: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyRelatio
|
|
|
33
33
|
),
|
|
34
34
|
ranked AS (
|
|
35
35
|
SELECT *,
|
|
36
|
-
CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2
|
|
36
|
+
CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2 ELSE 9 END AS status_rank,
|
|
37
37
|
CASE source WHEN 'LPI' THEN 0 WHEN 'ORGANISATION' THEN 1 WHEN 'DELIVERY_POINT' THEN 2 WHEN 'CUSTOM_LEVEL' THEN 3 ELSE 4 END AS source_rank
|
|
38
38
|
FROM normalized
|
|
39
39
|
),
|
|
@@ -70,15 +70,15 @@ matching messy user input. We output variants based on **Logical Status**:
|
|
|
70
70
|
locally known as "Rose Cottage").
|
|
71
71
|
3. **Provisional (6):** The address assigned during planning/construction, which
|
|
72
72
|
might change before the house is built.
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
|
|
74
|
+
Historic addresses (logical_status=8) are excluded from output.
|
|
75
75
|
|
|
76
76
|
------------------------------------------------------------------------------
|
|
77
77
|
Key Columns Explained
|
|
78
78
|
------------------------------------------------------------------------------
|
|
79
79
|
* `uprn`: The "Golden Key". Use this to link this address to other data.
|
|
80
80
|
* `base_address`: The constructed full address string.
|
|
81
|
-
* `logical_status`: 1=Current, 6=Provisional
|
|
81
|
+
* `logical_status`: 1=Current, 6=Provisional.
|
|
82
82
|
* `official_flag`: 'Y' indicates this is the "official" version, 'N' suggests
|
|
83
83
|
it might be an unofficial alias.
|
|
84
84
|
* `language`: 'ENG' (English) or 'CYM' (Welsh). Streets in Wales often have
|
|
@@ -183,7 +183,6 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
183
183
|
WHEN 1 THEN 0
|
|
184
184
|
WHEN 3 THEN 1
|
|
185
185
|
WHEN 6 THEN 2
|
|
186
|
-
WHEN 8 THEN 3
|
|
187
186
|
ELSE 9
|
|
188
187
|
END AS status_rank
|
|
189
188
|
FROM lpi l
|
|
@@ -192,7 +191,7 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
192
191
|
LEFT JOIN _sd_best_by_lang sd_lang ON sd_lang.usrn = l.usrn AND sd_lang.language = l.language
|
|
193
192
|
LEFT JOIN _sd_best_any sd_any ON sd_any.usrn = l.usrn
|
|
194
193
|
WHERE (b.addressbase_postal != 'N' OR b.addressbase_postal IS NULL)
|
|
195
|
-
AND l.logical_status IN (1, 3, 6
|
|
194
|
+
AND l.logical_status IN (1, 3, 6)
|
|
196
195
|
""")
|
|
197
196
|
|
|
198
197
|
# Deduplicated distinct addresses
|
|
@@ -266,7 +265,6 @@ def render_variants(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
266
265
|
WHEN 1 THEN 'APPROVED'
|
|
267
266
|
WHEN 3 THEN 'ALTERNATIVE'
|
|
268
267
|
WHEN 6 THEN 'PROVISIONAL'
|
|
269
|
-
WHEN 8 THEN 'HISTORICAL'
|
|
270
268
|
END AS variant_label,
|
|
271
269
|
(logical_status = 1) AS is_primary
|
|
272
270
|
FROM lpi_base_distinct
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Transforms the extracted parquet files into a single flatfile suitable for
|
|
4
4
|
UK address matching. This includes:
|
|
5
|
-
- Processing core feature types (Built Address,
|
|
5
|
+
- Processing core feature types (Built Address, Pre-Build Address, etc.)
|
|
6
6
|
- Processing alternate address records
|
|
7
7
|
- Processing Royal Mail addresses
|
|
8
8
|
- Handling Welsh language variants
|
|
@@ -27,8 +27,6 @@ logger = logging.getLogger(__name__)
|
|
|
27
27
|
FEATURE_TYPE_BY_STEM = {
|
|
28
28
|
"add_gb_builtaddress": "Built Address",
|
|
29
29
|
"add_gb_builtaddress_altadd": "Built Address",
|
|
30
|
-
"add_gb_historicaddress": "Historic Address",
|
|
31
|
-
"add_gb_historicaddress_altadd": "Historic Address",
|
|
32
30
|
"add_gb_nonaddressableobject": "Non-Addressable Object",
|
|
33
31
|
"add_gb_nonaddressableobject_altadd": "Non-Addressable Object",
|
|
34
32
|
"add_gb_prebuildaddress": "Pre-Build Address",
|
|
@@ -39,7 +37,6 @@ FEATURE_TYPE_BY_STEM = {
|
|
|
39
37
|
# Core feature stems (contain fulladdress and classification fields)
|
|
40
38
|
CORE_FEATURE_STEMS = {
|
|
41
39
|
"add_gb_builtaddress",
|
|
42
|
-
"add_gb_historicaddress",
|
|
43
40
|
"add_gb_nonaddressableobject",
|
|
44
41
|
"add_gb_prebuildaddress",
|
|
45
42
|
}
|
|
@@ -47,7 +44,6 @@ CORE_FEATURE_STEMS = {
|
|
|
47
44
|
# Alternate address stems (no classification fields)
|
|
48
45
|
ALTADD_STEMS = {
|
|
49
46
|
"add_gb_builtaddress_altadd",
|
|
50
|
-
"add_gb_historicaddress_altadd",
|
|
51
47
|
"add_gb_nonaddressableobject_altadd",
|
|
52
48
|
"add_gb_prebuildaddress_altadd",
|
|
53
49
|
}
|
|
@@ -57,7 +53,6 @@ CORE_FEATURE_PRIORITY = {
|
|
|
57
53
|
"add_gb_builtaddress": 1,
|
|
58
54
|
"add_gb_prebuildaddress": 2,
|
|
59
55
|
"add_gb_nonaddressableobject": 3,
|
|
60
|
-
"add_gb_historicaddress": 4,
|
|
61
56
|
}
|
|
62
57
|
|
|
63
58
|
|
|
@@ -71,7 +66,7 @@ def _create_metadata_lookup_view(
|
|
|
71
66
|
This view is used to enrich Royal Mail and alternate address records
|
|
72
67
|
with metadata (classificationcode, parentuprn, etc.) by UPRN lookup.
|
|
73
68
|
|
|
74
|
-
Uses priority ranking (Built > Pre-Build > Non-Addressable
|
|
69
|
+
Uses priority ranking (Built > Pre-Build > Non-Addressable)
|
|
75
70
|
to dedupe when a UPRN exists in multiple core files.
|
|
76
71
|
|
|
77
72
|
Args:
|
|
@@ -156,7 +151,7 @@ def _create_core_feature_view(
|
|
|
156
151
|
parquet_path: Path,
|
|
157
152
|
uprn_predicate: str | None = None,
|
|
158
153
|
) -> None:
|
|
159
|
-
"""Create view for core feature types (Built,
|
|
154
|
+
"""Create view for core feature types (Built, Pre-Build, Non-Addressable).
|
|
160
155
|
|
|
161
156
|
These tables have fulladdress, classification fields, and Welsh language columns.
|
|
162
157
|
Produces both English and Welsh (where available) address records.
|
|
@@ -413,11 +408,76 @@ def _enrich_with_metadata(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
413
408
|
con.execute(sql)
|
|
414
409
|
|
|
415
410
|
|
|
411
|
+
def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
|
|
412
|
+
"""Generate custom level-based address variants and insert into enriched table.
|
|
413
|
+
|
|
414
|
+
Parses the ``floorlevel`` column (VARCHAR) from the enriched address table,
|
|
415
|
+
maps integer floor levels to words (-1=BASEMENT … 6=SIXTH), and prepends the
|
|
416
|
+
word to the existing ``address_concat`` to create additional address variants.
|
|
417
|
+
|
|
418
|
+
These rows use ``feature_type='Custom Level'`` so they receive the lowest
|
|
419
|
+
dedup priority and never override official address data.
|
|
420
|
+
"""
|
|
421
|
+
sql = """
|
|
422
|
+
INSERT INTO all_full_addresses_enriched
|
|
423
|
+
WITH level_parsed AS (
|
|
424
|
+
SELECT
|
|
425
|
+
uprn, address_concat, postcode, filename,
|
|
426
|
+
classificationcode, parentuprn, rootuprn,
|
|
427
|
+
hierarchylevel, floorlevel, lowestfloorlevel, highestfloorlevel,
|
|
428
|
+
address_status, build_status,
|
|
429
|
+
CASE
|
|
430
|
+
WHEN split_part(floorlevel, ',', 1) ~ '^-?[0-9]+$'
|
|
431
|
+
THEN CAST(split_part(floorlevel, ',', 1) AS INTEGER)
|
|
432
|
+
ELSE NULL
|
|
433
|
+
END AS level_int
|
|
434
|
+
FROM all_full_addresses_enriched
|
|
435
|
+
WHERE floorlevel IS NOT NULL
|
|
436
|
+
AND address_concat IS NOT NULL
|
|
437
|
+
AND address_concat <> ''
|
|
438
|
+
),
|
|
439
|
+
level_words AS (
|
|
440
|
+
SELECT
|
|
441
|
+
*,
|
|
442
|
+
CASE level_int
|
|
443
|
+
WHEN -1 THEN 'BASEMENT'
|
|
444
|
+
WHEN 0 THEN 'GROUND'
|
|
445
|
+
WHEN 1 THEN 'FIRST'
|
|
446
|
+
WHEN 2 THEN 'SECOND'
|
|
447
|
+
WHEN 3 THEN 'THIRD'
|
|
448
|
+
WHEN 4 THEN 'FOURTH'
|
|
449
|
+
WHEN 5 THEN 'FIFTH'
|
|
450
|
+
WHEN 6 THEN 'SIXTH'
|
|
451
|
+
END AS level_word
|
|
452
|
+
FROM level_parsed
|
|
453
|
+
WHERE level_int BETWEEN -1 AND 6
|
|
454
|
+
)
|
|
455
|
+
SELECT
|
|
456
|
+
uprn,
|
|
457
|
+
TRIM(concat(level_word, ' ', address_concat)) AS address_concat,
|
|
458
|
+
postcode,
|
|
459
|
+
'CUSTOM_LEVEL' AS filename,
|
|
460
|
+
classificationcode,
|
|
461
|
+
parentuprn,
|
|
462
|
+
rootuprn,
|
|
463
|
+
hierarchylevel,
|
|
464
|
+
floorlevel,
|
|
465
|
+
lowestfloorlevel,
|
|
466
|
+
highestfloorlevel,
|
|
467
|
+
'Custom Level' AS feature_type,
|
|
468
|
+
address_status,
|
|
469
|
+
build_status
|
|
470
|
+
FROM level_words
|
|
471
|
+
WHERE level_word IS NOT NULL;
|
|
472
|
+
"""
|
|
473
|
+
con.execute(sql)
|
|
474
|
+
|
|
475
|
+
|
|
416
476
|
def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
417
477
|
"""Create deduplicated view of all addresses.
|
|
418
478
|
|
|
419
479
|
Priority rules for deduplication:
|
|
420
|
-
- Feature type: Built Address -> Pre-Build -> Royal Mail ->
|
|
480
|
+
- Feature type: Built Address -> Pre-Build -> Royal Mail -> Non-Addressable
|
|
421
481
|
- Address status: Approved -> Provisional -> Alternative -> Historical
|
|
422
482
|
- Build status: Built Complete -> Under Construction -> Prebuild -> Historic -> Demolished
|
|
423
483
|
|
|
@@ -433,8 +493,8 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
433
493
|
WHEN 'Built Address' THEN 1
|
|
434
494
|
WHEN 'Pre-Build Address' THEN 2
|
|
435
495
|
WHEN 'Royal Mail Address' THEN 3
|
|
436
|
-
WHEN 'Historic Address' THEN 4
|
|
437
496
|
WHEN 'Non-Addressable Object' THEN 5
|
|
497
|
+
WHEN 'Custom Level' THEN 6
|
|
438
498
|
ELSE 9
|
|
439
499
|
END AS feature_type_rank,
|
|
440
500
|
CASE
|
|
@@ -460,7 +520,7 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
460
520
|
build_status_rank
|
|
461
521
|
) AS rn
|
|
462
522
|
FROM all_full_addresses_enriched
|
|
463
|
-
WHERE feature_type
|
|
523
|
+
WHERE feature_type NOT IN ('Non-Addressable Object')
|
|
464
524
|
)
|
|
465
525
|
SELECT
|
|
466
526
|
uprn,
|
|
@@ -641,6 +701,10 @@ def run_flatfile_step(settings: Settings, force: bool = False) -> list[Path]:
|
|
|
641
701
|
logger.info("Enriching addresses with metadata from core files...")
|
|
642
702
|
_enrich_with_metadata(con)
|
|
643
703
|
|
|
704
|
+
# Generate custom level variants
|
|
705
|
+
logger.info("Generating custom level address variants...")
|
|
706
|
+
_create_custom_level_rows(con)
|
|
707
|
+
|
|
644
708
|
# Create deduplicated view
|
|
645
709
|
logger.info("Creating deduplicated view...")
|
|
646
710
|
_create_dedup_view(con)
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/extract.py
RENAMED
|
@@ -11,6 +11,9 @@ from ukam_os_builder.api.settings import Settings
|
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
|
+
# NGD file stems to exclude (historic addresses are not used in output)
|
|
15
|
+
_NGD_EXCLUDED_STEMS = {"historicaddress"}
|
|
16
|
+
|
|
14
17
|
|
|
15
18
|
def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
|
|
16
19
|
"""Find all downloaded zip files in a directory."""
|
|
@@ -22,11 +25,20 @@ def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
|
|
|
22
25
|
return zip_files
|
|
23
26
|
|
|
24
27
|
|
|
28
|
+
def _is_excluded_ngd_file(name: str) -> bool:
|
|
29
|
+
"""Return True if *name* matches an excluded NGD stem (e.g. historicaddress)."""
|
|
30
|
+
name_lower = name.lower()
|
|
31
|
+
return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
|
|
32
|
+
|
|
33
|
+
|
|
25
34
|
def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
|
|
26
35
|
source_lower = source.lower()
|
|
27
36
|
if source_lower == "ngd":
|
|
28
37
|
ngd_zips = [
|
|
29
|
-
zip_path
|
|
38
|
+
zip_path
|
|
39
|
+
for zip_path in zip_files
|
|
40
|
+
if zip_path.name.lower().startswith("add_gb_")
|
|
41
|
+
and not _is_excluded_ngd_file(zip_path.name)
|
|
30
42
|
]
|
|
31
43
|
return ngd_zips or zip_files
|
|
32
44
|
if source_lower == "abp":
|
|
@@ -39,7 +51,8 @@ def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
|
|
|
39
51
|
|
|
40
52
|
def _should_convert_csv_to_parquet(csv_path: Path, source: str) -> bool:
|
|
41
53
|
if source.lower() == "ngd":
|
|
42
|
-
|
|
54
|
+
name_lower = csv_path.name.lower()
|
|
55
|
+
return name_lower.startswith("add_gb_") and not _is_excluded_ngd_file(name_lower)
|
|
43
56
|
return True
|
|
44
57
|
|
|
45
58
|
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/os_hub.py
RENAMED
|
@@ -9,9 +9,25 @@ from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
|
|
9
9
|
|
|
10
10
|
import requests
|
|
11
11
|
|
|
12
|
+
from ukam_os_builder.api.settings import Settings
|
|
13
|
+
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
14
16
|
API_BASE_URL = "https://api.os.uk/downloads/v1"
|
|
17
|
+
|
|
18
|
+
# NGD file stems to exclude (historic addresses are not used in output)
|
|
19
|
+
_NGD_EXCLUDED_STEMS = {"historicaddress"}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _should_skip_ngd_download(filename: str, settings: object) -> bool:
|
|
23
|
+
"""Return True if *filename* is an NGD historic-address archive."""
|
|
24
|
+
source_type = getattr(getattr(settings, "source", None), "type", "")
|
|
25
|
+
if source_type != "ngd":
|
|
26
|
+
return False
|
|
27
|
+
name_lower = filename.lower()
|
|
28
|
+
return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
|
|
29
|
+
|
|
30
|
+
|
|
15
31
|
DEFAULT_CHUNK_SIZE = 1024 * 1024 * 20 # 20 MiB
|
|
16
32
|
DEFAULT_CONNECT_TIMEOUT_SECONDS = 30
|
|
17
33
|
DEFAULT_READ_TIMEOUT_SECONDS = 300
|
|
@@ -293,6 +309,11 @@ def run_download_step(
|
|
|
293
309
|
logger.warning("No URL for %s, skipping", item.filename)
|
|
294
310
|
continue
|
|
295
311
|
|
|
312
|
+
# Skip NGD historic address files — they are excluded from output
|
|
313
|
+
if _should_skip_ngd_download(item.filename, settings):
|
|
314
|
+
logger.info("Skipping historic address file: %s", item.filename)
|
|
315
|
+
continue
|
|
316
|
+
|
|
296
317
|
dest_path = downloads_dir / item.filename
|
|
297
318
|
was_downloaded = download_file(
|
|
298
319
|
url=item.url,
|
|
@@ -312,3 +333,54 @@ def run_download_step(
|
|
|
312
333
|
|
|
313
334
|
logger.info("Download complete: %d file(s)", len(downloaded))
|
|
314
335
|
return downloaded
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _get_manifest_path(settings: Settings) -> Path | None:
|
|
339
|
+
downloads_dir = settings.paths.downloads_dir.resolve()
|
|
340
|
+
source_type = settings.source.type # "abp" | "ngd"
|
|
341
|
+
|
|
342
|
+
if source_type == "abp":
|
|
343
|
+
candidates = list(downloads_dir.glob("*-Order_Details.txt"))
|
|
344
|
+
if not candidates:
|
|
345
|
+
logger.info("➡️ Manifest (ABP order details) not found. Check: %s", downloads_dir)
|
|
346
|
+
return None
|
|
347
|
+
|
|
348
|
+
manifest = max(candidates, key=lambda p: p.stat().st_mtime).resolve()
|
|
349
|
+
|
|
350
|
+
if len(candidates) > 1:
|
|
351
|
+
logger.warning(
|
|
352
|
+
"Multiple ABP manifests found in %s. Using newest: %s",
|
|
353
|
+
downloads_dir,
|
|
354
|
+
manifest,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
logger.info("➡️ Manifest (ABP order details): %s", manifest)
|
|
358
|
+
return manifest
|
|
359
|
+
|
|
360
|
+
elif source_type == "ngd":
|
|
361
|
+
candidates = list(
|
|
362
|
+
downloads_dir.glob("*_orderSummary.json")
|
|
363
|
+
) # adjust if it's "*.orderSummary.json"
|
|
364
|
+
if not candidates:
|
|
365
|
+
logger.info("➡️ Manifests (NGD order summaries) not found. Check: %s", downloads_dir)
|
|
366
|
+
return None
|
|
367
|
+
|
|
368
|
+
built_candidates = list(downloads_dir.glob("*builtaddress*_orderSummary.json"))
|
|
369
|
+
built_manifest = (
|
|
370
|
+
max(built_candidates, key=lambda p: p.stat().st_mtime).resolve()
|
|
371
|
+
if built_candidates
|
|
372
|
+
else None
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
logger.info(
|
|
376
|
+
"➡️ Manifests (NGD order summaries): %s (%d files)\n"
|
|
377
|
+
" ↳ Built address order summary: %s",
|
|
378
|
+
downloads_dir,
|
|
379
|
+
len(candidates),
|
|
380
|
+
built_manifest if built_manifest else "(not found)",
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
return downloads_dir
|
|
384
|
+
|
|
385
|
+
logger.warning("Unknown source type %r. No manifest lookup performed.", source_type)
|
|
386
|
+
return None
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from ukam_os_builder.os_builder.extract import (
|
|
6
|
-
_filter_zips_for_source,
|
|
7
|
-
_should_convert_csv_to_parquet,
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
|
|
12
|
-
zip_files = [
|
|
13
|
-
Path("add_gb_builtaddress.zip"),
|
|
14
|
-
Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
|
|
15
|
-
]
|
|
16
|
-
|
|
17
|
-
filtered = _filter_zips_for_source(zip_files, "ngd")
|
|
18
|
-
|
|
19
|
-
assert filtered == [Path("add_gb_builtaddress.zip")]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
|
|
23
|
-
ngd_csv = Path("add_gb_builtaddress.csv")
|
|
24
|
-
abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
|
|
25
|
-
|
|
26
|
-
assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
|
|
27
|
-
assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.github/workflows/release-pypi.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress_altadd.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_historicaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_prebuildaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_royalmailaddress.csv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_public_api_integration.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|