ukam-os-builder 0.1.0.dev4__tar.gz → 0.1.0.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/PKG-INFO +3 -6
  2. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/README.md +2 -5
  3. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/pyproject.toml +1 -1
  4. ukam_os_builder-0.1.0.dev6/tests/test_extract_source_filtering.py +49 -0
  5. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_inspect_results.py +2 -2
  6. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_public_api_integration.py +1 -1
  7. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_smoke.py +4 -8
  8. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/__init__.py +1 -1
  9. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/api.py +15 -6
  10. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/cli.py +0 -1
  11. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/runner.py +2 -2
  12. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +2 -2
  13. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +4 -6
  14. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/ngd/to_flatfile.py +142 -46
  15. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/extract.py +15 -2
  16. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/inspect_results.py +17 -17
  17. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/os_hub.py +72 -0
  18. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/uv.lock +1 -1
  19. ukam_os_builder-0.1.0.dev4/tests/test_extract_source_filtering.py +0 -27
  20. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.env.example +0 -0
  21. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.github/workflows/ci.yml +0 -0
  22. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.github/workflows/e2e.yml +0 -0
  23. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.github/workflows/release-pypi.yml +0 -0
  24. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/.gitignore +0 -0
  25. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/AGENTS.md +0 -0
  26. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/config.example.yaml +0 -0
  27. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/prompt.md +0 -0
  28. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/shell/test_release_locally.sh +0 -0
  29. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/README.md +0 -0
  30. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress.csv +0 -0
  31. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
  32. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_historicaddress.csv +0 -0
  33. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_prebuildaddress.csv +0 -0
  34. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_royalmailaddress.csv +0 -0
  35. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_api.py +0 -0
  36. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_cli.py +0 -0
  37. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_cli_errors.py +0 -0
  38. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_settings.py +0 -0
  39. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/tests/test_setup_wizard.py +0 -0
  40. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/_exceptions.py +0 -0
  41. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/cli_errors.py +0 -0
  42. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/settings.py +0 -0
  43. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
  44. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/split_raw.py +0 -0
  45. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
  46. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
  47. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
  48. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
  49. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
  50. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
  51. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/__init__.py +0 -0
  52. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/pipeline_factory.py +0 -0
  53. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/pipeline.py +0 -0
  54. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/setup_wizard.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ukam-os-builder
3
- Version: 0.1.0.dev4
3
+ Version: 0.1.0.dev6
4
4
  Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
5
5
  Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
6
6
  Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
@@ -221,13 +221,10 @@ Each file contains:
221
221
  | `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
222
222
  | `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
223
223
  | `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
224
- | `rootuprn` | BIGINT | Root UPRN at the top of the hierarchy |
225
- | `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
224
+ | `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
226
225
  | `floorlevel` | VARCHAR | Floor level identifier |
227
- | `lowestfloorlevel` | DOUBLE | Lowest floor number |
228
- | `highestfloorlevel` | DOUBLE | Highest floor number |
229
226
 
230
- Metadata columns (`classificationcode`, `parentuprn`, `rootuprn`, `hierarchylevel`, `floorlevel`, `lowestfloorlevel`, `highestfloorlevel`) are enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records.
227
+ Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
231
228
 
232
229
  </details>
233
230
 
@@ -195,13 +195,10 @@ Each file contains:
195
195
  | `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
196
196
  | `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
197
197
  | `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
198
- | `rootuprn` | BIGINT | Root UPRN at the top of the hierarchy |
199
- | `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
198
+ | `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
200
199
  | `floorlevel` | VARCHAR | Floor level identifier |
201
- | `lowestfloorlevel` | DOUBLE | Lowest floor number |
202
- | `highestfloorlevel` | DOUBLE | Highest floor number |
203
200
 
204
- Metadata columns (`classificationcode`, `parentuprn`, `rootuprn`, `hierarchylevel`, `floorlevel`, `lowestfloorlevel`, `highestfloorlevel`) are enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records.
201
+ Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
205
202
 
206
203
  </details>
207
204
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ukam-os-builder"
3
- version = "0.1.0.dev4"
3
+ version = "0.1.0.dev6"
4
4
  description = "Download, process and transform OS address data (NGD or ABP) for UK address matching"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from ukam_os_builder.os_builder.extract import (
6
+ _filter_zips_for_source,
7
+ _should_convert_csv_to_parquet,
8
+ )
9
+
10
+
11
+ def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
12
+ zip_files = [
13
+ Path("add_gb_builtaddress.zip"),
14
+ Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
15
+ ]
16
+
17
+ filtered = _filter_zips_for_source(zip_files, "ngd")
18
+
19
+ assert filtered == [Path("add_gb_builtaddress.zip")]
20
+
21
+
22
+ def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
23
+ ngd_csv = Path("add_gb_builtaddress.csv")
24
+ abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
25
+
26
+ assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
27
+ assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False
28
+
29
+
30
+ def test_filter_zips_for_source_excludes_ngd_historicaddress() -> None:
31
+ zip_files = [
32
+ Path("add_gb_builtaddress.zip"),
33
+ Path("add_gb_historicaddress.zip"),
34
+ Path("add_gb_historicaddress_altadd.zip"),
35
+ Path("add_gb_prebuildaddress.zip"),
36
+ ]
37
+
38
+ filtered = _filter_zips_for_source(zip_files, "ngd")
39
+
40
+ assert Path("add_gb_builtaddress.zip") in filtered
41
+ assert Path("add_gb_prebuildaddress.zip") in filtered
42
+ assert Path("add_gb_historicaddress.zip") not in filtered
43
+ assert Path("add_gb_historicaddress_altadd.zip") not in filtered
44
+
45
+
46
+ def test_should_convert_csv_to_parquet_skips_ngd_historicaddress() -> None:
47
+ assert _should_convert_csv_to_parquet(Path("add_gb_builtaddress.csv"), "ngd") is True
48
+ assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress.csv"), "ngd") is False
49
+ assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress_altadd.csv"), "ngd") is False
@@ -26,7 +26,7 @@ def test_inspect_flatfile_variants_uses_config_defaults(tmp_path: Path) -> None:
26
26
  (1001::BIGINT, 'A'::VARCHAR),
27
27
  (1001::BIGINT, 'B'::VARCHAR),
28
28
  (1002::BIGINT, 'C'::VARCHAR)
29
- ) AS t(uprn, address_concat)
29
+ ) AS t(unique_id, address_concat)
30
30
  ) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
31
31
  """
32
32
  )
@@ -61,7 +61,7 @@ def test_inspect_flatfile_variants_supports_abp_pattern(tmp_path: Path) -> None:
61
61
  (2001::BIGINT, 'A'::VARCHAR),
62
62
  (2002::BIGINT, 'B'::VARCHAR),
63
63
  (2002::BIGINT, 'C'::VARCHAR)
64
- ) AS t(uprn, address_concat)
64
+ ) AS t(unique_id, address_concat)
65
65
  ) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
66
66
  """
67
67
  )
@@ -71,7 +71,7 @@ def test_package_root_inspect_flatfile_variants(tmp_path: Path) -> None:
71
71
  (4001::BIGINT, 'A'::VARCHAR),
72
72
  (4001::BIGINT, 'B'::VARCHAR),
73
73
  (4002::BIGINT, 'C'::VARCHAR)
74
- ) AS t(uprn, address_concat)
74
+ ) AS t(unique_id, address_concat)
75
75
  ) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
76
76
  """
77
77
  )
@@ -121,7 +121,6 @@ def _prepare_test_parquet(settings: Settings) -> None:
121
121
  "add_gb_builtaddress_altadd.csv",
122
122
  "add_gb_royalmailaddress.csv",
123
123
  "add_gb_prebuildaddress.csv",
124
- "add_gb_historicaddress.csv",
125
124
  ]
126
125
 
127
126
  for csv_name in sample_files:
@@ -173,17 +172,14 @@ def test_flatfile_single_chunk(temp_settings: Settings) -> None:
173
172
  column_names = [row[0] for row in schema]
174
173
 
175
174
  expected_columns = [
176
- "uprn",
175
+ "unique_id",
177
176
  "address_concat",
178
177
  "postcode",
179
178
  "filename",
180
179
  "classificationcode",
181
180
  "parentuprn",
182
- "rootuprn",
183
- "hierarchylevel",
181
+ "lowertierlocalauthoritygsscode",
184
182
  "floorlevel",
185
- "lowestfloorlevel",
186
- "highestfloorlevel",
187
183
  ]
188
184
  for col in expected_columns:
189
185
  assert col in column_names, f"Column {col} should exist in output"
@@ -232,9 +228,9 @@ def test_deduplication(temp_settings: Settings) -> None:
232
228
  # Verify no exact duplicates
233
229
  con = duckdb.connect()
234
230
  result = con.execute(f"""
235
- SELECT uprn, address_concat, COUNT(*) as cnt
231
+ SELECT unique_id, address_concat, COUNT(*) as cnt
236
232
  FROM read_parquet('{output_files[0].as_posix()}')
237
- GROUP BY uprn, address_concat
233
+ GROUP BY unique_id, address_concat
238
234
  HAVING COUNT(*) > 1
239
235
  """).fetchall()
240
236
 
@@ -8,7 +8,7 @@ from ukam_os_builder.os_builder.inspect_results import (
8
8
  inspect_flatfile_variants,
9
9
  )
10
10
 
11
- __version__ = "0.1.0.dev4"
11
+ __version__ = "0.1.0.dev6"
12
12
 
13
13
  __all__ = [
14
14
  "create_config_and_env",
@@ -8,7 +8,7 @@ from typing import Any, Literal
8
8
  import yaml
9
9
 
10
10
  from ukam_os_builder.api.settings import Settings, SettingsError, load_settings
11
- from ukam_os_builder.os_builder.os_hub import get_package_version
11
+ from ukam_os_builder.os_builder.os_hub import _get_manifest_path, get_package_version
12
12
  from ukam_os_builder.pipeline import run as run_pipeline
13
13
  from ukam_os_builder.pipeline import supported_steps_for_source
14
14
 
@@ -333,11 +333,6 @@ def run_from_config(
333
333
  parquet_compression_level=parquet_compression_level,
334
334
  )
335
335
  logger.info("Resolved work_dir: %s", settings.paths.work_dir)
336
- logger.info("Resolved downloads_dir: %s", settings.paths.downloads_dir)
337
- logger.info("Resolved extracted_dir: %s", settings.paths.extracted_dir)
338
- logger.info("Resolved parquet_dir: %s", settings.paths.parquet_dir)
339
- logger.info("Resolved output_dir: %s", settings.paths.output_dir)
340
-
341
336
  source_type = settings.source.type
342
337
  if step != "all":
343
338
  supported_steps = supported_steps_for_source(source_type)
@@ -353,4 +348,18 @@ def run_from_config(
353
348
 
354
349
  overwrite_effective = overwrite if overwrite is not None else bool(force)
355
350
  run_pipeline(step=step, settings=settings, force=overwrite_effective, list_only=list_only)
351
+
352
+ logger.info(
353
+ "✅ Pipeline run completed\n\n"
354
+ "Where you need to look:\n"
355
+ " • downloads_dir (raw OS Hub extracts): %s%s\n"
356
+ " • output_dir (final files for address matcher): %s%s\n",
357
+ str(settings.paths.downloads_dir),
358
+ "",
359
+ str(settings.paths.output_dir),
360
+ "",
361
+ )
362
+
363
+ _get_manifest_path(settings)
364
+
356
365
  return settings
@@ -145,7 +145,6 @@ def main(argv: list[str] | None = None) -> int:
145
145
  parquet_compression=args.parquet_compression,
146
146
  parquet_compression_level=args.parquet_compression_level,
147
147
  )
148
- logger.info("Pipeline run completed")
149
148
  console.print("[bold green]Build completed successfully[/bold green]")
150
149
  return 0
151
150
  except (SettingsError, ValueError) as exc:
@@ -170,7 +170,7 @@ def _transform_to_flatfile_chunk(
170
170
  logger.debug("Combination and deduplication in %.2f seconds", perf_counter() - t0)
171
171
 
172
172
  # Get chunk metrics
173
- chunk_metrics = con.execute("SELECT COUNT(DISTINCT uprn), COUNT(*) FROM result").fetchone()
173
+ chunk_metrics = con.execute("SELECT COUNT(DISTINCT unique_id), COUNT(*) FROM result").fetchone()
174
174
  chunk_uprns = chunk_metrics[0]
175
175
  chunk_rows = chunk_metrics[1]
176
176
 
@@ -244,7 +244,7 @@ def transform_to_flatfile(
244
244
  con = create_duckdb_connection(settings)
245
245
  output_path = output_paths[0]
246
246
  stats = con.execute(f"""
247
- SELECT COUNT(DISTINCT uprn), COUNT(*)
247
+ SELECT COUNT(DISTINCT unique_id), COUNT(*)
248
248
  FROM read_parquet('{output_path.as_posix()}')
249
249
  """).fetchone()
250
250
  total_uprns = stats[0]
@@ -33,7 +33,7 @@ def combine_and_dedupe(con: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyRelatio
33
33
  ),
34
34
  ranked AS (
35
35
  SELECT *,
36
- CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2 WHEN 8 THEN 3 ELSE 9 END AS status_rank,
36
+ CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2 ELSE 9 END AS status_rank,
37
37
  CASE source WHEN 'LPI' THEN 0 WHEN 'ORGANISATION' THEN 1 WHEN 'DELIVERY_POINT' THEN 2 WHEN 'CUSTOM_LEVEL' THEN 3 ELSE 4 END AS source_rank
38
38
  FROM normalized
39
39
  ),
@@ -62,7 +62,7 @@ def combine_and_dedupe(con: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyRelatio
62
62
  FROM deduped_filtered
63
63
  )
64
64
  SELECT
65
- sr.uprn,
65
+ sr.uprn AS unique_id,
66
66
  sr.postcode,
67
67
  sr.address_concat,
68
68
  cb.classification_code,
@@ -70,15 +70,15 @@ matching messy user input. We output variants based on **Logical Status**:
70
70
  locally known as "Rose Cottage").
71
71
  3. **Provisional (6):** The address assigned during planning/construction, which
72
72
  might change before the house is built.
73
- 4. **Historic (8):** An old address. If "10 High St" is renumbered to "12 High St",
74
- the old address is kept as Historic. This helps match old datasets.
73
+
74
+ Historic addresses (logical_status=8) are excluded from output.
75
75
 
76
76
  ------------------------------------------------------------------------------
77
77
  Key Columns Explained
78
78
  ------------------------------------------------------------------------------
79
79
  * `uprn`: The "Golden Key". Use this to link this address to other data.
80
80
  * `base_address`: The constructed full address string.
81
- * `logical_status`: 1=Current, 6=Provisional, 8=Historic.
81
+ * `logical_status`: 1=Current, 6=Provisional.
82
82
  * `official_flag`: 'Y' indicates this is the "official" version, 'N' suggests
83
83
  it might be an unofficial alias.
84
84
  * `language`: 'ENG' (English) or 'CYM' (Welsh). Streets in Wales often have
@@ -183,7 +183,6 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
183
183
  WHEN 1 THEN 0
184
184
  WHEN 3 THEN 1
185
185
  WHEN 6 THEN 2
186
- WHEN 8 THEN 3
187
186
  ELSE 9
188
187
  END AS status_rank
189
188
  FROM lpi l
@@ -192,7 +191,7 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
192
191
  LEFT JOIN _sd_best_by_lang sd_lang ON sd_lang.usrn = l.usrn AND sd_lang.language = l.language
193
192
  LEFT JOIN _sd_best_any sd_any ON sd_any.usrn = l.usrn
194
193
  WHERE (b.addressbase_postal != 'N' OR b.addressbase_postal IS NULL)
195
- AND l.logical_status IN (1, 3, 6, 8)
194
+ AND l.logical_status IN (1, 3, 6)
196
195
  """)
197
196
 
198
197
  # Deduplicated distinct addresses
@@ -266,7 +265,6 @@ def render_variants(con: duckdb.DuckDBPyConnection) -> None:
266
265
  WHEN 1 THEN 'APPROVED'
267
266
  WHEN 3 THEN 'ALTERNATIVE'
268
267
  WHEN 6 THEN 'PROVISIONAL'
269
- WHEN 8 THEN 'HISTORICAL'
270
268
  END AS variant_label,
271
269
  (logical_status = 1) AS is_primary
272
270
  FROM lpi_base_distinct
@@ -2,7 +2,7 @@
2
2
 
3
3
  Transforms the extracted parquet files into a single flatfile suitable for
4
4
  UK address matching. This includes:
5
- - Processing core feature types (Built Address, Historic Address, etc.)
5
+ - Processing core feature types (Built Address, Pre-Build Address, etc.)
6
6
  - Processing alternate address records
7
7
  - Processing Royal Mail addresses
8
8
  - Handling Welsh language variants
@@ -27,8 +27,6 @@ logger = logging.getLogger(__name__)
27
27
  FEATURE_TYPE_BY_STEM = {
28
28
  "add_gb_builtaddress": "Built Address",
29
29
  "add_gb_builtaddress_altadd": "Built Address",
30
- "add_gb_historicaddress": "Historic Address",
31
- "add_gb_historicaddress_altadd": "Historic Address",
32
30
  "add_gb_nonaddressableobject": "Non-Addressable Object",
33
31
  "add_gb_nonaddressableobject_altadd": "Non-Addressable Object",
34
32
  "add_gb_prebuildaddress": "Pre-Build Address",
@@ -39,7 +37,6 @@ FEATURE_TYPE_BY_STEM = {
39
37
  # Core feature stems (contain fulladdress and classification fields)
40
38
  CORE_FEATURE_STEMS = {
41
39
  "add_gb_builtaddress",
42
- "add_gb_historicaddress",
43
40
  "add_gb_nonaddressableobject",
44
41
  "add_gb_prebuildaddress",
45
42
  }
@@ -47,7 +44,6 @@ CORE_FEATURE_STEMS = {
47
44
  # Alternate address stems (no classification fields)
48
45
  ALTADD_STEMS = {
49
46
  "add_gb_builtaddress_altadd",
50
- "add_gb_historicaddress_altadd",
51
47
  "add_gb_nonaddressableobject_altadd",
52
48
  "add_gb_prebuildaddress_altadd",
53
49
  }
@@ -57,7 +53,6 @@ CORE_FEATURE_PRIORITY = {
57
53
  "add_gb_builtaddress": 1,
58
54
  "add_gb_prebuildaddress": 2,
59
55
  "add_gb_nonaddressableobject": 3,
60
- "add_gb_historicaddress": 4,
61
56
  }
62
57
 
63
58
 
@@ -71,7 +66,7 @@ def _create_metadata_lookup_view(
71
66
  This view is used to enrich Royal Mail and alternate address records
72
67
  with metadata (classificationcode, parentuprn, etc.) by UPRN lookup.
73
68
 
74
- Uses priority ranking (Built > Pre-Build > Non-Addressable > Historic)
69
+ Uses priority ranking (Built > Pre-Build > Non-Addressable)
75
70
  to dedupe when a UPRN exists in multiple core files.
76
71
 
77
72
  Args:
@@ -102,7 +97,6 @@ def _create_metadata_lookup_view(
102
97
  """)
103
98
 
104
99
  if not union_parts:
105
- # No core files found - create empty lookup
106
100
  logger.warning("No core feature files found. Metadata lookup will be empty.")
107
101
  con.execute("""
108
102
  CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
@@ -117,37 +111,48 @@ def _create_metadata_lookup_view(
117
111
  CAST(NULL AS DOUBLE) AS highestfloorlevel
118
112
  WHERE 1=0
119
113
  """)
120
- return
121
-
122
- union_sql = "\nUNION ALL\n".join(union_parts)
114
+ else:
115
+ union_sql = "\nUNION ALL\n".join(union_parts)
123
116
 
124
- sql = f"""
125
- CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
126
- WITH core_data AS (
127
- {union_sql}
128
- ),
129
- ranked AS (
117
+ sql = f"""
118
+ CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
119
+ WITH core_data AS (
120
+ {union_sql}
121
+ ),
122
+ ranked AS (
123
+ SELECT
124
+ *,
125
+ ROW_NUMBER() OVER (
126
+ PARTITION BY uprn
127
+ ORDER BY source_priority
128
+ ) AS rn
129
+ FROM core_data
130
+ )
130
131
  SELECT
131
- *,
132
- ROW_NUMBER() OVER (
133
- PARTITION BY uprn
134
- ORDER BY source_priority
135
- ) AS rn
136
- FROM core_data
137
- )
132
+ uprn,
133
+ classificationcode,
134
+ parentuprn,
135
+ rootuprn,
136
+ hierarchylevel,
137
+ floorlevel,
138
+ lowestfloorlevel,
139
+ highestfloorlevel
140
+ FROM ranked
141
+ WHERE rn = 1;
142
+ """
143
+ con.execute(sql)
144
+
145
+ built_path = parquet_dir / "add_gb_builtaddress.parquet"
146
+ built_sql = f"""
147
+ CREATE OR REPLACE TEMP VIEW builtaddress_ltla_lookup AS
138
148
  SELECT
139
- uprn,
140
- classificationcode,
141
- parentuprn,
142
- rootuprn,
143
- hierarchylevel,
144
- floorlevel,
145
- lowestfloorlevel,
146
- highestfloorlevel
147
- FROM ranked
148
- WHERE rn = 1;
149
+ CAST(uprn AS BIGINT) AS uprn,
150
+ MAX(CAST(lowertierlocalauthoritygsscode AS VARCHAR)) AS lowertierlocalauthoritygsscode
151
+ FROM read_parquet('{built_path.as_posix()}')
152
+ {where_clause}
153
+ GROUP BY CAST(uprn AS BIGINT)
149
154
  """
150
- con.execute(sql)
155
+ con.execute(built_sql)
151
156
 
152
157
 
153
158
  def _create_core_feature_view(
@@ -156,7 +161,7 @@ def _create_core_feature_view(
156
161
  parquet_path: Path,
157
162
  uprn_predicate: str | None = None,
158
163
  ) -> None:
159
- """Create view for core feature types (Built, Historic, Pre-Build, Non-Addressable).
164
+ """Create view for core feature types (Built, Pre-Build, Non-Addressable).
160
165
 
161
166
  These tables have fulladdress, classification fields, and Welsh language columns.
162
167
  Produces both English and Welsh (where available) address records.
@@ -188,6 +193,7 @@ def _create_core_feature_view(
188
193
  CAST(floorlevel AS VARCHAR) AS floorlevel,
189
194
  CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
190
195
  CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
196
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
191
197
  -- Internal columns for deduplication (not in final output)
192
198
  CAST(description AS VARCHAR) AS feature_type,
193
199
  CAST(addressstatus AS VARCHAR) AS address_status,
@@ -227,6 +233,7 @@ def _create_core_feature_view(
227
233
  CAST(floorlevel AS VARCHAR) AS floorlevel,
228
234
  CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
229
235
  CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
236
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
230
237
  -- Internal columns for deduplication (not in final output)
231
238
  CAST(description AS VARCHAR) AS feature_type,
232
239
  CAST(addressstatus AS VARCHAR) AS address_status,
@@ -282,6 +289,7 @@ def _create_altadd_view(
282
289
  CAST(floorlevel AS VARCHAR) AS floorlevel,
283
290
  CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
284
291
  CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
292
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
285
293
  -- Internal columns for deduplication (not in final output)
286
294
  '{feature_type}' AS feature_type,
287
295
  CAST(addressstatus AS VARCHAR) AS address_status,
@@ -338,6 +346,7 @@ def _create_royal_mail_view(
338
346
  CAST(NULL AS VARCHAR) AS floorlevel,
339
347
  CAST(NULL AS DOUBLE) AS lowestfloorlevel,
340
348
  CAST(NULL AS DOUBLE) AS highestfloorlevel,
349
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
341
350
  -- Internal columns for deduplication (not in final output)
342
351
  'Royal Mail Address' AS feature_type,
343
352
  CAST(NULL AS VARCHAR) AS address_status,
@@ -368,6 +377,7 @@ def _create_royal_mail_view(
368
377
  CAST(NULL AS VARCHAR) AS floorlevel,
369
378
  CAST(NULL AS DOUBLE) AS lowestfloorlevel,
370
379
  CAST(NULL AS DOUBLE) AS highestfloorlevel,
380
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
371
381
  -- Internal columns for deduplication (not in final output)
372
382
  'Royal Mail Address' AS feature_type,
373
383
  CAST(NULL AS VARCHAR) AS address_status,
@@ -403,12 +413,97 @@ def _enrich_with_metadata(con: duckdb.DuckDBPyConnection) -> None:
403
413
  COALESCE(a.floorlevel, m.floorlevel) AS floorlevel,
404
414
  COALESCE(a.lowestfloorlevel, m.lowestfloorlevel) AS lowestfloorlevel,
405
415
  COALESCE(a.highestfloorlevel, m.highestfloorlevel) AS highestfloorlevel,
416
+ b.lowertierlocalauthoritygsscode AS lowertierlocalauthoritygsscode,
406
417
  -- Internal columns for deduplication
407
418
  a.feature_type,
408
419
  a.address_status,
409
420
  a.build_status
410
421
  FROM all_full_addresses a
411
- LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn;
422
+ LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn
423
+ LEFT JOIN builtaddress_ltla_lookup b ON a.uprn = b.uprn;
424
+ """
425
+ con.execute(sql)
426
+
427
+
428
+ def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
429
+ """Generate custom level-based address variants and insert into enriched table.
430
+
431
+ Parses the ``floorlevel`` column (VARCHAR) from the enriched address table,
432
+ maps integer floor levels to words (-1=BASEMENT … 6=SIXTH), and prepends the
433
+ word to the existing ``address_concat`` to create additional address variants.
434
+
435
+ These rows use ``feature_type='Custom Level'`` so they receive the lowest
436
+ dedup priority and never override official address data.
437
+ """
438
+ sql = """
439
+ INSERT INTO all_full_addresses_enriched (
440
+ uprn,
441
+ address_concat,
442
+ postcode,
443
+ filename,
444
+ classificationcode,
445
+ parentuprn,
446
+ rootuprn,
447
+ hierarchylevel,
448
+ floorlevel,
449
+ lowestfloorlevel,
450
+ highestfloorlevel,
451
+ lowertierlocalauthoritygsscode,
452
+ feature_type,
453
+ address_status,
454
+ build_status
455
+ )
456
+ WITH level_parsed AS (
457
+ SELECT
458
+ uprn, address_concat, postcode, filename,
459
+ classificationcode, parentuprn, rootuprn,
460
+ lowertierlocalauthoritygsscode,
461
+ hierarchylevel, floorlevel, lowestfloorlevel, highestfloorlevel,
462
+ address_status, build_status,
463
+ CASE
464
+ WHEN split_part(floorlevel, ',', 1) ~ '^-?[0-9]+$'
465
+ THEN CAST(split_part(floorlevel, ',', 1) AS INTEGER)
466
+ ELSE NULL
467
+ END AS level_int
468
+ FROM all_full_addresses_enriched
469
+ WHERE floorlevel IS NOT NULL
470
+ AND address_concat IS NOT NULL
471
+ AND address_concat <> ''
472
+ ),
473
+ level_words AS (
474
+ SELECT
475
+ *,
476
+ CASE level_int
477
+ WHEN -1 THEN 'BASEMENT'
478
+ WHEN 0 THEN 'GROUND'
479
+ WHEN 1 THEN 'FIRST'
480
+ WHEN 2 THEN 'SECOND'
481
+ WHEN 3 THEN 'THIRD'
482
+ WHEN 4 THEN 'FOURTH'
483
+ WHEN 5 THEN 'FIFTH'
484
+ WHEN 6 THEN 'SIXTH'
485
+ END AS level_word
486
+ FROM level_parsed
487
+ WHERE level_int BETWEEN -1 AND 6
488
+ )
489
+ SELECT
490
+ uprn,
491
+ TRIM(concat(level_word, ' ', address_concat)) AS address_concat,
492
+ postcode,
493
+ 'CUSTOM_LEVEL' AS filename,
494
+ classificationcode,
495
+ parentuprn,
496
+ rootuprn,
497
+ hierarchylevel,
498
+ floorlevel,
499
+ lowestfloorlevel,
500
+ highestfloorlevel,
501
+ lowertierlocalauthoritygsscode,
502
+ 'Custom Level' AS feature_type,
503
+ address_status,
504
+ build_status
505
+ FROM level_words
506
+ WHERE level_word IS NOT NULL;
412
507
  """
413
508
  con.execute(sql)
414
509
 
@@ -417,7 +512,7 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
417
512
  """Create deduplicated view of all addresses.
418
513
 
419
514
  Priority rules for deduplication:
420
- - Feature type: Built Address -> Pre-Build -> Royal Mail -> Historic -> Non-Addressable
515
+ - Feature type: Built Address -> Pre-Build -> Royal Mail -> Non-Addressable
421
516
  - Address status: Approved -> Provisional -> Alternative -> Historical
422
517
  - Build status: Built Complete -> Under Construction -> Prebuild -> Historic -> Demolished
423
518
 
@@ -433,8 +528,8 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
433
528
  WHEN 'Built Address' THEN 1
434
529
  WHEN 'Pre-Build Address' THEN 2
435
530
  WHEN 'Royal Mail Address' THEN 3
436
- WHEN 'Historic Address' THEN 4
437
531
  WHEN 'Non-Addressable Object' THEN 5
532
+ WHEN 'Custom Level' THEN 6
438
533
  ELSE 9
439
534
  END AS feature_type_rank,
440
535
  CASE
@@ -460,20 +555,17 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
460
555
  build_status_rank
461
556
  ) AS rn
462
557
  FROM all_full_addresses_enriched
463
- WHERE feature_type != 'Non-Addressable Object'
558
+ WHERE feature_type NOT IN ('Non-Addressable Object')
464
559
  )
465
560
  SELECT
466
- uprn,
561
+ uprn AS unique_id,
467
562
  address_concat,
468
563
  postcode,
469
564
  filename,
470
565
  classificationcode,
471
566
  parentuprn,
472
- rootuprn,
473
- hierarchylevel,
474
- floorlevel,
475
- lowestfloorlevel,
476
- highestfloorlevel
567
+ lowertierlocalauthoritygsscode,
568
+ floorlevel
477
569
  FROM ranked
478
570
  WHERE rn = 1;
479
571
  """
@@ -641,6 +733,10 @@ def run_flatfile_step(settings: Settings, force: bool = False) -> list[Path]:
641
733
  logger.info("Enriching addresses with metadata from core files...")
642
734
  _enrich_with_metadata(con)
643
735
 
736
+ # Generate custom level variants
737
+ logger.info("Generating custom level address variants...")
738
+ _create_custom_level_rows(con)
739
+
644
740
  # Create deduplicated view
645
741
  logger.info("Creating deduplicated view...")
646
742
  _create_dedup_view(con)
@@ -11,6 +11,9 @@ from ukam_os_builder.api.settings import Settings
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
+ # NGD file stems to exclude (historic addresses are not used in output)
15
+ _NGD_EXCLUDED_STEMS = {"historicaddress"}
16
+
14
17
 
15
18
  def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
16
19
  """Find all downloaded zip files in a directory."""
@@ -22,11 +25,20 @@ def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
22
25
  return zip_files
23
26
 
24
27
 
28
+ def _is_excluded_ngd_file(name: str) -> bool:
29
+ """Return True if *name* matches an excluded NGD stem (e.g. historicaddress)."""
30
+ name_lower = name.lower()
31
+ return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
32
+
33
+
25
34
  def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
26
35
  source_lower = source.lower()
27
36
  if source_lower == "ngd":
28
37
  ngd_zips = [
29
- zip_path for zip_path in zip_files if zip_path.name.lower().startswith("add_gb_")
38
+ zip_path
39
+ for zip_path in zip_files
40
+ if zip_path.name.lower().startswith("add_gb_")
41
+ and not _is_excluded_ngd_file(zip_path.name)
30
42
  ]
31
43
  return ngd_zips or zip_files
32
44
  if source_lower == "abp":
@@ -39,7 +51,8 @@ def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
39
51
 
40
52
  def _should_convert_csv_to_parquet(csv_path: Path, source: str) -> bool:
41
53
  if source.lower() == "ngd":
42
- return csv_path.name.lower().startswith("add_gb_")
54
+ name_lower = csv_path.name.lower()
55
+ return name_lower.startswith("add_gb_") and not _is_excluded_ngd_file(name_lower)
43
56
  return True
44
57
 
45
58
 
@@ -12,7 +12,7 @@ SourceType = Literal["ngd", "abp"]
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
  _DEFAULT_SELECT_COLUMNS = [
15
- "uprn",
15
+ "unique_id",
16
16
  "address_concat",
17
17
  "postcode",
18
18
  "source",
@@ -128,9 +128,9 @@ def get_variant_statistics(
128
128
 
129
129
  stats = con.sql(f"""
130
130
  WITH variant_counts AS (
131
- SELECT uprn, COUNT(*) AS variant_count
131
+ SELECT unique_id, COUNT(*) AS variant_count
132
132
  FROM read_parquet('{files_sql}')
133
- GROUP BY uprn
133
+ GROUP BY unique_id
134
134
  )
135
135
  SELECT
136
136
  COUNT(*) AS total_uprns,
@@ -179,7 +179,7 @@ def get_random_uprn(
179
179
 
180
180
  select_columns = _choose_select_columns(con, files_sql, columns)
181
181
  random_uprn = con.sql(f"""
182
- SELECT DISTINCT uprn
182
+ SELECT DISTINCT unique_id
183
183
  FROM read_parquet('{files_sql}')
184
184
  ORDER BY RANDOM()
185
185
  LIMIT 1
@@ -192,7 +192,7 @@ def get_random_uprn(
192
192
  SELECT
193
193
  {select_columns}
194
194
  FROM read_parquet('{files_sql}')
195
- WHERE uprn = {int(random_uprn[0])}
195
+ WHERE unique_id = {int(random_uprn[0])}
196
196
  ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
197
197
  """)
198
198
 
@@ -220,14 +220,14 @@ def get_random_large_uprn(
220
220
 
221
221
  selected = con.sql(f"""
222
222
  WITH variant_counts AS (
223
- SELECT uprn, COUNT(*) AS variant_count
223
+ SELECT unique_id, COUNT(*) AS variant_count
224
224
  FROM read_parquet('{files_sql}')
225
225
  {where_filter}
226
- GROUP BY uprn
227
- ORDER BY variant_count DESC, uprn ASC
226
+ GROUP BY unique_id
227
+ ORDER BY variant_count DESC, unique_id ASC
228
228
  LIMIT {int(top_n)}
229
229
  )
230
- SELECT uprn
230
+ SELECT unique_id
231
231
  FROM variant_counts
232
232
  ORDER BY RANDOM()
233
233
  LIMIT 1
@@ -240,7 +240,7 @@ def get_random_large_uprn(
240
240
  SELECT
241
241
  {select_columns}
242
242
  FROM read_parquet('{files_sql}')
243
- WHERE uprn = {int(selected[0])}
243
+ WHERE unique_id = {int(selected[0])}
244
244
  {and_filter}
245
245
  ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
246
246
  """)
@@ -269,7 +269,7 @@ def get_uprn_variants(
269
269
  SELECT
270
270
  {select_columns}
271
271
  FROM read_parquet('{files_sql}')
272
- WHERE uprn = {int(uprn)}
272
+ WHERE unique_id = {int(uprn)}
273
273
  {and_filter}
274
274
  ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
275
275
  """)
@@ -317,10 +317,10 @@ def inspect_flatfile_variants(
317
317
  WITH data AS (
318
318
  SELECT * FROM read_parquet('{files_sql}')
319
319
  )
320
- SELECT uprn, COUNT(*) AS variant_count
320
+ SELECT unique_id, COUNT(*) AS variant_count
321
321
  FROM data
322
- GROUP BY uprn
323
- ORDER BY variant_count DESC, uprn ASC
322
+ GROUP BY unique_id
323
+ ORDER BY variant_count DESC, unique_id ASC
324
324
  LIMIT 1 OFFSET {top_offset}
325
325
  """
326
326
  ).fetchone()
@@ -333,7 +333,7 @@ def inspect_flatfile_variants(
333
333
  f"""
334
334
  SELECT COUNT(*)
335
335
  FROM read_parquet('{files_sql}')
336
- WHERE uprn = ?
336
+ WHERE unique_id = ?
337
337
  """,
338
338
  [target_uprn],
339
339
  ).fetchone()
@@ -343,7 +343,7 @@ def inspect_flatfile_variants(
343
343
  f"""
344
344
  SELECT *
345
345
  FROM read_parquet('{files_sql}')
346
- WHERE uprn = ?
346
+ WHERE unique_id = ?
347
347
  ORDER BY 1
348
348
  """,
349
349
  [target_uprn],
@@ -358,7 +358,7 @@ def inspect_flatfile_variants(
358
358
  max_width=10_000
359
359
  )
360
360
  logger.info("Selected UPRN rows:")
361
- con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE uprn = {target_uprn}").show(
361
+ con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE unique_id = {target_uprn}").show(
362
362
  max_width=10_000
363
363
  )
364
364
 
@@ -9,9 +9,25 @@ from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
9
9
 
10
10
  import requests
11
11
 
12
+ from ukam_os_builder.api.settings import Settings
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
  API_BASE_URL = "https://api.os.uk/downloads/v1"
17
+
18
+ # NGD file stems to exclude (historic addresses are not used in output)
19
+ _NGD_EXCLUDED_STEMS = {"historicaddress"}
20
+
21
+
22
+ def _should_skip_ngd_download(filename: str, settings: object) -> bool:
23
+ """Return True if *filename* is an NGD historic-address archive."""
24
+ source_type = getattr(getattr(settings, "source", None), "type", "")
25
+ if source_type != "ngd":
26
+ return False
27
+ name_lower = filename.lower()
28
+ return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
29
+
30
+
15
31
  DEFAULT_CHUNK_SIZE = 1024 * 1024 * 20 # 20 MiB
16
32
  DEFAULT_CONNECT_TIMEOUT_SECONDS = 30
17
33
  DEFAULT_READ_TIMEOUT_SECONDS = 300
@@ -293,6 +309,11 @@ def run_download_step(
293
309
  logger.warning("No URL for %s, skipping", item.filename)
294
310
  continue
295
311
 
312
+ # Skip NGD historic address files — they are excluded from output
313
+ if _should_skip_ngd_download(item.filename, settings):
314
+ logger.info("Skipping historic address file: %s", item.filename)
315
+ continue
316
+
296
317
  dest_path = downloads_dir / item.filename
297
318
  was_downloaded = download_file(
298
319
  url=item.url,
@@ -312,3 +333,54 @@ def run_download_step(
312
333
 
313
334
  logger.info("Download complete: %d file(s)", len(downloaded))
314
335
  return downloaded
336
+
337
+
338
+ def _get_manifest_path(settings: Settings) -> Path | None:
339
+ downloads_dir = settings.paths.downloads_dir.resolve()
340
+ source_type = settings.source.type # "abp" | "ngd"
341
+
342
+ if source_type == "abp":
343
+ candidates = list(downloads_dir.glob("*-Order_Details.txt"))
344
+ if not candidates:
345
+ logger.info("➡️ Manifest (ABP order details) not found. Check: %s", downloads_dir)
346
+ return None
347
+
348
+ manifest = max(candidates, key=lambda p: p.stat().st_mtime).resolve()
349
+
350
+ if len(candidates) > 1:
351
+ logger.warning(
352
+ "Multiple ABP manifests found in %s. Using newest: %s",
353
+ downloads_dir,
354
+ manifest,
355
+ )
356
+
357
+ logger.info("➡️ Manifest (ABP order details): %s", manifest)
358
+ return manifest
359
+
360
+ elif source_type == "ngd":
361
+ candidates = list(
362
+ downloads_dir.glob("*_orderSummary.json")
363
+ ) # adjust if it's "*.orderSummary.json"
364
+ if not candidates:
365
+ logger.info("➡️ Manifests (NGD order summaries) not found. Check: %s", downloads_dir)
366
+ return None
367
+
368
+ built_candidates = list(downloads_dir.glob("*builtaddress*_orderSummary.json"))
369
+ built_manifest = (
370
+ max(built_candidates, key=lambda p: p.stat().st_mtime).resolve()
371
+ if built_candidates
372
+ else None
373
+ )
374
+
375
+ logger.info(
376
+ "➡️ Manifests (NGD order summaries): %s (%d files)\n"
377
+ " ↳ Built address order summary: %s",
378
+ downloads_dir,
379
+ len(candidates),
380
+ built_manifest if built_manifest else "(not found)",
381
+ )
382
+
383
+ return downloads_dir
384
+
385
+ logger.warning("Unknown source type %r. No manifest lookup performed.", source_type)
386
+ return None
@@ -1421,7 +1421,7 @@ wheels = [
1421
1421
 
1422
1422
  [[package]]
1423
1423
  name = "ukam-os-builder"
1424
- version = "0.1.0.dev4"
1424
+ version = "0.1.0.dev6"
1425
1425
  source = { editable = "." }
1426
1426
  dependencies = [
1427
1427
  { name = "duckdb" },
@@ -1,27 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from pathlib import Path
4
-
5
- from ukam_os_builder.os_builder.extract import (
6
- _filter_zips_for_source,
7
- _should_convert_csv_to_parquet,
8
- )
9
-
10
-
11
- def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
12
- zip_files = [
13
- Path("add_gb_builtaddress.zip"),
14
- Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
15
- ]
16
-
17
- filtered = _filter_zips_for_source(zip_files, "ngd")
18
-
19
- assert filtered == [Path("add_gb_builtaddress.zip")]
20
-
21
-
22
- def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
23
- ngd_csv = Path("add_gb_builtaddress.csv")
24
- abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
25
-
26
- assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
27
- assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False