ukam-os-builder 0.1.0.dev4__tar.gz → 0.1.0.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/PKG-INFO +1 -1
  2. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/pyproject.toml +1 -1
  3. ukam_os_builder-0.1.0.dev5/tests/test_extract_source_filtering.py +49 -0
  4. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_smoke.py +0 -1
  5. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/__init__.py +1 -1
  6. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/api.py +15 -6
  7. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/cli.py +0 -1
  8. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +1 -1
  9. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +4 -6
  10. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/ngd/to_flatfile.py +75 -11
  11. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/extract.py +15 -2
  12. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/os_hub.py +72 -0
  13. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/uv.lock +1 -1
  14. ukam_os_builder-0.1.0.dev4/tests/test_extract_source_filtering.py +0 -27
  15. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.env.example +0 -0
  16. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.github/workflows/ci.yml +0 -0
  17. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.github/workflows/e2e.yml +0 -0
  18. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.github/workflows/release-pypi.yml +0 -0
  19. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/.gitignore +0 -0
  20. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/AGENTS.md +0 -0
  21. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/README.md +0 -0
  22. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/config.example.yaml +0 -0
  23. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/prompt.md +0 -0
  24. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/shell/test_release_locally.sh +0 -0
  25. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/README.md +0 -0
  26. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress.csv +0 -0
  27. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
  28. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_historicaddress.csv +0 -0
  29. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_prebuildaddress.csv +0 -0
  30. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_royalmailaddress.csv +0 -0
  31. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_api.py +0 -0
  32. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_cli.py +0 -0
  33. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_cli_errors.py +0 -0
  34. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_inspect_results.py +0 -0
  35. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_public_api_integration.py +0 -0
  36. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_settings.py +0 -0
  37. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/tests/test_setup_wizard.py +0 -0
  38. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/_exceptions.py +0 -0
  39. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/cli_errors.py +0 -0
  40. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/settings.py +0 -0
  41. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
  42. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/split_raw.py +0 -0
  43. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
  44. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
  45. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/runner.py +0 -0
  46. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
  47. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
  48. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
  49. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
  50. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/__init__.py +0 -0
  51. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/inspect_results.py +0 -0
  52. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/pipeline_factory.py +0 -0
  53. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/pipeline.py +0 -0
  54. {ukam_os_builder-0.1.0.dev4 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/setup_wizard.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ukam-os-builder
3
- Version: 0.1.0.dev4
3
+ Version: 0.1.0.dev5
4
4
  Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
5
5
  Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
6
6
  Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ukam-os-builder"
3
- version = "0.1.0.dev4"
3
+ version = "0.1.0.dev5"
4
4
  description = "Download, process and transform OS address data (NGD or ABP) for UK address matching"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from ukam_os_builder.os_builder.extract import (
6
+ _filter_zips_for_source,
7
+ _should_convert_csv_to_parquet,
8
+ )
9
+
10
+
11
+ def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
12
+ zip_files = [
13
+ Path("add_gb_builtaddress.zip"),
14
+ Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
15
+ ]
16
+
17
+ filtered = _filter_zips_for_source(zip_files, "ngd")
18
+
19
+ assert filtered == [Path("add_gb_builtaddress.zip")]
20
+
21
+
22
+ def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
23
+ ngd_csv = Path("add_gb_builtaddress.csv")
24
+ abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
25
+
26
+ assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
27
+ assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False
28
+
29
+
30
+ def test_filter_zips_for_source_excludes_ngd_historicaddress() -> None:
31
+ zip_files = [
32
+ Path("add_gb_builtaddress.zip"),
33
+ Path("add_gb_historicaddress.zip"),
34
+ Path("add_gb_historicaddress_altadd.zip"),
35
+ Path("add_gb_prebuildaddress.zip"),
36
+ ]
37
+
38
+ filtered = _filter_zips_for_source(zip_files, "ngd")
39
+
40
+ assert Path("add_gb_builtaddress.zip") in filtered
41
+ assert Path("add_gb_prebuildaddress.zip") in filtered
42
+ assert Path("add_gb_historicaddress.zip") not in filtered
43
+ assert Path("add_gb_historicaddress_altadd.zip") not in filtered
44
+
45
+
46
+ def test_should_convert_csv_to_parquet_skips_ngd_historicaddress() -> None:
47
+ assert _should_convert_csv_to_parquet(Path("add_gb_builtaddress.csv"), "ngd") is True
48
+ assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress.csv"), "ngd") is False
49
+ assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress_altadd.csv"), "ngd") is False
@@ -121,7 +121,6 @@ def _prepare_test_parquet(settings: Settings) -> None:
121
121
  "add_gb_builtaddress_altadd.csv",
122
122
  "add_gb_royalmailaddress.csv",
123
123
  "add_gb_prebuildaddress.csv",
124
- "add_gb_historicaddress.csv",
125
124
  ]
126
125
 
127
126
  for csv_name in sample_files:
@@ -8,7 +8,7 @@ from ukam_os_builder.os_builder.inspect_results import (
8
8
  inspect_flatfile_variants,
9
9
  )
10
10
 
11
- __version__ = "0.1.0.dev4"
11
+ __version__ = "0.1.0.dev5"
12
12
 
13
13
  __all__ = [
14
14
  "create_config_and_env",
@@ -8,7 +8,7 @@ from typing import Any, Literal
8
8
  import yaml
9
9
 
10
10
  from ukam_os_builder.api.settings import Settings, SettingsError, load_settings
11
- from ukam_os_builder.os_builder.os_hub import get_package_version
11
+ from ukam_os_builder.os_builder.os_hub import _get_manifest_path, get_package_version
12
12
  from ukam_os_builder.pipeline import run as run_pipeline
13
13
  from ukam_os_builder.pipeline import supported_steps_for_source
14
14
 
@@ -333,11 +333,6 @@ def run_from_config(
333
333
  parquet_compression_level=parquet_compression_level,
334
334
  )
335
335
  logger.info("Resolved work_dir: %s", settings.paths.work_dir)
336
- logger.info("Resolved downloads_dir: %s", settings.paths.downloads_dir)
337
- logger.info("Resolved extracted_dir: %s", settings.paths.extracted_dir)
338
- logger.info("Resolved parquet_dir: %s", settings.paths.parquet_dir)
339
- logger.info("Resolved output_dir: %s", settings.paths.output_dir)
340
-
341
336
  source_type = settings.source.type
342
337
  if step != "all":
343
338
  supported_steps = supported_steps_for_source(source_type)
@@ -353,4 +348,18 @@ def run_from_config(
353
348
 
354
349
  overwrite_effective = overwrite if overwrite is not None else bool(force)
355
350
  run_pipeline(step=step, settings=settings, force=overwrite_effective, list_only=list_only)
351
+
352
+ logger.info(
353
+ "✅ Pipeline run completed\n\n"
354
+ "Where you need to look:\n"
355
+ " • downloads_dir (raw OS Hub extracts): %s%s\n"
356
+ " • output_dir (final files for address matcher): %s%s\n",
357
+ str(settings.paths.downloads_dir),
358
+ "",
359
+ str(settings.paths.output_dir),
360
+ "",
361
+ )
362
+
363
+ _get_manifest_path(settings)
364
+
356
365
  return settings
@@ -145,7 +145,6 @@ def main(argv: list[str] | None = None) -> int:
145
145
  parquet_compression=args.parquet_compression,
146
146
  parquet_compression_level=args.parquet_compression_level,
147
147
  )
148
- logger.info("Pipeline run completed")
149
148
  console.print("[bold green]Build completed successfully[/bold green]")
150
149
  return 0
151
150
  except (SettingsError, ValueError) as exc:
@@ -33,7 +33,7 @@ def combine_and_dedupe(con: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyRelatio
33
33
  ),
34
34
  ranked AS (
35
35
  SELECT *,
36
- CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2 WHEN 8 THEN 3 ELSE 9 END AS status_rank,
36
+ CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2 ELSE 9 END AS status_rank,
37
37
  CASE source WHEN 'LPI' THEN 0 WHEN 'ORGANISATION' THEN 1 WHEN 'DELIVERY_POINT' THEN 2 WHEN 'CUSTOM_LEVEL' THEN 3 ELSE 4 END AS source_rank
38
38
  FROM normalized
39
39
  ),
@@ -70,15 +70,15 @@ matching messy user input. We output variants based on **Logical Status**:
70
70
  locally known as "Rose Cottage").
71
71
  3. **Provisional (6):** The address assigned during planning/construction, which
72
72
  might change before the house is built.
73
- 4. **Historic (8):** An old address. If "10 High St" is renumbered to "12 High St",
74
- the old address is kept as Historic. This helps match old datasets.
73
+
74
+ Historic addresses (logical_status=8) are excluded from output.
75
75
 
76
76
  ------------------------------------------------------------------------------
77
77
  Key Columns Explained
78
78
  ------------------------------------------------------------------------------
79
79
  * `uprn`: The "Golden Key". Use this to link this address to other data.
80
80
  * `base_address`: The constructed full address string.
81
- * `logical_status`: 1=Current, 6=Provisional, 8=Historic.
81
+ * `logical_status`: 1=Current, 6=Provisional.
82
82
  * `official_flag`: 'Y' indicates this is the "official" version, 'N' suggests
83
83
  it might be an unofficial alias.
84
84
  * `language`: 'ENG' (English) or 'CYM' (Welsh). Streets in Wales often have
@@ -183,7 +183,6 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
183
183
  WHEN 1 THEN 0
184
184
  WHEN 3 THEN 1
185
185
  WHEN 6 THEN 2
186
- WHEN 8 THEN 3
187
186
  ELSE 9
188
187
  END AS status_rank
189
188
  FROM lpi l
@@ -192,7 +191,7 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
192
191
  LEFT JOIN _sd_best_by_lang sd_lang ON sd_lang.usrn = l.usrn AND sd_lang.language = l.language
193
192
  LEFT JOIN _sd_best_any sd_any ON sd_any.usrn = l.usrn
194
193
  WHERE (b.addressbase_postal != 'N' OR b.addressbase_postal IS NULL)
195
- AND l.logical_status IN (1, 3, 6, 8)
194
+ AND l.logical_status IN (1, 3, 6)
196
195
  """)
197
196
 
198
197
  # Deduplicated distinct addresses
@@ -266,7 +265,6 @@ def render_variants(con: duckdb.DuckDBPyConnection) -> None:
266
265
  WHEN 1 THEN 'APPROVED'
267
266
  WHEN 3 THEN 'ALTERNATIVE'
268
267
  WHEN 6 THEN 'PROVISIONAL'
269
- WHEN 8 THEN 'HISTORICAL'
270
268
  END AS variant_label,
271
269
  (logical_status = 1) AS is_primary
272
270
  FROM lpi_base_distinct
@@ -2,7 +2,7 @@
2
2
 
3
3
  Transforms the extracted parquet files into a single flatfile suitable for
4
4
  UK address matching. This includes:
5
- - Processing core feature types (Built Address, Historic Address, etc.)
5
+ - Processing core feature types (Built Address, Pre-Build Address, etc.)
6
6
  - Processing alternate address records
7
7
  - Processing Royal Mail addresses
8
8
  - Handling Welsh language variants
@@ -27,8 +27,6 @@ logger = logging.getLogger(__name__)
27
27
  FEATURE_TYPE_BY_STEM = {
28
28
  "add_gb_builtaddress": "Built Address",
29
29
  "add_gb_builtaddress_altadd": "Built Address",
30
- "add_gb_historicaddress": "Historic Address",
31
- "add_gb_historicaddress_altadd": "Historic Address",
32
30
  "add_gb_nonaddressableobject": "Non-Addressable Object",
33
31
  "add_gb_nonaddressableobject_altadd": "Non-Addressable Object",
34
32
  "add_gb_prebuildaddress": "Pre-Build Address",
@@ -39,7 +37,6 @@ FEATURE_TYPE_BY_STEM = {
39
37
  # Core feature stems (contain fulladdress and classification fields)
40
38
  CORE_FEATURE_STEMS = {
41
39
  "add_gb_builtaddress",
42
- "add_gb_historicaddress",
43
40
  "add_gb_nonaddressableobject",
44
41
  "add_gb_prebuildaddress",
45
42
  }
@@ -47,7 +44,6 @@ CORE_FEATURE_STEMS = {
47
44
  # Alternate address stems (no classification fields)
48
45
  ALTADD_STEMS = {
49
46
  "add_gb_builtaddress_altadd",
50
- "add_gb_historicaddress_altadd",
51
47
  "add_gb_nonaddressableobject_altadd",
52
48
  "add_gb_prebuildaddress_altadd",
53
49
  }
@@ -57,7 +53,6 @@ CORE_FEATURE_PRIORITY = {
57
53
  "add_gb_builtaddress": 1,
58
54
  "add_gb_prebuildaddress": 2,
59
55
  "add_gb_nonaddressableobject": 3,
60
- "add_gb_historicaddress": 4,
61
56
  }
62
57
 
63
58
 
@@ -71,7 +66,7 @@ def _create_metadata_lookup_view(
71
66
  This view is used to enrich Royal Mail and alternate address records
72
67
  with metadata (classificationcode, parentuprn, etc.) by UPRN lookup.
73
68
 
74
- Uses priority ranking (Built > Pre-Build > Non-Addressable > Historic)
69
+ Uses priority ranking (Built > Pre-Build > Non-Addressable)
75
70
  to dedupe when a UPRN exists in multiple core files.
76
71
 
77
72
  Args:
@@ -156,7 +151,7 @@ def _create_core_feature_view(
156
151
  parquet_path: Path,
157
152
  uprn_predicate: str | None = None,
158
153
  ) -> None:
159
- """Create view for core feature types (Built, Historic, Pre-Build, Non-Addressable).
154
+ """Create view for core feature types (Built, Pre-Build, Non-Addressable).
160
155
 
161
156
  These tables have fulladdress, classification fields, and Welsh language columns.
162
157
  Produces both English and Welsh (where available) address records.
@@ -413,11 +408,76 @@ def _enrich_with_metadata(con: duckdb.DuckDBPyConnection) -> None:
413
408
  con.execute(sql)
414
409
 
415
410
 
411
+ def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
412
+ """Generate custom level-based address variants and insert into enriched table.
413
+
414
+ Parses the ``floorlevel`` column (VARCHAR) from the enriched address table,
415
+ maps integer floor levels to words (-1=BASEMENT … 6=SIXTH), and prepends the
416
+ word to the existing ``address_concat`` to create additional address variants.
417
+
418
+ These rows use ``feature_type='Custom Level'`` so they receive the lowest
419
+ dedup priority and never override official address data.
420
+ """
421
+ sql = """
422
+ INSERT INTO all_full_addresses_enriched
423
+ WITH level_parsed AS (
424
+ SELECT
425
+ uprn, address_concat, postcode, filename,
426
+ classificationcode, parentuprn, rootuprn,
427
+ hierarchylevel, floorlevel, lowestfloorlevel, highestfloorlevel,
428
+ address_status, build_status,
429
+ CASE
430
+ WHEN split_part(floorlevel, ',', 1) ~ '^-?[0-9]+$'
431
+ THEN CAST(split_part(floorlevel, ',', 1) AS INTEGER)
432
+ ELSE NULL
433
+ END AS level_int
434
+ FROM all_full_addresses_enriched
435
+ WHERE floorlevel IS NOT NULL
436
+ AND address_concat IS NOT NULL
437
+ AND address_concat <> ''
438
+ ),
439
+ level_words AS (
440
+ SELECT
441
+ *,
442
+ CASE level_int
443
+ WHEN -1 THEN 'BASEMENT'
444
+ WHEN 0 THEN 'GROUND'
445
+ WHEN 1 THEN 'FIRST'
446
+ WHEN 2 THEN 'SECOND'
447
+ WHEN 3 THEN 'THIRD'
448
+ WHEN 4 THEN 'FOURTH'
449
+ WHEN 5 THEN 'FIFTH'
450
+ WHEN 6 THEN 'SIXTH'
451
+ END AS level_word
452
+ FROM level_parsed
453
+ WHERE level_int BETWEEN -1 AND 6
454
+ )
455
+ SELECT
456
+ uprn,
457
+ TRIM(concat(level_word, ' ', address_concat)) AS address_concat,
458
+ postcode,
459
+ 'CUSTOM_LEVEL' AS filename,
460
+ classificationcode,
461
+ parentuprn,
462
+ rootuprn,
463
+ hierarchylevel,
464
+ floorlevel,
465
+ lowestfloorlevel,
466
+ highestfloorlevel,
467
+ 'Custom Level' AS feature_type,
468
+ address_status,
469
+ build_status
470
+ FROM level_words
471
+ WHERE level_word IS NOT NULL;
472
+ """
473
+ con.execute(sql)
474
+
475
+
416
476
  def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
417
477
  """Create deduplicated view of all addresses.
418
478
 
419
479
  Priority rules for deduplication:
420
- - Feature type: Built Address -> Pre-Build -> Royal Mail -> Historic -> Non-Addressable
480
+ - Feature type: Built Address -> Pre-Build -> Royal Mail -> Non-Addressable
421
481
  - Address status: Approved -> Provisional -> Alternative -> Historical
422
482
  - Build status: Built Complete -> Under Construction -> Prebuild -> Historic -> Demolished
423
483
 
@@ -433,8 +493,8 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
433
493
  WHEN 'Built Address' THEN 1
434
494
  WHEN 'Pre-Build Address' THEN 2
435
495
  WHEN 'Royal Mail Address' THEN 3
436
- WHEN 'Historic Address' THEN 4
437
496
  WHEN 'Non-Addressable Object' THEN 5
497
+ WHEN 'Custom Level' THEN 6
438
498
  ELSE 9
439
499
  END AS feature_type_rank,
440
500
  CASE
@@ -460,7 +520,7 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
460
520
  build_status_rank
461
521
  ) AS rn
462
522
  FROM all_full_addresses_enriched
463
- WHERE feature_type != 'Non-Addressable Object'
523
+ WHERE feature_type NOT IN ('Non-Addressable Object')
464
524
  )
465
525
  SELECT
466
526
  uprn,
@@ -641,6 +701,10 @@ def run_flatfile_step(settings: Settings, force: bool = False) -> list[Path]:
641
701
  logger.info("Enriching addresses with metadata from core files...")
642
702
  _enrich_with_metadata(con)
643
703
 
704
+ # Generate custom level variants
705
+ logger.info("Generating custom level address variants...")
706
+ _create_custom_level_rows(con)
707
+
644
708
  # Create deduplicated view
645
709
  logger.info("Creating deduplicated view...")
646
710
  _create_dedup_view(con)
@@ -11,6 +11,9 @@ from ukam_os_builder.api.settings import Settings
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
+ # NGD file stems to exclude (historic addresses are not used in output)
15
+ _NGD_EXCLUDED_STEMS = {"historicaddress"}
16
+
14
17
 
15
18
  def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
16
19
  """Find all downloaded zip files in a directory."""
@@ -22,11 +25,20 @@ def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
22
25
  return zip_files
23
26
 
24
27
 
28
+ def _is_excluded_ngd_file(name: str) -> bool:
29
+ """Return True if *name* matches an excluded NGD stem (e.g. historicaddress)."""
30
+ name_lower = name.lower()
31
+ return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
32
+
33
+
25
34
  def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
26
35
  source_lower = source.lower()
27
36
  if source_lower == "ngd":
28
37
  ngd_zips = [
29
- zip_path for zip_path in zip_files if zip_path.name.lower().startswith("add_gb_")
38
+ zip_path
39
+ for zip_path in zip_files
40
+ if zip_path.name.lower().startswith("add_gb_")
41
+ and not _is_excluded_ngd_file(zip_path.name)
30
42
  ]
31
43
  return ngd_zips or zip_files
32
44
  if source_lower == "abp":
@@ -39,7 +51,8 @@ def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
39
51
 
40
52
  def _should_convert_csv_to_parquet(csv_path: Path, source: str) -> bool:
41
53
  if source.lower() == "ngd":
42
- return csv_path.name.lower().startswith("add_gb_")
54
+ name_lower = csv_path.name.lower()
55
+ return name_lower.startswith("add_gb_") and not _is_excluded_ngd_file(name_lower)
43
56
  return True
44
57
 
45
58
 
@@ -9,9 +9,25 @@ from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
9
9
 
10
10
  import requests
11
11
 
12
+ from ukam_os_builder.api.settings import Settings
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
  API_BASE_URL = "https://api.os.uk/downloads/v1"
17
+
18
+ # NGD file stems to exclude (historic addresses are not used in output)
19
+ _NGD_EXCLUDED_STEMS = {"historicaddress"}
20
+
21
+
22
+ def _should_skip_ngd_download(filename: str, settings: object) -> bool:
23
+ """Return True if *filename* is an NGD historic-address archive."""
24
+ source_type = getattr(getattr(settings, "source", None), "type", "")
25
+ if source_type != "ngd":
26
+ return False
27
+ name_lower = filename.lower()
28
+ return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
29
+
30
+
15
31
  DEFAULT_CHUNK_SIZE = 1024 * 1024 * 20 # 20 MiB
16
32
  DEFAULT_CONNECT_TIMEOUT_SECONDS = 30
17
33
  DEFAULT_READ_TIMEOUT_SECONDS = 300
@@ -293,6 +309,11 @@ def run_download_step(
293
309
  logger.warning("No URL for %s, skipping", item.filename)
294
310
  continue
295
311
 
312
+ # Skip NGD historic address files — they are excluded from output
313
+ if _should_skip_ngd_download(item.filename, settings):
314
+ logger.info("Skipping historic address file: %s", item.filename)
315
+ continue
316
+
296
317
  dest_path = downloads_dir / item.filename
297
318
  was_downloaded = download_file(
298
319
  url=item.url,
@@ -312,3 +333,54 @@ def run_download_step(
312
333
 
313
334
  logger.info("Download complete: %d file(s)", len(downloaded))
314
335
  return downloaded
336
+
337
+
338
+ def _get_manifest_path(settings: Settings) -> Path | None:
339
+ downloads_dir = settings.paths.downloads_dir.resolve()
340
+ source_type = settings.source.type # "abp" | "ngd"
341
+
342
+ if source_type == "abp":
343
+ candidates = list(downloads_dir.glob("*-Order_Details.txt"))
344
+ if not candidates:
345
+ logger.info("➡️ Manifest (ABP order details) not found. Check: %s", downloads_dir)
346
+ return None
347
+
348
+ manifest = max(candidates, key=lambda p: p.stat().st_mtime).resolve()
349
+
350
+ if len(candidates) > 1:
351
+ logger.warning(
352
+ "Multiple ABP manifests found in %s. Using newest: %s",
353
+ downloads_dir,
354
+ manifest,
355
+ )
356
+
357
+ logger.info("➡️ Manifest (ABP order details): %s", manifest)
358
+ return manifest
359
+
360
+ elif source_type == "ngd":
361
+ candidates = list(
362
+ downloads_dir.glob("*_orderSummary.json")
363
+ ) # adjust if it's "*.orderSummary.json"
364
+ if not candidates:
365
+ logger.info("➡️ Manifests (NGD order summaries) not found. Check: %s", downloads_dir)
366
+ return None
367
+
368
+ built_candidates = list(downloads_dir.glob("*builtaddress*_orderSummary.json"))
369
+ built_manifest = (
370
+ max(built_candidates, key=lambda p: p.stat().st_mtime).resolve()
371
+ if built_candidates
372
+ else None
373
+ )
374
+
375
+ logger.info(
376
+ "➡️ Manifests (NGD order summaries): %s (%d files)\n"
377
+ " ↳ Built address order summary: %s",
378
+ downloads_dir,
379
+ len(candidates),
380
+ built_manifest if built_manifest else "(not found)",
381
+ )
382
+
383
+ return downloads_dir
384
+
385
+ logger.warning("Unknown source type %r. No manifest lookup performed.", source_type)
386
+ return None
@@ -1421,7 +1421,7 @@ wheels = [
1421
1421
 
1422
1422
  [[package]]
1423
1423
  name = "ukam-os-builder"
1424
- version = "0.1.0.dev4"
1424
+ version = "0.1.0.dev5"
1425
1425
  source = { editable = "." }
1426
1426
  dependencies = [
1427
1427
  { name = "duckdb" },
@@ -1,27 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from pathlib import Path
4
-
5
- from ukam_os_builder.os_builder.extract import (
6
- _filter_zips_for_source,
7
- _should_convert_csv_to_parquet,
8
- )
9
-
10
-
11
- def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
12
- zip_files = [
13
- Path("add_gb_builtaddress.zip"),
14
- Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
15
- ]
16
-
17
- filtered = _filter_zips_for_source(zip_files, "ngd")
18
-
19
- assert filtered == [Path("add_gb_builtaddress.zip")]
20
-
21
-
22
- def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
23
- ngd_csv = Path("add_gb_builtaddress.csv")
24
- abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
25
-
26
- assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
27
- assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False