ukam-os-builder 0.1.0.dev5__tar.gz → 0.1.0.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/PKG-INFO +3 -6
  2. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/README.md +2 -5
  3. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/pyproject.toml +1 -1
  4. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_inspect_results.py +2 -2
  5. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_public_api_integration.py +1 -1
  6. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_smoke.py +4 -7
  7. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/__init__.py +1 -1
  8. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/runner.py +2 -2
  9. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +1 -1
  10. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/ngd/to_flatfile.py +68 -36
  11. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/inspect_results.py +17 -17
  12. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/uv.lock +1 -1
  13. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.env.example +0 -0
  14. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.github/workflows/ci.yml +0 -0
  15. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.github/workflows/e2e.yml +0 -0
  16. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.github/workflows/release-pypi.yml +0 -0
  17. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.gitignore +0 -0
  18. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/AGENTS.md +0 -0
  19. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/config.example.yaml +0 -0
  20. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/prompt.md +0 -0
  21. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/shell/test_release_locally.sh +0 -0
  22. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/README.md +0 -0
  23. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress.csv +0 -0
  24. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
  25. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_historicaddress.csv +0 -0
  26. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_prebuildaddress.csv +0 -0
  27. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_royalmailaddress.csv +0 -0
  28. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_api.py +0 -0
  29. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_cli.py +0 -0
  30. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_cli_errors.py +0 -0
  31. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_extract_source_filtering.py +0 -0
  32. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_settings.py +0 -0
  33. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_setup_wizard.py +0 -0
  34. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/_exceptions.py +0 -0
  35. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/api.py +0 -0
  36. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/cli_errors.py +0 -0
  37. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/settings.py +0 -0
  38. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/cli.py +0 -0
  39. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
  40. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/split_raw.py +0 -0
  41. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
  42. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
  43. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
  44. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
  45. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +0 -0
  46. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
  47. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
  48. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/__init__.py +0 -0
  49. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/extract.py +0 -0
  50. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/os_hub.py +0 -0
  51. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/pipeline_factory.py +0 -0
  52. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/pipeline.py +0 -0
  53. {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/setup_wizard.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ukam-os-builder
3
- Version: 0.1.0.dev5
3
+ Version: 0.1.0.dev6
4
4
  Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
5
5
  Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
6
6
  Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
@@ -221,13 +221,10 @@ Each file contains:
221
221
  | `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
222
222
  | `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
223
223
  | `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
224
- | `rootuprn` | BIGINT | Root UPRN at the top of the hierarchy |
225
- | `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
224
+ | `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
226
225
  | `floorlevel` | VARCHAR | Floor level identifier |
227
- | `lowestfloorlevel` | DOUBLE | Lowest floor number |
228
- | `highestfloorlevel` | DOUBLE | Highest floor number |
229
226
 
230
- Metadata columns (`classificationcode`, `parentuprn`, `rootuprn`, `hierarchylevel`, `floorlevel`, `lowestfloorlevel`, `highestfloorlevel`) are enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records.
227
+ Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
231
228
 
232
229
  </details>
233
230
 
@@ -195,13 +195,10 @@ Each file contains:
195
195
  | `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
196
196
  | `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
197
197
  | `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
198
- | `rootuprn` | BIGINT | Root UPRN at the top of the hierarchy |
199
- | `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
198
+ | `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
200
199
  | `floorlevel` | VARCHAR | Floor level identifier |
201
- | `lowestfloorlevel` | DOUBLE | Lowest floor number |
202
- | `highestfloorlevel` | DOUBLE | Highest floor number |
203
200
 
204
- Metadata columns (`classificationcode`, `parentuprn`, `rootuprn`, `hierarchylevel`, `floorlevel`, `lowestfloorlevel`, `highestfloorlevel`) are enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records.
201
+ Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
205
202
 
206
203
  </details>
207
204
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ukam-os-builder"
3
- version = "0.1.0.dev5"
3
+ version = "0.1.0.dev6"
4
4
  description = "Download, process and transform OS address data (NGD or ABP) for UK address matching"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -26,7 +26,7 @@ def test_inspect_flatfile_variants_uses_config_defaults(tmp_path: Path) -> None:
26
26
  (1001::BIGINT, 'A'::VARCHAR),
27
27
  (1001::BIGINT, 'B'::VARCHAR),
28
28
  (1002::BIGINT, 'C'::VARCHAR)
29
- ) AS t(uprn, address_concat)
29
+ ) AS t(unique_id, address_concat)
30
30
  ) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
31
31
  """
32
32
  )
@@ -61,7 +61,7 @@ def test_inspect_flatfile_variants_supports_abp_pattern(tmp_path: Path) -> None:
61
61
  (2001::BIGINT, 'A'::VARCHAR),
62
62
  (2002::BIGINT, 'B'::VARCHAR),
63
63
  (2002::BIGINT, 'C'::VARCHAR)
64
- ) AS t(uprn, address_concat)
64
+ ) AS t(unique_id, address_concat)
65
65
  ) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
66
66
  """
67
67
  )
@@ -71,7 +71,7 @@ def test_package_root_inspect_flatfile_variants(tmp_path: Path) -> None:
71
71
  (4001::BIGINT, 'A'::VARCHAR),
72
72
  (4001::BIGINT, 'B'::VARCHAR),
73
73
  (4002::BIGINT, 'C'::VARCHAR)
74
- ) AS t(uprn, address_concat)
74
+ ) AS t(unique_id, address_concat)
75
75
  ) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
76
76
  """
77
77
  )
@@ -172,17 +172,14 @@ def test_flatfile_single_chunk(temp_settings: Settings) -> None:
172
172
  column_names = [row[0] for row in schema]
173
173
 
174
174
  expected_columns = [
175
- "uprn",
175
+ "unique_id",
176
176
  "address_concat",
177
177
  "postcode",
178
178
  "filename",
179
179
  "classificationcode",
180
180
  "parentuprn",
181
- "rootuprn",
182
- "hierarchylevel",
181
+ "lowertierlocalauthoritygsscode",
183
182
  "floorlevel",
184
- "lowestfloorlevel",
185
- "highestfloorlevel",
186
183
  ]
187
184
  for col in expected_columns:
188
185
  assert col in column_names, f"Column {col} should exist in output"
@@ -231,9 +228,9 @@ def test_deduplication(temp_settings: Settings) -> None:
231
228
  # Verify no exact duplicates
232
229
  con = duckdb.connect()
233
230
  result = con.execute(f"""
234
- SELECT uprn, address_concat, COUNT(*) as cnt
231
+ SELECT unique_id, address_concat, COUNT(*) as cnt
235
232
  FROM read_parquet('{output_files[0].as_posix()}')
236
- GROUP BY uprn, address_concat
233
+ GROUP BY unique_id, address_concat
237
234
  HAVING COUNT(*) > 1
238
235
  """).fetchall()
239
236
 
@@ -8,7 +8,7 @@ from ukam_os_builder.os_builder.inspect_results import (
8
8
  inspect_flatfile_variants,
9
9
  )
10
10
 
11
- __version__ = "0.1.0.dev5"
11
+ __version__ = "0.1.0.dev6"
12
12
 
13
13
  __all__ = [
14
14
  "create_config_and_env",
@@ -170,7 +170,7 @@ def _transform_to_flatfile_chunk(
170
170
  logger.debug("Combination and deduplication in %.2f seconds", perf_counter() - t0)
171
171
 
172
172
  # Get chunk metrics
173
- chunk_metrics = con.execute("SELECT COUNT(DISTINCT uprn), COUNT(*) FROM result").fetchone()
173
+ chunk_metrics = con.execute("SELECT COUNT(DISTINCT unique_id), COUNT(*) FROM result").fetchone()
174
174
  chunk_uprns = chunk_metrics[0]
175
175
  chunk_rows = chunk_metrics[1]
176
176
 
@@ -244,7 +244,7 @@ def transform_to_flatfile(
244
244
  con = create_duckdb_connection(settings)
245
245
  output_path = output_paths[0]
246
246
  stats = con.execute(f"""
247
- SELECT COUNT(DISTINCT uprn), COUNT(*)
247
+ SELECT COUNT(DISTINCT unique_id), COUNT(*)
248
248
  FROM read_parquet('{output_path.as_posix()}')
249
249
  """).fetchone()
250
250
  total_uprns = stats[0]
@@ -62,7 +62,7 @@ def combine_and_dedupe(con: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyRelatio
62
62
  FROM deduped_filtered
63
63
  )
64
64
  SELECT
65
- sr.uprn,
65
+ sr.uprn AS unique_id,
66
66
  sr.postcode,
67
67
  sr.address_concat,
68
68
  cb.classification_code,
@@ -97,7 +97,6 @@ def _create_metadata_lookup_view(
97
97
  """)
98
98
 
99
99
  if not union_parts:
100
- # No core files found - create empty lookup
101
100
  logger.warning("No core feature files found. Metadata lookup will be empty.")
102
101
  con.execute("""
103
102
  CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
@@ -112,37 +111,48 @@ def _create_metadata_lookup_view(
112
111
  CAST(NULL AS DOUBLE) AS highestfloorlevel
113
112
  WHERE 1=0
114
113
  """)
115
- return
116
-
117
- union_sql = "\nUNION ALL\n".join(union_parts)
114
+ else:
115
+ union_sql = "\nUNION ALL\n".join(union_parts)
118
116
 
119
- sql = f"""
120
- CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
121
- WITH core_data AS (
122
- {union_sql}
123
- ),
124
- ranked AS (
117
+ sql = f"""
118
+ CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
119
+ WITH core_data AS (
120
+ {union_sql}
121
+ ),
122
+ ranked AS (
123
+ SELECT
124
+ *,
125
+ ROW_NUMBER() OVER (
126
+ PARTITION BY uprn
127
+ ORDER BY source_priority
128
+ ) AS rn
129
+ FROM core_data
130
+ )
125
131
  SELECT
126
- *,
127
- ROW_NUMBER() OVER (
128
- PARTITION BY uprn
129
- ORDER BY source_priority
130
- ) AS rn
131
- FROM core_data
132
- )
132
+ uprn,
133
+ classificationcode,
134
+ parentuprn,
135
+ rootuprn,
136
+ hierarchylevel,
137
+ floorlevel,
138
+ lowestfloorlevel,
139
+ highestfloorlevel
140
+ FROM ranked
141
+ WHERE rn = 1;
142
+ """
143
+ con.execute(sql)
144
+
145
+ built_path = parquet_dir / "add_gb_builtaddress.parquet"
146
+ built_sql = f"""
147
+ CREATE OR REPLACE TEMP VIEW builtaddress_ltla_lookup AS
133
148
  SELECT
134
- uprn,
135
- classificationcode,
136
- parentuprn,
137
- rootuprn,
138
- hierarchylevel,
139
- floorlevel,
140
- lowestfloorlevel,
141
- highestfloorlevel
142
- FROM ranked
143
- WHERE rn = 1;
149
+ CAST(uprn AS BIGINT) AS uprn,
150
+ MAX(CAST(lowertierlocalauthoritygsscode AS VARCHAR)) AS lowertierlocalauthoritygsscode
151
+ FROM read_parquet('{built_path.as_posix()}')
152
+ {where_clause}
153
+ GROUP BY CAST(uprn AS BIGINT)
144
154
  """
145
- con.execute(sql)
155
+ con.execute(built_sql)
146
156
 
147
157
 
148
158
  def _create_core_feature_view(
@@ -183,6 +193,7 @@ def _create_core_feature_view(
183
193
  CAST(floorlevel AS VARCHAR) AS floorlevel,
184
194
  CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
185
195
  CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
196
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
186
197
  -- Internal columns for deduplication (not in final output)
187
198
  CAST(description AS VARCHAR) AS feature_type,
188
199
  CAST(addressstatus AS VARCHAR) AS address_status,
@@ -222,6 +233,7 @@ def _create_core_feature_view(
222
233
  CAST(floorlevel AS VARCHAR) AS floorlevel,
223
234
  CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
224
235
  CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
236
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
225
237
  -- Internal columns for deduplication (not in final output)
226
238
  CAST(description AS VARCHAR) AS feature_type,
227
239
  CAST(addressstatus AS VARCHAR) AS address_status,
@@ -277,6 +289,7 @@ def _create_altadd_view(
277
289
  CAST(floorlevel AS VARCHAR) AS floorlevel,
278
290
  CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
279
291
  CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
292
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
280
293
  -- Internal columns for deduplication (not in final output)
281
294
  '{feature_type}' AS feature_type,
282
295
  CAST(addressstatus AS VARCHAR) AS address_status,
@@ -333,6 +346,7 @@ def _create_royal_mail_view(
333
346
  CAST(NULL AS VARCHAR) AS floorlevel,
334
347
  CAST(NULL AS DOUBLE) AS lowestfloorlevel,
335
348
  CAST(NULL AS DOUBLE) AS highestfloorlevel,
349
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
336
350
  -- Internal columns for deduplication (not in final output)
337
351
  'Royal Mail Address' AS feature_type,
338
352
  CAST(NULL AS VARCHAR) AS address_status,
@@ -363,6 +377,7 @@ def _create_royal_mail_view(
363
377
  CAST(NULL AS VARCHAR) AS floorlevel,
364
378
  CAST(NULL AS DOUBLE) AS lowestfloorlevel,
365
379
  CAST(NULL AS DOUBLE) AS highestfloorlevel,
380
+ CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
366
381
  -- Internal columns for deduplication (not in final output)
367
382
  'Royal Mail Address' AS feature_type,
368
383
  CAST(NULL AS VARCHAR) AS address_status,
@@ -398,12 +413,14 @@ def _enrich_with_metadata(con: duckdb.DuckDBPyConnection) -> None:
398
413
  COALESCE(a.floorlevel, m.floorlevel) AS floorlevel,
399
414
  COALESCE(a.lowestfloorlevel, m.lowestfloorlevel) AS lowestfloorlevel,
400
415
  COALESCE(a.highestfloorlevel, m.highestfloorlevel) AS highestfloorlevel,
416
+ b.lowertierlocalauthoritygsscode AS lowertierlocalauthoritygsscode,
401
417
  -- Internal columns for deduplication
402
418
  a.feature_type,
403
419
  a.address_status,
404
420
  a.build_status
405
421
  FROM all_full_addresses a
406
- LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn;
422
+ LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn
423
+ LEFT JOIN builtaddress_ltla_lookup b ON a.uprn = b.uprn;
407
424
  """
408
425
  con.execute(sql)
409
426
 
@@ -419,11 +436,28 @@ def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
419
436
  dedup priority and never override official address data.
420
437
  """
421
438
  sql = """
422
- INSERT INTO all_full_addresses_enriched
439
+ INSERT INTO all_full_addresses_enriched (
440
+ uprn,
441
+ address_concat,
442
+ postcode,
443
+ filename,
444
+ classificationcode,
445
+ parentuprn,
446
+ rootuprn,
447
+ hierarchylevel,
448
+ floorlevel,
449
+ lowestfloorlevel,
450
+ highestfloorlevel,
451
+ lowertierlocalauthoritygsscode,
452
+ feature_type,
453
+ address_status,
454
+ build_status
455
+ )
423
456
  WITH level_parsed AS (
424
457
  SELECT
425
458
  uprn, address_concat, postcode, filename,
426
459
  classificationcode, parentuprn, rootuprn,
460
+ lowertierlocalauthoritygsscode,
427
461
  hierarchylevel, floorlevel, lowestfloorlevel, highestfloorlevel,
428
462
  address_status, build_status,
429
463
  CASE
@@ -464,6 +498,7 @@ def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
464
498
  floorlevel,
465
499
  lowestfloorlevel,
466
500
  highestfloorlevel,
501
+ lowertierlocalauthoritygsscode,
467
502
  'Custom Level' AS feature_type,
468
503
  address_status,
469
504
  build_status
@@ -523,17 +558,14 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
523
558
  WHERE feature_type NOT IN ('Non-Addressable Object')
524
559
  )
525
560
  SELECT
526
- uprn,
561
+ uprn AS unique_id,
527
562
  address_concat,
528
563
  postcode,
529
564
  filename,
530
565
  classificationcode,
531
566
  parentuprn,
532
- rootuprn,
533
- hierarchylevel,
534
- floorlevel,
535
- lowestfloorlevel,
536
- highestfloorlevel
567
+ lowertierlocalauthoritygsscode,
568
+ floorlevel
537
569
  FROM ranked
538
570
  WHERE rn = 1;
539
571
  """
@@ -12,7 +12,7 @@ SourceType = Literal["ngd", "abp"]
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
  _DEFAULT_SELECT_COLUMNS = [
15
- "uprn",
15
+ "unique_id",
16
16
  "address_concat",
17
17
  "postcode",
18
18
  "source",
@@ -128,9 +128,9 @@ def get_variant_statistics(
128
128
 
129
129
  stats = con.sql(f"""
130
130
  WITH variant_counts AS (
131
- SELECT uprn, COUNT(*) AS variant_count
131
+ SELECT unique_id, COUNT(*) AS variant_count
132
132
  FROM read_parquet('{files_sql}')
133
- GROUP BY uprn
133
+ GROUP BY unique_id
134
134
  )
135
135
  SELECT
136
136
  COUNT(*) AS total_uprns,
@@ -179,7 +179,7 @@ def get_random_uprn(
179
179
 
180
180
  select_columns = _choose_select_columns(con, files_sql, columns)
181
181
  random_uprn = con.sql(f"""
182
- SELECT DISTINCT uprn
182
+ SELECT DISTINCT unique_id
183
183
  FROM read_parquet('{files_sql}')
184
184
  ORDER BY RANDOM()
185
185
  LIMIT 1
@@ -192,7 +192,7 @@ def get_random_uprn(
192
192
  SELECT
193
193
  {select_columns}
194
194
  FROM read_parquet('{files_sql}')
195
- WHERE uprn = {int(random_uprn[0])}
195
+ WHERE unique_id = {int(random_uprn[0])}
196
196
  ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
197
197
  """)
198
198
 
@@ -220,14 +220,14 @@ def get_random_large_uprn(
220
220
 
221
221
  selected = con.sql(f"""
222
222
  WITH variant_counts AS (
223
- SELECT uprn, COUNT(*) AS variant_count
223
+ SELECT unique_id, COUNT(*) AS variant_count
224
224
  FROM read_parquet('{files_sql}')
225
225
  {where_filter}
226
- GROUP BY uprn
227
- ORDER BY variant_count DESC, uprn ASC
226
+ GROUP BY unique_id
227
+ ORDER BY variant_count DESC, unique_id ASC
228
228
  LIMIT {int(top_n)}
229
229
  )
230
- SELECT uprn
230
+ SELECT unique_id
231
231
  FROM variant_counts
232
232
  ORDER BY RANDOM()
233
233
  LIMIT 1
@@ -240,7 +240,7 @@ def get_random_large_uprn(
240
240
  SELECT
241
241
  {select_columns}
242
242
  FROM read_parquet('{files_sql}')
243
- WHERE uprn = {int(selected[0])}
243
+ WHERE unique_id = {int(selected[0])}
244
244
  {and_filter}
245
245
  ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
246
246
  """)
@@ -269,7 +269,7 @@ def get_uprn_variants(
269
269
  SELECT
270
270
  {select_columns}
271
271
  FROM read_parquet('{files_sql}')
272
- WHERE uprn = {int(uprn)}
272
+ WHERE unique_id = {int(uprn)}
273
273
  {and_filter}
274
274
  ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
275
275
  """)
@@ -317,10 +317,10 @@ def inspect_flatfile_variants(
317
317
  WITH data AS (
318
318
  SELECT * FROM read_parquet('{files_sql}')
319
319
  )
320
- SELECT uprn, COUNT(*) AS variant_count
320
+ SELECT unique_id, COUNT(*) AS variant_count
321
321
  FROM data
322
- GROUP BY uprn
323
- ORDER BY variant_count DESC, uprn ASC
322
+ GROUP BY unique_id
323
+ ORDER BY variant_count DESC, unique_id ASC
324
324
  LIMIT 1 OFFSET {top_offset}
325
325
  """
326
326
  ).fetchone()
@@ -333,7 +333,7 @@ def inspect_flatfile_variants(
333
333
  f"""
334
334
  SELECT COUNT(*)
335
335
  FROM read_parquet('{files_sql}')
336
- WHERE uprn = ?
336
+ WHERE unique_id = ?
337
337
  """,
338
338
  [target_uprn],
339
339
  ).fetchone()
@@ -343,7 +343,7 @@ def inspect_flatfile_variants(
343
343
  f"""
344
344
  SELECT *
345
345
  FROM read_parquet('{files_sql}')
346
- WHERE uprn = ?
346
+ WHERE unique_id = ?
347
347
  ORDER BY 1
348
348
  """,
349
349
  [target_uprn],
@@ -358,7 +358,7 @@ def inspect_flatfile_variants(
358
358
  max_width=10_000
359
359
  )
360
360
  logger.info("Selected UPRN rows:")
361
- con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE uprn = {target_uprn}").show(
361
+ con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE unique_id = {target_uprn}").show(
362
362
  max_width=10_000
363
363
  )
364
364
 
@@ -1421,7 +1421,7 @@ wheels = [
1421
1421
 
1422
1422
  [[package]]
1423
1423
  name = "ukam-os-builder"
1424
- version = "0.1.0.dev5"
1424
+ version = "0.1.0.dev6"
1425
1425
  source = { editable = "." }
1426
1426
  dependencies = [
1427
1427
  { name = "duckdb" },