ukam-os-builder 0.1.0.dev5__tar.gz → 0.1.0.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/PKG-INFO +3 -6
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/README.md +2 -5
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/pyproject.toml +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_inspect_results.py +2 -2
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_public_api_integration.py +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_smoke.py +4 -7
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/__init__.py +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/runner.py +2 -2
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/ngd/to_flatfile.py +68 -36
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/inspect_results.py +17 -17
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/uv.lock +1 -1
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.env.example +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.github/workflows/ci.yml +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.github/workflows/e2e.yml +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.github/workflows/release-pypi.yml +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.gitignore +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/AGENTS.md +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/config.example.yaml +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/prompt.md +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/shell/test_release_locally.sh +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/README.md +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_historicaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_prebuildaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_royalmailaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_api.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_cli.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_extract_source_filtering.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_settings.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_setup_wizard.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/_exceptions.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/api.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/api/settings.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/cli.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/split_raw.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/extract.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/os_hub.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/pipeline_factory.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/pipeline.py +0 -0
- {ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/setup_wizard.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ukam-os-builder
|
|
3
|
-
Version: 0.1.0.
|
|
3
|
+
Version: 0.1.0.dev6
|
|
4
4
|
Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
|
|
5
5
|
Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
6
6
|
Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
@@ -221,13 +221,10 @@ Each file contains:
|
|
|
221
221
|
| `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
|
|
222
222
|
| `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
|
|
223
223
|
| `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
|
|
224
|
-
| `
|
|
225
|
-
| `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
|
|
224
|
+
| `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
|
|
226
225
|
| `floorlevel` | VARCHAR | Floor level identifier |
|
|
227
|
-
| `lowestfloorlevel` | DOUBLE | Lowest floor number |
|
|
228
|
-
| `highestfloorlevel` | DOUBLE | Highest floor number |
|
|
229
226
|
|
|
230
|
-
Metadata
|
|
227
|
+
Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
|
|
231
228
|
|
|
232
229
|
</details>
|
|
233
230
|
|
|
@@ -195,13 +195,10 @@ Each file contains:
|
|
|
195
195
|
| `filename` | VARCHAR | Source file name (for example `add_gb_builtaddress.parquet`) |
|
|
196
196
|
| `classificationcode` | VARCHAR | Property classification code (for example RD06 for residential) |
|
|
197
197
|
| `parentuprn` | BIGINT | Parent UPRN for hierarchical addresses |
|
|
198
|
-
| `
|
|
199
|
-
| `hierarchylevel` | INTEGER | Level in the address hierarchy (1 = root) |
|
|
198
|
+
| `lowertierlocalauthoritygsscode` | VARCHAR | Lower-tier local authority GSS code |
|
|
200
199
|
| `floorlevel` | VARCHAR | Floor level identifier |
|
|
201
|
-
| `lowestfloorlevel` | DOUBLE | Lowest floor number |
|
|
202
|
-
| `highestfloorlevel` | DOUBLE | Highest floor number |
|
|
203
200
|
|
|
204
|
-
Metadata
|
|
201
|
+
Metadata used in output (`classificationcode`, `parentuprn`, `lowertierlocalauthoritygsscode`, `floorlevel`) is enriched via UPRN lookup from core address files. This means Royal Mail addresses and alternate address records receive metadata from their corresponding Built, Historic, or Pre-Build records. `lowertierlocalauthoritygsscode` is always sourced from Built Address via UPRN lookup.
|
|
205
202
|
|
|
206
203
|
</details>
|
|
207
204
|
|
|
@@ -26,7 +26,7 @@ def test_inspect_flatfile_variants_uses_config_defaults(tmp_path: Path) -> None:
|
|
|
26
26
|
(1001::BIGINT, 'A'::VARCHAR),
|
|
27
27
|
(1001::BIGINT, 'B'::VARCHAR),
|
|
28
28
|
(1002::BIGINT, 'C'::VARCHAR)
|
|
29
|
-
) AS t(
|
|
29
|
+
) AS t(unique_id, address_concat)
|
|
30
30
|
) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
|
|
31
31
|
"""
|
|
32
32
|
)
|
|
@@ -61,7 +61,7 @@ def test_inspect_flatfile_variants_supports_abp_pattern(tmp_path: Path) -> None:
|
|
|
61
61
|
(2001::BIGINT, 'A'::VARCHAR),
|
|
62
62
|
(2002::BIGINT, 'B'::VARCHAR),
|
|
63
63
|
(2002::BIGINT, 'C'::VARCHAR)
|
|
64
|
-
) AS t(
|
|
64
|
+
) AS t(unique_id, address_concat)
|
|
65
65
|
) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
|
|
66
66
|
"""
|
|
67
67
|
)
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_public_api_integration.py
RENAMED
|
@@ -71,7 +71,7 @@ def test_package_root_inspect_flatfile_variants(tmp_path: Path) -> None:
|
|
|
71
71
|
(4001::BIGINT, 'A'::VARCHAR),
|
|
72
72
|
(4001::BIGINT, 'B'::VARCHAR),
|
|
73
73
|
(4002::BIGINT, 'C'::VARCHAR)
|
|
74
|
-
) AS t(
|
|
74
|
+
) AS t(unique_id, address_concat)
|
|
75
75
|
) TO '{parquet_path.as_posix()}' (FORMAT PARQUET)
|
|
76
76
|
"""
|
|
77
77
|
)
|
|
@@ -172,17 +172,14 @@ def test_flatfile_single_chunk(temp_settings: Settings) -> None:
|
|
|
172
172
|
column_names = [row[0] for row in schema]
|
|
173
173
|
|
|
174
174
|
expected_columns = [
|
|
175
|
-
"
|
|
175
|
+
"unique_id",
|
|
176
176
|
"address_concat",
|
|
177
177
|
"postcode",
|
|
178
178
|
"filename",
|
|
179
179
|
"classificationcode",
|
|
180
180
|
"parentuprn",
|
|
181
|
-
"
|
|
182
|
-
"hierarchylevel",
|
|
181
|
+
"lowertierlocalauthoritygsscode",
|
|
183
182
|
"floorlevel",
|
|
184
|
-
"lowestfloorlevel",
|
|
185
|
-
"highestfloorlevel",
|
|
186
183
|
]
|
|
187
184
|
for col in expected_columns:
|
|
188
185
|
assert col in column_names, f"Column {col} should exist in output"
|
|
@@ -231,9 +228,9 @@ def test_deduplication(temp_settings: Settings) -> None:
|
|
|
231
228
|
# Verify no exact duplicates
|
|
232
229
|
con = duckdb.connect()
|
|
233
230
|
result = con.execute(f"""
|
|
234
|
-
SELECT
|
|
231
|
+
SELECT unique_id, address_concat, COUNT(*) as cnt
|
|
235
232
|
FROM read_parquet('{output_files[0].as_posix()}')
|
|
236
|
-
GROUP BY
|
|
233
|
+
GROUP BY unique_id, address_concat
|
|
237
234
|
HAVING COUNT(*) > 1
|
|
238
235
|
""").fetchall()
|
|
239
236
|
|
|
@@ -170,7 +170,7 @@ def _transform_to_flatfile_chunk(
|
|
|
170
170
|
logger.debug("Combination and deduplication in %.2f seconds", perf_counter() - t0)
|
|
171
171
|
|
|
172
172
|
# Get chunk metrics
|
|
173
|
-
chunk_metrics = con.execute("SELECT COUNT(DISTINCT
|
|
173
|
+
chunk_metrics = con.execute("SELECT COUNT(DISTINCT unique_id), COUNT(*) FROM result").fetchone()
|
|
174
174
|
chunk_uprns = chunk_metrics[0]
|
|
175
175
|
chunk_rows = chunk_metrics[1]
|
|
176
176
|
|
|
@@ -244,7 +244,7 @@ def transform_to_flatfile(
|
|
|
244
244
|
con = create_duckdb_connection(settings)
|
|
245
245
|
output_path = output_paths[0]
|
|
246
246
|
stats = con.execute(f"""
|
|
247
|
-
SELECT COUNT(DISTINCT
|
|
247
|
+
SELECT COUNT(DISTINCT unique_id), COUNT(*)
|
|
248
248
|
FROM read_parquet('{output_path.as_posix()}')
|
|
249
249
|
""").fetchone()
|
|
250
250
|
total_uprns = stats[0]
|
|
@@ -97,7 +97,6 @@ def _create_metadata_lookup_view(
|
|
|
97
97
|
""")
|
|
98
98
|
|
|
99
99
|
if not union_parts:
|
|
100
|
-
# No core files found - create empty lookup
|
|
101
100
|
logger.warning("No core feature files found. Metadata lookup will be empty.")
|
|
102
101
|
con.execute("""
|
|
103
102
|
CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
|
|
@@ -112,37 +111,48 @@ def _create_metadata_lookup_view(
|
|
|
112
111
|
CAST(NULL AS DOUBLE) AS highestfloorlevel
|
|
113
112
|
WHERE 1=0
|
|
114
113
|
""")
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
union_sql = "\nUNION ALL\n".join(union_parts)
|
|
114
|
+
else:
|
|
115
|
+
union_sql = "\nUNION ALL\n".join(union_parts)
|
|
118
116
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
117
|
+
sql = f"""
|
|
118
|
+
CREATE OR REPLACE TEMP VIEW uprn_metadata_lookup AS
|
|
119
|
+
WITH core_data AS (
|
|
120
|
+
{union_sql}
|
|
121
|
+
),
|
|
122
|
+
ranked AS (
|
|
123
|
+
SELECT
|
|
124
|
+
*,
|
|
125
|
+
ROW_NUMBER() OVER (
|
|
126
|
+
PARTITION BY uprn
|
|
127
|
+
ORDER BY source_priority
|
|
128
|
+
) AS rn
|
|
129
|
+
FROM core_data
|
|
130
|
+
)
|
|
125
131
|
SELECT
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
132
|
+
uprn,
|
|
133
|
+
classificationcode,
|
|
134
|
+
parentuprn,
|
|
135
|
+
rootuprn,
|
|
136
|
+
hierarchylevel,
|
|
137
|
+
floorlevel,
|
|
138
|
+
lowestfloorlevel,
|
|
139
|
+
highestfloorlevel
|
|
140
|
+
FROM ranked
|
|
141
|
+
WHERE rn = 1;
|
|
142
|
+
"""
|
|
143
|
+
con.execute(sql)
|
|
144
|
+
|
|
145
|
+
built_path = parquet_dir / "add_gb_builtaddress.parquet"
|
|
146
|
+
built_sql = f"""
|
|
147
|
+
CREATE OR REPLACE TEMP VIEW builtaddress_ltla_lookup AS
|
|
133
148
|
SELECT
|
|
134
|
-
uprn,
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
floorlevel,
|
|
140
|
-
lowestfloorlevel,
|
|
141
|
-
highestfloorlevel
|
|
142
|
-
FROM ranked
|
|
143
|
-
WHERE rn = 1;
|
|
149
|
+
CAST(uprn AS BIGINT) AS uprn,
|
|
150
|
+
MAX(CAST(lowertierlocalauthoritygsscode AS VARCHAR)) AS lowertierlocalauthoritygsscode
|
|
151
|
+
FROM read_parquet('{built_path.as_posix()}')
|
|
152
|
+
{where_clause}
|
|
153
|
+
GROUP BY CAST(uprn AS BIGINT)
|
|
144
154
|
"""
|
|
145
|
-
con.execute(
|
|
155
|
+
con.execute(built_sql)
|
|
146
156
|
|
|
147
157
|
|
|
148
158
|
def _create_core_feature_view(
|
|
@@ -183,6 +193,7 @@ def _create_core_feature_view(
|
|
|
183
193
|
CAST(floorlevel AS VARCHAR) AS floorlevel,
|
|
184
194
|
CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
|
|
185
195
|
CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
|
|
196
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
186
197
|
-- Internal columns for deduplication (not in final output)
|
|
187
198
|
CAST(description AS VARCHAR) AS feature_type,
|
|
188
199
|
CAST(addressstatus AS VARCHAR) AS address_status,
|
|
@@ -222,6 +233,7 @@ def _create_core_feature_view(
|
|
|
222
233
|
CAST(floorlevel AS VARCHAR) AS floorlevel,
|
|
223
234
|
CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
|
|
224
235
|
CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
|
|
236
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
225
237
|
-- Internal columns for deduplication (not in final output)
|
|
226
238
|
CAST(description AS VARCHAR) AS feature_type,
|
|
227
239
|
CAST(addressstatus AS VARCHAR) AS address_status,
|
|
@@ -277,6 +289,7 @@ def _create_altadd_view(
|
|
|
277
289
|
CAST(floorlevel AS VARCHAR) AS floorlevel,
|
|
278
290
|
CAST(lowestfloorlevel AS DOUBLE) AS lowestfloorlevel,
|
|
279
291
|
CAST(highestfloorlevel AS DOUBLE) AS highestfloorlevel,
|
|
292
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
280
293
|
-- Internal columns for deduplication (not in final output)
|
|
281
294
|
'{feature_type}' AS feature_type,
|
|
282
295
|
CAST(addressstatus AS VARCHAR) AS address_status,
|
|
@@ -333,6 +346,7 @@ def _create_royal_mail_view(
|
|
|
333
346
|
CAST(NULL AS VARCHAR) AS floorlevel,
|
|
334
347
|
CAST(NULL AS DOUBLE) AS lowestfloorlevel,
|
|
335
348
|
CAST(NULL AS DOUBLE) AS highestfloorlevel,
|
|
349
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
336
350
|
-- Internal columns for deduplication (not in final output)
|
|
337
351
|
'Royal Mail Address' AS feature_type,
|
|
338
352
|
CAST(NULL AS VARCHAR) AS address_status,
|
|
@@ -363,6 +377,7 @@ def _create_royal_mail_view(
|
|
|
363
377
|
CAST(NULL AS VARCHAR) AS floorlevel,
|
|
364
378
|
CAST(NULL AS DOUBLE) AS lowestfloorlevel,
|
|
365
379
|
CAST(NULL AS DOUBLE) AS highestfloorlevel,
|
|
380
|
+
CAST(NULL AS VARCHAR) AS lowertierlocalauthoritygsscode,
|
|
366
381
|
-- Internal columns for deduplication (not in final output)
|
|
367
382
|
'Royal Mail Address' AS feature_type,
|
|
368
383
|
CAST(NULL AS VARCHAR) AS address_status,
|
|
@@ -398,12 +413,14 @@ def _enrich_with_metadata(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
398
413
|
COALESCE(a.floorlevel, m.floorlevel) AS floorlevel,
|
|
399
414
|
COALESCE(a.lowestfloorlevel, m.lowestfloorlevel) AS lowestfloorlevel,
|
|
400
415
|
COALESCE(a.highestfloorlevel, m.highestfloorlevel) AS highestfloorlevel,
|
|
416
|
+
b.lowertierlocalauthoritygsscode AS lowertierlocalauthoritygsscode,
|
|
401
417
|
-- Internal columns for deduplication
|
|
402
418
|
a.feature_type,
|
|
403
419
|
a.address_status,
|
|
404
420
|
a.build_status
|
|
405
421
|
FROM all_full_addresses a
|
|
406
|
-
LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn
|
|
422
|
+
LEFT JOIN uprn_metadata_lookup m ON a.uprn = m.uprn
|
|
423
|
+
LEFT JOIN builtaddress_ltla_lookup b ON a.uprn = b.uprn;
|
|
407
424
|
"""
|
|
408
425
|
con.execute(sql)
|
|
409
426
|
|
|
@@ -419,11 +436,28 @@ def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
419
436
|
dedup priority and never override official address data.
|
|
420
437
|
"""
|
|
421
438
|
sql = """
|
|
422
|
-
INSERT INTO all_full_addresses_enriched
|
|
439
|
+
INSERT INTO all_full_addresses_enriched (
|
|
440
|
+
uprn,
|
|
441
|
+
address_concat,
|
|
442
|
+
postcode,
|
|
443
|
+
filename,
|
|
444
|
+
classificationcode,
|
|
445
|
+
parentuprn,
|
|
446
|
+
rootuprn,
|
|
447
|
+
hierarchylevel,
|
|
448
|
+
floorlevel,
|
|
449
|
+
lowestfloorlevel,
|
|
450
|
+
highestfloorlevel,
|
|
451
|
+
lowertierlocalauthoritygsscode,
|
|
452
|
+
feature_type,
|
|
453
|
+
address_status,
|
|
454
|
+
build_status
|
|
455
|
+
)
|
|
423
456
|
WITH level_parsed AS (
|
|
424
457
|
SELECT
|
|
425
458
|
uprn, address_concat, postcode, filename,
|
|
426
459
|
classificationcode, parentuprn, rootuprn,
|
|
460
|
+
lowertierlocalauthoritygsscode,
|
|
427
461
|
hierarchylevel, floorlevel, lowestfloorlevel, highestfloorlevel,
|
|
428
462
|
address_status, build_status,
|
|
429
463
|
CASE
|
|
@@ -464,6 +498,7 @@ def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
464
498
|
floorlevel,
|
|
465
499
|
lowestfloorlevel,
|
|
466
500
|
highestfloorlevel,
|
|
501
|
+
lowertierlocalauthoritygsscode,
|
|
467
502
|
'Custom Level' AS feature_type,
|
|
468
503
|
address_status,
|
|
469
504
|
build_status
|
|
@@ -523,17 +558,14 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
523
558
|
WHERE feature_type NOT IN ('Non-Addressable Object')
|
|
524
559
|
)
|
|
525
560
|
SELECT
|
|
526
|
-
uprn,
|
|
561
|
+
uprn AS unique_id,
|
|
527
562
|
address_concat,
|
|
528
563
|
postcode,
|
|
529
564
|
filename,
|
|
530
565
|
classificationcode,
|
|
531
566
|
parentuprn,
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
floorlevel,
|
|
535
|
-
lowestfloorlevel,
|
|
536
|
-
highestfloorlevel
|
|
567
|
+
lowertierlocalauthoritygsscode,
|
|
568
|
+
floorlevel
|
|
537
569
|
FROM ranked
|
|
538
570
|
WHERE rn = 1;
|
|
539
571
|
"""
|
|
@@ -12,7 +12,7 @@ SourceType = Literal["ngd", "abp"]
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
14
|
_DEFAULT_SELECT_COLUMNS = [
|
|
15
|
-
"
|
|
15
|
+
"unique_id",
|
|
16
16
|
"address_concat",
|
|
17
17
|
"postcode",
|
|
18
18
|
"source",
|
|
@@ -128,9 +128,9 @@ def get_variant_statistics(
|
|
|
128
128
|
|
|
129
129
|
stats = con.sql(f"""
|
|
130
130
|
WITH variant_counts AS (
|
|
131
|
-
SELECT
|
|
131
|
+
SELECT unique_id, COUNT(*) AS variant_count
|
|
132
132
|
FROM read_parquet('{files_sql}')
|
|
133
|
-
GROUP BY
|
|
133
|
+
GROUP BY unique_id
|
|
134
134
|
)
|
|
135
135
|
SELECT
|
|
136
136
|
COUNT(*) AS total_uprns,
|
|
@@ -179,7 +179,7 @@ def get_random_uprn(
|
|
|
179
179
|
|
|
180
180
|
select_columns = _choose_select_columns(con, files_sql, columns)
|
|
181
181
|
random_uprn = con.sql(f"""
|
|
182
|
-
SELECT DISTINCT
|
|
182
|
+
SELECT DISTINCT unique_id
|
|
183
183
|
FROM read_parquet('{files_sql}')
|
|
184
184
|
ORDER BY RANDOM()
|
|
185
185
|
LIMIT 1
|
|
@@ -192,7 +192,7 @@ def get_random_uprn(
|
|
|
192
192
|
SELECT
|
|
193
193
|
{select_columns}
|
|
194
194
|
FROM read_parquet('{files_sql}')
|
|
195
|
-
WHERE
|
|
195
|
+
WHERE unique_id = {int(random_uprn[0])}
|
|
196
196
|
ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
|
|
197
197
|
""")
|
|
198
198
|
|
|
@@ -220,14 +220,14 @@ def get_random_large_uprn(
|
|
|
220
220
|
|
|
221
221
|
selected = con.sql(f"""
|
|
222
222
|
WITH variant_counts AS (
|
|
223
|
-
SELECT
|
|
223
|
+
SELECT unique_id, COUNT(*) AS variant_count
|
|
224
224
|
FROM read_parquet('{files_sql}')
|
|
225
225
|
{where_filter}
|
|
226
|
-
GROUP BY
|
|
227
|
-
ORDER BY variant_count DESC,
|
|
226
|
+
GROUP BY unique_id
|
|
227
|
+
ORDER BY variant_count DESC, unique_id ASC
|
|
228
228
|
LIMIT {int(top_n)}
|
|
229
229
|
)
|
|
230
|
-
SELECT
|
|
230
|
+
SELECT unique_id
|
|
231
231
|
FROM variant_counts
|
|
232
232
|
ORDER BY RANDOM()
|
|
233
233
|
LIMIT 1
|
|
@@ -240,7 +240,7 @@ def get_random_large_uprn(
|
|
|
240
240
|
SELECT
|
|
241
241
|
{select_columns}
|
|
242
242
|
FROM read_parquet('{files_sql}')
|
|
243
|
-
WHERE
|
|
243
|
+
WHERE unique_id = {int(selected[0])}
|
|
244
244
|
{and_filter}
|
|
245
245
|
ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
|
|
246
246
|
""")
|
|
@@ -269,7 +269,7 @@ def get_uprn_variants(
|
|
|
269
269
|
SELECT
|
|
270
270
|
{select_columns}
|
|
271
271
|
FROM read_parquet('{files_sql}')
|
|
272
|
-
WHERE
|
|
272
|
+
WHERE unique_id = {int(uprn)}
|
|
273
273
|
{and_filter}
|
|
274
274
|
ORDER BY is_primary DESC NULLS LAST, source NULLS LAST, variant_label NULLS LAST
|
|
275
275
|
""")
|
|
@@ -317,10 +317,10 @@ def inspect_flatfile_variants(
|
|
|
317
317
|
WITH data AS (
|
|
318
318
|
SELECT * FROM read_parquet('{files_sql}')
|
|
319
319
|
)
|
|
320
|
-
SELECT
|
|
320
|
+
SELECT unique_id, COUNT(*) AS variant_count
|
|
321
321
|
FROM data
|
|
322
|
-
GROUP BY
|
|
323
|
-
ORDER BY variant_count DESC,
|
|
322
|
+
GROUP BY unique_id
|
|
323
|
+
ORDER BY variant_count DESC, unique_id ASC
|
|
324
324
|
LIMIT 1 OFFSET {top_offset}
|
|
325
325
|
"""
|
|
326
326
|
).fetchone()
|
|
@@ -333,7 +333,7 @@ def inspect_flatfile_variants(
|
|
|
333
333
|
f"""
|
|
334
334
|
SELECT COUNT(*)
|
|
335
335
|
FROM read_parquet('{files_sql}')
|
|
336
|
-
WHERE
|
|
336
|
+
WHERE unique_id = ?
|
|
337
337
|
""",
|
|
338
338
|
[target_uprn],
|
|
339
339
|
).fetchone()
|
|
@@ -343,7 +343,7 @@ def inspect_flatfile_variants(
|
|
|
343
343
|
f"""
|
|
344
344
|
SELECT *
|
|
345
345
|
FROM read_parquet('{files_sql}')
|
|
346
|
-
WHERE
|
|
346
|
+
WHERE unique_id = ?
|
|
347
347
|
ORDER BY 1
|
|
348
348
|
""",
|
|
349
349
|
[target_uprn],
|
|
@@ -358,7 +358,7 @@ def inspect_flatfile_variants(
|
|
|
358
358
|
max_width=10_000
|
|
359
359
|
)
|
|
360
360
|
logger.info("Selected UPRN rows:")
|
|
361
|
-
con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE
|
|
361
|
+
con.sql(f"SELECT * FROM read_parquet('{files_sql}') WHERE unique_id = {target_uprn}").show(
|
|
362
362
|
max_width=10_000
|
|
363
363
|
)
|
|
364
364
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/.github/workflows/release-pypi.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_builtaddress_altadd.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_historicaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_prebuildaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/data/add_gb_royalmailaddress.csv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/tests/test_extract_source_filtering.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/__init__.py
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/extract.py
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev5 → ukam_os_builder-0.1.0.dev6}/ukam_os_builder/os_builder/os_hub.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|