ukam-os-builder 0.1.0.dev3__tar.gz → 0.1.0.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ukam_os_builder-0.1.0.dev5/.github/workflows/e2e.yml +147 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/.github/workflows/release-pypi.yml +40 -22
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/PKG-INFO +2 -2
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/README.md +1 -1
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/pyproject.toml +1 -1
- ukam_os_builder-0.1.0.dev5/tests/test_extract_source_filtering.py +49 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_settings.py +7 -3
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_smoke.py +0 -1
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/__init__.py +1 -1
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/api.py +17 -7
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/settings.py +9 -19
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/cli.py +1 -3
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/split_raw.py +28 -7
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +1 -1
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +4 -6
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/ngd/to_flatfile.py +75 -11
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/extract.py +15 -2
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/os_hub.py +98 -1
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/uv.lock +1 -1
- ukam_os_builder-0.1.0.dev3/tests/test_extract_source_filtering.py +0 -27
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/.env.example +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/.github/workflows/ci.yml +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/.gitignore +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/AGENTS.md +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/config.example.yaml +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/prompt.md +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/shell/test_release_locally.sh +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/README.md +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_historicaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_prebuildaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_royalmailaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_api.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_cli.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_inspect_results.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_public_api_integration.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_setup_wizard.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/_exceptions.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/runner.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/inspect_results.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/pipeline_factory.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/pipeline.py +0 -0
- {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/setup_wizard.py +0 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
name: End-to-end tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
branches: [main]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
|
|
10
|
+
concurrency:
|
|
11
|
+
group: e2e-${{ github.head_ref || github.ref }}
|
|
12
|
+
cancel-in-progress: true
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
e2e:
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
strategy:
|
|
18
|
+
fail-fast: false
|
|
19
|
+
matrix:
|
|
20
|
+
include:
|
|
21
|
+
- source: ngd
|
|
22
|
+
package_id: "18296"
|
|
23
|
+
version_id: "118120"
|
|
24
|
+
- source: abp
|
|
25
|
+
package_id: "0040206240"
|
|
26
|
+
version_id: "6777574"
|
|
27
|
+
|
|
28
|
+
name: E2E – ${{ matrix.source }}
|
|
29
|
+
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v5
|
|
32
|
+
|
|
33
|
+
- name: Mask API credentials
|
|
34
|
+
run: |
|
|
35
|
+
echo "::add-mask::${{ secrets.OS_PROJECT_API_KEY }}"
|
|
36
|
+
echo "::add-mask::${{ secrets.OS_PROJECT_API_SECRET }}"
|
|
37
|
+
|
|
38
|
+
- name: Set up Python
|
|
39
|
+
uses: actions/setup-python@v5
|
|
40
|
+
with:
|
|
41
|
+
python-version: "3.10"
|
|
42
|
+
|
|
43
|
+
- name: Set up uv
|
|
44
|
+
uses: astral-sh/setup-uv@v7
|
|
45
|
+
with:
|
|
46
|
+
enable-cache: true
|
|
47
|
+
cache-dependency-glob: "uv.lock"
|
|
48
|
+
|
|
49
|
+
- name: Install dependencies
|
|
50
|
+
run: uv sync --all-extras --all-groups
|
|
51
|
+
|
|
52
|
+
- name: Write config.yaml
|
|
53
|
+
run: |
|
|
54
|
+
printf '%s\n' \
|
|
55
|
+
'paths:' \
|
|
56
|
+
' work_dir: ./data' \
|
|
57
|
+
'' \
|
|
58
|
+
'source:' \
|
|
59
|
+
' type: ${{ matrix.source }}' \
|
|
60
|
+
'' \
|
|
61
|
+
'os_downloads:' \
|
|
62
|
+
' package_id: "${{ matrix.package_id }}"' \
|
|
63
|
+
' version_id: "${{ matrix.version_id }}"' \
|
|
64
|
+
'' \
|
|
65
|
+
'processing:' \
|
|
66
|
+
' parquet_compression: zstd' \
|
|
67
|
+
' parquet_compression_level: 9' \
|
|
68
|
+
' num_chunks: 1' \
|
|
69
|
+
> config.yaml
|
|
70
|
+
|
|
71
|
+
- name: Run full pipeline
|
|
72
|
+
env:
|
|
73
|
+
OS_PROJECT_API_KEY: ${{ secrets.OS_PROJECT_API_KEY }}
|
|
74
|
+
OS_PROJECT_API_SECRET: ${{ secrets.OS_PROJECT_API_SECRET }}
|
|
75
|
+
run: uv run ukam-os-build --verbose
|
|
76
|
+
|
|
77
|
+
- name: Verify output files exist
|
|
78
|
+
run: |
|
|
79
|
+
echo "=== Output directory ==="
|
|
80
|
+
ls -lhR data/output/
|
|
81
|
+
echo ""
|
|
82
|
+
echo "=== Checking for parquet files ==="
|
|
83
|
+
count=$(find data/output -name '*.parquet' | wc -l)
|
|
84
|
+
echo "Found $count parquet file(s) in data/output/"
|
|
85
|
+
if [ "$count" -eq 0 ]; then
|
|
86
|
+
echo "::error::No parquet output files found!"
|
|
87
|
+
exit 1
|
|
88
|
+
fi
|
|
89
|
+
|
|
90
|
+
- name: Preview first output row
|
|
91
|
+
run: |
|
|
92
|
+
uv run python -c "
|
|
93
|
+
import duckdb
|
|
94
|
+
con = duckdb.connect()
|
|
95
|
+
con.sql(\"SELECT * FROM read_parquet('data/output/*.parquet')\").show(max_rows=1, max_width=10000)
|
|
96
|
+
"
|
|
97
|
+
|
|
98
|
+
# ── Second run: offline (no API credentials) ──────────────
|
|
99
|
+
- name: Record download file timestamps
|
|
100
|
+
run: |
|
|
101
|
+
stat -c '%n %Y' data/downloads/* | sort > /tmp/downloads_before.txt
|
|
102
|
+
echo "=== Download file timestamps ==="
|
|
103
|
+
cat /tmp/downloads_before.txt
|
|
104
|
+
|
|
105
|
+
- name: Remove everything except downloads and block API access
|
|
106
|
+
run: |
|
|
107
|
+
find data -mindepth 1 -maxdepth 1 ! -name downloads -exec rm -rf {} +
|
|
108
|
+
echo "=== Remaining data tree ==="
|
|
109
|
+
find data -type f | sort
|
|
110
|
+
|
|
111
|
+
- name: Re-run pipeline without API credentials
|
|
112
|
+
run: |
|
|
113
|
+
unset OS_PROJECT_API_KEY OS_PROJECT_API_SECRET
|
|
114
|
+
uv run ukam-os-build --verbose --overwrite
|
|
115
|
+
|
|
116
|
+
- name: Verify output files exist (offline run)
|
|
117
|
+
run: |
|
|
118
|
+
echo "=== Output directory ==="
|
|
119
|
+
ls -lhR data/output/
|
|
120
|
+
echo ""
|
|
121
|
+
echo "=== Checking for parquet files ==="
|
|
122
|
+
count=$(find data/output -name '*.parquet' | wc -l)
|
|
123
|
+
echo "Found $count parquet file(s) in data/output/"
|
|
124
|
+
if [ "$count" -eq 0 ]; then
|
|
125
|
+
echo "::error::No parquet output files found on offline run!"
|
|
126
|
+
exit 1
|
|
127
|
+
fi
|
|
128
|
+
|
|
129
|
+
- name: Preview first output row (offline run)
|
|
130
|
+
run: |
|
|
131
|
+
uv run python -c "
|
|
132
|
+
import duckdb
|
|
133
|
+
con = duckdb.connect()
|
|
134
|
+
con.sql(\"SELECT * FROM read_parquet('data/output/*.parquet')\").show(max_rows=1, max_width=10000)
|
|
135
|
+
"
|
|
136
|
+
|
|
137
|
+
- name: Verify downloads were not modified
|
|
138
|
+
run: |
|
|
139
|
+
stat -c '%n %Y' data/downloads/* | sort > /tmp/downloads_after.txt
|
|
140
|
+
echo "=== Download file timestamps after offline run ==="
|
|
141
|
+
cat /tmp/downloads_after.txt
|
|
142
|
+
if ! diff -q /tmp/downloads_before.txt /tmp/downloads_after.txt; then
|
|
143
|
+
echo "::error::Download file timestamps changed – files were unexpectedly modified!"
|
|
144
|
+
diff /tmp/downloads_before.txt /tmp/downloads_after.txt
|
|
145
|
+
exit 1
|
|
146
|
+
fi
|
|
147
|
+
echo "Download timestamps unchanged – existing archives were reused as expected."
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/.github/workflows/release-pypi.yml
RENAMED
|
@@ -12,6 +12,7 @@ permissions:
|
|
|
12
12
|
jobs:
|
|
13
13
|
publish:
|
|
14
14
|
runs-on: ubuntu-latest
|
|
15
|
+
environment: pypi
|
|
15
16
|
|
|
16
17
|
# Set up such that PyPI Trusted Publishing (OIDC) can work.
|
|
17
18
|
permissions:
|
|
@@ -51,36 +52,53 @@ jobs:
|
|
|
51
52
|
|
|
52
53
|
core.setOutput('release_sha', tagSha);
|
|
53
54
|
|
|
54
|
-
- name:
|
|
55
|
+
- name: Wait for successful CI build artifact
|
|
55
56
|
id: find_build
|
|
56
57
|
uses: actions/github-script@v7
|
|
57
58
|
with:
|
|
58
59
|
script: |
|
|
59
60
|
const { owner, repo } = context.repo;
|
|
60
61
|
const sha = '${{ steps.main_guard.outputs.release_sha }}';
|
|
61
|
-
|
|
62
|
-
const
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
62
|
+
const maxAttempts = 30; // 30 × 20 s = 10 minutes
|
|
63
|
+
const delayMs = 20_000; // 20 seconds between polls
|
|
64
|
+
|
|
65
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
66
|
+
const runs = await github.rest.actions.listWorkflowRuns({
|
|
67
|
+
owner,
|
|
68
|
+
repo,
|
|
69
|
+
workflow_id: 'ci.yml',
|
|
70
|
+
head_sha: sha,
|
|
71
|
+
event: 'push',
|
|
72
|
+
status: 'completed',
|
|
73
|
+
per_page: 50,
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
const success = runs.data.workflow_runs.find(r => r.conclusion === 'success');
|
|
77
|
+
if (success) {
|
|
78
|
+
core.info(`Found successful CI run ${success.id} (${success.html_url})`);
|
|
79
|
+
core.setOutput('run_id', String(success.id));
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const failed = runs.data.workflow_runs.find(r => r.conclusion === 'failure');
|
|
84
|
+
if (failed) {
|
|
85
|
+
core.setFailed(
|
|
86
|
+
`CI run ${failed.id} failed for commit ${sha}. ` +
|
|
87
|
+
'Fix CI before releasing.'
|
|
88
|
+
);
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (attempt < maxAttempts) {
|
|
93
|
+
core.info(`Attempt ${attempt}/${maxAttempts}: CI not finished yet — waiting ${delayMs / 1000}s …`);
|
|
94
|
+
await new Promise(r => setTimeout(r, delayMs));
|
|
95
|
+
}
|
|
80
96
|
}
|
|
81
97
|
|
|
82
|
-
core.
|
|
83
|
-
|
|
98
|
+
core.setFailed(
|
|
99
|
+
`No successful CI run found for commit ${sha} after ${maxAttempts} attempts (≈10 min). ` +
|
|
100
|
+
'Check whether the CI workflow was triggered for this commit.'
|
|
101
|
+
);
|
|
84
102
|
|
|
85
103
|
- name: Download built dist artifact
|
|
86
104
|
uses: actions/download-artifact@v4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ukam-os-builder
|
|
3
|
-
Version: 0.1.0.
|
|
3
|
+
Version: 0.1.0.dev5
|
|
4
4
|
Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
|
|
5
5
|
Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
6
6
|
Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
@@ -183,7 +183,7 @@ ukam-os-build --config config.yaml
|
|
|
183
183
|
|
|
184
184
|
1. `download` - fetch package metadata and zip files from OS Data Hub.
|
|
185
185
|
2. `extract` - extract CSVs from downloaded zip files and convert to parquet.
|
|
186
|
-
3. `split` - ABP only: split raw records
|
|
186
|
+
3. `split` - ABP only: split raw records and write only parquet staging files used by flatfile generation (`street_descriptor`, `blpu`, `lpi`, `delivery_point`, `organisation`, `classification`).
|
|
187
187
|
4. `flatfile` - transform and deduplicate into final output parquet file(s).
|
|
188
188
|
|
|
189
189
|
All stages are idempotent. Use `--overwrite` to regenerate outputs (`--force` is accepted as a backward-compatible alias).
|
|
@@ -157,7 +157,7 @@ ukam-os-build --config config.yaml
|
|
|
157
157
|
|
|
158
158
|
1. `download` - fetch package metadata and zip files from OS Data Hub.
|
|
159
159
|
2. `extract` - extract CSVs from downloaded zip files and convert to parquet.
|
|
160
|
-
3. `split` - ABP only: split raw records
|
|
160
|
+
3. `split` - ABP only: split raw records and write only parquet staging files used by flatfile generation (`street_descriptor`, `blpu`, `lpi`, `delivery_point`, `organisation`, `classification`).
|
|
161
161
|
4. `flatfile` - transform and deduplicate into final output parquet file(s).
|
|
162
162
|
|
|
163
163
|
All stages are idempotent. Use `--overwrite` to regenerate outputs (`--force` is accepted as a backward-compatible alias).
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ukam_os_builder.os_builder.extract import (
|
|
6
|
+
_filter_zips_for_source,
|
|
7
|
+
_should_convert_csv_to_parquet,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
|
|
12
|
+
zip_files = [
|
|
13
|
+
Path("add_gb_builtaddress.zip"),
|
|
14
|
+
Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
filtered = _filter_zips_for_source(zip_files, "ngd")
|
|
18
|
+
|
|
19
|
+
assert filtered == [Path("add_gb_builtaddress.zip")]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
|
|
23
|
+
ngd_csv = Path("add_gb_builtaddress.csv")
|
|
24
|
+
abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
|
|
25
|
+
|
|
26
|
+
assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
|
|
27
|
+
assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_filter_zips_for_source_excludes_ngd_historicaddress() -> None:
|
|
31
|
+
zip_files = [
|
|
32
|
+
Path("add_gb_builtaddress.zip"),
|
|
33
|
+
Path("add_gb_historicaddress.zip"),
|
|
34
|
+
Path("add_gb_historicaddress_altadd.zip"),
|
|
35
|
+
Path("add_gb_prebuildaddress.zip"),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
filtered = _filter_zips_for_source(zip_files, "ngd")
|
|
39
|
+
|
|
40
|
+
assert Path("add_gb_builtaddress.zip") in filtered
|
|
41
|
+
assert Path("add_gb_prebuildaddress.zip") in filtered
|
|
42
|
+
assert Path("add_gb_historicaddress.zip") not in filtered
|
|
43
|
+
assert Path("add_gb_historicaddress_altadd.zip") not in filtered
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_should_convert_csv_to_parquet_skips_ngd_historicaddress() -> None:
|
|
47
|
+
assert _should_convert_csv_to_parquet(Path("add_gb_builtaddress.csv"), "ngd") is True
|
|
48
|
+
assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress.csv"), "ngd") is False
|
|
49
|
+
assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress_altadd.csv"), "ngd") is False
|
|
@@ -122,7 +122,9 @@ def test_load_settings_uses_work_dir_for_default_subpaths(
|
|
|
122
122
|
assert settings.paths.output_dir == (tmp_path / "custom_data/output").resolve()
|
|
123
123
|
|
|
124
124
|
|
|
125
|
-
def
|
|
125
|
+
def test_load_settings_allows_missing_env_vars(
|
|
126
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
127
|
+
) -> None:
|
|
126
128
|
monkeypatch.delenv("OS_PROJECT_API_KEY", raising=False)
|
|
127
129
|
monkeypatch.delenv("OS_PROJECT_API_SECRET", raising=False)
|
|
128
130
|
|
|
@@ -139,8 +141,10 @@ def test_load_settings_requires_env_vars(tmp_path: Path, monkeypatch: pytest.Mon
|
|
|
139
141
|
""",
|
|
140
142
|
)
|
|
141
143
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
+
settings = load_settings(config_path, load_env=False)
|
|
145
|
+
|
|
146
|
+
assert settings.os_downloads.api_key is None
|
|
147
|
+
assert settings.os_downloads.api_secret is None
|
|
144
148
|
|
|
145
149
|
|
|
146
150
|
def test_load_settings_validates_positive_read_timeout(
|
|
@@ -8,7 +8,7 @@ from typing import Any, Literal
|
|
|
8
8
|
import yaml
|
|
9
9
|
|
|
10
10
|
from ukam_os_builder.api.settings import Settings, SettingsError, load_settings
|
|
11
|
-
from ukam_os_builder.os_builder.os_hub import get_package_version
|
|
11
|
+
from ukam_os_builder.os_builder.os_hub import _get_manifest_path, get_package_version
|
|
12
12
|
from ukam_os_builder.pipeline import run as run_pipeline
|
|
13
13
|
from ukam_os_builder.pipeline import supported_steps_for_source
|
|
14
14
|
|
|
@@ -333,11 +333,6 @@ def run_from_config(
|
|
|
333
333
|
parquet_compression_level=parquet_compression_level,
|
|
334
334
|
)
|
|
335
335
|
logger.info("Resolved work_dir: %s", settings.paths.work_dir)
|
|
336
|
-
logger.info("Resolved downloads_dir: %s", settings.paths.downloads_dir)
|
|
337
|
-
logger.info("Resolved extracted_dir: %s", settings.paths.extracted_dir)
|
|
338
|
-
logger.info("Resolved parquet_dir: %s", settings.paths.parquet_dir)
|
|
339
|
-
logger.info("Resolved output_dir: %s", settings.paths.output_dir)
|
|
340
|
-
|
|
341
336
|
source_type = settings.source.type
|
|
342
337
|
if step != "all":
|
|
343
338
|
supported_steps = supported_steps_for_source(source_type)
|
|
@@ -347,9 +342,24 @@ def run_from_config(
|
|
|
347
342
|
f"--step {step} is not valid for source {source_type}. Valid steps: {valid_steps}"
|
|
348
343
|
)
|
|
349
344
|
|
|
350
|
-
|
|
345
|
+
has_api_key = bool(os.environ.get("OS_PROJECT_API_KEY"))
|
|
346
|
+
if check_api and has_api_key:
|
|
351
347
|
get_package_version(settings)
|
|
352
348
|
|
|
353
349
|
overwrite_effective = overwrite if overwrite is not None else bool(force)
|
|
354
350
|
run_pipeline(step=step, settings=settings, force=overwrite_effective, list_only=list_only)
|
|
351
|
+
|
|
352
|
+
logger.info(
|
|
353
|
+
"✅ Pipeline run completed\n\n"
|
|
354
|
+
"Where you need to look:\n"
|
|
355
|
+
" • downloads_dir (raw OS Hub extracts): %s%s\n"
|
|
356
|
+
" • output_dir (final files for address matcher): %s%s\n",
|
|
357
|
+
str(settings.paths.downloads_dir),
|
|
358
|
+
"",
|
|
359
|
+
str(settings.paths.output_dir),
|
|
360
|
+
"",
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
_get_manifest_path(settings)
|
|
364
|
+
|
|
355
365
|
return settings
|
|
@@ -41,8 +41,8 @@ class OSDownloadSettings(StrictBaseModel):
|
|
|
41
41
|
|
|
42
42
|
package_id: str
|
|
43
43
|
version_id: str
|
|
44
|
-
api_key: SecretStr
|
|
45
|
-
api_secret: SecretStr
|
|
44
|
+
api_key: SecretStr | None = None
|
|
45
|
+
api_secret: SecretStr | None = None
|
|
46
46
|
connect_timeout_seconds: int = 30
|
|
47
47
|
read_timeout_seconds: int = 300
|
|
48
48
|
|
|
@@ -57,6 +57,8 @@ class OSDownloadSettings(StrictBaseModel):
|
|
|
57
57
|
@field_validator("api_key", "api_secret", mode="before")
|
|
58
58
|
@classmethod
|
|
59
59
|
def _validate_secret(cls, value: Any) -> Any:
|
|
60
|
+
if value is None:
|
|
61
|
+
return value
|
|
60
62
|
if isinstance(value, str) and not value.strip():
|
|
61
63
|
raise ValueError("must be non-empty")
|
|
62
64
|
return value
|
|
@@ -182,22 +184,11 @@ def _load_yaml(config_path: Path) -> dict[str, Any]:
|
|
|
182
184
|
return config
|
|
183
185
|
|
|
184
186
|
|
|
185
|
-
def
|
|
186
|
-
"""
|
|
187
|
+
def _load_env_vars() -> tuple[str | None, str | None]:
|
|
188
|
+
"""Load API credentials from environment variables if available."""
|
|
187
189
|
api_key = os.environ.get("OS_PROJECT_API_KEY")
|
|
188
190
|
api_secret = os.environ.get("OS_PROJECT_API_SECRET")
|
|
189
191
|
|
|
190
|
-
if not api_key:
|
|
191
|
-
raise SettingsError(
|
|
192
|
-
"OS_PROJECT_API_KEY not found in environment. "
|
|
193
|
-
"Create a .env file with OS_PROJECT_API_KEY=<your-key>"
|
|
194
|
-
)
|
|
195
|
-
if not api_secret:
|
|
196
|
-
raise SettingsError(
|
|
197
|
-
"OS_PROJECT_API_SECRET not found in environment. "
|
|
198
|
-
"Create a .env file with OS_PROJECT_API_SECRET=<your-secret>"
|
|
199
|
-
)
|
|
200
|
-
|
|
201
192
|
return api_key, api_secret
|
|
202
193
|
|
|
203
194
|
|
|
@@ -216,8 +207,7 @@ def load_settings(
|
|
|
216
207
|
Complete Settings object with resolved paths.
|
|
217
208
|
|
|
218
209
|
Raises:
|
|
219
|
-
SettingsError: If config file is missing or invalid
|
|
220
|
-
or if required environment variables are not set.
|
|
210
|
+
SettingsError: If config file is missing or invalid.
|
|
221
211
|
"""
|
|
222
212
|
config_path = Path(config_path).resolve()
|
|
223
213
|
base_dir = config_path.parent
|
|
@@ -232,8 +222,8 @@ def load_settings(
|
|
|
232
222
|
# Load YAML config
|
|
233
223
|
config = _load_yaml(config_path)
|
|
234
224
|
|
|
235
|
-
#
|
|
236
|
-
api_key, api_secret =
|
|
225
|
+
# Load environment variables (optional)
|
|
226
|
+
api_key, api_secret = _load_env_vars()
|
|
237
227
|
|
|
238
228
|
resolved_paths = resolve_paths(config=config, config_dir=base_dir)
|
|
239
229
|
|
|
@@ -122,7 +122,7 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
122
122
|
config_path = Path(args.config).resolve()
|
|
123
123
|
console.print(f"[green]✓[/green] Loaded config: [bold]{config_path}[/bold]")
|
|
124
124
|
console.print(f"[cyan]Step:[/cyan] {args.step}")
|
|
125
|
-
console.print("[cyan]
|
|
125
|
+
console.print("[cyan]Starting pipeline...[/cyan]")
|
|
126
126
|
|
|
127
127
|
run_from_config(
|
|
128
128
|
config_path=config_path,
|
|
@@ -145,8 +145,6 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
145
145
|
parquet_compression=args.parquet_compression,
|
|
146
146
|
parquet_compression_level=args.parquet_compression_level,
|
|
147
147
|
)
|
|
148
|
-
logger.info("Pipeline run completed")
|
|
149
|
-
console.print("[green]✓[/green] API connectivity check passed")
|
|
150
148
|
console.print("[bold green]Build completed successfully[/bold green]")
|
|
151
149
|
return 0
|
|
152
150
|
except (SettingsError, ValueError) as exc:
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""Split raw ABP data module.
|
|
2
2
|
|
|
3
3
|
Reads raw ABP CSV files (which contain all record types mixed together),
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
filters to the record types needed for flatfile creation, and writes
|
|
5
|
+
one parquet file per required record type.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from __future__ import annotations
|
|
@@ -19,8 +19,8 @@ from ukam_os_builder.api.settings import Settings, create_duckdb_connection
|
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
|
-
#
|
|
23
|
-
|
|
22
|
+
# All known ABP record identifiers
|
|
23
|
+
ALL_RECORD_TYPE_MAP = {
|
|
24
24
|
"10": "header",
|
|
25
25
|
"11": "street",
|
|
26
26
|
"15": "street_descriptor",
|
|
@@ -35,6 +35,16 @@ RECORD_TYPE_MAP = {
|
|
|
35
35
|
"99": "trailer",
|
|
36
36
|
}
|
|
37
37
|
|
|
38
|
+
# Record identifiers needed for ABP flatfile creation
|
|
39
|
+
RECORD_TYPE_MAP = {
|
|
40
|
+
"15": "street_descriptor",
|
|
41
|
+
"21": "blpu",
|
|
42
|
+
"24": "lpi",
|
|
43
|
+
"28": "delivery_point",
|
|
44
|
+
"31": "organisation",
|
|
45
|
+
"32": "classification",
|
|
46
|
+
}
|
|
47
|
+
|
|
38
48
|
DEFAULT_SCHEMA_PATH = Path(__file__).resolve().parent / "schemas" / "abp_schema.yaml"
|
|
39
49
|
|
|
40
50
|
|
|
@@ -169,12 +179,23 @@ def split_raw_to_parquet(
|
|
|
169
179
|
input_counts[name] = count
|
|
170
180
|
logger.debug("Record type %s (%s): %d lines", rid, name, count)
|
|
171
181
|
|
|
182
|
+
unused_rids = sorted(set(ALL_RECORD_TYPE_MAP) - set(RECORD_TYPE_MAP))
|
|
183
|
+
rid_list_sql = ", ".join([f"'{rid}'" for rid in unused_rids])
|
|
184
|
+
ignored_input = con.execute(f"""
|
|
185
|
+
SELECT COUNT(*)
|
|
186
|
+
FROM lines_with_rid
|
|
187
|
+
WHERE rid IN ({rid_list_sql})
|
|
188
|
+
""").fetchone()[0]
|
|
189
|
+
|
|
172
190
|
total_input = sum(input_counts.values())
|
|
173
|
-
logger.info("Total input lines (
|
|
191
|
+
logger.info("Total input lines (processed record IDs): %d", total_input)
|
|
192
|
+
if ignored_input > 0:
|
|
193
|
+
logger.info("Ignored input lines (unused record IDs): %d", ignored_input)
|
|
174
194
|
if total_input == 0:
|
|
175
195
|
raise ValueError(
|
|
176
196
|
"No ABP record identifiers found in extracted CSV input. "
|
|
177
|
-
"Ensure --source abp is used with ABP raw extracts
|
|
197
|
+
"Ensure --source abp is used with ABP raw extracts "
|
|
198
|
+
"(required record IDs: 15/21/24/28/31/32)."
|
|
178
199
|
)
|
|
179
200
|
|
|
180
201
|
# 4) Process each record type
|
|
@@ -279,7 +300,7 @@ def split_raw_to_parquet(
|
|
|
279
300
|
total_output = sum(output_counts.values())
|
|
280
301
|
logger.info("")
|
|
281
302
|
logger.info("=== Validation: Line count check ===")
|
|
282
|
-
logger.info("Input lines (
|
|
303
|
+
logger.info("Input lines (processed record IDs): %d", total_input)
|
|
283
304
|
logger.info("Output rows (parquet): %d", total_output)
|
|
284
305
|
|
|
285
306
|
if total_input == total_output:
|
|
@@ -33,7 +33,7 @@ def combine_and_dedupe(con: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyRelatio
|
|
|
33
33
|
),
|
|
34
34
|
ranked AS (
|
|
35
35
|
SELECT *,
|
|
36
|
-
CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2
|
|
36
|
+
CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2 ELSE 9 END AS status_rank,
|
|
37
37
|
CASE source WHEN 'LPI' THEN 0 WHEN 'ORGANISATION' THEN 1 WHEN 'DELIVERY_POINT' THEN 2 WHEN 'CUSTOM_LEVEL' THEN 3 ELSE 4 END AS source_rank
|
|
38
38
|
FROM normalized
|
|
39
39
|
),
|
|
@@ -70,15 +70,15 @@ matching messy user input. We output variants based on **Logical Status**:
|
|
|
70
70
|
locally known as "Rose Cottage").
|
|
71
71
|
3. **Provisional (6):** The address assigned during planning/construction, which
|
|
72
72
|
might change before the house is built.
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
|
|
74
|
+
Historic addresses (logical_status=8) are excluded from output.
|
|
75
75
|
|
|
76
76
|
------------------------------------------------------------------------------
|
|
77
77
|
Key Columns Explained
|
|
78
78
|
------------------------------------------------------------------------------
|
|
79
79
|
* `uprn`: The "Golden Key". Use this to link this address to other data.
|
|
80
80
|
* `base_address`: The constructed full address string.
|
|
81
|
-
* `logical_status`: 1=Current, 6=Provisional
|
|
81
|
+
* `logical_status`: 1=Current, 6=Provisional.
|
|
82
82
|
* `official_flag`: 'Y' indicates this is the "official" version, 'N' suggests
|
|
83
83
|
it might be an unofficial alias.
|
|
84
84
|
* `language`: 'ENG' (English) or 'CYM' (Welsh). Streets in Wales often have
|
|
@@ -183,7 +183,6 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
183
183
|
WHEN 1 THEN 0
|
|
184
184
|
WHEN 3 THEN 1
|
|
185
185
|
WHEN 6 THEN 2
|
|
186
|
-
WHEN 8 THEN 3
|
|
187
186
|
ELSE 9
|
|
188
187
|
END AS status_rank
|
|
189
188
|
FROM lpi l
|
|
@@ -192,7 +191,7 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
192
191
|
LEFT JOIN _sd_best_by_lang sd_lang ON sd_lang.usrn = l.usrn AND sd_lang.language = l.language
|
|
193
192
|
LEFT JOIN _sd_best_any sd_any ON sd_any.usrn = l.usrn
|
|
194
193
|
WHERE (b.addressbase_postal != 'N' OR b.addressbase_postal IS NULL)
|
|
195
|
-
AND l.logical_status IN (1, 3, 6
|
|
194
|
+
AND l.logical_status IN (1, 3, 6)
|
|
196
195
|
""")
|
|
197
196
|
|
|
198
197
|
# Deduplicated distinct addresses
|
|
@@ -266,7 +265,6 @@ def render_variants(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
266
265
|
WHEN 1 THEN 'APPROVED'
|
|
267
266
|
WHEN 3 THEN 'ALTERNATIVE'
|
|
268
267
|
WHEN 6 THEN 'PROVISIONAL'
|
|
269
|
-
WHEN 8 THEN 'HISTORICAL'
|
|
270
268
|
END AS variant_label,
|
|
271
269
|
(logical_status = 1) AS is_primary
|
|
272
270
|
FROM lpi_base_distinct
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Transforms the extracted parquet files into a single flatfile suitable for
|
|
4
4
|
UK address matching. This includes:
|
|
5
|
-
- Processing core feature types (Built Address,
|
|
5
|
+
- Processing core feature types (Built Address, Pre-Build Address, etc.)
|
|
6
6
|
- Processing alternate address records
|
|
7
7
|
- Processing Royal Mail addresses
|
|
8
8
|
- Handling Welsh language variants
|
|
@@ -27,8 +27,6 @@ logger = logging.getLogger(__name__)
|
|
|
27
27
|
FEATURE_TYPE_BY_STEM = {
|
|
28
28
|
"add_gb_builtaddress": "Built Address",
|
|
29
29
|
"add_gb_builtaddress_altadd": "Built Address",
|
|
30
|
-
"add_gb_historicaddress": "Historic Address",
|
|
31
|
-
"add_gb_historicaddress_altadd": "Historic Address",
|
|
32
30
|
"add_gb_nonaddressableobject": "Non-Addressable Object",
|
|
33
31
|
"add_gb_nonaddressableobject_altadd": "Non-Addressable Object",
|
|
34
32
|
"add_gb_prebuildaddress": "Pre-Build Address",
|
|
@@ -39,7 +37,6 @@ FEATURE_TYPE_BY_STEM = {
|
|
|
39
37
|
# Core feature stems (contain fulladdress and classification fields)
|
|
40
38
|
CORE_FEATURE_STEMS = {
|
|
41
39
|
"add_gb_builtaddress",
|
|
42
|
-
"add_gb_historicaddress",
|
|
43
40
|
"add_gb_nonaddressableobject",
|
|
44
41
|
"add_gb_prebuildaddress",
|
|
45
42
|
}
|
|
@@ -47,7 +44,6 @@ CORE_FEATURE_STEMS = {
|
|
|
47
44
|
# Alternate address stems (no classification fields)
|
|
48
45
|
ALTADD_STEMS = {
|
|
49
46
|
"add_gb_builtaddress_altadd",
|
|
50
|
-
"add_gb_historicaddress_altadd",
|
|
51
47
|
"add_gb_nonaddressableobject_altadd",
|
|
52
48
|
"add_gb_prebuildaddress_altadd",
|
|
53
49
|
}
|
|
@@ -57,7 +53,6 @@ CORE_FEATURE_PRIORITY = {
|
|
|
57
53
|
"add_gb_builtaddress": 1,
|
|
58
54
|
"add_gb_prebuildaddress": 2,
|
|
59
55
|
"add_gb_nonaddressableobject": 3,
|
|
60
|
-
"add_gb_historicaddress": 4,
|
|
61
56
|
}
|
|
62
57
|
|
|
63
58
|
|
|
@@ -71,7 +66,7 @@ def _create_metadata_lookup_view(
|
|
|
71
66
|
This view is used to enrich Royal Mail and alternate address records
|
|
72
67
|
with metadata (classificationcode, parentuprn, etc.) by UPRN lookup.
|
|
73
68
|
|
|
74
|
-
Uses priority ranking (Built > Pre-Build > Non-Addressable
|
|
69
|
+
Uses priority ranking (Built > Pre-Build > Non-Addressable)
|
|
75
70
|
to dedupe when a UPRN exists in multiple core files.
|
|
76
71
|
|
|
77
72
|
Args:
|
|
@@ -156,7 +151,7 @@ def _create_core_feature_view(
|
|
|
156
151
|
parquet_path: Path,
|
|
157
152
|
uprn_predicate: str | None = None,
|
|
158
153
|
) -> None:
|
|
159
|
-
"""Create view for core feature types (Built,
|
|
154
|
+
"""Create view for core feature types (Built, Pre-Build, Non-Addressable).
|
|
160
155
|
|
|
161
156
|
These tables have fulladdress, classification fields, and Welsh language columns.
|
|
162
157
|
Produces both English and Welsh (where available) address records.
|
|
@@ -413,11 +408,76 @@ def _enrich_with_metadata(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
413
408
|
con.execute(sql)
|
|
414
409
|
|
|
415
410
|
|
|
411
|
+
def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
|
|
412
|
+
"""Generate custom level-based address variants and insert into enriched table.
|
|
413
|
+
|
|
414
|
+
Parses the ``floorlevel`` column (VARCHAR) from the enriched address table,
|
|
415
|
+
maps integer floor levels to words (-1=BASEMENT … 6=SIXTH), and prepends the
|
|
416
|
+
word to the existing ``address_concat`` to create additional address variants.
|
|
417
|
+
|
|
418
|
+
These rows use ``feature_type='Custom Level'`` so they receive the lowest
|
|
419
|
+
dedup priority and never override official address data.
|
|
420
|
+
"""
|
|
421
|
+
sql = """
|
|
422
|
+
INSERT INTO all_full_addresses_enriched
|
|
423
|
+
WITH level_parsed AS (
|
|
424
|
+
SELECT
|
|
425
|
+
uprn, address_concat, postcode, filename,
|
|
426
|
+
classificationcode, parentuprn, rootuprn,
|
|
427
|
+
hierarchylevel, floorlevel, lowestfloorlevel, highestfloorlevel,
|
|
428
|
+
address_status, build_status,
|
|
429
|
+
CASE
|
|
430
|
+
WHEN split_part(floorlevel, ',', 1) ~ '^-?[0-9]+$'
|
|
431
|
+
THEN CAST(split_part(floorlevel, ',', 1) AS INTEGER)
|
|
432
|
+
ELSE NULL
|
|
433
|
+
END AS level_int
|
|
434
|
+
FROM all_full_addresses_enriched
|
|
435
|
+
WHERE floorlevel IS NOT NULL
|
|
436
|
+
AND address_concat IS NOT NULL
|
|
437
|
+
AND address_concat <> ''
|
|
438
|
+
),
|
|
439
|
+
level_words AS (
|
|
440
|
+
SELECT
|
|
441
|
+
*,
|
|
442
|
+
CASE level_int
|
|
443
|
+
WHEN -1 THEN 'BASEMENT'
|
|
444
|
+
WHEN 0 THEN 'GROUND'
|
|
445
|
+
WHEN 1 THEN 'FIRST'
|
|
446
|
+
WHEN 2 THEN 'SECOND'
|
|
447
|
+
WHEN 3 THEN 'THIRD'
|
|
448
|
+
WHEN 4 THEN 'FOURTH'
|
|
449
|
+
WHEN 5 THEN 'FIFTH'
|
|
450
|
+
WHEN 6 THEN 'SIXTH'
|
|
451
|
+
END AS level_word
|
|
452
|
+
FROM level_parsed
|
|
453
|
+
WHERE level_int BETWEEN -1 AND 6
|
|
454
|
+
)
|
|
455
|
+
SELECT
|
|
456
|
+
uprn,
|
|
457
|
+
TRIM(concat(level_word, ' ', address_concat)) AS address_concat,
|
|
458
|
+
postcode,
|
|
459
|
+
'CUSTOM_LEVEL' AS filename,
|
|
460
|
+
classificationcode,
|
|
461
|
+
parentuprn,
|
|
462
|
+
rootuprn,
|
|
463
|
+
hierarchylevel,
|
|
464
|
+
floorlevel,
|
|
465
|
+
lowestfloorlevel,
|
|
466
|
+
highestfloorlevel,
|
|
467
|
+
'Custom Level' AS feature_type,
|
|
468
|
+
address_status,
|
|
469
|
+
build_status
|
|
470
|
+
FROM level_words
|
|
471
|
+
WHERE level_word IS NOT NULL;
|
|
472
|
+
"""
|
|
473
|
+
con.execute(sql)
|
|
474
|
+
|
|
475
|
+
|
|
416
476
|
def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
417
477
|
"""Create deduplicated view of all addresses.
|
|
418
478
|
|
|
419
479
|
Priority rules for deduplication:
|
|
420
|
-
- Feature type: Built Address -> Pre-Build -> Royal Mail ->
|
|
480
|
+
- Feature type: Built Address -> Pre-Build -> Royal Mail -> Non-Addressable
|
|
421
481
|
- Address status: Approved -> Provisional -> Alternative -> Historical
|
|
422
482
|
- Build status: Built Complete -> Under Construction -> Prebuild -> Historic -> Demolished
|
|
423
483
|
|
|
@@ -433,8 +493,8 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
433
493
|
WHEN 'Built Address' THEN 1
|
|
434
494
|
WHEN 'Pre-Build Address' THEN 2
|
|
435
495
|
WHEN 'Royal Mail Address' THEN 3
|
|
436
|
-
WHEN 'Historic Address' THEN 4
|
|
437
496
|
WHEN 'Non-Addressable Object' THEN 5
|
|
497
|
+
WHEN 'Custom Level' THEN 6
|
|
438
498
|
ELSE 9
|
|
439
499
|
END AS feature_type_rank,
|
|
440
500
|
CASE
|
|
@@ -460,7 +520,7 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
|
|
|
460
520
|
build_status_rank
|
|
461
521
|
) AS rn
|
|
462
522
|
FROM all_full_addresses_enriched
|
|
463
|
-
WHERE feature_type
|
|
523
|
+
WHERE feature_type NOT IN ('Non-Addressable Object')
|
|
464
524
|
)
|
|
465
525
|
SELECT
|
|
466
526
|
uprn,
|
|
@@ -641,6 +701,10 @@ def run_flatfile_step(settings: Settings, force: bool = False) -> list[Path]:
|
|
|
641
701
|
logger.info("Enriching addresses with metadata from core files...")
|
|
642
702
|
_enrich_with_metadata(con)
|
|
643
703
|
|
|
704
|
+
# Generate custom level variants
|
|
705
|
+
logger.info("Generating custom level address variants...")
|
|
706
|
+
_create_custom_level_rows(con)
|
|
707
|
+
|
|
644
708
|
# Create deduplicated view
|
|
645
709
|
logger.info("Creating deduplicated view...")
|
|
646
710
|
_create_dedup_view(con)
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/extract.py
RENAMED
|
@@ -11,6 +11,9 @@ from ukam_os_builder.api.settings import Settings
|
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
|
+
# NGD file stems to exclude (historic addresses are not used in output)
|
|
15
|
+
_NGD_EXCLUDED_STEMS = {"historicaddress"}
|
|
16
|
+
|
|
14
17
|
|
|
15
18
|
def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
|
|
16
19
|
"""Find all downloaded zip files in a directory."""
|
|
@@ -22,11 +25,20 @@ def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
|
|
|
22
25
|
return zip_files
|
|
23
26
|
|
|
24
27
|
|
|
28
|
+
def _is_excluded_ngd_file(name: str) -> bool:
|
|
29
|
+
"""Return True if *name* matches an excluded NGD stem (e.g. historicaddress)."""
|
|
30
|
+
name_lower = name.lower()
|
|
31
|
+
return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
|
|
32
|
+
|
|
33
|
+
|
|
25
34
|
def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
|
|
26
35
|
source_lower = source.lower()
|
|
27
36
|
if source_lower == "ngd":
|
|
28
37
|
ngd_zips = [
|
|
29
|
-
zip_path
|
|
38
|
+
zip_path
|
|
39
|
+
for zip_path in zip_files
|
|
40
|
+
if zip_path.name.lower().startswith("add_gb_")
|
|
41
|
+
and not _is_excluded_ngd_file(zip_path.name)
|
|
30
42
|
]
|
|
31
43
|
return ngd_zips or zip_files
|
|
32
44
|
if source_lower == "abp":
|
|
@@ -39,7 +51,8 @@ def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
|
|
|
39
51
|
|
|
40
52
|
def _should_convert_csv_to_parquet(csv_path: Path, source: str) -> bool:
|
|
41
53
|
if source.lower() == "ngd":
|
|
42
|
-
|
|
54
|
+
name_lower = csv_path.name.lower()
|
|
55
|
+
return name_lower.startswith("add_gb_") and not _is_excluded_ngd_file(name_lower)
|
|
43
56
|
return True
|
|
44
57
|
|
|
45
58
|
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/os_hub.py
RENAMED
|
@@ -9,9 +9,25 @@ from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
|
|
9
9
|
|
|
10
10
|
import requests
|
|
11
11
|
|
|
12
|
+
from ukam_os_builder.api.settings import Settings
|
|
13
|
+
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
14
16
|
API_BASE_URL = "https://api.os.uk/downloads/v1"
|
|
17
|
+
|
|
18
|
+
# NGD file stems to exclude (historic addresses are not used in output)
|
|
19
|
+
_NGD_EXCLUDED_STEMS = {"historicaddress"}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _should_skip_ngd_download(filename: str, settings: object) -> bool:
|
|
23
|
+
"""Return True if *filename* is an NGD historic-address archive."""
|
|
24
|
+
source_type = getattr(getattr(settings, "source", None), "type", "")
|
|
25
|
+
if source_type != "ngd":
|
|
26
|
+
return False
|
|
27
|
+
name_lower = filename.lower()
|
|
28
|
+
return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
|
|
29
|
+
|
|
30
|
+
|
|
15
31
|
DEFAULT_CHUNK_SIZE = 1024 * 1024 * 20 # 20 MiB
|
|
16
32
|
DEFAULT_CONNECT_TIMEOUT_SECONDS = 30
|
|
17
33
|
DEFAULT_READ_TIMEOUT_SECONDS = 300
|
|
@@ -65,6 +81,13 @@ def _require_api_key(settings: Any) -> str:
|
|
|
65
81
|
return api_key
|
|
66
82
|
|
|
67
83
|
|
|
84
|
+
def _find_existing_download_archives(downloads_dir: Path) -> list[Path]:
|
|
85
|
+
"""Find existing local archives that can be used for extract step."""
|
|
86
|
+
if not downloads_dir.exists():
|
|
87
|
+
return []
|
|
88
|
+
return sorted(downloads_dir.glob("*.zip"))
|
|
89
|
+
|
|
90
|
+
|
|
68
91
|
def get_package_version(settings: Any) -> dict:
|
|
69
92
|
"""Fetch package version metadata from the OS Data Hub API."""
|
|
70
93
|
package_id = settings.os_downloads.package_id
|
|
@@ -236,9 +259,27 @@ def run_download_step(
|
|
|
236
259
|
list_only: bool = False,
|
|
237
260
|
) -> list[Path]:
|
|
238
261
|
"""Run the OS Data Hub download step for any compatible settings object."""
|
|
239
|
-
api_key = _require_api_key(settings)
|
|
240
262
|
downloads_dir = settings.paths.downloads_dir
|
|
241
263
|
|
|
264
|
+
try:
|
|
265
|
+
api_key = _require_api_key(settings)
|
|
266
|
+
except ValueError as exc:
|
|
267
|
+
if list_only:
|
|
268
|
+
raise
|
|
269
|
+
|
|
270
|
+
existing_archives = _find_existing_download_archives(downloads_dir)
|
|
271
|
+
if existing_archives:
|
|
272
|
+
logger.warning(
|
|
273
|
+
"No API key found; using %d existing archive(s) in %s and skipping download.",
|
|
274
|
+
len(existing_archives),
|
|
275
|
+
downloads_dir,
|
|
276
|
+
)
|
|
277
|
+
return existing_archives
|
|
278
|
+
|
|
279
|
+
raise ValueError(
|
|
280
|
+
f"{exc} No local zip files were found in {downloads_dir}, so download cannot be skipped."
|
|
281
|
+
) from exc
|
|
282
|
+
|
|
242
283
|
logger.info("Fetching package metadata...")
|
|
243
284
|
metadata = get_package_version(settings)
|
|
244
285
|
items = list_downloads(metadata)
|
|
@@ -268,6 +309,11 @@ def run_download_step(
|
|
|
268
309
|
logger.warning("No URL for %s, skipping", item.filename)
|
|
269
310
|
continue
|
|
270
311
|
|
|
312
|
+
# Skip NGD historic address files — they are excluded from output
|
|
313
|
+
if _should_skip_ngd_download(item.filename, settings):
|
|
314
|
+
logger.info("Skipping historic address file: %s", item.filename)
|
|
315
|
+
continue
|
|
316
|
+
|
|
271
317
|
dest_path = downloads_dir / item.filename
|
|
272
318
|
was_downloaded = download_file(
|
|
273
319
|
url=item.url,
|
|
@@ -287,3 +333,54 @@ def run_download_step(
|
|
|
287
333
|
|
|
288
334
|
logger.info("Download complete: %d file(s)", len(downloaded))
|
|
289
335
|
return downloaded
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _get_manifest_path(settings: Settings) -> Path | None:
|
|
339
|
+
downloads_dir = settings.paths.downloads_dir.resolve()
|
|
340
|
+
source_type = settings.source.type # "abp" | "ngd"
|
|
341
|
+
|
|
342
|
+
if source_type == "abp":
|
|
343
|
+
candidates = list(downloads_dir.glob("*-Order_Details.txt"))
|
|
344
|
+
if not candidates:
|
|
345
|
+
logger.info("➡️ Manifest (ABP order details) not found. Check: %s", downloads_dir)
|
|
346
|
+
return None
|
|
347
|
+
|
|
348
|
+
manifest = max(candidates, key=lambda p: p.stat().st_mtime).resolve()
|
|
349
|
+
|
|
350
|
+
if len(candidates) > 1:
|
|
351
|
+
logger.warning(
|
|
352
|
+
"Multiple ABP manifests found in %s. Using newest: %s",
|
|
353
|
+
downloads_dir,
|
|
354
|
+
manifest,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
logger.info("➡️ Manifest (ABP order details): %s", manifest)
|
|
358
|
+
return manifest
|
|
359
|
+
|
|
360
|
+
elif source_type == "ngd":
|
|
361
|
+
candidates = list(
|
|
362
|
+
downloads_dir.glob("*_orderSummary.json")
|
|
363
|
+
) # adjust if it's "*.orderSummary.json"
|
|
364
|
+
if not candidates:
|
|
365
|
+
logger.info("➡️ Manifests (NGD order summaries) not found. Check: %s", downloads_dir)
|
|
366
|
+
return None
|
|
367
|
+
|
|
368
|
+
built_candidates = list(downloads_dir.glob("*builtaddress*_orderSummary.json"))
|
|
369
|
+
built_manifest = (
|
|
370
|
+
max(built_candidates, key=lambda p: p.stat().st_mtime).resolve()
|
|
371
|
+
if built_candidates
|
|
372
|
+
else None
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
logger.info(
|
|
376
|
+
"➡️ Manifests (NGD order summaries): %s (%d files)\n"
|
|
377
|
+
" ↳ Built address order summary: %s",
|
|
378
|
+
downloads_dir,
|
|
379
|
+
len(candidates),
|
|
380
|
+
built_manifest if built_manifest else "(not found)",
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
return downloads_dir
|
|
384
|
+
|
|
385
|
+
logger.warning("Unknown source type %r. No manifest lookup performed.", source_type)
|
|
386
|
+
return None
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from ukam_os_builder.os_builder.extract import (
|
|
6
|
-
_filter_zips_for_source,
|
|
7
|
-
_should_convert_csv_to_parquet,
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
|
|
12
|
-
zip_files = [
|
|
13
|
-
Path("add_gb_builtaddress.zip"),
|
|
14
|
-
Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
|
|
15
|
-
]
|
|
16
|
-
|
|
17
|
-
filtered = _filter_zips_for_source(zip_files, "ngd")
|
|
18
|
-
|
|
19
|
-
assert filtered == [Path("add_gb_builtaddress.zip")]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
|
|
23
|
-
ngd_csv = Path("add_gb_builtaddress.csv")
|
|
24
|
-
abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
|
|
25
|
-
|
|
26
|
-
assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
|
|
27
|
-
assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress_altadd.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_historicaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_prebuildaddress.csv
RENAMED
|
File without changes
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_royalmailaddress.csv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_public_api_integration.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|