ukam-os-builder 0.1.0.dev2__tar.gz → 0.1.0.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ukam_os_builder-0.1.0.dev4/.github/workflows/e2e.yml +147 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/.github/workflows/release-pypi.yml +40 -22
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/.gitignore +1 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/AGENTS.md +1 -1
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/PKG-INFO +38 -27
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/README.md +37 -26
- ukam_os_builder-0.1.0.dev2/config.yaml → ukam_os_builder-0.1.0.dev4/config.example.yaml +0 -9
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/pyproject.toml +1 -1
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_api.py +80 -9
- ukam_os_builder-0.1.0.dev4/tests/test_cli.py +32 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_settings.py +63 -18
- ukam_os_builder-0.1.0.dev4/tests/test_setup_wizard.py +136 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/__init__.py +1 -1
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/api/api.py +67 -22
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/api/settings.py +60 -42
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/cli.py +11 -2
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/split_raw.py +28 -7
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/os_hub.py +26 -1
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/pipeline_factory.py +12 -8
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/pipeline.py +4 -2
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/setup_wizard.py +56 -29
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/uv.lock +860 -858
- ukam_os_builder-0.1.0.dev2/ukam_os_builder/data_sources/abp/to_flatfile.py +0 -677
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/.env.example +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/.github/workflows/ci.yml +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/prompt.md +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/shell/test_release_locally.sh +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/README.md +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_builtaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_historicaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_prebuildaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_royalmailaddress.csv +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_extract_source_filtering.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_inspect_results.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_public_api_integration.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_smoke.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/_exceptions.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/api/cli_errors.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/runner.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/ngd/to_flatfile.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/__init__.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/extract.py +0 -0
- {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/inspect_results.py +0 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
name: End-to-end tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
branches: [main]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
|
|
10
|
+
concurrency:
|
|
11
|
+
group: e2e-${{ github.head_ref || github.ref }}
|
|
12
|
+
cancel-in-progress: true
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
e2e:
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
strategy:
|
|
18
|
+
fail-fast: false
|
|
19
|
+
matrix:
|
|
20
|
+
include:
|
|
21
|
+
- source: ngd
|
|
22
|
+
package_id: "18296"
|
|
23
|
+
version_id: "118120"
|
|
24
|
+
- source: abp
|
|
25
|
+
package_id: "0040206240"
|
|
26
|
+
version_id: "6777574"
|
|
27
|
+
|
|
28
|
+
name: E2E – ${{ matrix.source }}
|
|
29
|
+
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v5
|
|
32
|
+
|
|
33
|
+
- name: Mask API credentials
|
|
34
|
+
run: |
|
|
35
|
+
echo "::add-mask::${{ secrets.OS_PROJECT_API_KEY }}"
|
|
36
|
+
echo "::add-mask::${{ secrets.OS_PROJECT_API_SECRET }}"
|
|
37
|
+
|
|
38
|
+
- name: Set up Python
|
|
39
|
+
uses: actions/setup-python@v5
|
|
40
|
+
with:
|
|
41
|
+
python-version: "3.10"
|
|
42
|
+
|
|
43
|
+
- name: Set up uv
|
|
44
|
+
uses: astral-sh/setup-uv@v7
|
|
45
|
+
with:
|
|
46
|
+
enable-cache: true
|
|
47
|
+
cache-dependency-glob: "uv.lock"
|
|
48
|
+
|
|
49
|
+
- name: Install dependencies
|
|
50
|
+
run: uv sync --all-extras --all-groups
|
|
51
|
+
|
|
52
|
+
- name: Write config.yaml
|
|
53
|
+
run: |
|
|
54
|
+
printf '%s\n' \
|
|
55
|
+
'paths:' \
|
|
56
|
+
' work_dir: ./data' \
|
|
57
|
+
'' \
|
|
58
|
+
'source:' \
|
|
59
|
+
' type: ${{ matrix.source }}' \
|
|
60
|
+
'' \
|
|
61
|
+
'os_downloads:' \
|
|
62
|
+
' package_id: "${{ matrix.package_id }}"' \
|
|
63
|
+
' version_id: "${{ matrix.version_id }}"' \
|
|
64
|
+
'' \
|
|
65
|
+
'processing:' \
|
|
66
|
+
' parquet_compression: zstd' \
|
|
67
|
+
' parquet_compression_level: 9' \
|
|
68
|
+
' num_chunks: 1' \
|
|
69
|
+
> config.yaml
|
|
70
|
+
|
|
71
|
+
- name: Run full pipeline
|
|
72
|
+
env:
|
|
73
|
+
OS_PROJECT_API_KEY: ${{ secrets.OS_PROJECT_API_KEY }}
|
|
74
|
+
OS_PROJECT_API_SECRET: ${{ secrets.OS_PROJECT_API_SECRET }}
|
|
75
|
+
run: uv run ukam-os-build --verbose
|
|
76
|
+
|
|
77
|
+
- name: Verify output files exist
|
|
78
|
+
run: |
|
|
79
|
+
echo "=== Output directory ==="
|
|
80
|
+
ls -lhR data/output/
|
|
81
|
+
echo ""
|
|
82
|
+
echo "=== Checking for parquet files ==="
|
|
83
|
+
count=$(find data/output -name '*.parquet' | wc -l)
|
|
84
|
+
echo "Found $count parquet file(s) in data/output/"
|
|
85
|
+
if [ "$count" -eq 0 ]; then
|
|
86
|
+
echo "::error::No parquet output files found!"
|
|
87
|
+
exit 1
|
|
88
|
+
fi
|
|
89
|
+
|
|
90
|
+
- name: Preview first output row
|
|
91
|
+
run: |
|
|
92
|
+
uv run python -c "
|
|
93
|
+
import duckdb
|
|
94
|
+
con = duckdb.connect()
|
|
95
|
+
con.sql(\"SELECT * FROM read_parquet('data/output/*.parquet')\").show(max_rows=1, max_width=10000)
|
|
96
|
+
"
|
|
97
|
+
|
|
98
|
+
# ── Second run: offline (no API credentials) ──────────────
|
|
99
|
+
- name: Record download file timestamps
|
|
100
|
+
run: |
|
|
101
|
+
stat -c '%n %Y' data/downloads/* | sort > /tmp/downloads_before.txt
|
|
102
|
+
echo "=== Download file timestamps ==="
|
|
103
|
+
cat /tmp/downloads_before.txt
|
|
104
|
+
|
|
105
|
+
- name: Remove everything except downloads and block API access
|
|
106
|
+
run: |
|
|
107
|
+
find data -mindepth 1 -maxdepth 1 ! -name downloads -exec rm -rf {} +
|
|
108
|
+
echo "=== Remaining data tree ==="
|
|
109
|
+
find data -type f | sort
|
|
110
|
+
|
|
111
|
+
- name: Re-run pipeline without API credentials
|
|
112
|
+
run: |
|
|
113
|
+
unset OS_PROJECT_API_KEY OS_PROJECT_API_SECRET
|
|
114
|
+
uv run ukam-os-build --verbose --overwrite
|
|
115
|
+
|
|
116
|
+
- name: Verify output files exist (offline run)
|
|
117
|
+
run: |
|
|
118
|
+
echo "=== Output directory ==="
|
|
119
|
+
ls -lhR data/output/
|
|
120
|
+
echo ""
|
|
121
|
+
echo "=== Checking for parquet files ==="
|
|
122
|
+
count=$(find data/output -name '*.parquet' | wc -l)
|
|
123
|
+
echo "Found $count parquet file(s) in data/output/"
|
|
124
|
+
if [ "$count" -eq 0 ]; then
|
|
125
|
+
echo "::error::No parquet output files found on offline run!"
|
|
126
|
+
exit 1
|
|
127
|
+
fi
|
|
128
|
+
|
|
129
|
+
- name: Preview first output row (offline run)
|
|
130
|
+
run: |
|
|
131
|
+
uv run python -c "
|
|
132
|
+
import duckdb
|
|
133
|
+
con = duckdb.connect()
|
|
134
|
+
con.sql(\"SELECT * FROM read_parquet('data/output/*.parquet')\").show(max_rows=1, max_width=10000)
|
|
135
|
+
"
|
|
136
|
+
|
|
137
|
+
- name: Verify downloads were not modified
|
|
138
|
+
run: |
|
|
139
|
+
stat -c '%n %Y' data/downloads/* | sort > /tmp/downloads_after.txt
|
|
140
|
+
echo "=== Download file timestamps after offline run ==="
|
|
141
|
+
cat /tmp/downloads_after.txt
|
|
142
|
+
if ! diff -q /tmp/downloads_before.txt /tmp/downloads_after.txt; then
|
|
143
|
+
echo "::error::Download file timestamps changed – files were unexpectedly modified!"
|
|
144
|
+
diff /tmp/downloads_before.txt /tmp/downloads_after.txt
|
|
145
|
+
exit 1
|
|
146
|
+
fi
|
|
147
|
+
echo "Download timestamps unchanged – existing archives were reused as expected."
|
{ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/.github/workflows/release-pypi.yml
RENAMED
|
@@ -12,6 +12,7 @@ permissions:
|
|
|
12
12
|
jobs:
|
|
13
13
|
publish:
|
|
14
14
|
runs-on: ubuntu-latest
|
|
15
|
+
environment: pypi
|
|
15
16
|
|
|
16
17
|
# Set up such that PyPI Trusted Publishing (OIDC) can work.
|
|
17
18
|
permissions:
|
|
@@ -51,36 +52,53 @@ jobs:
|
|
|
51
52
|
|
|
52
53
|
core.setOutput('release_sha', tagSha);
|
|
53
54
|
|
|
54
|
-
- name:
|
|
55
|
+
- name: Wait for successful CI build artifact
|
|
55
56
|
id: find_build
|
|
56
57
|
uses: actions/github-script@v7
|
|
57
58
|
with:
|
|
58
59
|
script: |
|
|
59
60
|
const { owner, repo } = context.repo;
|
|
60
61
|
const sha = '${{ steps.main_guard.outputs.release_sha }}';
|
|
61
|
-
|
|
62
|
-
const
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
62
|
+
const maxAttempts = 30; // 30 × 20 s = 10 minutes
|
|
63
|
+
const delayMs = 20_000; // 20 seconds between polls
|
|
64
|
+
|
|
65
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
66
|
+
const runs = await github.rest.actions.listWorkflowRuns({
|
|
67
|
+
owner,
|
|
68
|
+
repo,
|
|
69
|
+
workflow_id: 'ci.yml',
|
|
70
|
+
head_sha: sha,
|
|
71
|
+
event: 'push',
|
|
72
|
+
status: 'completed',
|
|
73
|
+
per_page: 50,
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
const success = runs.data.workflow_runs.find(r => r.conclusion === 'success');
|
|
77
|
+
if (success) {
|
|
78
|
+
core.info(`Found successful CI run ${success.id} (${success.html_url})`);
|
|
79
|
+
core.setOutput('run_id', String(success.id));
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const failed = runs.data.workflow_runs.find(r => r.conclusion === 'failure');
|
|
84
|
+
if (failed) {
|
|
85
|
+
core.setFailed(
|
|
86
|
+
`CI run ${failed.id} failed for commit ${sha}. ` +
|
|
87
|
+
'Fix CI before releasing.'
|
|
88
|
+
);
|
|
89
|
+
return;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (attempt < maxAttempts) {
|
|
93
|
+
core.info(`Attempt ${attempt}/${maxAttempts}: CI not finished yet — waiting ${delayMs / 1000}s …`);
|
|
94
|
+
await new Promise(r => setTimeout(r, delayMs));
|
|
95
|
+
}
|
|
80
96
|
}
|
|
81
97
|
|
|
82
|
-
core.
|
|
83
|
-
|
|
98
|
+
core.setFailed(
|
|
99
|
+
`No successful CI run found for commit ${sha} after ${maxAttempts} attempts (≈10 min). ` +
|
|
100
|
+
'Check whether the CI workflow was triggered for this commit.'
|
|
101
|
+
);
|
|
84
102
|
|
|
85
103
|
- name: Download built dist artifact
|
|
86
104
|
uses: actions/download-artifact@v4
|
|
@@ -7,7 +7,7 @@ This project transforms NGD (National Geographic Database) data into a clean fla
|
|
|
7
7
|
## Repository Structure
|
|
8
8
|
|
|
9
9
|
```
|
|
10
|
-
├── config.yaml
|
|
10
|
+
├── config.example.yaml # Pipeline configuration template (copy to config.yaml)
|
|
11
11
|
├── script.py # Main entry point
|
|
12
12
|
├── pyproject.toml # Project metadata and dependencies
|
|
13
13
|
├── README.md # User documentation
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ukam-os-builder
|
|
3
|
-
Version: 0.1.0.
|
|
3
|
+
Version: 0.1.0.dev4
|
|
4
4
|
Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
|
|
5
5
|
Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
6
6
|
Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
|
|
@@ -140,7 +140,13 @@ result = inspect_flatfile_variants(config_path="config.yaml", top_offset=0, show
|
|
|
140
140
|
<summary>Configure manually</summary>
|
|
141
141
|
|
|
142
142
|
If you prefer not to use the setup wizard, edit `config.yaml` directly.
|
|
143
|
-
Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id
|
|
143
|
+
Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`.
|
|
144
|
+
|
|
145
|
+
Most users only need one path setting:
|
|
146
|
+
|
|
147
|
+
- `paths.work_dir` (default `./data`, relative to the config file directory)
|
|
148
|
+
|
|
149
|
+
The tool derives all other directories automatically under `work_dir`.
|
|
144
150
|
|
|
145
151
|
</details>
|
|
146
152
|
|
|
@@ -153,7 +159,7 @@ Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`, the
|
|
|
153
159
|
|
|
154
160
|
### Command notes
|
|
155
161
|
|
|
156
|
-
-
|
|
162
|
+
- `step` only supports `download` and `all` to simplify usage. Use `--overwrite` to re-run a step with the same parameters.
|
|
157
163
|
- CLI overrides take precedence over values in `config.yaml`.
|
|
158
164
|
- By default, `ukam-os-build` loads `.env` from the same directory as your config, unless `--env-file` is supplied.
|
|
159
165
|
|
|
@@ -177,7 +183,7 @@ ukam-os-build --config config.yaml
|
|
|
177
183
|
|
|
178
184
|
1. `download` - fetch package metadata and zip files from OS Data Hub.
|
|
179
185
|
2. `extract` - extract CSVs from downloaded zip files and convert to parquet.
|
|
180
|
-
3. `split` - ABP only: split raw records
|
|
186
|
+
3. `split` - ABP only: split raw records and write only parquet staging files used by flatfile generation (`street_descriptor`, `blpu`, `lpi`, `delivery_point`, `organisation`, `classification`).
|
|
181
187
|
4. `flatfile` - transform and deduplicate into final output parquet file(s).
|
|
182
188
|
|
|
183
189
|
All stages are idempotent. Use `--overwrite` to regenerate outputs (`--force` is accepted as a backward-compatible alias).
|
|
@@ -294,25 +300,6 @@ When the same UPRN and address combination appears in multiple sources, records
|
|
|
294
300
|
4. Historic
|
|
295
301
|
5. Demolished
|
|
296
302
|
|
|
297
|
-
## Manual Download
|
|
298
|
-
|
|
299
|
-
If you prefer to download manually:
|
|
300
|
-
- Sign in to https://osdatahub.os.uk/
|
|
301
|
-
- Create a datapackage with NGD address features
|
|
302
|
-
- Download the zip file
|
|
303
|
-
|
|
304
|
-
To run the pipeline from a manual download:
|
|
305
|
-
|
|
306
|
-
1. Place the zip in the downloads directory configured in `config.yaml`
|
|
307
|
-
- By default this is `data/downloads/`
|
|
308
|
-
- The extract step looks for `*.zip` files in this folder
|
|
309
|
-
|
|
310
|
-
2. Run the pipeline starting from extract:
|
|
311
|
-
|
|
312
|
-
```bash
|
|
313
|
-
ukam-os-build --config config.yaml --step extract
|
|
314
|
-
ukam-os-build --config config.yaml --step flatfile
|
|
315
|
-
```
|
|
316
303
|
|
|
317
304
|
## OS Downloads API
|
|
318
305
|
|
|
@@ -348,10 +335,6 @@ source:
|
|
|
348
335
|
|
|
349
336
|
paths:
|
|
350
337
|
work_dir: ./data
|
|
351
|
-
downloads_dir: ./data/downloads
|
|
352
|
-
extracted_dir: ./data/extracted
|
|
353
|
-
parquet_dir: ./data/parquet
|
|
354
|
-
output_dir: ./data/output
|
|
355
338
|
|
|
356
339
|
os_downloads:
|
|
357
340
|
package_id: "<your_package_id>"
|
|
@@ -366,6 +349,34 @@ processing:
|
|
|
366
349
|
# duckdb_memory_limit: "8GB"
|
|
367
350
|
```
|
|
368
351
|
|
|
352
|
+
By default, the tool creates these directories under `paths.work_dir`:
|
|
353
|
+
|
|
354
|
+
- downloads: `<work_dir>/downloads`
|
|
355
|
+
- extracted: `<work_dir>/extracted`
|
|
356
|
+
- parquet: `<work_dir>/parquet`
|
|
357
|
+
- output: `<work_dir>/output`
|
|
358
|
+
|
|
359
|
+
<details>
|
|
360
|
+
<summary>Advanced: override default directories</summary>
|
|
361
|
+
|
|
362
|
+
Most users won’t need this.
|
|
363
|
+
|
|
364
|
+
If you need to customize locations, use `paths.overrides`:
|
|
365
|
+
|
|
366
|
+
```yaml
|
|
367
|
+
paths:
|
|
368
|
+
work_dir: ./data
|
|
369
|
+
overrides:
|
|
370
|
+
downloads_dir: ./somewhere/downloads
|
|
371
|
+
extracted_dir: /mnt/fast/extracted
|
|
372
|
+
parquet_dir: ./data/parquet
|
|
373
|
+
output_dir: ./output
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
Override keys replace derived defaults. Relative paths are resolved relative to the directory containing `config.yaml`.
|
|
377
|
+
|
|
378
|
+
</details>
|
|
379
|
+
|
|
369
380
|
## Smoke test
|
|
370
381
|
|
|
371
382
|
```bash
|
|
@@ -114,7 +114,13 @@ result = inspect_flatfile_variants(config_path="config.yaml", top_offset=0, show
|
|
|
114
114
|
<summary>Configure manually</summary>
|
|
115
115
|
|
|
116
116
|
If you prefer not to use the setup wizard, edit `config.yaml` directly.
|
|
117
|
-
Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id
|
|
117
|
+
Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`.
|
|
118
|
+
|
|
119
|
+
Most users only need one path setting:
|
|
120
|
+
|
|
121
|
+
- `paths.work_dir` (default `./data`, relative to the config file directory)
|
|
122
|
+
|
|
123
|
+
The tool derives all other directories automatically under `work_dir`.
|
|
118
124
|
|
|
119
125
|
</details>
|
|
120
126
|
|
|
@@ -127,7 +133,7 @@ Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`, the
|
|
|
127
133
|
|
|
128
134
|
### Command notes
|
|
129
135
|
|
|
130
|
-
-
|
|
136
|
+
- `step` only supports `download` and `all` to simplify usage. Use `--overwrite` to re-run a step with the same parameters.
|
|
131
137
|
- CLI overrides take precedence over values in `config.yaml`.
|
|
132
138
|
- By default, `ukam-os-build` loads `.env` from the same directory as your config, unless `--env-file` is supplied.
|
|
133
139
|
|
|
@@ -151,7 +157,7 @@ ukam-os-build --config config.yaml
|
|
|
151
157
|
|
|
152
158
|
1. `download` - fetch package metadata and zip files from OS Data Hub.
|
|
153
159
|
2. `extract` - extract CSVs from downloaded zip files and convert to parquet.
|
|
154
|
-
3. `split` - ABP only: split raw records
|
|
160
|
+
3. `split` - ABP only: split raw records and write only parquet staging files used by flatfile generation (`street_descriptor`, `blpu`, `lpi`, `delivery_point`, `organisation`, `classification`).
|
|
155
161
|
4. `flatfile` - transform and deduplicate into final output parquet file(s).
|
|
156
162
|
|
|
157
163
|
All stages are idempotent. Use `--overwrite` to regenerate outputs (`--force` is accepted as a backward-compatible alias).
|
|
@@ -268,25 +274,6 @@ When the same UPRN and address combination appears in multiple sources, records
|
|
|
268
274
|
4. Historic
|
|
269
275
|
5. Demolished
|
|
270
276
|
|
|
271
|
-
## Manual Download
|
|
272
|
-
|
|
273
|
-
If you prefer to download manually:
|
|
274
|
-
- Sign in to https://osdatahub.os.uk/
|
|
275
|
-
- Create a datapackage with NGD address features
|
|
276
|
-
- Download the zip file
|
|
277
|
-
|
|
278
|
-
To run the pipeline from a manual download:
|
|
279
|
-
|
|
280
|
-
1. Place the zip in the downloads directory configured in `config.yaml`
|
|
281
|
-
- By default this is `data/downloads/`
|
|
282
|
-
- The extract step looks for `*.zip` files in this folder
|
|
283
|
-
|
|
284
|
-
2. Run the pipeline starting from extract:
|
|
285
|
-
|
|
286
|
-
```bash
|
|
287
|
-
ukam-os-build --config config.yaml --step extract
|
|
288
|
-
ukam-os-build --config config.yaml --step flatfile
|
|
289
|
-
```
|
|
290
277
|
|
|
291
278
|
## OS Downloads API
|
|
292
279
|
|
|
@@ -322,10 +309,6 @@ source:
|
|
|
322
309
|
|
|
323
310
|
paths:
|
|
324
311
|
work_dir: ./data
|
|
325
|
-
downloads_dir: ./data/downloads
|
|
326
|
-
extracted_dir: ./data/extracted
|
|
327
|
-
parquet_dir: ./data/parquet
|
|
328
|
-
output_dir: ./data/output
|
|
329
312
|
|
|
330
313
|
os_downloads:
|
|
331
314
|
package_id: "<your_package_id>"
|
|
@@ -340,6 +323,34 @@ processing:
|
|
|
340
323
|
# duckdb_memory_limit: "8GB"
|
|
341
324
|
```
|
|
342
325
|
|
|
326
|
+
By default, the tool creates these directories under `paths.work_dir`:
|
|
327
|
+
|
|
328
|
+
- downloads: `<work_dir>/downloads`
|
|
329
|
+
- extracted: `<work_dir>/extracted`
|
|
330
|
+
- parquet: `<work_dir>/parquet`
|
|
331
|
+
- output: `<work_dir>/output`
|
|
332
|
+
|
|
333
|
+
<details>
|
|
334
|
+
<summary>Advanced: override default directories</summary>
|
|
335
|
+
|
|
336
|
+
Most users won’t need this.
|
|
337
|
+
|
|
338
|
+
If you need to customize locations, use `paths.overrides`:
|
|
339
|
+
|
|
340
|
+
```yaml
|
|
341
|
+
paths:
|
|
342
|
+
work_dir: ./data
|
|
343
|
+
overrides:
|
|
344
|
+
downloads_dir: ./somewhere/downloads
|
|
345
|
+
extracted_dir: /mnt/fast/extracted
|
|
346
|
+
parquet_dir: ./data/parquet
|
|
347
|
+
output_dir: ./output
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
Override keys replace derived defaults. Relative paths are resolved relative to the directory containing `config.yaml`.
|
|
351
|
+
|
|
352
|
+
</details>
|
|
353
|
+
|
|
343
354
|
## Smoke test
|
|
344
355
|
|
|
345
356
|
```bash
|
|
@@ -5,15 +5,6 @@ paths:
|
|
|
5
5
|
# Base working directory for all data
|
|
6
6
|
work_dir: ./data
|
|
7
7
|
|
|
8
|
-
# Downloaded zip files from OS
|
|
9
|
-
downloads_dir: ./data/downloads
|
|
10
|
-
|
|
11
|
-
# Extracted CSV files and intermediate parquet
|
|
12
|
-
extracted_dir: ./data/extracted
|
|
13
|
-
|
|
14
|
-
# Final output parquet files
|
|
15
|
-
output_dir: ./data/output
|
|
16
|
-
|
|
17
8
|
# OS Data Hub download settings
|
|
18
9
|
# Given a datapackage at: https://osdatahub.os.uk/data/downloads/data-packages/16331
|
|
19
10
|
# You can get versions from:
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from textwrap import dedent
|
|
6
|
+
from typing import Literal
|
|
5
7
|
|
|
6
8
|
import pytest
|
|
7
9
|
|
|
@@ -39,6 +41,37 @@ def test_create_config_and_env_writes_expected_files(tmp_path: Path) -> None:
|
|
|
39
41
|
assert "OS_PROJECT_API_SECRET=your_api_secret_here" in env_text
|
|
40
42
|
|
|
41
43
|
|
|
44
|
+
def test_create_config_and_env_writes_supplied_api_credentials(tmp_path: Path) -> None:
|
|
45
|
+
config_path = tmp_path / "config.yaml"
|
|
46
|
+
env_path = tmp_path / ".env"
|
|
47
|
+
|
|
48
|
+
create_config_and_env(
|
|
49
|
+
config_out=config_path,
|
|
50
|
+
env_out=env_path,
|
|
51
|
+
source="ngd",
|
|
52
|
+
package_id="16331",
|
|
53
|
+
version_id="104444",
|
|
54
|
+
api_key="my-key",
|
|
55
|
+
api_secret="my-secret",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
env_text = env_path.read_text()
|
|
59
|
+
assert "OS_PROJECT_API_KEY=my-key" in env_text
|
|
60
|
+
assert "OS_PROJECT_API_SECRET=my-secret" in env_text
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_create_config_and_env_rejects_partial_api_credentials(tmp_path: Path) -> None:
|
|
64
|
+
with pytest.raises(ValueError, match="must be provided together"):
|
|
65
|
+
create_config_and_env(
|
|
66
|
+
config_out=tmp_path / "config.yaml",
|
|
67
|
+
env_out=tmp_path / ".env",
|
|
68
|
+
source="ngd",
|
|
69
|
+
package_id="16331",
|
|
70
|
+
version_id="104444",
|
|
71
|
+
api_key="my-key",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
42
75
|
def test_run_from_config_applies_overrides(
|
|
43
76
|
monkeypatch: pytest.MonkeyPatch,
|
|
44
77
|
tmp_path: Path,
|
|
@@ -52,9 +85,6 @@ def test_run_from_config_applies_overrides(
|
|
|
52
85
|
"""
|
|
53
86
|
paths:
|
|
54
87
|
work_dir: ./data
|
|
55
|
-
downloads_dir: ./data/downloads
|
|
56
|
-
extracted_dir: ./data/extracted
|
|
57
|
-
output_dir: ./data/output
|
|
58
88
|
|
|
59
89
|
os_downloads:
|
|
60
90
|
package_id: "16465"
|
|
@@ -70,7 +100,9 @@ def test_run_from_config_applies_overrides(
|
|
|
70
100
|
def fake_check_api(_settings: object) -> None:
|
|
71
101
|
calls["checked_api"] = True
|
|
72
102
|
|
|
73
|
-
def fake_run_pipeline(
|
|
103
|
+
def fake_run_pipeline(
|
|
104
|
+
step: Literal["all", "download"], settings: object, force: bool, list_only: bool
|
|
105
|
+
) -> None:
|
|
74
106
|
calls["step"] = step
|
|
75
107
|
calls["force"] = force
|
|
76
108
|
calls["list_only"] = list_only
|
|
@@ -94,6 +126,47 @@ def test_run_from_config_applies_overrides(
|
|
|
94
126
|
assert calls["num_chunks"] == 5
|
|
95
127
|
|
|
96
128
|
|
|
129
|
+
def test_run_from_config_accepts_api_key_secret_overrides(
|
|
130
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
131
|
+
tmp_path: Path,
|
|
132
|
+
) -> None:
|
|
133
|
+
monkeypatch.delenv("OS_PROJECT_API_KEY", raising=False)
|
|
134
|
+
monkeypatch.delenv("OS_PROJECT_API_SECRET", raising=False)
|
|
135
|
+
|
|
136
|
+
config_path = tmp_path / "config.yaml"
|
|
137
|
+
_write_config(
|
|
138
|
+
config_path,
|
|
139
|
+
"""
|
|
140
|
+
source:
|
|
141
|
+
type: ngd
|
|
142
|
+
|
|
143
|
+
os_downloads:
|
|
144
|
+
package_id: "16465"
|
|
145
|
+
version_id: "104444"
|
|
146
|
+
""",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
monkeypatch.setattr("ukam_os_builder.api.api.get_package_version", lambda _settings: None)
|
|
150
|
+
monkeypatch.setattr("ukam_os_builder.api.api.run_pipeline", lambda **_kwargs: None)
|
|
151
|
+
|
|
152
|
+
run_from_config(
|
|
153
|
+
config_path=config_path,
|
|
154
|
+
api_key="runtime-key",
|
|
155
|
+
api_secret="runtime-secret",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
assert os.environ["OS_PROJECT_API_KEY"] == "runtime-key"
|
|
159
|
+
assert os.environ["OS_PROJECT_API_SECRET"] == "runtime-secret"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_run_from_config_rejects_partial_api_credentials(tmp_path: Path) -> None:
|
|
163
|
+
with pytest.raises(ValueError, match="must be provided together"):
|
|
164
|
+
run_from_config(
|
|
165
|
+
config_path=tmp_path / "config.yaml",
|
|
166
|
+
api_key="runtime-key",
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
97
170
|
def test_run_from_config_validates_list_only_step(tmp_path: Path) -> None:
|
|
98
171
|
with pytest.raises(ValueError, match="--list-only can only be used"):
|
|
99
172
|
run_from_config(config_path=tmp_path / "config.yaml", step="extract", list_only=True)
|
|
@@ -126,7 +199,9 @@ def test_run_from_config_uses_source_override_for_pipeline_validation(
|
|
|
126
199
|
|
|
127
200
|
monkeypatch.setattr("ukam_os_builder.api.api.get_package_version", lambda _settings: None)
|
|
128
201
|
|
|
129
|
-
def fake_run_pipeline(
|
|
202
|
+
def fake_run_pipeline(
|
|
203
|
+
step: Literal["all", "download"], settings: object, force: bool, list_only: bool
|
|
204
|
+
) -> None:
|
|
130
205
|
calls["step"] = step
|
|
131
206
|
calls["source"] = settings.source.type
|
|
132
207
|
calls["force"] = force
|
|
@@ -195,10 +270,6 @@ def test_run_from_config_applies_schema_path_override(
|
|
|
195
270
|
|
|
196
271
|
paths:
|
|
197
272
|
work_dir: ./data
|
|
198
|
-
downloads_dir: ./data/downloads
|
|
199
|
-
extracted_dir: ./data/extracted
|
|
200
|
-
output_dir: ./data/output
|
|
201
|
-
parquet_dir: ./data/parquet
|
|
202
273
|
|
|
203
274
|
os_downloads:
|
|
204
275
|
package_id: "16465"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from ukam_os_builder import cli
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_build_cli_passes_api_credentials_to_run_from_config(monkeypatch) -> None:
|
|
7
|
+
captured: dict[str, object] = {}
|
|
8
|
+
|
|
9
|
+
def fake_run_from_config(**kwargs):
|
|
10
|
+
captured.update(kwargs)
|
|
11
|
+
return None
|
|
12
|
+
|
|
13
|
+
monkeypatch.setattr(cli, "run_from_config", fake_run_from_config)
|
|
14
|
+
monkeypatch.setattr(cli, "_configure_logging", lambda _verbose: None)
|
|
15
|
+
|
|
16
|
+
exit_code = cli.main(
|
|
17
|
+
[
|
|
18
|
+
"--config",
|
|
19
|
+
"config.yaml",
|
|
20
|
+
"--step",
|
|
21
|
+
"download",
|
|
22
|
+
"--list-only",
|
|
23
|
+
"--api-key",
|
|
24
|
+
"runtime-key",
|
|
25
|
+
"--api-secret",
|
|
26
|
+
"runtime-secret",
|
|
27
|
+
]
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
assert exit_code == 0
|
|
31
|
+
assert captured["api_key"] == "runtime-key"
|
|
32
|
+
assert captured["api_secret"] == "runtime-secret"
|