ukam-os-builder 0.1.0.dev2__tar.gz → 0.1.0.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. ukam_os_builder-0.1.0.dev4/.github/workflows/e2e.yml +147 -0
  2. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/.github/workflows/release-pypi.yml +40 -22
  3. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/.gitignore +1 -0
  4. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/AGENTS.md +1 -1
  5. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/PKG-INFO +38 -27
  6. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/README.md +37 -26
  7. ukam_os_builder-0.1.0.dev2/config.yaml → ukam_os_builder-0.1.0.dev4/config.example.yaml +0 -9
  8. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/pyproject.toml +1 -1
  9. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_api.py +80 -9
  10. ukam_os_builder-0.1.0.dev4/tests/test_cli.py +32 -0
  11. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_settings.py +63 -18
  12. ukam_os_builder-0.1.0.dev4/tests/test_setup_wizard.py +136 -0
  13. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/__init__.py +1 -1
  14. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/api/api.py +67 -22
  15. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/api/settings.py +60 -42
  16. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/cli.py +11 -2
  17. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/split_raw.py +28 -7
  18. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/os_hub.py +26 -1
  19. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/pipeline_factory.py +12 -8
  20. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/pipeline.py +4 -2
  21. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/setup_wizard.py +56 -29
  22. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/uv.lock +860 -858
  23. ukam_os_builder-0.1.0.dev2/ukam_os_builder/data_sources/abp/to_flatfile.py +0 -677
  24. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/.env.example +0 -0
  25. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/.github/workflows/ci.yml +0 -0
  26. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/prompt.md +0 -0
  27. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/shell/test_release_locally.sh +0 -0
  28. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/README.md +0 -0
  29. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_builtaddress.csv +0 -0
  30. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
  31. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_historicaddress.csv +0 -0
  32. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_prebuildaddress.csv +0 -0
  33. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/data/add_gb_royalmailaddress.csv +0 -0
  34. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_cli_errors.py +0 -0
  35. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_extract_source_filtering.py +0 -0
  36. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_inspect_results.py +0 -0
  37. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_public_api_integration.py +0 -0
  38. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/tests/test_smoke.py +0 -0
  39. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/_exceptions.py +0 -0
  40. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/api/cli_errors.py +0 -0
  41. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
  42. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
  43. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
  44. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/runner.py +0 -0
  45. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
  46. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
  47. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +0 -0
  48. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +0 -0
  49. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
  50. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
  51. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/data_sources/ngd/to_flatfile.py +0 -0
  52. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/__init__.py +0 -0
  53. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/extract.py +0 -0
  54. {ukam_os_builder-0.1.0.dev2 → ukam_os_builder-0.1.0.dev4}/ukam_os_builder/os_builder/inspect_results.py +0 -0
@@ -0,0 +1,147 @@
1
+ name: End-to-end tests
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main]
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ concurrency:
11
+ group: e2e-${{ github.head_ref || github.ref }}
12
+ cancel-in-progress: true
13
+
14
+ jobs:
15
+ e2e:
16
+ runs-on: ubuntu-latest
17
+ strategy:
18
+ fail-fast: false
19
+ matrix:
20
+ include:
21
+ - source: ngd
22
+ package_id: "18296"
23
+ version_id: "118120"
24
+ - source: abp
25
+ package_id: "0040206240"
26
+ version_id: "6777574"
27
+
28
+ name: E2E – ${{ matrix.source }}
29
+
30
+ steps:
31
+ - uses: actions/checkout@v5
32
+
33
+ - name: Mask API credentials
34
+ run: |
35
+ echo "::add-mask::${{ secrets.OS_PROJECT_API_KEY }}"
36
+ echo "::add-mask::${{ secrets.OS_PROJECT_API_SECRET }}"
37
+
38
+ - name: Set up Python
39
+ uses: actions/setup-python@v5
40
+ with:
41
+ python-version: "3.10"
42
+
43
+ - name: Set up uv
44
+ uses: astral-sh/setup-uv@v7
45
+ with:
46
+ enable-cache: true
47
+ cache-dependency-glob: "uv.lock"
48
+
49
+ - name: Install dependencies
50
+ run: uv sync --all-extras --all-groups
51
+
52
+ - name: Write config.yaml
53
+ run: |
54
+ printf '%s\n' \
55
+ 'paths:' \
56
+ ' work_dir: ./data' \
57
+ '' \
58
+ 'source:' \
59
+ ' type: ${{ matrix.source }}' \
60
+ '' \
61
+ 'os_downloads:' \
62
+ ' package_id: "${{ matrix.package_id }}"' \
63
+ ' version_id: "${{ matrix.version_id }}"' \
64
+ '' \
65
+ 'processing:' \
66
+ ' parquet_compression: zstd' \
67
+ ' parquet_compression_level: 9' \
68
+ ' num_chunks: 1' \
69
+ > config.yaml
70
+
71
+ - name: Run full pipeline
72
+ env:
73
+ OS_PROJECT_API_KEY: ${{ secrets.OS_PROJECT_API_KEY }}
74
+ OS_PROJECT_API_SECRET: ${{ secrets.OS_PROJECT_API_SECRET }}
75
+ run: uv run ukam-os-build --verbose
76
+
77
+ - name: Verify output files exist
78
+ run: |
79
+ echo "=== Output directory ==="
80
+ ls -lhR data/output/
81
+ echo ""
82
+ echo "=== Checking for parquet files ==="
83
+ count=$(find data/output -name '*.parquet' | wc -l)
84
+ echo "Found $count parquet file(s) in data/output/"
85
+ if [ "$count" -eq 0 ]; then
86
+ echo "::error::No parquet output files found!"
87
+ exit 1
88
+ fi
89
+
90
+ - name: Preview first output row
91
+ run: |
92
+ uv run python -c "
93
+ import duckdb
94
+ con = duckdb.connect()
95
+ con.sql(\"SELECT * FROM read_parquet('data/output/*.parquet')\").show(max_rows=1, max_width=10000)
96
+ "
97
+
98
+ # ── Second run: offline (no API credentials) ──────────────
99
+ - name: Record download file timestamps
100
+ run: |
101
+ stat -c '%n %Y' data/downloads/* | sort > /tmp/downloads_before.txt
102
+ echo "=== Download file timestamps ==="
103
+ cat /tmp/downloads_before.txt
104
+
105
+ - name: Remove everything except downloads and block API access
106
+ run: |
107
+ find data -mindepth 1 -maxdepth 1 ! -name downloads -exec rm -rf {} +
108
+ echo "=== Remaining data tree ==="
109
+ find data -type f | sort
110
+
111
+ - name: Re-run pipeline without API credentials
112
+ run: |
113
+ unset OS_PROJECT_API_KEY OS_PROJECT_API_SECRET
114
+ uv run ukam-os-build --verbose --overwrite
115
+
116
+ - name: Verify output files exist (offline run)
117
+ run: |
118
+ echo "=== Output directory ==="
119
+ ls -lhR data/output/
120
+ echo ""
121
+ echo "=== Checking for parquet files ==="
122
+ count=$(find data/output -name '*.parquet' | wc -l)
123
+ echo "Found $count parquet file(s) in data/output/"
124
+ if [ "$count" -eq 0 ]; then
125
+ echo "::error::No parquet output files found on offline run!"
126
+ exit 1
127
+ fi
128
+
129
+ - name: Preview first output row (offline run)
130
+ run: |
131
+ uv run python -c "
132
+ import duckdb
133
+ con = duckdb.connect()
134
+ con.sql(\"SELECT * FROM read_parquet('data/output/*.parquet')\").show(max_rows=1, max_width=10000)
135
+ "
136
+
137
+ - name: Verify downloads were not modified
138
+ run: |
139
+ stat -c '%n %Y' data/downloads/* | sort > /tmp/downloads_after.txt
140
+ echo "=== Download file timestamps after offline run ==="
141
+ cat /tmp/downloads_after.txt
142
+ if ! diff -q /tmp/downloads_before.txt /tmp/downloads_after.txt; then
143
+ echo "::error::Download file timestamps changed – files were unexpectedly modified!"
144
+ diff /tmp/downloads_before.txt /tmp/downloads_after.txt
145
+ exit 1
146
+ fi
147
+ echo "Download timestamps unchanged – existing archives were reused as expected."
@@ -12,6 +12,7 @@ permissions:
12
12
  jobs:
13
13
  publish:
14
14
  runs-on: ubuntu-latest
15
+ environment: pypi
15
16
 
16
17
  # Set up such that PyPI Trusted Publishing (OIDC) can work.
17
18
  permissions:
@@ -51,36 +52,53 @@ jobs:
51
52
 
52
53
  core.setOutput('release_sha', tagSha);
53
54
 
54
- - name: Find successful build artifact run
55
+ - name: Wait for successful CI build artifact
55
56
  id: find_build
56
57
  uses: actions/github-script@v7
57
58
  with:
58
59
  script: |
59
60
  const { owner, repo } = context.repo;
60
61
  const sha = '${{ steps.main_guard.outputs.release_sha }}';
61
-
62
- const runs = await github.rest.actions.listWorkflowRuns({
63
- owner,
64
- repo,
65
- workflow_id: 'ci.yml',
66
- head_sha: sha,
67
- event: 'push',
68
- status: 'completed',
69
- per_page: 50,
70
- });
71
-
72
- const run = runs.data.workflow_runs.find((r) => r.conclusion === 'success');
73
-
74
- if (!run) {
75
- core.setFailed(
76
- `No successful Build & package run found for commit ${sha}. ` +
77
- 'Wait for the main build to pass, then re-run this release workflow.'
78
- );
79
- return;
62
+ const maxAttempts = 30; // 30 × 20 s = 10 minutes
63
+ const delayMs = 20_000; // 20 seconds between polls
64
+
65
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
66
+ const runs = await github.rest.actions.listWorkflowRuns({
67
+ owner,
68
+ repo,
69
+ workflow_id: 'ci.yml',
70
+ head_sha: sha,
71
+ event: 'push',
72
+ status: 'completed',
73
+ per_page: 50,
74
+ });
75
+
76
+ const success = runs.data.workflow_runs.find(r => r.conclusion === 'success');
77
+ if (success) {
78
+ core.info(`Found successful CI run ${success.id} (${success.html_url})`);
79
+ core.setOutput('run_id', String(success.id));
80
+ return;
81
+ }
82
+
83
+ const failed = runs.data.workflow_runs.find(r => r.conclusion === 'failure');
84
+ if (failed) {
85
+ core.setFailed(
86
+ `CI run ${failed.id} failed for commit ${sha}. ` +
87
+ 'Fix CI before releasing.'
88
+ );
89
+ return;
90
+ }
91
+
92
+ if (attempt < maxAttempts) {
93
+ core.info(`Attempt ${attempt}/${maxAttempts}: CI not finished yet — waiting ${delayMs / 1000}s …`);
94
+ await new Promise(r => setTimeout(r, delayMs));
95
+ }
80
96
  }
81
97
 
82
- core.info(`Using build run id ${run.id} from ${run.html_url}`);
83
- core.setOutput('run_id', String(run.id));
98
+ core.setFailed(
99
+ `No successful CI run found for commit ${sha} after ${maxAttempts} attempts (≈10 min). ` +
100
+ 'Check whether the CI workflow was triggered for this commit.'
101
+ );
84
102
 
85
103
  - name: Download built dist artifact
86
104
  uses: actions/download-artifact@v4
@@ -5,6 +5,7 @@ data/
5
5
  !tests/data/**
6
6
  scripts/os_docs.md
7
7
  .env
8
+ config.yaml
8
9
  # Byte-compiled / optimized / DLL files
9
10
  __pycache__/
10
11
  *.py[codz]
@@ -7,7 +7,7 @@ This project transforms NGD (National Geographic Database) data into a clean fla
7
7
  ## Repository Structure
8
8
 
9
9
  ```
10
- ├── config.yaml # Pipeline configuration
10
+ ├── config.example.yaml # Pipeline configuration template (copy to config.yaml)
11
11
  ├── script.py # Main entry point
12
12
  ├── pyproject.toml # Project metadata and dependencies
13
13
  ├── README.md # User documentation
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ukam-os-builder
3
- Version: 0.1.0.dev2
3
+ Version: 0.1.0.dev4
4
4
  Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
5
5
  Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
6
6
  Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
@@ -140,7 +140,13 @@ result = inspect_flatfile_variants(config_path="config.yaml", top_offset=0, show
140
140
  <summary>Configure manually</summary>
141
141
 
142
142
  If you prefer not to use the setup wizard, edit `config.yaml` directly.
143
- Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`, then adjust `paths` and `processing` as needed.
143
+ Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`.
144
+
145
+ Most users only need one path setting:
146
+
147
+ - `paths.work_dir` (default `./data`, relative to the config file directory)
148
+
149
+ The tool derives all other directories automatically under `work_dir`.
144
150
 
145
151
  </details>
146
152
 
@@ -153,7 +159,7 @@ Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`, the
153
159
 
154
160
  ### Command notes
155
161
 
156
- - `--list-only` is only valid with `--step download` or `--step all`.
162
+ - `step` only supports `download` and `all` to simplify usage. Use `--overwrite` to re-run a step with the same parameters.
157
163
  - CLI overrides take precedence over values in `config.yaml`.
158
164
  - By default, `ukam-os-build` loads `.env` from the same directory as your config, unless `--env-file` is supplied.
159
165
 
@@ -177,7 +183,7 @@ ukam-os-build --config config.yaml
177
183
 
178
184
  1. `download` - fetch package metadata and zip files from OS Data Hub.
179
185
  2. `extract` - extract CSVs from downloaded zip files and convert to parquet.
180
- 3. `split` - ABP only: split raw records into parquet staging files.
186
+ 3. `split` - ABP only: split raw records and write only parquet staging files used by flatfile generation (`street_descriptor`, `blpu`, `lpi`, `delivery_point`, `organisation`, `classification`).
181
187
  4. `flatfile` - transform and deduplicate into final output parquet file(s).
182
188
 
183
189
  All stages are idempotent. Use `--overwrite` to regenerate outputs (`--force` is accepted as a backward-compatible alias).
@@ -294,25 +300,6 @@ When the same UPRN and address combination appears in multiple sources, records
294
300
  4. Historic
295
301
  5. Demolished
296
302
 
297
- ## Manual Download
298
-
299
- If you prefer to download manually:
300
- - Sign in to https://osdatahub.os.uk/
301
- - Create a datapackage with NGD address features
302
- - Download the zip file
303
-
304
- To run the pipeline from a manual download:
305
-
306
- 1. Place the zip in the downloads directory configured in `config.yaml`
307
- - By default this is `data/downloads/`
308
- - The extract step looks for `*.zip` files in this folder
309
-
310
- 2. Run the pipeline starting from extract:
311
-
312
- ```bash
313
- ukam-os-build --config config.yaml --step extract
314
- ukam-os-build --config config.yaml --step flatfile
315
- ```
316
303
 
317
304
  ## OS Downloads API
318
305
 
@@ -348,10 +335,6 @@ source:
348
335
 
349
336
  paths:
350
337
  work_dir: ./data
351
- downloads_dir: ./data/downloads
352
- extracted_dir: ./data/extracted
353
- parquet_dir: ./data/parquet
354
- output_dir: ./data/output
355
338
 
356
339
  os_downloads:
357
340
  package_id: "<your_package_id>"
@@ -366,6 +349,34 @@ processing:
366
349
  # duckdb_memory_limit: "8GB"
367
350
  ```
368
351
 
352
+ By default, the tool creates these directories under `paths.work_dir`:
353
+
354
+ - downloads: `<work_dir>/downloads`
355
+ - extracted: `<work_dir>/extracted`
356
+ - parquet: `<work_dir>/parquet`
357
+ - output: `<work_dir>/output`
358
+
359
+ <details>
360
+ <summary>Advanced: override default directories</summary>
361
+
362
+ Most users won’t need this.
363
+
364
+ If you need to customize locations, use `paths.overrides`:
365
+
366
+ ```yaml
367
+ paths:
368
+ work_dir: ./data
369
+ overrides:
370
+ downloads_dir: ./somewhere/downloads
371
+ extracted_dir: /mnt/fast/extracted
372
+ parquet_dir: ./data/parquet
373
+ output_dir: ./output
374
+ ```
375
+
376
+ Override keys replace derived defaults. Relative paths are resolved relative to the directory containing `config.yaml`.
377
+
378
+ </details>
379
+
369
380
  ## Smoke test
370
381
 
371
382
  ```bash
@@ -114,7 +114,13 @@ result = inspect_flatfile_variants(config_path="config.yaml", top_offset=0, show
114
114
  <summary>Configure manually</summary>
115
115
 
116
116
  If you prefer not to use the setup wizard, edit `config.yaml` directly.
117
- Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`, then adjust `paths` and `processing` as needed.
117
+ Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`.
118
+
119
+ Most users only need one path setting:
120
+
121
+ - `paths.work_dir` (default `./data`, relative to the config file directory)
122
+
123
+ The tool derives all other directories automatically under `work_dir`.
118
124
 
119
125
  </details>
120
126
 
@@ -127,7 +133,7 @@ Set `source.type`, `os_downloads.package_id`, and `os_downloads.version_id`, the
127
133
 
128
134
  ### Command notes
129
135
 
130
- - `--list-only` is only valid with `--step download` or `--step all`.
136
+ - `step` only supports `download` and `all` to simplify usage. Use `--overwrite` to re-run a step with the same parameters.
131
137
  - CLI overrides take precedence over values in `config.yaml`.
132
138
  - By default, `ukam-os-build` loads `.env` from the same directory as your config, unless `--env-file` is supplied.
133
139
 
@@ -151,7 +157,7 @@ ukam-os-build --config config.yaml
151
157
 
152
158
  1. `download` - fetch package metadata and zip files from OS Data Hub.
153
159
  2. `extract` - extract CSVs from downloaded zip files and convert to parquet.
154
- 3. `split` - ABP only: split raw records into parquet staging files.
160
+ 3. `split` - ABP only: split raw records and write only parquet staging files used by flatfile generation (`street_descriptor`, `blpu`, `lpi`, `delivery_point`, `organisation`, `classification`).
155
161
  4. `flatfile` - transform and deduplicate into final output parquet file(s).
156
162
 
157
163
  All stages are idempotent. Use `--overwrite` to regenerate outputs (`--force` is accepted as a backward-compatible alias).
@@ -268,25 +274,6 @@ When the same UPRN and address combination appears in multiple sources, records
268
274
  4. Historic
269
275
  5. Demolished
270
276
 
271
- ## Manual Download
272
-
273
- If you prefer to download manually:
274
- - Sign in to https://osdatahub.os.uk/
275
- - Create a datapackage with NGD address features
276
- - Download the zip file
277
-
278
- To run the pipeline from a manual download:
279
-
280
- 1. Place the zip in the downloads directory configured in `config.yaml`
281
- - By default this is `data/downloads/`
282
- - The extract step looks for `*.zip` files in this folder
283
-
284
- 2. Run the pipeline starting from extract:
285
-
286
- ```bash
287
- ukam-os-build --config config.yaml --step extract
288
- ukam-os-build --config config.yaml --step flatfile
289
- ```
290
277
 
291
278
  ## OS Downloads API
292
279
 
@@ -322,10 +309,6 @@ source:
322
309
 
323
310
  paths:
324
311
  work_dir: ./data
325
- downloads_dir: ./data/downloads
326
- extracted_dir: ./data/extracted
327
- parquet_dir: ./data/parquet
328
- output_dir: ./data/output
329
312
 
330
313
  os_downloads:
331
314
  package_id: "<your_package_id>"
@@ -340,6 +323,34 @@ processing:
340
323
  # duckdb_memory_limit: "8GB"
341
324
  ```
342
325
 
326
+ By default, the tool creates these directories under `paths.work_dir`:
327
+
328
+ - downloads: `<work_dir>/downloads`
329
+ - extracted: `<work_dir>/extracted`
330
+ - parquet: `<work_dir>/parquet`
331
+ - output: `<work_dir>/output`
332
+
333
+ <details>
334
+ <summary>Advanced: override default directories</summary>
335
+
336
+ Most users won’t need this.
337
+
338
+ If you need to customize locations, use `paths.overrides`:
339
+
340
+ ```yaml
341
+ paths:
342
+ work_dir: ./data
343
+ overrides:
344
+ downloads_dir: ./somewhere/downloads
345
+ extracted_dir: /mnt/fast/extracted
346
+ parquet_dir: ./data/parquet
347
+ output_dir: ./output
348
+ ```
349
+
350
+ Override keys replace derived defaults. Relative paths are resolved relative to the directory containing `config.yaml`.
351
+
352
+ </details>
353
+
343
354
  ## Smoke test
344
355
 
345
356
  ```bash
@@ -5,15 +5,6 @@ paths:
5
5
  # Base working directory for all data
6
6
  work_dir: ./data
7
7
 
8
- # Downloaded zip files from OS
9
- downloads_dir: ./data/downloads
10
-
11
- # Extracted CSV files and intermediate parquet
12
- extracted_dir: ./data/extracted
13
-
14
- # Final output parquet files
15
- output_dir: ./data/output
16
-
17
8
  # OS Data Hub download settings
18
9
  # Given a datapackage at: https://osdatahub.os.uk/data/downloads/data-packages/16331
19
10
  # You can get versions from:
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ukam-os-builder"
3
- version = "0.1.0.dev2"
3
+ version = "0.1.0.dev4"
4
4
  description = "Download, process and transform OS address data (NGD or ABP) for UK address matching"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import os
3
4
  from pathlib import Path
4
5
  from textwrap import dedent
6
+ from typing import Literal
5
7
 
6
8
  import pytest
7
9
 
@@ -39,6 +41,37 @@ def test_create_config_and_env_writes_expected_files(tmp_path: Path) -> None:
39
41
  assert "OS_PROJECT_API_SECRET=your_api_secret_here" in env_text
40
42
 
41
43
 
44
+ def test_create_config_and_env_writes_supplied_api_credentials(tmp_path: Path) -> None:
45
+ config_path = tmp_path / "config.yaml"
46
+ env_path = tmp_path / ".env"
47
+
48
+ create_config_and_env(
49
+ config_out=config_path,
50
+ env_out=env_path,
51
+ source="ngd",
52
+ package_id="16331",
53
+ version_id="104444",
54
+ api_key="my-key",
55
+ api_secret="my-secret",
56
+ )
57
+
58
+ env_text = env_path.read_text()
59
+ assert "OS_PROJECT_API_KEY=my-key" in env_text
60
+ assert "OS_PROJECT_API_SECRET=my-secret" in env_text
61
+
62
+
63
+ def test_create_config_and_env_rejects_partial_api_credentials(tmp_path: Path) -> None:
64
+ with pytest.raises(ValueError, match="must be provided together"):
65
+ create_config_and_env(
66
+ config_out=tmp_path / "config.yaml",
67
+ env_out=tmp_path / ".env",
68
+ source="ngd",
69
+ package_id="16331",
70
+ version_id="104444",
71
+ api_key="my-key",
72
+ )
73
+
74
+
42
75
  def test_run_from_config_applies_overrides(
43
76
  monkeypatch: pytest.MonkeyPatch,
44
77
  tmp_path: Path,
@@ -52,9 +85,6 @@ def test_run_from_config_applies_overrides(
52
85
  """
53
86
  paths:
54
87
  work_dir: ./data
55
- downloads_dir: ./data/downloads
56
- extracted_dir: ./data/extracted
57
- output_dir: ./data/output
58
88
 
59
89
  os_downloads:
60
90
  package_id: "16465"
@@ -70,7 +100,9 @@ def test_run_from_config_applies_overrides(
70
100
  def fake_check_api(_settings: object) -> None:
71
101
  calls["checked_api"] = True
72
102
 
73
- def fake_run_pipeline(step: str, settings: object, force: bool, list_only: bool) -> None:
103
+ def fake_run_pipeline(
104
+ step: Literal["all", "download"], settings: object, force: bool, list_only: bool
105
+ ) -> None:
74
106
  calls["step"] = step
75
107
  calls["force"] = force
76
108
  calls["list_only"] = list_only
@@ -94,6 +126,47 @@ def test_run_from_config_applies_overrides(
94
126
  assert calls["num_chunks"] == 5
95
127
 
96
128
 
129
+ def test_run_from_config_accepts_api_key_secret_overrides(
130
+ monkeypatch: pytest.MonkeyPatch,
131
+ tmp_path: Path,
132
+ ) -> None:
133
+ monkeypatch.delenv("OS_PROJECT_API_KEY", raising=False)
134
+ monkeypatch.delenv("OS_PROJECT_API_SECRET", raising=False)
135
+
136
+ config_path = tmp_path / "config.yaml"
137
+ _write_config(
138
+ config_path,
139
+ """
140
+ source:
141
+ type: ngd
142
+
143
+ os_downloads:
144
+ package_id: "16465"
145
+ version_id: "104444"
146
+ """,
147
+ )
148
+
149
+ monkeypatch.setattr("ukam_os_builder.api.api.get_package_version", lambda _settings: None)
150
+ monkeypatch.setattr("ukam_os_builder.api.api.run_pipeline", lambda **_kwargs: None)
151
+
152
+ run_from_config(
153
+ config_path=config_path,
154
+ api_key="runtime-key",
155
+ api_secret="runtime-secret",
156
+ )
157
+
158
+ assert os.environ["OS_PROJECT_API_KEY"] == "runtime-key"
159
+ assert os.environ["OS_PROJECT_API_SECRET"] == "runtime-secret"
160
+
161
+
162
+ def test_run_from_config_rejects_partial_api_credentials(tmp_path: Path) -> None:
163
+ with pytest.raises(ValueError, match="must be provided together"):
164
+ run_from_config(
165
+ config_path=tmp_path / "config.yaml",
166
+ api_key="runtime-key",
167
+ )
168
+
169
+
97
170
  def test_run_from_config_validates_list_only_step(tmp_path: Path) -> None:
98
171
  with pytest.raises(ValueError, match="--list-only can only be used"):
99
172
  run_from_config(config_path=tmp_path / "config.yaml", step="extract", list_only=True)
@@ -126,7 +199,9 @@ def test_run_from_config_uses_source_override_for_pipeline_validation(
126
199
 
127
200
  monkeypatch.setattr("ukam_os_builder.api.api.get_package_version", lambda _settings: None)
128
201
 
129
- def fake_run_pipeline(step: str, settings: object, force: bool, list_only: bool) -> None:
202
+ def fake_run_pipeline(
203
+ step: Literal["all", "download"], settings: object, force: bool, list_only: bool
204
+ ) -> None:
130
205
  calls["step"] = step
131
206
  calls["source"] = settings.source.type
132
207
  calls["force"] = force
@@ -195,10 +270,6 @@ def test_run_from_config_applies_schema_path_override(
195
270
 
196
271
  paths:
197
272
  work_dir: ./data
198
- downloads_dir: ./data/downloads
199
- extracted_dir: ./data/extracted
200
- output_dir: ./data/output
201
- parquet_dir: ./data/parquet
202
273
 
203
274
  os_downloads:
204
275
  package_id: "16465"
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from ukam_os_builder import cli
4
+
5
+
6
+ def test_build_cli_passes_api_credentials_to_run_from_config(monkeypatch) -> None:
7
+ captured: dict[str, object] = {}
8
+
9
+ def fake_run_from_config(**kwargs):
10
+ captured.update(kwargs)
11
+ return None
12
+
13
+ monkeypatch.setattr(cli, "run_from_config", fake_run_from_config)
14
+ monkeypatch.setattr(cli, "_configure_logging", lambda _verbose: None)
15
+
16
+ exit_code = cli.main(
17
+ [
18
+ "--config",
19
+ "config.yaml",
20
+ "--step",
21
+ "download",
22
+ "--list-only",
23
+ "--api-key",
24
+ "runtime-key",
25
+ "--api-secret",
26
+ "runtime-secret",
27
+ ]
28
+ )
29
+
30
+ assert exit_code == 0
31
+ assert captured["api_key"] == "runtime-key"
32
+ assert captured["api_secret"] == "runtime-secret"