ukam-os-builder 0.1.0.dev3__tar.gz → 0.1.0.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. ukam_os_builder-0.1.0.dev5/.github/workflows/e2e.yml +147 -0
  2. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/.github/workflows/release-pypi.yml +40 -22
  3. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/PKG-INFO +2 -2
  4. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/README.md +1 -1
  5. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/pyproject.toml +1 -1
  6. ukam_os_builder-0.1.0.dev5/tests/test_extract_source_filtering.py +49 -0
  7. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_settings.py +7 -3
  8. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_smoke.py +0 -1
  9. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/__init__.py +1 -1
  10. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/api.py +17 -7
  11. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/settings.py +9 -19
  12. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/cli.py +1 -3
  13. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/split_raw.py +28 -7
  14. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/combine.py +1 -1
  15. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/lpi.py +4 -6
  16. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/ngd/to_flatfile.py +75 -11
  17. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/extract.py +15 -2
  18. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/os_hub.py +98 -1
  19. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/uv.lock +1 -1
  20. ukam_os_builder-0.1.0.dev3/tests/test_extract_source_filtering.py +0 -27
  21. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/.env.example +0 -0
  22. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/.github/workflows/ci.yml +0 -0
  23. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/.gitignore +0 -0
  24. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/AGENTS.md +0 -0
  25. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/config.example.yaml +0 -0
  26. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/prompt.md +0 -0
  27. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/shell/test_release_locally.sh +0 -0
  28. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/README.md +0 -0
  29. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress.csv +0 -0
  30. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_builtaddress_altadd.csv +0 -0
  31. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_historicaddress.csv +0 -0
  32. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_prebuildaddress.csv +0 -0
  33. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/data/add_gb_royalmailaddress.csv +0 -0
  34. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_api.py +0 -0
  35. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_cli.py +0 -0
  36. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_cli_errors.py +0 -0
  37. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_inspect_results.py +0 -0
  38. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_public_api_integration.py +0 -0
  39. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/tests/test_setup_wizard.py +0 -0
  40. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/_exceptions.py +0 -0
  41. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/api/cli_errors.py +0 -0
  42. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/schemas/abp_schema.yaml +0 -0
  43. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/__init__.py +0 -0
  44. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/common.py +0 -0
  45. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/runner.py +0 -0
  46. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/__init__.py +0 -0
  47. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/business.py +0 -0
  48. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/misc.py +0 -0
  49. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/data_sources/abp/transform/stages/postal.py +0 -0
  50. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/__init__.py +0 -0
  51. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/inspect_results.py +0 -0
  52. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/os_builder/pipeline_factory.py +0 -0
  53. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/pipeline.py +0 -0
  54. {ukam_os_builder-0.1.0.dev3 → ukam_os_builder-0.1.0.dev5}/ukam_os_builder/setup_wizard.py +0 -0
@@ -0,0 +1,147 @@
1
+ name: End-to-end tests
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main]
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ concurrency:
11
+ group: e2e-${{ github.head_ref || github.ref }}
12
+ cancel-in-progress: true
13
+
14
+ jobs:
15
+ e2e:
16
+ runs-on: ubuntu-latest
17
+ strategy:
18
+ fail-fast: false
19
+ matrix:
20
+ include:
21
+ - source: ngd
22
+ package_id: "18296"
23
+ version_id: "118120"
24
+ - source: abp
25
+ package_id: "0040206240"
26
+ version_id: "6777574"
27
+
28
+ name: E2E – ${{ matrix.source }}
29
+
30
+ steps:
31
+ - uses: actions/checkout@v5
32
+
33
+ - name: Mask API credentials
34
+ run: |
35
+ echo "::add-mask::${{ secrets.OS_PROJECT_API_KEY }}"
36
+ echo "::add-mask::${{ secrets.OS_PROJECT_API_SECRET }}"
37
+
38
+ - name: Set up Python
39
+ uses: actions/setup-python@v5
40
+ with:
41
+ python-version: "3.10"
42
+
43
+ - name: Set up uv
44
+ uses: astral-sh/setup-uv@v7
45
+ with:
46
+ enable-cache: true
47
+ cache-dependency-glob: "uv.lock"
48
+
49
+ - name: Install dependencies
50
+ run: uv sync --all-extras --all-groups
51
+
52
+ - name: Write config.yaml
53
+ run: |
54
+ printf '%s\n' \
55
+ 'paths:' \
56
+ ' work_dir: ./data' \
57
+ '' \
58
+ 'source:' \
59
+ ' type: ${{ matrix.source }}' \
60
+ '' \
61
+ 'os_downloads:' \
62
+ ' package_id: "${{ matrix.package_id }}"' \
63
+ ' version_id: "${{ matrix.version_id }}"' \
64
+ '' \
65
+ 'processing:' \
66
+ ' parquet_compression: zstd' \
67
+ ' parquet_compression_level: 9' \
68
+ ' num_chunks: 1' \
69
+ > config.yaml
70
+
71
+ - name: Run full pipeline
72
+ env:
73
+ OS_PROJECT_API_KEY: ${{ secrets.OS_PROJECT_API_KEY }}
74
+ OS_PROJECT_API_SECRET: ${{ secrets.OS_PROJECT_API_SECRET }}
75
+ run: uv run ukam-os-build --verbose
76
+
77
+ - name: Verify output files exist
78
+ run: |
79
+ echo "=== Output directory ==="
80
+ ls -lhR data/output/
81
+ echo ""
82
+ echo "=== Checking for parquet files ==="
83
+ count=$(find data/output -name '*.parquet' | wc -l)
84
+ echo "Found $count parquet file(s) in data/output/"
85
+ if [ "$count" -eq 0 ]; then
86
+ echo "::error::No parquet output files found!"
87
+ exit 1
88
+ fi
89
+
90
+ - name: Preview first output row
91
+ run: |
92
+ uv run python -c "
93
+ import duckdb
94
+ con = duckdb.connect()
95
+ con.sql(\"SELECT * FROM read_parquet('data/output/*.parquet')\").show(max_rows=1, max_width=10000)
96
+ "
97
+
98
+ # ── Second run: offline (no API credentials) ──────────────
99
+ - name: Record download file timestamps
100
+ run: |
101
+ stat -c '%n %Y' data/downloads/* | sort > /tmp/downloads_before.txt
102
+ echo "=== Download file timestamps ==="
103
+ cat /tmp/downloads_before.txt
104
+
105
+ - name: Remove everything except downloads and block API access
106
+ run: |
107
+ find data -mindepth 1 -maxdepth 1 ! -name downloads -exec rm -rf {} +
108
+ echo "=== Remaining data tree ==="
109
+ find data -type f | sort
110
+
111
+ - name: Re-run pipeline without API credentials
112
+ run: |
113
+ unset OS_PROJECT_API_KEY OS_PROJECT_API_SECRET
114
+ uv run ukam-os-build --verbose --overwrite
115
+
116
+ - name: Verify output files exist (offline run)
117
+ run: |
118
+ echo "=== Output directory ==="
119
+ ls -lhR data/output/
120
+ echo ""
121
+ echo "=== Checking for parquet files ==="
122
+ count=$(find data/output -name '*.parquet' | wc -l)
123
+ echo "Found $count parquet file(s) in data/output/"
124
+ if [ "$count" -eq 0 ]; then
125
+ echo "::error::No parquet output files found on offline run!"
126
+ exit 1
127
+ fi
128
+
129
+ - name: Preview first output row (offline run)
130
+ run: |
131
+ uv run python -c "
132
+ import duckdb
133
+ con = duckdb.connect()
134
+ con.sql(\"SELECT * FROM read_parquet('data/output/*.parquet')\").show(max_rows=1, max_width=10000)
135
+ "
136
+
137
+ - name: Verify downloads were not modified
138
+ run: |
139
+ stat -c '%n %Y' data/downloads/* | sort > /tmp/downloads_after.txt
140
+ echo "=== Download file timestamps after offline run ==="
141
+ cat /tmp/downloads_after.txt
142
+ if ! diff -q /tmp/downloads_before.txt /tmp/downloads_after.txt; then
143
+ echo "::error::Download file timestamps changed – files were unexpectedly modified!"
144
+ diff /tmp/downloads_before.txt /tmp/downloads_after.txt
145
+ exit 1
146
+ fi
147
+ echo "Download timestamps unchanged – existing archives were reused as expected."
@@ -12,6 +12,7 @@ permissions:
12
12
  jobs:
13
13
  publish:
14
14
  runs-on: ubuntu-latest
15
+ environment: pypi
15
16
 
16
17
  # Set up such that PyPI Trusted Publishing (OIDC) can work.
17
18
  permissions:
@@ -51,36 +52,53 @@ jobs:
51
52
 
52
53
  core.setOutput('release_sha', tagSha);
53
54
 
54
- - name: Find successful build artifact run
55
+ - name: Wait for successful CI build artifact
55
56
  id: find_build
56
57
  uses: actions/github-script@v7
57
58
  with:
58
59
  script: |
59
60
  const { owner, repo } = context.repo;
60
61
  const sha = '${{ steps.main_guard.outputs.release_sha }}';
61
-
62
- const runs = await github.rest.actions.listWorkflowRuns({
63
- owner,
64
- repo,
65
- workflow_id: 'ci.yml',
66
- head_sha: sha,
67
- event: 'push',
68
- status: 'completed',
69
- per_page: 50,
70
- });
71
-
72
- const run = runs.data.workflow_runs.find((r) => r.conclusion === 'success');
73
-
74
- if (!run) {
75
- core.setFailed(
76
- `No successful Build & package run found for commit ${sha}. ` +
77
- 'Wait for the main build to pass, then re-run this release workflow.'
78
- );
79
- return;
62
+ const maxAttempts = 30; // 30 × 20 s = 10 minutes
63
+ const delayMs = 20_000; // 20 seconds between polls
64
+
65
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
66
+ const runs = await github.rest.actions.listWorkflowRuns({
67
+ owner,
68
+ repo,
69
+ workflow_id: 'ci.yml',
70
+ head_sha: sha,
71
+ event: 'push',
72
+ status: 'completed',
73
+ per_page: 50,
74
+ });
75
+
76
+ const success = runs.data.workflow_runs.find(r => r.conclusion === 'success');
77
+ if (success) {
78
+ core.info(`Found successful CI run ${success.id} (${success.html_url})`);
79
+ core.setOutput('run_id', String(success.id));
80
+ return;
81
+ }
82
+
83
+ const failed = runs.data.workflow_runs.find(r => r.conclusion === 'failure');
84
+ if (failed) {
85
+ core.setFailed(
86
+ `CI run ${failed.id} failed for commit ${sha}. ` +
87
+ 'Fix CI before releasing.'
88
+ );
89
+ return;
90
+ }
91
+
92
+ if (attempt < maxAttempts) {
93
+ core.info(`Attempt ${attempt}/${maxAttempts}: CI not finished yet — waiting ${delayMs / 1000}s …`);
94
+ await new Promise(r => setTimeout(r, delayMs));
95
+ }
80
96
  }
81
97
 
82
- core.info(`Using build run id ${run.id} from ${run.html_url}`);
83
- core.setOutput('run_id', String(run.id));
98
+ core.setFailed(
99
+ `No successful CI run found for commit ${sha} after ${maxAttempts} attempts (≈10 min). ` +
100
+ 'Check whether the CI workflow was triggered for this commit.'
101
+ );
84
102
 
85
103
  - name: Download built dist artifact
86
104
  uses: actions/download-artifact@v4
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ukam-os-builder
3
- Version: 0.1.0.dev3
3
+ Version: 0.1.0.dev5
4
4
  Summary: Download, process and transform OS address data (NGD or ABP) for UK address matching
5
5
  Project-URL: Homepage, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
6
6
  Project-URL: Repository, https://github.com/moj-analytical-services/prepare_ngd_for_address_matching
@@ -183,7 +183,7 @@ ukam-os-build --config config.yaml
183
183
 
184
184
  1. `download` - fetch package metadata and zip files from OS Data Hub.
185
185
  2. `extract` - extract CSVs from downloaded zip files and convert to parquet.
186
- 3. `split` - ABP only: split raw records into parquet staging files.
186
+ 3. `split` - ABP only: split raw records and write only parquet staging files used by flatfile generation (`street_descriptor`, `blpu`, `lpi`, `delivery_point`, `organisation`, `classification`).
187
187
  4. `flatfile` - transform and deduplicate into final output parquet file(s).
188
188
 
189
189
  All stages are idempotent. Use `--overwrite` to regenerate outputs (`--force` is accepted as a backward-compatible alias).
@@ -157,7 +157,7 @@ ukam-os-build --config config.yaml
157
157
 
158
158
  1. `download` - fetch package metadata and zip files from OS Data Hub.
159
159
  2. `extract` - extract CSVs from downloaded zip files and convert to parquet.
160
- 3. `split` - ABP only: split raw records into parquet staging files.
160
+ 3. `split` - ABP only: split raw records and write only parquet staging files used by flatfile generation (`street_descriptor`, `blpu`, `lpi`, `delivery_point`, `organisation`, `classification`).
161
161
  4. `flatfile` - transform and deduplicate into final output parquet file(s).
162
162
 
163
163
  All stages are idempotent. Use `--overwrite` to regenerate outputs (`--force` is accepted as a backward-compatible alias).
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ukam-os-builder"
3
- version = "0.1.0.dev3"
3
+ version = "0.1.0.dev5"
4
4
  description = "Download, process and transform OS address data (NGD or ABP) for UK address matching"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from ukam_os_builder.os_builder.extract import (
6
+ _filter_zips_for_source,
7
+ _should_convert_csv_to_parquet,
8
+ )
9
+
10
+
11
+ def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
12
+ zip_files = [
13
+ Path("add_gb_builtaddress.zip"),
14
+ Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
15
+ ]
16
+
17
+ filtered = _filter_zips_for_source(zip_files, "ngd")
18
+
19
+ assert filtered == [Path("add_gb_builtaddress.zip")]
20
+
21
+
22
+ def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
23
+ ngd_csv = Path("add_gb_builtaddress.csv")
24
+ abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
25
+
26
+ assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
27
+ assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False
28
+
29
+
30
+ def test_filter_zips_for_source_excludes_ngd_historicaddress() -> None:
31
+ zip_files = [
32
+ Path("add_gb_builtaddress.zip"),
33
+ Path("add_gb_historicaddress.zip"),
34
+ Path("add_gb_historicaddress_altadd.zip"),
35
+ Path("add_gb_prebuildaddress.zip"),
36
+ ]
37
+
38
+ filtered = _filter_zips_for_source(zip_files, "ngd")
39
+
40
+ assert Path("add_gb_builtaddress.zip") in filtered
41
+ assert Path("add_gb_prebuildaddress.zip") in filtered
42
+ assert Path("add_gb_historicaddress.zip") not in filtered
43
+ assert Path("add_gb_historicaddress_altadd.zip") not in filtered
44
+
45
+
46
+ def test_should_convert_csv_to_parquet_skips_ngd_historicaddress() -> None:
47
+ assert _should_convert_csv_to_parquet(Path("add_gb_builtaddress.csv"), "ngd") is True
48
+ assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress.csv"), "ngd") is False
49
+ assert _should_convert_csv_to_parquet(Path("add_gb_historicaddress_altadd.csv"), "ngd") is False
@@ -122,7 +122,9 @@ def test_load_settings_uses_work_dir_for_default_subpaths(
122
122
  assert settings.paths.output_dir == (tmp_path / "custom_data/output").resolve()
123
123
 
124
124
 
125
- def test_load_settings_requires_env_vars(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
125
+ def test_load_settings_allows_missing_env_vars(
126
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
127
+ ) -> None:
126
128
  monkeypatch.delenv("OS_PROJECT_API_KEY", raising=False)
127
129
  monkeypatch.delenv("OS_PROJECT_API_SECRET", raising=False)
128
130
 
@@ -139,8 +141,10 @@ def test_load_settings_requires_env_vars(tmp_path: Path, monkeypatch: pytest.Mon
139
141
  """,
140
142
  )
141
143
 
142
- with pytest.raises(SettingsError, match="OS_PROJECT_API_KEY"):
143
- load_settings(config_path, load_env=False)
144
+ settings = load_settings(config_path, load_env=False)
145
+
146
+ assert settings.os_downloads.api_key is None
147
+ assert settings.os_downloads.api_secret is None
144
148
 
145
149
 
146
150
  def test_load_settings_validates_positive_read_timeout(
@@ -121,7 +121,6 @@ def _prepare_test_parquet(settings: Settings) -> None:
121
121
  "add_gb_builtaddress_altadd.csv",
122
122
  "add_gb_royalmailaddress.csv",
123
123
  "add_gb_prebuildaddress.csv",
124
- "add_gb_historicaddress.csv",
125
124
  ]
126
125
 
127
126
  for csv_name in sample_files:
@@ -8,7 +8,7 @@ from ukam_os_builder.os_builder.inspect_results import (
8
8
  inspect_flatfile_variants,
9
9
  )
10
10
 
11
- __version__ = "0.1.0.dev3"
11
+ __version__ = "0.1.0.dev5"
12
12
 
13
13
  __all__ = [
14
14
  "create_config_and_env",
@@ -8,7 +8,7 @@ from typing import Any, Literal
8
8
  import yaml
9
9
 
10
10
  from ukam_os_builder.api.settings import Settings, SettingsError, load_settings
11
- from ukam_os_builder.os_builder.os_hub import get_package_version
11
+ from ukam_os_builder.os_builder.os_hub import _get_manifest_path, get_package_version
12
12
  from ukam_os_builder.pipeline import run as run_pipeline
13
13
  from ukam_os_builder.pipeline import supported_steps_for_source
14
14
 
@@ -333,11 +333,6 @@ def run_from_config(
333
333
  parquet_compression_level=parquet_compression_level,
334
334
  )
335
335
  logger.info("Resolved work_dir: %s", settings.paths.work_dir)
336
- logger.info("Resolved downloads_dir: %s", settings.paths.downloads_dir)
337
- logger.info("Resolved extracted_dir: %s", settings.paths.extracted_dir)
338
- logger.info("Resolved parquet_dir: %s", settings.paths.parquet_dir)
339
- logger.info("Resolved output_dir: %s", settings.paths.output_dir)
340
-
341
336
  source_type = settings.source.type
342
337
  if step != "all":
343
338
  supported_steps = supported_steps_for_source(source_type)
@@ -347,9 +342,24 @@ def run_from_config(
347
342
  f"--step {step} is not valid for source {source_type}. Valid steps: {valid_steps}"
348
343
  )
349
344
 
350
- if check_api:
345
+ has_api_key = bool(os.environ.get("OS_PROJECT_API_KEY"))
346
+ if check_api and has_api_key:
351
347
  get_package_version(settings)
352
348
 
353
349
  overwrite_effective = overwrite if overwrite is not None else bool(force)
354
350
  run_pipeline(step=step, settings=settings, force=overwrite_effective, list_only=list_only)
351
+
352
+ logger.info(
353
+ "✅ Pipeline run completed\n\n"
354
+ "Where you need to look:\n"
355
+ " • downloads_dir (raw OS Hub extracts): %s%s\n"
356
+ " • output_dir (final files for address matcher): %s%s\n",
357
+ str(settings.paths.downloads_dir),
358
+ "",
359
+ str(settings.paths.output_dir),
360
+ "",
361
+ )
362
+
363
+ _get_manifest_path(settings)
364
+
355
365
  return settings
@@ -41,8 +41,8 @@ class OSDownloadSettings(StrictBaseModel):
41
41
 
42
42
  package_id: str
43
43
  version_id: str
44
- api_key: SecretStr
45
- api_secret: SecretStr
44
+ api_key: SecretStr | None = None
45
+ api_secret: SecretStr | None = None
46
46
  connect_timeout_seconds: int = 30
47
47
  read_timeout_seconds: int = 300
48
48
 
@@ -57,6 +57,8 @@ class OSDownloadSettings(StrictBaseModel):
57
57
  @field_validator("api_key", "api_secret", mode="before")
58
58
  @classmethod
59
59
  def _validate_secret(cls, value: Any) -> Any:
60
+ if value is None:
61
+ return value
60
62
  if isinstance(value, str) and not value.strip():
61
63
  raise ValueError("must be non-empty")
62
64
  return value
@@ -182,22 +184,11 @@ def _load_yaml(config_path: Path) -> dict[str, Any]:
182
184
  return config
183
185
 
184
186
 
185
- def _validate_env_vars() -> tuple[str, str]:
186
- """Validate required environment variables exist."""
187
+ def _load_env_vars() -> tuple[str | None, str | None]:
188
+ """Load API credentials from environment variables if available."""
187
189
  api_key = os.environ.get("OS_PROJECT_API_KEY")
188
190
  api_secret = os.environ.get("OS_PROJECT_API_SECRET")
189
191
 
190
- if not api_key:
191
- raise SettingsError(
192
- "OS_PROJECT_API_KEY not found in environment. "
193
- "Create a .env file with OS_PROJECT_API_KEY=<your-key>"
194
- )
195
- if not api_secret:
196
- raise SettingsError(
197
- "OS_PROJECT_API_SECRET not found in environment. "
198
- "Create a .env file with OS_PROJECT_API_SECRET=<your-secret>"
199
- )
200
-
201
192
  return api_key, api_secret
202
193
 
203
194
 
@@ -216,8 +207,7 @@ def load_settings(
216
207
  Complete Settings object with resolved paths.
217
208
 
218
209
  Raises:
219
- SettingsError: If config file is missing or invalid,
220
- or if required environment variables are not set.
210
+ SettingsError: If config file is missing or invalid.
221
211
  """
222
212
  config_path = Path(config_path).resolve()
223
213
  base_dir = config_path.parent
@@ -232,8 +222,8 @@ def load_settings(
232
222
  # Load YAML config
233
223
  config = _load_yaml(config_path)
234
224
 
235
- # Validate environment variables
236
- api_key, api_secret = _validate_env_vars()
225
+ # Load environment variables (optional)
226
+ api_key, api_secret = _load_env_vars()
237
227
 
238
228
  resolved_paths = resolve_paths(config=config, config_dir=base_dir)
239
229
 
@@ -122,7 +122,7 @@ def main(argv: list[str] | None = None) -> int:
122
122
  config_path = Path(args.config).resolve()
123
123
  console.print(f"[green]✓[/green] Loaded config: [bold]{config_path}[/bold]")
124
124
  console.print(f"[cyan]Step:[/cyan] {args.step}")
125
- console.print("[cyan]Checking OS API credentials and connectivity...[/cyan]")
125
+ console.print("[cyan]Starting pipeline...[/cyan]")
126
126
 
127
127
  run_from_config(
128
128
  config_path=config_path,
@@ -145,8 +145,6 @@ def main(argv: list[str] | None = None) -> int:
145
145
  parquet_compression=args.parquet_compression,
146
146
  parquet_compression_level=args.parquet_compression_level,
147
147
  )
148
- logger.info("Pipeline run completed")
149
- console.print("[green]✓[/green] API connectivity check passed")
150
148
  console.print("[bold green]Build completed successfully[/bold green]")
151
149
  return 0
152
150
  except (SettingsError, ValueError) as exc:
@@ -1,8 +1,8 @@
1
1
  """Split raw ABP data module.
2
2
 
3
3
  Reads raw ABP CSV files (which contain all record types mixed together),
4
- splits them by record identifier (10/11/15/21/24/28/31/32/99 etc.),
5
- and writes one parquet file per record type.
4
+ filters to the record types needed for flatfile creation, and writes
5
+ one parquet file per required record type.
6
6
  """
7
7
 
8
8
  from __future__ import annotations
@@ -19,8 +19,8 @@ from ukam_os_builder.api.settings import Settings, create_duckdb_connection
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
- # Record identifier to table name mapping
23
- RECORD_TYPE_MAP = {
22
+ # All known ABP record identifiers
23
+ ALL_RECORD_TYPE_MAP = {
24
24
  "10": "header",
25
25
  "11": "street",
26
26
  "15": "street_descriptor",
@@ -35,6 +35,16 @@ RECORD_TYPE_MAP = {
35
35
  "99": "trailer",
36
36
  }
37
37
 
38
+ # Record identifiers needed for ABP flatfile creation
39
+ RECORD_TYPE_MAP = {
40
+ "15": "street_descriptor",
41
+ "21": "blpu",
42
+ "24": "lpi",
43
+ "28": "delivery_point",
44
+ "31": "organisation",
45
+ "32": "classification",
46
+ }
47
+
38
48
  DEFAULT_SCHEMA_PATH = Path(__file__).resolve().parent / "schemas" / "abp_schema.yaml"
39
49
 
40
50
 
@@ -169,12 +179,23 @@ def split_raw_to_parquet(
169
179
  input_counts[name] = count
170
180
  logger.debug("Record type %s (%s): %d lines", rid, name, count)
171
181
 
182
+ unused_rids = sorted(set(ALL_RECORD_TYPE_MAP) - set(RECORD_TYPE_MAP))
183
+ rid_list_sql = ", ".join([f"'{rid}'" for rid in unused_rids])
184
+ ignored_input = con.execute(f"""
185
+ SELECT COUNT(*)
186
+ FROM lines_with_rid
187
+ WHERE rid IN ({rid_list_sql})
188
+ """).fetchone()[0]
189
+
172
190
  total_input = sum(input_counts.values())
173
- logger.info("Total input lines (with valid record IDs): %d", total_input)
191
+ logger.info("Total input lines (processed record IDs): %d", total_input)
192
+ if ignored_input > 0:
193
+ logger.info("Ignored input lines (unused record IDs): %d", ignored_input)
174
194
  if total_input == 0:
175
195
  raise ValueError(
176
196
  "No ABP record identifiers found in extracted CSV input. "
177
- "Ensure --source abp is used with ABP raw extracts (record IDs 10/11/15/21/24/28/31/32/99)."
197
+ "Ensure --source abp is used with ABP raw extracts "
198
+ "(required record IDs: 15/21/24/28/31/32)."
178
199
  )
179
200
 
180
201
  # 4) Process each record type
@@ -279,7 +300,7 @@ def split_raw_to_parquet(
279
300
  total_output = sum(output_counts.values())
280
301
  logger.info("")
281
302
  logger.info("=== Validation: Line count check ===")
282
- logger.info("Input lines (with valid record IDs): %d", total_input)
303
+ logger.info("Input lines (processed record IDs): %d", total_input)
283
304
  logger.info("Output rows (parquet): %d", total_output)
284
305
 
285
306
  if total_input == total_output:
@@ -33,7 +33,7 @@ def combine_and_dedupe(con: duckdb.DuckDBPyConnection) -> duckdb.DuckDBPyRelatio
33
33
  ),
34
34
  ranked AS (
35
35
  SELECT *,
36
- CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2 WHEN 8 THEN 3 ELSE 9 END AS status_rank,
36
+ CASE logical_status WHEN 1 THEN 0 WHEN 3 THEN 1 WHEN 6 THEN 2 ELSE 9 END AS status_rank,
37
37
  CASE source WHEN 'LPI' THEN 0 WHEN 'ORGANISATION' THEN 1 WHEN 'DELIVERY_POINT' THEN 2 WHEN 'CUSTOM_LEVEL' THEN 3 ELSE 4 END AS source_rank
38
38
  FROM normalized
39
39
  ),
@@ -70,15 +70,15 @@ matching messy user input. We output variants based on **Logical Status**:
70
70
  locally known as "Rose Cottage").
71
71
  3. **Provisional (6):** The address assigned during planning/construction, which
72
72
  might change before the house is built.
73
- 4. **Historic (8):** An old address. If "10 High St" is renumbered to "12 High St",
74
- the old address is kept as Historic. This helps match old datasets.
73
+
74
+ Historic addresses (logical_status=8) are excluded from output.
75
75
 
76
76
  ------------------------------------------------------------------------------
77
77
  Key Columns Explained
78
78
  ------------------------------------------------------------------------------
79
79
  * `uprn`: The "Golden Key". Use this to link this address to other data.
80
80
  * `base_address`: The constructed full address string.
81
- * `logical_status`: 1=Current, 6=Provisional, 8=Historic.
81
+ * `logical_status`: 1=Current, 6=Provisional.
82
82
  * `official_flag`: 'Y' indicates this is the "official" version, 'N' suggests
83
83
  it might be an unofficial alias.
84
84
  * `language`: 'ENG' (English) or 'CYM' (Welsh). Streets in Wales often have
@@ -183,7 +183,6 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
183
183
  WHEN 1 THEN 0
184
184
  WHEN 3 THEN 1
185
185
  WHEN 6 THEN 2
186
- WHEN 8 THEN 3
187
186
  ELSE 9
188
187
  END AS status_rank
189
188
  FROM lpi l
@@ -192,7 +191,7 @@ def prepare_lpi_base(con: duckdb.DuckDBPyConnection) -> None:
192
191
  LEFT JOIN _sd_best_by_lang sd_lang ON sd_lang.usrn = l.usrn AND sd_lang.language = l.language
193
192
  LEFT JOIN _sd_best_any sd_any ON sd_any.usrn = l.usrn
194
193
  WHERE (b.addressbase_postal != 'N' OR b.addressbase_postal IS NULL)
195
- AND l.logical_status IN (1, 3, 6, 8)
194
+ AND l.logical_status IN (1, 3, 6)
196
195
  """)
197
196
 
198
197
  # Deduplicated distinct addresses
@@ -266,7 +265,6 @@ def render_variants(con: duckdb.DuckDBPyConnection) -> None:
266
265
  WHEN 1 THEN 'APPROVED'
267
266
  WHEN 3 THEN 'ALTERNATIVE'
268
267
  WHEN 6 THEN 'PROVISIONAL'
269
- WHEN 8 THEN 'HISTORICAL'
270
268
  END AS variant_label,
271
269
  (logical_status = 1) AS is_primary
272
270
  FROM lpi_base_distinct
@@ -2,7 +2,7 @@
2
2
 
3
3
  Transforms the extracted parquet files into a single flatfile suitable for
4
4
  UK address matching. This includes:
5
- - Processing core feature types (Built Address, Historic Address, etc.)
5
+ - Processing core feature types (Built Address, Pre-Build Address, etc.)
6
6
  - Processing alternate address records
7
7
  - Processing Royal Mail addresses
8
8
  - Handling Welsh language variants
@@ -27,8 +27,6 @@ logger = logging.getLogger(__name__)
27
27
  FEATURE_TYPE_BY_STEM = {
28
28
  "add_gb_builtaddress": "Built Address",
29
29
  "add_gb_builtaddress_altadd": "Built Address",
30
- "add_gb_historicaddress": "Historic Address",
31
- "add_gb_historicaddress_altadd": "Historic Address",
32
30
  "add_gb_nonaddressableobject": "Non-Addressable Object",
33
31
  "add_gb_nonaddressableobject_altadd": "Non-Addressable Object",
34
32
  "add_gb_prebuildaddress": "Pre-Build Address",
@@ -39,7 +37,6 @@ FEATURE_TYPE_BY_STEM = {
39
37
  # Core feature stems (contain fulladdress and classification fields)
40
38
  CORE_FEATURE_STEMS = {
41
39
  "add_gb_builtaddress",
42
- "add_gb_historicaddress",
43
40
  "add_gb_nonaddressableobject",
44
41
  "add_gb_prebuildaddress",
45
42
  }
@@ -47,7 +44,6 @@ CORE_FEATURE_STEMS = {
47
44
  # Alternate address stems (no classification fields)
48
45
  ALTADD_STEMS = {
49
46
  "add_gb_builtaddress_altadd",
50
- "add_gb_historicaddress_altadd",
51
47
  "add_gb_nonaddressableobject_altadd",
52
48
  "add_gb_prebuildaddress_altadd",
53
49
  }
@@ -57,7 +53,6 @@ CORE_FEATURE_PRIORITY = {
57
53
  "add_gb_builtaddress": 1,
58
54
  "add_gb_prebuildaddress": 2,
59
55
  "add_gb_nonaddressableobject": 3,
60
- "add_gb_historicaddress": 4,
61
56
  }
62
57
 
63
58
 
@@ -71,7 +66,7 @@ def _create_metadata_lookup_view(
71
66
  This view is used to enrich Royal Mail and alternate address records
72
67
  with metadata (classificationcode, parentuprn, etc.) by UPRN lookup.
73
68
 
74
- Uses priority ranking (Built > Pre-Build > Non-Addressable > Historic)
69
+ Uses priority ranking (Built > Pre-Build > Non-Addressable)
75
70
  to dedupe when a UPRN exists in multiple core files.
76
71
 
77
72
  Args:
@@ -156,7 +151,7 @@ def _create_core_feature_view(
156
151
  parquet_path: Path,
157
152
  uprn_predicate: str | None = None,
158
153
  ) -> None:
159
- """Create view for core feature types (Built, Historic, Pre-Build, Non-Addressable).
154
+ """Create view for core feature types (Built, Pre-Build, Non-Addressable).
160
155
 
161
156
  These tables have fulladdress, classification fields, and Welsh language columns.
162
157
  Produces both English and Welsh (where available) address records.
@@ -413,11 +408,76 @@ def _enrich_with_metadata(con: duckdb.DuckDBPyConnection) -> None:
413
408
  con.execute(sql)
414
409
 
415
410
 
411
+ def _create_custom_level_rows(con: duckdb.DuckDBPyConnection) -> None:
412
+ """Generate custom level-based address variants and insert into enriched table.
413
+
414
+ Parses the ``floorlevel`` column (VARCHAR) from the enriched address table,
415
+ maps integer floor levels to words (-1=BASEMENT … 6=SIXTH), and prepends the
416
+ word to the existing ``address_concat`` to create additional address variants.
417
+
418
+ These rows use ``feature_type='Custom Level'`` so they receive the lowest
419
+ dedup priority and never override official address data.
420
+ """
421
+ sql = """
422
+ INSERT INTO all_full_addresses_enriched
423
+ WITH level_parsed AS (
424
+ SELECT
425
+ uprn, address_concat, postcode, filename,
426
+ classificationcode, parentuprn, rootuprn,
427
+ hierarchylevel, floorlevel, lowestfloorlevel, highestfloorlevel,
428
+ address_status, build_status,
429
+ CASE
430
+ WHEN split_part(floorlevel, ',', 1) ~ '^-?[0-9]+$'
431
+ THEN CAST(split_part(floorlevel, ',', 1) AS INTEGER)
432
+ ELSE NULL
433
+ END AS level_int
434
+ FROM all_full_addresses_enriched
435
+ WHERE floorlevel IS NOT NULL
436
+ AND address_concat IS NOT NULL
437
+ AND address_concat <> ''
438
+ ),
439
+ level_words AS (
440
+ SELECT
441
+ *,
442
+ CASE level_int
443
+ WHEN -1 THEN 'BASEMENT'
444
+ WHEN 0 THEN 'GROUND'
445
+ WHEN 1 THEN 'FIRST'
446
+ WHEN 2 THEN 'SECOND'
447
+ WHEN 3 THEN 'THIRD'
448
+ WHEN 4 THEN 'FOURTH'
449
+ WHEN 5 THEN 'FIFTH'
450
+ WHEN 6 THEN 'SIXTH'
451
+ END AS level_word
452
+ FROM level_parsed
453
+ WHERE level_int BETWEEN -1 AND 6
454
+ )
455
+ SELECT
456
+ uprn,
457
+ TRIM(concat(level_word, ' ', address_concat)) AS address_concat,
458
+ postcode,
459
+ 'CUSTOM_LEVEL' AS filename,
460
+ classificationcode,
461
+ parentuprn,
462
+ rootuprn,
463
+ hierarchylevel,
464
+ floorlevel,
465
+ lowestfloorlevel,
466
+ highestfloorlevel,
467
+ 'Custom Level' AS feature_type,
468
+ address_status,
469
+ build_status
470
+ FROM level_words
471
+ WHERE level_word IS NOT NULL;
472
+ """
473
+ con.execute(sql)
474
+
475
+
416
476
  def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
417
477
  """Create deduplicated view of all addresses.
418
478
 
419
479
  Priority rules for deduplication:
420
- - Feature type: Built Address -> Pre-Build -> Royal Mail -> Historic -> Non-Addressable
480
+ - Feature type: Built Address -> Pre-Build -> Royal Mail -> Non-Addressable
421
481
  - Address status: Approved -> Provisional -> Alternative -> Historical
422
482
  - Build status: Built Complete -> Under Construction -> Prebuild -> Historic -> Demolished
423
483
 
@@ -433,8 +493,8 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
433
493
  WHEN 'Built Address' THEN 1
434
494
  WHEN 'Pre-Build Address' THEN 2
435
495
  WHEN 'Royal Mail Address' THEN 3
436
- WHEN 'Historic Address' THEN 4
437
496
  WHEN 'Non-Addressable Object' THEN 5
497
+ WHEN 'Custom Level' THEN 6
438
498
  ELSE 9
439
499
  END AS feature_type_rank,
440
500
  CASE
@@ -460,7 +520,7 @@ def _create_dedup_view(con: duckdb.DuckDBPyConnection) -> None:
460
520
  build_status_rank
461
521
  ) AS rn
462
522
  FROM all_full_addresses_enriched
463
- WHERE feature_type != 'Non-Addressable Object'
523
+ WHERE feature_type NOT IN ('Non-Addressable Object')
464
524
  )
465
525
  SELECT
466
526
  uprn,
@@ -641,6 +701,10 @@ def run_flatfile_step(settings: Settings, force: bool = False) -> list[Path]:
641
701
  logger.info("Enriching addresses with metadata from core files...")
642
702
  _enrich_with_metadata(con)
643
703
 
704
+ # Generate custom level variants
705
+ logger.info("Generating custom level address variants...")
706
+ _create_custom_level_rows(con)
707
+
644
708
  # Create deduplicated view
645
709
  logger.info("Creating deduplicated view...")
646
710
  _create_dedup_view(con)
@@ -11,6 +11,9 @@ from ukam_os_builder.api.settings import Settings
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
+ # NGD file stems to exclude (historic addresses are not used in output)
15
+ _NGD_EXCLUDED_STEMS = {"historicaddress"}
16
+
14
17
 
15
18
  def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
16
19
  """Find all downloaded zip files in a directory."""
@@ -22,11 +25,20 @@ def find_downloaded_zips(downloads_dir: Path) -> list[Path]:
22
25
  return zip_files
23
26
 
24
27
 
28
+ def _is_excluded_ngd_file(name: str) -> bool:
29
+ """Return True if *name* matches an excluded NGD stem (e.g. historicaddress)."""
30
+ name_lower = name.lower()
31
+ return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
32
+
33
+
25
34
  def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
26
35
  source_lower = source.lower()
27
36
  if source_lower == "ngd":
28
37
  ngd_zips = [
29
- zip_path for zip_path in zip_files if zip_path.name.lower().startswith("add_gb_")
38
+ zip_path
39
+ for zip_path in zip_files
40
+ if zip_path.name.lower().startswith("add_gb_")
41
+ and not _is_excluded_ngd_file(zip_path.name)
30
42
  ]
31
43
  return ngd_zips or zip_files
32
44
  if source_lower == "abp":
@@ -39,7 +51,8 @@ def _filter_zips_for_source(zip_files: list[Path], source: str) -> list[Path]:
39
51
 
40
52
  def _should_convert_csv_to_parquet(csv_path: Path, source: str) -> bool:
41
53
  if source.lower() == "ngd":
42
- return csv_path.name.lower().startswith("add_gb_")
54
+ name_lower = csv_path.name.lower()
55
+ return name_lower.startswith("add_gb_") and not _is_excluded_ngd_file(name_lower)
43
56
  return True
44
57
 
45
58
 
@@ -9,9 +9,25 @@ from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
9
9
 
10
10
  import requests
11
11
 
12
+ from ukam_os_builder.api.settings import Settings
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
  API_BASE_URL = "https://api.os.uk/downloads/v1"
17
+
18
+ # NGD file stems to exclude (historic addresses are not used in output)
19
+ _NGD_EXCLUDED_STEMS = {"historicaddress"}
20
+
21
+
22
+ def _should_skip_ngd_download(filename: str, settings: object) -> bool:
23
+ """Return True if *filename* is an NGD historic-address archive."""
24
+ source_type = getattr(getattr(settings, "source", None), "type", "")
25
+ if source_type != "ngd":
26
+ return False
27
+ name_lower = filename.lower()
28
+ return any(stem in name_lower for stem in _NGD_EXCLUDED_STEMS)
29
+
30
+
15
31
  DEFAULT_CHUNK_SIZE = 1024 * 1024 * 20 # 20 MiB
16
32
  DEFAULT_CONNECT_TIMEOUT_SECONDS = 30
17
33
  DEFAULT_READ_TIMEOUT_SECONDS = 300
@@ -65,6 +81,13 @@ def _require_api_key(settings: Any) -> str:
65
81
  return api_key
66
82
 
67
83
 
84
+ def _find_existing_download_archives(downloads_dir: Path) -> list[Path]:
85
+ """Find existing local archives that can be used for extract step."""
86
+ if not downloads_dir.exists():
87
+ return []
88
+ return sorted(downloads_dir.glob("*.zip"))
89
+
90
+
68
91
  def get_package_version(settings: Any) -> dict:
69
92
  """Fetch package version metadata from the OS Data Hub API."""
70
93
  package_id = settings.os_downloads.package_id
@@ -236,9 +259,27 @@ def run_download_step(
236
259
  list_only: bool = False,
237
260
  ) -> list[Path]:
238
261
  """Run the OS Data Hub download step for any compatible settings object."""
239
- api_key = _require_api_key(settings)
240
262
  downloads_dir = settings.paths.downloads_dir
241
263
 
264
+ try:
265
+ api_key = _require_api_key(settings)
266
+ except ValueError as exc:
267
+ if list_only:
268
+ raise
269
+
270
+ existing_archives = _find_existing_download_archives(downloads_dir)
271
+ if existing_archives:
272
+ logger.warning(
273
+ "No API key found; using %d existing archive(s) in %s and skipping download.",
274
+ len(existing_archives),
275
+ downloads_dir,
276
+ )
277
+ return existing_archives
278
+
279
+ raise ValueError(
280
+ f"{exc} No local zip files were found in {downloads_dir}, so download cannot be skipped."
281
+ ) from exc
282
+
242
283
  logger.info("Fetching package metadata...")
243
284
  metadata = get_package_version(settings)
244
285
  items = list_downloads(metadata)
@@ -268,6 +309,11 @@ def run_download_step(
268
309
  logger.warning("No URL for %s, skipping", item.filename)
269
310
  continue
270
311
 
312
+ # Skip NGD historic address files — they are excluded from output
313
+ if _should_skip_ngd_download(item.filename, settings):
314
+ logger.info("Skipping historic address file: %s", item.filename)
315
+ continue
316
+
271
317
  dest_path = downloads_dir / item.filename
272
318
  was_downloaded = download_file(
273
319
  url=item.url,
@@ -287,3 +333,54 @@ def run_download_step(
287
333
 
288
334
  logger.info("Download complete: %d file(s)", len(downloaded))
289
335
  return downloaded
336
+
337
+
338
+ def _get_manifest_path(settings: Settings) -> Path | None:
339
+ downloads_dir = settings.paths.downloads_dir.resolve()
340
+ source_type = settings.source.type # "abp" | "ngd"
341
+
342
+ if source_type == "abp":
343
+ candidates = list(downloads_dir.glob("*-Order_Details.txt"))
344
+ if not candidates:
345
+ logger.info("➡️ Manifest (ABP order details) not found. Check: %s", downloads_dir)
346
+ return None
347
+
348
+ manifest = max(candidates, key=lambda p: p.stat().st_mtime).resolve()
349
+
350
+ if len(candidates) > 1:
351
+ logger.warning(
352
+ "Multiple ABP manifests found in %s. Using newest: %s",
353
+ downloads_dir,
354
+ manifest,
355
+ )
356
+
357
+ logger.info("➡️ Manifest (ABP order details): %s", manifest)
358
+ return manifest
359
+
360
+ elif source_type == "ngd":
361
+ candidates = list(
362
+ downloads_dir.glob("*_orderSummary.json")
363
+ ) # adjust if it's "*.orderSummary.json"
364
+ if not candidates:
365
+ logger.info("➡️ Manifests (NGD order summaries) not found. Check: %s", downloads_dir)
366
+ return None
367
+
368
+ built_candidates = list(downloads_dir.glob("*builtaddress*_orderSummary.json"))
369
+ built_manifest = (
370
+ max(built_candidates, key=lambda p: p.stat().st_mtime).resolve()
371
+ if built_candidates
372
+ else None
373
+ )
374
+
375
+ logger.info(
376
+ "➡️ Manifests (NGD order summaries): %s (%d files)\n"
377
+ " ↳ Built address order summary: %s",
378
+ downloads_dir,
379
+ len(candidates),
380
+ built_manifest if built_manifest else "(not found)",
381
+ )
382
+
383
+ return downloads_dir
384
+
385
+ logger.warning("Unknown source type %r. No manifest lookup performed.", source_type)
386
+ return None
@@ -1421,7 +1421,7 @@ wheels = [
1421
1421
 
1422
1422
  [[package]]
1423
1423
  name = "ukam-os-builder"
1424
- version = "0.1.0.dev3"
1424
+ version = "0.1.0.dev5"
1425
1425
  source = { editable = "." }
1426
1426
  dependencies = [
1427
1427
  { name = "duckdb" },
@@ -1,27 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from pathlib import Path
4
-
5
- from ukam_os_builder.os_builder.extract import (
6
- _filter_zips_for_source,
7
- _should_convert_csv_to_parquet,
8
- )
9
-
10
-
11
- def test_filter_zips_for_source_prefers_ngd_named_zips() -> None:
12
- zip_files = [
13
- Path("add_gb_builtaddress.zip"),
14
- Path("AddressBasePremium_FULL_2025-12-15_002.zip"),
15
- ]
16
-
17
- filtered = _filter_zips_for_source(zip_files, "ngd")
18
-
19
- assert filtered == [Path("add_gb_builtaddress.zip")]
20
-
21
-
22
- def test_should_convert_csv_to_parquet_skips_non_ngd_for_ngd_source() -> None:
23
- ngd_csv = Path("add_gb_builtaddress.csv")
24
- abp_csv = Path("AddressBasePremium_FULL_2025-12-15_002.csv")
25
-
26
- assert _should_convert_csv_to_parquet(ngd_csv, "ngd") is True
27
- assert _should_convert_csv_to_parquet(abp_csv, "ngd") is False