@mailwoman/corpus 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/out/src/adapter.d.ts +96 -0
  2. package/out/src/adapter.d.ts.map +1 -0
  3. package/out/src/adapter.js +107 -0
  4. package/out/src/adapter.js.map +1 -0
  5. package/out/src/adapters/ban/adapter.d.ts +32 -0
  6. package/out/src/adapters/ban/adapter.d.ts.map +1 -0
  7. package/out/src/adapters/ban/adapter.js +133 -0
  8. package/out/src/adapters/ban/adapter.js.map +1 -0
  9. package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
  10. package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
  11. package/out/src/adapters/fcc-bdc/adapter.js +153 -0
  12. package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
  13. package/out/src/adapters/index.d.ts +42 -0
  14. package/out/src/adapters/index.d.ts.map +1 -0
  15. package/out/src/adapters/index.js +76 -0
  16. package/out/src/adapters/index.js.map +1 -0
  17. package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
  18. package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
  19. package/out/src/adapters/openaddresses/adapter.js +174 -0
  20. package/out/src/adapters/openaddresses/adapter.js.map +1 -0
  21. package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
  22. package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
  23. package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
  24. package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
  25. package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
  26. package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
  27. package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
  28. package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
  29. package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
  30. package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
  31. package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
  32. package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
  33. package/out/src/adapters/tiger/adapter.d.ts +45 -0
  34. package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
  35. package/out/src/adapters/tiger/adapter.js +179 -0
  36. package/out/src/adapters/tiger/adapter.js.map +1 -0
  37. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
  38. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
  39. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
  40. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
  41. package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
  42. package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
  43. package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
  44. package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
  45. package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
  46. package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
  47. package/out/src/adapters/usgov-nad/adapter.js +227 -0
  48. package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
  49. package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
  50. package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
  51. package/out/src/adapters/usgov-nppes/adapter.js +123 -0
  52. package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
  53. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
  54. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
  55. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
  56. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
  57. package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
  58. package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
  59. package/out/src/adapters/wof-admin-json/adapter.js +241 -0
  60. package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
  61. package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
  62. package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
  63. package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
  64. package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
  65. package/out/src/align.d.ts +58 -0
  66. package/out/src/align.d.ts.map +1 -0
  67. package/out/src/align.js +139 -0
  68. package/out/src/align.js.map +1 -0
  69. package/out/src/build.d.ts +104 -0
  70. package/out/src/build.d.ts.map +1 -0
  71. package/out/src/build.js +201 -0
  72. package/out/src/build.js.map +1 -0
  73. package/out/src/codex/us-fips-state.d.ts +44 -0
  74. package/out/src/codex/us-fips-state.d.ts.map +1 -0
  75. package/out/src/codex/us-fips-state.js +105 -0
  76. package/out/src/codex/us-fips-state.js.map +1 -0
  77. package/out/src/codex/us-street-suffix.d.ts +259 -0
  78. package/out/src/codex/us-street-suffix.d.ts.map +1 -0
  79. package/out/src/codex/us-street-suffix.js +285 -0
  80. package/out/src/codex/us-street-suffix.js.map +1 -0
  81. package/out/src/format.d.ts +79 -0
  82. package/out/src/format.d.ts.map +1 -0
  83. package/out/src/format.js +151 -0
  84. package/out/src/format.js.map +1 -0
  85. package/out/src/golden.d.ts +50 -0
  86. package/out/src/golden.d.ts.map +1 -0
  87. package/out/src/golden.js +104 -0
  88. package/out/src/golden.js.map +1 -0
  89. package/out/src/index.d.ts +18 -0
  90. package/out/src/index.d.ts.map +1 -0
  91. package/out/src/index.js +18 -0
  92. package/out/src/index.js.map +1 -0
  93. package/out/src/parquet-wrapper/index.d.ts +12 -0
  94. package/out/src/parquet-wrapper/index.d.ts.map +1 -0
  95. package/out/src/parquet-wrapper/index.js +12 -0
  96. package/out/src/parquet-wrapper/index.js.map +1 -0
  97. package/out/src/parquet-wrapper/reader.d.ts +31 -0
  98. package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
  99. package/out/src/parquet-wrapper/reader.js +54 -0
  100. package/out/src/parquet-wrapper/reader.js.map +1 -0
  101. package/out/src/parquet-wrapper/schema.d.ts +45 -0
  102. package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
  103. package/out/src/parquet-wrapper/schema.js +55 -0
  104. package/out/src/parquet-wrapper/schema.js.map +1 -0
  105. package/out/src/parquet-wrapper/writer.d.ts +41 -0
  106. package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
  107. package/out/src/parquet-wrapper/writer.js +71 -0
  108. package/out/src/parquet-wrapper/writer.js.map +1 -0
  109. package/out/src/parquet.d.ts +122 -0
  110. package/out/src/parquet.d.ts.map +1 -0
  111. package/out/src/parquet.js +220 -0
  112. package/out/src/parquet.js.map +1 -0
  113. package/out/src/runner.d.ts +100 -0
  114. package/out/src/runner.d.ts.map +1 -0
  115. package/out/src/runner.js +183 -0
  116. package/out/src/runner.js.map +1 -0
  117. package/out/src/split.d.ts +108 -0
  118. package/out/src/split.d.ts.map +1 -0
  119. package/out/src/split.js +191 -0
  120. package/out/src/split.js.map +1 -0
  121. package/out/src/synthesize.d.ts +146 -0
  122. package/out/src/synthesize.d.ts.map +1 -0
  123. package/out/src/synthesize.js +472 -0
  124. package/out/src/synthesize.js.map +1 -0
  125. package/out/src/tokenize.d.ts +47 -0
  126. package/out/src/tokenize.d.ts.map +1 -0
  127. package/out/src/tokenize.js +49 -0
  128. package/out/src/tokenize.js.map +1 -0
  129. package/out/src/types.d.ts +168 -0
  130. package/out/src/types.d.ts.map +1 -0
  131. package/out/src/types.js +19 -0
  132. package/out/src/types.js.map +1 -0
  133. package/out/src/wof-json.d.ts +105 -0
  134. package/out/src/wof-json.d.ts.map +1 -0
  135. package/out/src/wof-json.js +174 -0
  136. package/out/src/wof-json.js.map +1 -0
  137. package/package.json +36 -0
@@ -0,0 +1,96 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Adapter framework helpers — the pieces every corpus adapter and the runner share.
7
+ *
8
+ * This file does **not** define `CorpusAdapter` (that lives in `./types.ts`, which is the single
9
+ * canonical schema module). It exposes:
10
+ *
11
+ * - `AdapterRegistry`: a tiny lookup table the CLI + build pipeline use to find adapters by id.
12
+ * - `InMemoryAdapterRegistry`: the default implementation.
13
+ * - `stableSourceId(adapterId, components)`: deterministic content-addressed id for adapters whose
14
+ * source data has no native primary key (CSV, GeoJSON).
15
+ * - `canonicalDedupKey(row)`: normalized signature used to drop near-identical rows during a run.
16
+ * Adapter-internal dedup; cross-adapter dedup is the runner's job.
17
+ * - `streamingSha256()`: thin wrapper around `node:crypto` so the runner can hash JSONL output as it
18
+ * streams (avoids re-reading the shard for the manifest checksum).
19
+ *
20
+ * Everything here is pure (no I/O); side-effecting code goes in `./runner.ts`.
21
+ */
22
+ import type { ComponentTag } from "@mailwoman/core/types";
23
+ import type { CanonicalRow, CorpusAdapter } from "./types.js";
24
+ /**
25
+ * Lookup table for corpus adapters.
26
+ *
27
+ * The CLI's `npx mailwoman corpus run <adapter-id>` resolves `<adapter-id>` against this registry;
28
+ * the same registry is iterated by the `corpus build` pipeline. Adapters do not self-register at
29
+ * module load — they're added explicitly so the dependency graph stays traceable.
30
+ */
31
+ export interface AdapterRegistry {
32
+ /** Add an adapter. Throws if `adapter.id` is already registered. */
33
+ register(adapter: CorpusAdapter): void;
34
+ /** Return the adapter for `id`, or `undefined`. */
35
+ get(id: string): CorpusAdapter | undefined;
36
+ /** All registered adapters, in insertion order. */
37
+ list(): readonly CorpusAdapter[];
38
+ /** Convenience: ids only, in insertion order. */
39
+ ids(): readonly string[];
40
+ }
41
+ /**
42
+ * Default in-memory registry. The runner constructs one per invocation; the CLI re-uses a shared
43
+ * singleton (`defaultAdapterRegistry`) populated by `./adapters/index.ts` as adapters come online.
44
+ */
45
+ export declare class InMemoryAdapterRegistry implements AdapterRegistry {
46
+ #private;
47
+ register(adapter: CorpusAdapter): void;
48
+ get(id: string): CorpusAdapter | undefined;
49
+ list(): readonly CorpusAdapter[];
50
+ ids(): readonly string[];
51
+ }
52
+ /**
53
+ * Process-wide default registry. Populated by `./adapters/index.ts` as adapters are built; imported
54
+ * by the CLI. Tests should construct their own `InMemoryAdapterRegistry` to avoid cross-test
55
+ * pollution.
56
+ */
57
+ export declare const defaultAdapterRegistry: InMemoryAdapterRegistry;
58
+ /**
59
+ * Deterministic content-addressed source id.
60
+ *
61
+ * For adapters whose upstream source has no native primary key (CSV rows, GeoJSON features), the
62
+ * runner expects a stable id so dedup, holdout manifests, and resumability work across reruns. This
63
+ * helper produces one by hashing the adapter id and a canonical serialization of the components
64
+ * dict (keys sorted, values verbatim).
65
+ *
66
+ * Output format: `<adapterId>-<first-12-hex-chars-of-sha256>`. 48 bits of entropy is enough for
67
+ * ~17M rows per adapter before the expected collision count exceeds 1 (birthday paradox); adapters
68
+ * with more rows should extend the prefix length.
69
+ */
70
+ export declare function stableSourceId(adapterId: string, components: Partial<Record<ComponentTag, string>>): string;
71
+ /**
72
+ * Canonical dedup key for a row.
73
+ *
74
+ * Two rows that share this key are treated as duplicates and only the first wins. The key is built
75
+ * from `country`, the sorted `components` dict, and a normalized `raw` (lower-cased, whitespace
76
+ * collapsed). License and provenance fields are intentionally excluded so the same address from
77
+ * multiple adapters is recognized as a duplicate.
78
+ *
79
+ * Synthetic rows are never deduplicated against natural rows: `synth.method` is folded into the key
80
+ * when present, ensuring each augmentation variant survives.
81
+ */
82
+ export declare function canonicalDedupKey(row: CanonicalRow): string;
83
+ /**
84
+ * Streaming SHA-256 hasher.
85
+ *
86
+ * The runner feeds every JSONL line into one of these so the per-shard checksum can be recorded in
87
+ * `MANIFEST.json` without a second pass over the shard. Implementation is a one-line wrapper, but
88
+ * giving it a name keeps the runner's hash-tracking intent obvious.
89
+ */
90
+ export interface StreamingHasher {
91
+ update(chunk: string | Uint8Array): void;
92
+ digest(): string;
93
+ }
94
+ /** Default `StreamingHasher` (SHA-256, hex). */
95
+ export declare function streamingSha256(): StreamingHasher;
96
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../src/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAEzD,OAAO,KAAK,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,YAAY,CAAA;AAE7D;;;;;;GAMG;AACH,MAAM,WAAW,eAAe;IAC/B,oEAAoE;IACpE,QAAQ,CAAC,OAAO,EAAE,aAAa,GAAG,IAAI,CAAA;IAEtC,mDAAmD;IACnD,GAAG,CAAC,EAAE,EAAE,MAAM,GAAG,aAAa,GAAG,SAAS,CAAA;IAE1C,mDAAmD;IACnD,IAAI,IAAI,SAAS,aAAa,EAAE,CAAA;IAEhC,iDAAiD;IACjD,GAAG,IAAI,SAAS,MAAM,EAAE,CAAA;CACxB;AAED;;;GAGG;AACH,qBAAa,uBAAwB,YAAW,eAAe;;IAG9D,QAAQ,CAAC,OAAO,EAAE,aAAa,GAAG,IAAI;IAOtC,GAAG,CAAC,EAAE,EAAE,MAAM,GAAG,aAAa,GAAG,SAAS;IAI1C,IAAI,IAAI,SAAS,aAAa,EAAE;IAIhC,GAAG,IAAI,SAAS,MAAM,EAAE;CAGxB;AAED;;;;GAIG;AACH,eAAO,MAAM,sBAAsB,yBAAgC,CAAA;AAEnE;;;;;;;;;;;GAWG;AACH,wBAAgB,cAAc,CAAC,SAAS,EAAE,MAAM,EAAE,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,GAAG,MAAM,CAK3G;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,YAAY,GAAG,MAAM,CAM3D;AAED;;;;;;GAMG;AACH,MAAM,WAAW,eAAe;IAC/B,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,UAAU,GAAG,IAAI,CAAA;IACxC,MAAM,IAAI,MAAM,CAAA;CAChB;AAED,gDAAgD;AAChD,wBAAgB,eAAe,IAAI,eAAe,CAiBjD"}
@@ -0,0 +1,107 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Adapter framework helpers — the pieces every corpus adapter and the runner share.
7
+ *
8
+ * This file does **not** define `CorpusAdapter` (that lives in `./types.ts`, which is the single
9
+ * canonical schema module). It exposes:
10
+ *
11
+ * - `AdapterRegistry`: a tiny lookup table the CLI + build pipeline use to find adapters by id.
12
+ * - `InMemoryAdapterRegistry`: the default implementation.
13
+ * - `stableSourceId(adapterId, components)`: deterministic content-addressed id for adapters whose
14
+ * source data has no native primary key (CSV, GeoJSON).
15
+ * - `canonicalDedupKey(row)`: normalized signature used to drop near-identical rows during a run.
16
+ * Adapter-internal dedup; cross-adapter dedup is the runner's job.
17
+ * - `streamingSha256()`: thin wrapper around `node:crypto` so the runner can hash JSONL output as it
18
+ * streams (avoids re-reading the shard for the manifest checksum).
19
+ *
20
+ * Everything here is pure (no I/O); side-effecting code goes in `./runner.ts`.
21
+ */
22
+ import { createHash } from "node:crypto";
23
+ /**
24
+ * Default in-memory registry. The runner constructs one per invocation; the CLI re-uses a shared
25
+ * singleton (`defaultAdapterRegistry`) populated by `./adapters/index.ts` as adapters come online.
26
+ */
27
+ export class InMemoryAdapterRegistry {
28
+ #byId = new Map();
29
+ register(adapter) {
30
+ if (this.#byId.has(adapter.id)) {
31
+ throw new Error(`AdapterRegistry: id ${JSON.stringify(adapter.id)} already registered`);
32
+ }
33
+ this.#byId.set(adapter.id, adapter);
34
+ }
35
+ get(id) {
36
+ return this.#byId.get(id);
37
+ }
38
+ list() {
39
+ return Array.from(this.#byId.values());
40
+ }
41
+ ids() {
42
+ return Array.from(this.#byId.keys());
43
+ }
44
+ }
45
+ /**
46
+ * Process-wide default registry. Populated by `./adapters/index.ts` as adapters are built; imported
47
+ * by the CLI. Tests should construct their own `InMemoryAdapterRegistry` to avoid cross-test
48
+ * pollution.
49
+ */
50
+ export const defaultAdapterRegistry = new InMemoryAdapterRegistry();
51
+ /**
52
+ * Deterministic content-addressed source id.
53
+ *
54
+ * For adapters whose upstream source has no native primary key (CSV rows, GeoJSON features), the
55
+ * runner expects a stable id so dedup, holdout manifests, and resumability work across reruns. This
56
+ * helper produces one by hashing the adapter id and a canonical serialization of the components
57
+ * dict (keys sorted, values verbatim).
58
+ *
59
+ * Output format: `<adapterId>-<first-12-hex-chars-of-sha256>`. 48 bits of entropy is enough for
60
+ * ~17M rows per adapter before the expected collision count exceeds 1 (birthday paradox); adapters
61
+ * with more rows should extend the prefix length.
62
+ */
63
+ export function stableSourceId(adapterId, components) {
64
+ const sortedKeys = Object.keys(components).sort();
65
+ const payload = sortedKeys.map((k) => `${k}=${components[k] ?? ""}`).join("\x1f");
66
+ const digest = createHash("sha256").update(adapterId).update("\x1e").update(payload).digest("hex");
67
+ return `${adapterId}-${digest.slice(0, 12)}`;
68
+ }
69
+ /**
70
+ * Canonical dedup key for a row.
71
+ *
72
+ * Two rows that share this key are treated as duplicates and only the first wins. The key is built
73
+ * from `country`, the sorted `components` dict, and a normalized `raw` (lower-cased, whitespace
74
+ * collapsed). License and provenance fields are intentionally excluded so the same address from
75
+ * multiple adapters is recognized as a duplicate.
76
+ *
77
+ * Synthetic rows are never deduplicated against natural rows: `synth.method` is folded into the key
78
+ * when present, ensuring each augmentation variant survives.
79
+ */
80
+ export function canonicalDedupKey(row) {
81
+ const sortedKeys = Object.keys(row.components).sort();
82
+ const compPart = sortedKeys.map((k) => `${k}=${row.components[k] ?? ""}`).join("\x1f");
83
+ const rawNorm = row.raw.toLowerCase().replace(/\s+/g, " ").trim();
84
+ const synthPart = row.synth ? `\x1e${row.synth.method}` : "";
85
+ return `${row.country}\x1e${rawNorm}\x1e${compPart}${synthPart}`;
86
+ }
87
+ /** Default `StreamingHasher` (SHA-256, hex). */
88
+ export function streamingSha256() {
89
+ const h = createHash("sha256");
90
+ let finalized = false;
91
+ let digestHex = "";
92
+ return {
93
+ update(chunk) {
94
+ if (finalized)
95
+ throw new Error("streamingSha256: update() called after digest()");
96
+ h.update(typeof chunk === "string" ? chunk : chunk);
97
+ },
98
+ digest() {
99
+ if (!finalized) {
100
+ digestHex = h.digest("hex");
101
+ finalized = true;
102
+ }
103
+ return digestHex;
104
+ },
105
+ };
106
+ }
107
+ //# sourceMappingURL=adapter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../src/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAGH,OAAO,EAAE,UAAU,EAAa,MAAM,aAAa,CAAA;AAwBnD;;;GAGG;AACH,MAAM,OAAO,uBAAuB;IACnC,KAAK,GAAG,IAAI,GAAG,EAAyB,CAAA;IAExC,QAAQ,CAAC,OAAsB;QAC9B,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,CAAC;YAChC,MAAM,IAAI,KAAK,CAAC,uBAAuB,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,EAAE,CAAC,qBAAqB,CAAC,CAAA;QACxF,CAAC;QACD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,EAAE,OAAO,CAAC,CAAA;IACpC,CAAC;IAED,GAAG,CAAC,EAAU;QACb,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;IAC1B,CAAC;IAED,IAAI;QACH,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAA;IACvC,CAAC;IAED,GAAG;QACF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAA;IACrC,CAAC;CACD;AAED;;;;GAIG;AACH,MAAM,CAAC,MAAM,sBAAsB,GAAG,IAAI,uBAAuB,EAAE,CAAA;AAEnE;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,cAAc,CAAC,SAAiB,EAAE,UAAiD;IAClG,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,EAAoB,CAAA;IACnE,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACjF,MAAM,MAAM,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;IAClG,OAAO,GAAG,SAAS,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAA;AAC7C,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,iBAAiB,CAAC,GAAiB;IAClD,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,IAAI,EAAoB,CAAA;IACvE,MAAM,QAAQ,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACtF,MAAM,OAAO,GAAG,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACjE,MAAM,SAAS,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAA;IAC5D,OAAO,GAAG,GAAG,CAAC,OAAO,OAAO,OAAO,OAAO,QAAQ,GAAG,SAAS,EAAE,CAAA;AACjE,CAAC;AAcD,gDAAgD;AAChD,MAAM,UAAU,eAAe;IAC9B,MAAM,CAAC,GAAS,UAAU,CAAC,QAAQ,CAAC,CAAA;IACpC,IAAI,SAAS,GAAG,KAAK,CAAA;IACrB,IAAI,SAAS,GAAG,EAAE,CAAA;IAClB,OAAO;QACN,MAAM,CAAC,KAAK;YACX,IAAI,SAAS;gBAAE,MAAM,IAAI,KAAK,CAAC,iDAAiD,CAAC,CAAA;YACjF,CAAC,CAAC,MAAM,CAAC,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAA;QACpD,CAAC;QACD,MAAM;YACL,IAAI,CAAC,SAAS,EAAE,CAAC;gBAChB,SAAS,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;gBAC3B,SAAS,GAAG,IAAI,CAAA;YACjB,CAAC;YACD,OAAO,SAAS,CAAA;QACjB,CAAC;KACD,CAAA;AACF,CAAC"}
@@ -0,0 +1,32 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `ban`: Base Adresse Nationale CSV adapter (FR street-level).
7
+ *
8
+ * Input: a CSV dump from `adresse.data.gouv.fr` (semicolon-separated, ~25M rows nationally). The
9
+ * adapter only reads the small set of columns needed for the corpus:
10
+ *
11
+ * - `numero` → `house_number`
12
+ * - `rep` → repetition index ("bis", "ter") appended to house_number
13
+ * - `nom_voie` → `street` (full road name; includes the prefix "Rue", "Avenue", etc.)
14
+ * - `code_postal` → `postcode`
15
+ * - `nom_commune` → `locality`
16
+ *
17
+ * `region` and `country` are not in BAN. The adapter stamps `country: "FR"` on every row; region is
18
+ * left for the wof-postalcode + wof-admin cross-reference at corpus build time (a future pass;
19
+ * for Phase 1 the row's region is simply absent).
20
+ *
21
+ * License: ODbL / Licence Ouverte. Stamped onto every row as `ODbL-1.0` (the OpenStreetMap-
22
+ * compatible label most consumers expect).
23
+ *
24
+ * The adapter is streaming-aware: it uses `csv-parse` in streaming mode so a 25M-row dump never
25
+ * sits in memory. Honors `opts.limit` for fixture / smoke runs, `opts.signal` for cancellation,
26
+ * and `opts.country` for a self-consistency check (errors if country !== FR).
27
+ */
28
+ import type { CorpusAdapter } from "../../types.js";
29
+ export declare const BAN_ADAPTER_ID = "ban";
30
+ export declare function createBanAdapter(): CorpusAdapter;
31
+ export declare const banAdapter: CorpusAdapter;
32
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/ban/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAMH,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AAEjF,eAAO,MAAM,cAAc,QAAQ,CAAA;AA4CnC,wBAAgB,gBAAgB,IAAI,aAAa,CAqEhD;AAED,eAAO,MAAM,UAAU,eAAqB,CAAA"}
@@ -0,0 +1,133 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `ban`: Base Adresse Nationale CSV adapter (FR street-level).
7
+ *
8
+ * Input: a CSV dump from `adresse.data.gouv.fr` (semicolon-separated, ~25M rows nationally). The
9
+ * adapter only reads the small set of columns needed for the corpus:
10
+ *
11
+ * - `numero` → `house_number`
12
+ * - `rep` → repetition index ("bis", "ter") appended to house_number
13
+ * - `nom_voie` → `street` (full road name; includes the prefix "Rue", "Avenue", etc.)
14
+ * - `code_postal` → `postcode`
15
+ * - `nom_commune` → `locality`
16
+ *
17
+ * `region` and `country` are not in BAN. The adapter stamps `country: "FR"` on every row; region is
18
+ * left for the wof-postalcode + wof-admin cross-reference at corpus build time (a future pass;
19
+ * for Phase 1 the row's region is simply absent).
20
+ *
21
+ * License: ODbL / Licence Ouverte. Stamped onto every row as `ODbL-1.0` (the OpenStreetMap-
22
+ * compatible label most consumers expect).
23
+ *
24
+ * The adapter is streaming-aware: it uses `csv-parse` in streaming mode so a 25M-row dump never
25
+ * sits in memory. Honors `opts.limit` for fixture / smoke runs, `opts.signal` for cancellation,
26
+ * and `opts.country` for a self-consistency check (errors if country !== FR).
27
+ */
28
+ import { parse as csvParse } from "csv-parse";
29
+ import { createReadStream } from "node:fs";
30
+ import { stableSourceId } from "../../adapter.js";
31
+ import { reconcileComponents } from "../../format.js";
32
+ export const BAN_ADAPTER_ID = "ban";
33
+ /**
34
+ * Compose `house_number` from `numero` + `rep`. BAN uses `rep` for repetition indices ("bis",
35
+ * "ter", "quater") that follow the house number. Result: `"10 bis"`, `"45"`, etc.
36
+ */
37
+ function composeHouseNumber(numero, rep) {
38
+ const n = numero.trim();
39
+ const r = rep.trim();
40
+ if (!n)
41
+ return "";
42
+ return r ? `${n} ${r}` : n;
43
+ }
44
+ /**
45
+ * Compose the raw FR-style address line. Two common BAN-derived shapes:
46
+ *
47
+ * "10 bis Avenue des Champs-Élysées, 75008 Paris" "45 Cours Lafayette, 69003 Lyon"
48
+ *
49
+ * FR convention puts postcode on the same line as the locality, comma-separated from the street.
50
+ * The adapter renders that directly rather than relying on OpenCage's template — BAN already gives
51
+ * us the canonical FR strings; the template would round-trip identically.
52
+ */
53
+ function composeRaw(house, street, postcode, locality) {
54
+ const parts = [];
55
+ const streetPart = [house, street].filter(Boolean).join(" ").trim();
56
+ if (streetPart)
57
+ parts.push(streetPart);
58
+ const cityPart = [postcode, locality].filter(Boolean).join(" ").trim();
59
+ if (cityPart)
60
+ parts.push(cityPart);
61
+ return parts.join(", ").replace(/\s+/g, " ").trim();
62
+ }
63
+ export function createBanAdapter() {
64
+ return {
65
+ id: BAN_ADAPTER_ID,
66
+ defaultLicense: "ODbL-1.0",
67
+ description: "Base Adresse Nationale (FR): house-number-level street addresses (~25M rows).",
68
+ async *rows(opts) {
69
+ if (opts.country && opts.country !== "FR") {
70
+ throw new Error(`ban adapter: only FR supported, got country=${opts.country}`);
71
+ }
72
+ const stream = createReadStream(opts.inputPath, { encoding: "utf8" });
73
+ const parser = stream.pipe(csvParse({
74
+ delimiter: ";",
75
+ columns: true,
76
+ skip_empty_lines: true,
77
+ relax_quotes: true,
78
+ relax_column_count: true,
79
+ }));
80
+ let emitted = 0;
81
+ try {
82
+ for await (const record of parser) {
83
+ if (opts.signal?.aborted)
84
+ break;
85
+ if (opts.limit !== undefined && emitted >= opts.limit)
86
+ break;
87
+ const house = composeHouseNumber(record.numero ?? "", record.rep ?? "");
88
+ const street = (record.nom_voie ?? "").trim();
89
+ const postcode = (record.code_postal ?? "").trim();
90
+ const locality = (record.nom_commune ?? "").trim();
91
+ if (!street || !locality)
92
+ continue;
93
+ if (!house && !postcode)
94
+ continue;
95
+ const components = {};
96
+ if (house)
97
+ components.house_number = house;
98
+ if (street)
99
+ components.street = street;
100
+ if (postcode)
101
+ components.postcode = postcode;
102
+ if (locality)
103
+ components.locality = locality;
104
+ const raw = composeRaw(house, street, postcode, locality);
105
+ if (!raw)
106
+ continue;
107
+ const aligned = reconcileComponents(components, raw);
108
+ if (Object.keys(aligned).length === 0)
109
+ continue;
110
+ const sourceId = record.id?.trim()
111
+ ? `${BAN_ADAPTER_ID}-${record.id.trim()}`
112
+ : stableSourceId(BAN_ADAPTER_ID, aligned);
113
+ yield {
114
+ raw,
115
+ components: aligned,
116
+ country: "FR",
117
+ locale: "fr-FR",
118
+ source: BAN_ADAPTER_ID,
119
+ source_id: sourceId,
120
+ corpus_version: "",
121
+ license: "ODbL-1.0",
122
+ };
123
+ emitted++;
124
+ }
125
+ }
126
+ finally {
127
+ stream.destroy();
128
+ }
129
+ },
130
+ };
131
+ }
132
+ export const banAdapter = createBanAdapter();
133
+ //# sourceMappingURL=adapter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/ban/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAEH,OAAO,EAAE,KAAK,IAAI,QAAQ,EAAE,MAAM,WAAW,CAAA;AAC7C,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAA;AACjD,OAAO,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGrD,MAAM,CAAC,MAAM,cAAc,GAAG,KAAK,CAAA;AAenC;;;GAGG;AACH,SAAS,kBAAkB,CAAC,MAAc,EAAE,GAAW;IACtD,MAAM,CAAC,GAAG,MAAM,CAAC,IAAI,EAAE,CAAA;IACvB,MAAM,CAAC,GAAG,GAAG,CAAC,IAAI,EAAE,CAAA;IACpB,IAAI,CAAC,CAAC;QAAE,OAAO,EAAE,CAAA;IACjB,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAA;AAC3B,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,UAAU,CAAC,KAAa,EAAE,MAAc,EAAE,QAAgB,EAAE,QAAgB;IACpF,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,MAAM,UAAU,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,IAAI,UAAU;QAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;IACtC,MAAM,QAAQ,GAAG,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACtE,IAAI,QAAQ;QAAE,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;IAClC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;AACpD,CAAC;AAED,MAAM,UAAU,gBAAgB;IAC/B,OAAO;QACN,EAAE,EAAE,cAAc;QAClB,cAAc,EAAE,UAAU;QAC1B,WAAW,EAAE,+EAA+E;QAE5F,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,+CAA+C,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YAC/E,CAAC;YAED,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;YACrE,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CACzB,QAAQ,CAAC;gBACR,SAAS,EAAE,GAAG;gBACd,OAAO,EAAE,IAAI;gBACb,gBAAgB,EAAE,IAAI;gBACtB,YAAY,EAAE,IAAI;gBAClB,kBAAkB,EAAE,IAAI;aACxB,CAAC,CACF,CAAA;YAED,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,IAAI,CAAC;gBACJ,IAAI,KAAK,EAAE,MAAM,MAAM,IAAI,MAA+B,EAAE,CAAC;oBAC5D,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;wBAAE,MAAK;oBAC/B,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;wBAAE,MAAK;oBAE5D,MAAM,KAAK,GAAG,kBAAkB,CAAC,MAAM,CAAC,MAAM,IAAI,EAAE,EAAE,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC,CAAA;oBACvE,MAAM,MAAM,GAAG,CAAC,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAC7C,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAClD,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAA;oBAElD,IAAI,CAAC,MAAM,IAAI,CAAC,QAAQ;wBAAE,SAAQ;oBAClC,IAAI,CAAC,KAAK,IAAI,CAAC,QAAQ;wBAAE,SAAQ;oBAEjC,MAAM,UAAU,GAA+B,EAAE,CAAA;oBACjD,IAAI,KAAK;wBAAE,UAAU,CAAC,YAAY,GAAG,KAAK,CAAA;oBAC1C,IAAI,MAAM;wBAAE,UAAU,CAAC,MAAM,GAAG,MAAM,CAAA;oBACtC,IAAI,QAAQ;wBAAE,UAAU,CAAC,QAAQ,GAAG,QAAQ,CAAA;oBAC5C,IAAI,QAAQ;wBAAE,UAAU,CAAC,QAAQ,GAAG,QAAQ,CAAA;oBAE5C,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAA;oBACzD,IAAI,CAAC,GAAG;wBAAE,SAAQ;oBAElB,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;oBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;wBAAE,SAAQ;oBAE/C,MAAM,QAAQ,GAAG,MAAM,CAAC,EAAE,EAAE,IAAI,EAAE;wBACjC,CAAC,CAAC,GAAG,cAAc,IAAI,MAAM,CAAC,EAAE,CAAC,IAAI,EAAE,EAAE;wBACzC,CAAC,CAAC,cAAc,CAAC,cAAc,EAAE,OAAO,CAAC,CAAA;oBAE1C,MAAM;wBACL,GAAG;wBACH,UAAU,EAAE,OAAO;wBACnB,OAAO,EAAE,IAAI;wBACb,MAAM,EAAE,OAAO;wBACf,MAAM,EAAE,cAAc;wBACtB,SAAS,EAAE,QAAQ;wBACnB,cAAc,EAAE,EAAE;wBAClB,OAAO,EAAE,UAAU;qBACnB,CAAA;oBACD,OAAO,EAAE,CAAA;gBACV,CAAC;YACF,CAAC;oBAAS,CAAC;gBACV,MAAM,CAAC,OAAO,EAAE,CAAA;YACjB,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,UAAU,GAAG,gBAAgB,EAAE,CAAA"}
@@ -0,0 +1,61 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `fcc-bdc`: FCC Broadband Data Collection (BDC) — Fabric-derived location consumer.
7
+ *
8
+ * The first member of Phase 1.6's "adversarial sources" class. BDC ships the public-domain US
9
+ * broadband-serviceable-location (BSL) fabric: ~120M addresses keyed by stable `location_id`,
10
+ * carrying `address_primary` + `city` + `state` + `zip` + `zip_suffix`. Compared to the clean
11
+ * gazetteer rows from WOF / TIGER / BAN, BDC carries the chaos of address data that has passed
12
+ * through several layers of human entry + automated geocoding + revision: abbreviation drift,
13
+ * inconsistent unit designators, "RR" / "HC" / "PSC" rural-route shapes, embedded apartment /
14
+ * suite numbers that did not survive the address parser cleanly. This is the highest-signal,
15
+ * hardest-to-normalize address corpus in the federal public-domain catalog.
16
+ *
17
+ * Following the `tiger` / `wof-admin` pattern, this adapter consumes a SQLite database the operator
18
+ * pre-builds via the isp-nexus BDC ETL (`/srv/isp-nexus/sync/fcc/bdc/`) or any equivalent
19
+ * host-side pipeline. The mailwoman side does not download or parse the raw CSV/ZIP distribution
20
+ * directly — that keeps the adapter narrow and the BDC ingest pluggable.
21
+ *
22
+ * The SQLite schema is documented in README.md and modeled after `NTIARecord`
23
+ * (`isp-nexus/fcc/bdc/data-collection.ts`): one row per `location_id`. The adapter splits
24
+ * `address_primary` into `house_number` (leading numeric prefix, if any) + `street` (everything
25
+ * after), and combines `zip` + `zip_suffix` into the canonical USPS `postcode` slot.
26
+ *
27
+ * One CanonicalRow per fabric record. Unlike `tiger` (multiple postcode variants per segment) or
28
+ * `wof-admin` (multiple hierarchy variants per place), BDC records already represent fully
29
+ * specified addresses; no fan-out is warranted. Adversarial composition (Phase 1.6 §2.1) is the
30
+ * mechanism for deriving multiple training rows per BDC record.
31
+ *
32
+ * License: stamped `"Public Domain"` per the BDC fabric's US federal-government distribution terms.
33
+ * The CostQuest Fabric source data has its own license; consumers who substitute that path should
34
+ * re-stamp accordingly.
35
+ */
36
+ import type { CorpusAdapter } from "../../types.js";
37
+ export declare const FCC_BDC_ADAPTER_ID = "fcc-bdc";
38
+ export declare const FCC_BDC_DEFAULT_LICENSE = "Public Domain";
39
+ interface SplitAddress {
40
+ house_number?: string;
41
+ street: string;
42
+ }
43
+ export declare function splitAddressPrimary(address: string): SplitAddress | null;
44
+ /**
45
+ * Combine `zip` + optional `zip_suffix` into the canonical USPS postcode surface form.
46
+ *
47
+ * NTIARecord doc is ambiguous about whether `zip_suffix` is the 4-digit extension alone or the full
48
+ * ZIP+4 string. This handles both:
49
+ *
50
+ * - Bare 4-digit extension (`zip="94103"`, `zip_suffix="1234"`) → `"94103-1234"`
51
+ * - Already-joined form (`zip_suffix="94103-1234"`) → returned as-is
52
+ * - No suffix → bare `zip`
53
+ *
54
+ * Empty / whitespace-only suffix is treated as missing.
55
+ */
56
+ export declare function buildPostcode(zip: string, suffix: string | null): string;
57
+ /** Build a BDC adapter. Pure factory so multiple instances can be created in tests. */
58
+ export declare function createFccBdcAdapter(): CorpusAdapter;
59
+ export declare const fccBdcAdapter: CorpusAdapter;
60
+ export {};
61
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../../../../src/adapters/fcc-bdc/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAKH,OAAO,KAAK,EAAgC,aAAa,EAAE,MAAM,gBAAgB,CAAA;AAEjF,eAAO,MAAM,kBAAkB,YAAY,CAAA;AAC3C,eAAO,MAAM,uBAAuB,kBAAkB,CAAA;AA0BtD,UAAU,YAAY;IACrB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,MAAM,EAAE,MAAM,CAAA;CACd;AAED,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,GAAG,YAAY,GAAG,IAAI,CAQxE;AAED;;;;;;;;;;;GAWG;AACH,wBAAgB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,IAAI,GAAG,MAAM,CAOxE;AAED,uFAAuF;AACvF,wBAAgB,mBAAmB,IAAI,aAAa,CAgEnD;AAED,eAAO,MAAM,aAAa,eAAwB,CAAA"}
@@ -0,0 +1,153 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `fcc-bdc`: FCC Broadband Data Collection (BDC) — Fabric-derived location consumer.
7
+ *
8
+ * The first member of Phase 1.6's "adversarial sources" class. BDC ships the public-domain US
9
+ * broadband-serviceable-location (BSL) fabric: ~120M addresses keyed by stable `location_id`,
10
+ * carrying `address_primary` + `city` + `state` + `zip` + `zip_suffix`. Compared to the clean
11
+ * gazetteer rows from WOF / TIGER / BAN, BDC carries the chaos of address data that has passed
12
+ * through several layers of human entry + automated geocoding + revision: abbreviation drift,
13
+ * inconsistent unit designators, "RR" / "HC" / "PSC" rural-route shapes, embedded apartment /
14
+ * suite numbers that did not survive the address parser cleanly. This is the highest-signal,
15
+ * hardest-to-normalize address corpus in the federal public-domain catalog.
16
+ *
17
+ * Following the `tiger` / `wof-admin` pattern, this adapter consumes a SQLite database the operator
18
+ * pre-builds via the isp-nexus BDC ETL (`/srv/isp-nexus/sync/fcc/bdc/`) or any equivalent
19
+ * host-side pipeline. The mailwoman side does not download or parse the raw CSV/ZIP distribution
20
+ * directly — that keeps the adapter narrow and the BDC ingest pluggable.
21
+ *
22
+ * The SQLite schema is documented in README.md and modeled after `NTIARecord`
23
+ * (`isp-nexus/fcc/bdc/data-collection.ts`): one row per `location_id`. The adapter splits
24
+ * `address_primary` into `house_number` (leading numeric prefix, if any) + `street` (everything
25
+ * after), and combines `zip` + `zip_suffix` into the canonical USPS `postcode` slot.
26
+ *
27
+ * One CanonicalRow per fabric record. Unlike `tiger` (multiple postcode variants per segment) or
28
+ * `wof-admin` (multiple hierarchy variants per place), BDC records already represent fully
29
+ * specified addresses; no fan-out is warranted. Adversarial composition (Phase 1.6 §2.1) is the
30
+ * mechanism for deriving multiple training rows per BDC record.
31
+ *
32
+ * License: stamped `"Public Domain"` per the BDC fabric's US federal-government distribution terms.
33
+ * The CostQuest Fabric source data has its own license; consumers who substitute that path should
34
+ * re-stamp accordingly.
35
+ */
36
+ import Database from "better-sqlite3";
37
+ import { lookupStateAbbreviation } from "../../codex/us-fips-state.js";
38
+ import { formatAddress, reconcileComponents } from "../../format.js";
39
+ export const FCC_BDC_ADAPTER_ID = "fcc-bdc";
40
+ export const FCC_BDC_DEFAULT_LICENSE = "Public Domain";
41
+ /**
42
+ * Split `address_primary` into a `(house_number, street)` pair.
43
+ *
44
+ * BDC's `address_primary` follows USPS Publication 28 conventions but with hand-entry drift. The
45
+ * canonical leading-digit prefix is the house number (`"123 Main St"`, `"6450 W Indian School Rd"`,
46
+ * even hyphenated forms `"40-12 Bell Blvd"`). Anything that doesn't match the prefix shape (`"PO
47
+ * Box 1234"`, `"RR 2 Box 67"`, `"HC 1"`) is left as a single `street` value — the model sees the
48
+ * original surface form, and downstream classifiers/po-box handling can pick it up.
49
+ *
50
+ * The regex tolerates one trailing letter (`"123A Main St"`) and an optional hyphenated half
51
+ * (`"40-12"`) which is common in NYC + suburban garden-apartment numbering.
52
+ */
53
+ const HOUSE_NUMBER_PREFIX = /^(\d+(?:-\d+)?[A-Za-z]?)\s+(.+)$/;
54
+ export function splitAddressPrimary(address) {
55
+ const trimmed = address.trim();
56
+ if (!trimmed)
57
+ return null;
58
+ const match = HOUSE_NUMBER_PREFIX.exec(trimmed);
59
+ if (match) {
60
+ return { house_number: match[1], street: match[2].trim() };
61
+ }
62
+ return { street: trimmed };
63
+ }
64
+ /**
65
+ * Combine `zip` + optional `zip_suffix` into the canonical USPS postcode surface form.
66
+ *
67
+ * NTIARecord doc is ambiguous about whether `zip_suffix` is the 4-digit extension alone or the full
68
+ * ZIP+4 string. This handles both:
69
+ *
70
+ * - Bare 4-digit extension (`zip="94103"`, `zip_suffix="1234"`) → `"94103-1234"`
71
+ * - Already-joined form (`zip_suffix="94103-1234"`) → returned as-is
72
+ * - No suffix → bare `zip`
73
+ *
74
+ * Empty / whitespace-only suffix is treated as missing.
75
+ */
76
+ export function buildPostcode(zip, suffix) {
77
+ const z = zip.trim();
78
+ if (!z)
79
+ return "";
80
+ const s = suffix?.trim() ?? "";
81
+ if (!s)
82
+ return z;
83
+ if (s.includes("-"))
84
+ return s;
85
+ return `${z}-${s}`;
86
+ }
87
+ /** Build a BDC adapter. Pure factory so multiple instances can be created in tests. */
88
+ export function createFccBdcAdapter() {
89
+ return {
90
+ id: FCC_BDC_ADAPTER_ID,
91
+ defaultLicense: FCC_BDC_DEFAULT_LICENSE,
92
+ description: "FCC Broadband Data Collection — Fabric-derived BSL addresses (public-domain); SQLite DB the operator builds via the isp-nexus BDC ETL.",
93
+ async *rows(opts) {
94
+ if (opts.country && opts.country !== "US") {
95
+ throw new Error(`fcc-bdc adapter: only US supported, got country=${opts.country}`);
96
+ }
97
+ const db = new Database(opts.inputPath, { readonly: true, fileMustExist: true });
98
+ let emitted = 0;
99
+ try {
100
+ const stmt = db.prepare(`SELECT location_id, address_primary, city, state, zip, zip_suffix
101
+ FROM bdc_locations
102
+ ORDER BY location_id`);
103
+ for (const row of stmt.iterate()) {
104
+ if (opts.signal?.aborted)
105
+ return;
106
+ if (opts.limit !== undefined && emitted >= opts.limit)
107
+ return;
108
+ const split = splitAddressPrimary(row.address_primary ?? "");
109
+ if (!split)
110
+ continue;
111
+ const state = lookupStateAbbreviation(row.state);
112
+ if (!state)
113
+ continue;
114
+ const locality = row.city?.trim();
115
+ if (!locality)
116
+ continue;
117
+ const postcode = buildPostcode(row.zip ?? "", row.zip_suffix ?? null);
118
+ if (!postcode)
119
+ continue;
120
+ const components = {
121
+ ...(split.house_number ? { house_number: split.house_number } : {}),
122
+ street: split.street,
123
+ locality,
124
+ region: state.abbreviation,
125
+ postcode,
126
+ };
127
+ const raw = formatAddress(components, "US", { separator: ", " });
128
+ if (!raw)
129
+ continue;
130
+ const aligned = reconcileComponents(components, raw);
131
+ if (Object.keys(aligned).length === 0)
132
+ continue;
133
+ yield {
134
+ raw,
135
+ components: aligned,
136
+ country: "US",
137
+ locale: "en-US",
138
+ source: FCC_BDC_ADAPTER_ID,
139
+ source_id: `${FCC_BDC_ADAPTER_ID}-${row.location_id}`,
140
+ corpus_version: "",
141
+ license: FCC_BDC_DEFAULT_LICENSE,
142
+ };
143
+ emitted++;
144
+ }
145
+ }
146
+ finally {
147
+ db.close();
148
+ }
149
+ },
150
+ };
151
+ }
152
+ export const fccBdcAdapter = createFccBdcAdapter();
153
+ //# sourceMappingURL=adapter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.js","sourceRoot":"","sources":["../../../../src/adapters/fcc-bdc/adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAEH,OAAO,QAAQ,MAAM,gBAAgB,CAAA;AACrC,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAA;AACtE,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAA;AAGpE,MAAM,CAAC,MAAM,kBAAkB,GAAG,SAAS,CAAA;AAC3C,MAAM,CAAC,MAAM,uBAAuB,GAAG,eAAe,CAAA;AAYtD;;;;;;;;;;;GAWG;AACH,MAAM,mBAAmB,GAAG,kCAAkC,CAAA;AAO9D,MAAM,UAAU,mBAAmB,CAAC,OAAe;IAClD,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAA;IAC9B,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,MAAM,KAAK,GAAG,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC/C,IAAI,KAAK,EAAE,CAAC;QACX,OAAO,EAAE,YAAY,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC,CAAE,CAAC,IAAI,EAAE,EAAE,CAAA;IAC5D,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC3B,CAAC;AAED;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,aAAa,CAAC,GAAW,EAAE,MAAqB;IAC/D,MAAM,CAAC,GAAG,GAAG,CAAC,IAAI,EAAE,CAAA;IACpB,IAAI,CAAC,CAAC;QAAE,OAAO,EAAE,CAAA;IACjB,MAAM,CAAC,GAAG,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,CAAA;IAC9B,IAAI,CAAC,CAAC;QAAE,OAAO,CAAC,CAAA;IAChB,IAAI,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAA;IAC7B,OAAO,GAAG,CAAC,IAAI,CAAC,EAAE,CAAA;AACnB,CAAC;AAED,uFAAuF;AACvF,MAAM,UAAU,mBAAmB;IAClC,OAAO;QACN,EAAE,EAAE,kBAAkB;QACtB,cAAc,EAAE,uBAAuB;QACvC,WAAW,EACV,wIAAwI;QAEzI,KAAK,CAAC,CAAC,IAAI,CAAC,IAAoB;YAC/B,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;gBAC3C,MAAM,IAAI,KAAK,CAAC,mDAAmD,IAAI,CAAC,OAAO,EAAE,CAAC,CAAA;YACnF,CAAC;YAED,MAAM,EAAE,GAAG,IAAI,QAAQ,CAAC,IAAI,CAAC,SAAS,EAAE,EAAE,QAAQ,EAAE,IAAI,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAA;YAChF,IAAI,OAAO,GAAG,CAAC,CAAA;YACf,IAAI,CAAC;gBACJ,MAAM,IAAI,GAAG,EAAE,CAAC,OAAO,CACtB;;2BAEsB,CACtB,CAAA;gBAED,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,OAAO,EAAE,EAAE,CAAC;oBAClC,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;wBAAE,OAAM;oBAChC,IAAI,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,OAAO,IAAI,IAAI,CAAC,KAAK;wBAAE,OAAM;oBAE7D,MAAM,KAAK,GAAG,mBAAmB,CAAC,GAAG,CAAC,eAAe,IAAI,EAAE,CAAC,CAAA;oBAC5D,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBACpB,MAAM,KAAK,GAAG,uBAAuB,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;oBAChD,IAAI,CAAC,KAAK;wBAAE,SAAQ;oBACpB,MAAM,QAAQ,GAAG,GAAG,CAAC,IAAI,EAAE,IAAI,EAAE,CAAA;oBACjC,IAAI,CAAC,QAAQ;wBAAE,SAAQ;oBACvB,MAAM,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,GAAG,IAAI,EAAE,EAAE,GAAG,CAAC,UAAU,IAAI,IAAI,CAAC,CAAA;oBACrE,IAAI,CAAC,QAAQ;wBAAE,SAAQ;oBAEvB,MAAM,UAAU,GAA+B;wBAC9C,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnE,MAAM,EAAE,KAAK,CAAC,MAAM;wBACpB,QAAQ;wBACR,MAAM,EAAE,KAAK,CAAC,YAAY;wBAC1B,QAAQ;qBACR,CAAA;oBAED,MAAM,GAAG,GAAG,aAAa,CAAC,UAAU,EAAE,IAAI,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;oBAChE,IAAI,CAAC,GAAG;wBAAE,SAAQ;oBAClB,MAAM,OAAO,GAAG,mBAAmB,CAAC,UAAU,EAAE,GAAG,CAAC,CAAA;oBACpD,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;wBAAE,SAAQ;oBAE/C,MAAM;wBACL,GAAG;wBACH,UAAU,EAAE,OAAO;wBACnB,OAAO,EAAE,IAAI;wBACb,MAAM,EAAE,OAAO;wBACf,MAAM,EAAE,kBAAkB;wBAC1B,SAAS,EAAE,GAAG,kBAAkB,IAAI,GAAG,CAAC,WAAW,EAAE;wBACrD,cAAc,EAAE,EAAE;wBAClB,OAAO,EAAE,uBAAuB;qBAChC,CAAA;oBACD,OAAO,EAAE,CAAA;gBACV,CAAC;YACF,CAAC;oBAAS,CAAC;gBACV,EAAE,CAAC,KAAK,EAAE,CAAA;YACX,CAAC;QACF,CAAC;KACD,CAAA;AACF,CAAC;AAED,MAAM,CAAC,MAAM,aAAa,GAAG,mBAAmB,EAAE,CAAA"}