@mailwoman/corpus 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/out/src/adapter.d.ts +96 -0
  2. package/out/src/adapter.d.ts.map +1 -0
  3. package/out/src/adapter.js +107 -0
  4. package/out/src/adapter.js.map +1 -0
  5. package/out/src/adapters/ban/adapter.d.ts +32 -0
  6. package/out/src/adapters/ban/adapter.d.ts.map +1 -0
  7. package/out/src/adapters/ban/adapter.js +133 -0
  8. package/out/src/adapters/ban/adapter.js.map +1 -0
  9. package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
  10. package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
  11. package/out/src/adapters/fcc-bdc/adapter.js +153 -0
  12. package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
  13. package/out/src/adapters/index.d.ts +42 -0
  14. package/out/src/adapters/index.d.ts.map +1 -0
  15. package/out/src/adapters/index.js +76 -0
  16. package/out/src/adapters/index.js.map +1 -0
  17. package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
  18. package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
  19. package/out/src/adapters/openaddresses/adapter.js +174 -0
  20. package/out/src/adapters/openaddresses/adapter.js.map +1 -0
  21. package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
  22. package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
  23. package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
  24. package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
  25. package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
  26. package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
  27. package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
  28. package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
  29. package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
  30. package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
  31. package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
  32. package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
  33. package/out/src/adapters/tiger/adapter.d.ts +45 -0
  34. package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
  35. package/out/src/adapters/tiger/adapter.js +179 -0
  36. package/out/src/adapters/tiger/adapter.js.map +1 -0
  37. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
  38. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
  39. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
  40. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
  41. package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
  42. package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
  43. package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
  44. package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
  45. package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
  46. package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
  47. package/out/src/adapters/usgov-nad/adapter.js +227 -0
  48. package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
  49. package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
  50. package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
  51. package/out/src/adapters/usgov-nppes/adapter.js +123 -0
  52. package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
  53. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
  54. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
  55. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
  56. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
  57. package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
  58. package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
  59. package/out/src/adapters/wof-admin-json/adapter.js +241 -0
  60. package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
  61. package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
  62. package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
  63. package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
  64. package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
  65. package/out/src/align.d.ts +58 -0
  66. package/out/src/align.d.ts.map +1 -0
  67. package/out/src/align.js +139 -0
  68. package/out/src/align.js.map +1 -0
  69. package/out/src/build.d.ts +104 -0
  70. package/out/src/build.d.ts.map +1 -0
  71. package/out/src/build.js +201 -0
  72. package/out/src/build.js.map +1 -0
  73. package/out/src/codex/us-fips-state.d.ts +44 -0
  74. package/out/src/codex/us-fips-state.d.ts.map +1 -0
  75. package/out/src/codex/us-fips-state.js +105 -0
  76. package/out/src/codex/us-fips-state.js.map +1 -0
  77. package/out/src/codex/us-street-suffix.d.ts +259 -0
  78. package/out/src/codex/us-street-suffix.d.ts.map +1 -0
  79. package/out/src/codex/us-street-suffix.js +285 -0
  80. package/out/src/codex/us-street-suffix.js.map +1 -0
  81. package/out/src/format.d.ts +79 -0
  82. package/out/src/format.d.ts.map +1 -0
  83. package/out/src/format.js +151 -0
  84. package/out/src/format.js.map +1 -0
  85. package/out/src/golden.d.ts +50 -0
  86. package/out/src/golden.d.ts.map +1 -0
  87. package/out/src/golden.js +104 -0
  88. package/out/src/golden.js.map +1 -0
  89. package/out/src/index.d.ts +18 -0
  90. package/out/src/index.d.ts.map +1 -0
  91. package/out/src/index.js +18 -0
  92. package/out/src/index.js.map +1 -0
  93. package/out/src/parquet-wrapper/index.d.ts +12 -0
  94. package/out/src/parquet-wrapper/index.d.ts.map +1 -0
  95. package/out/src/parquet-wrapper/index.js +12 -0
  96. package/out/src/parquet-wrapper/index.js.map +1 -0
  97. package/out/src/parquet-wrapper/reader.d.ts +31 -0
  98. package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
  99. package/out/src/parquet-wrapper/reader.js +54 -0
  100. package/out/src/parquet-wrapper/reader.js.map +1 -0
  101. package/out/src/parquet-wrapper/schema.d.ts +45 -0
  102. package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
  103. package/out/src/parquet-wrapper/schema.js +55 -0
  104. package/out/src/parquet-wrapper/schema.js.map +1 -0
  105. package/out/src/parquet-wrapper/writer.d.ts +41 -0
  106. package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
  107. package/out/src/parquet-wrapper/writer.js +71 -0
  108. package/out/src/parquet-wrapper/writer.js.map +1 -0
  109. package/out/src/parquet.d.ts +122 -0
  110. package/out/src/parquet.d.ts.map +1 -0
  111. package/out/src/parquet.js +220 -0
  112. package/out/src/parquet.js.map +1 -0
  113. package/out/src/runner.d.ts +100 -0
  114. package/out/src/runner.d.ts.map +1 -0
  115. package/out/src/runner.js +183 -0
  116. package/out/src/runner.js.map +1 -0
  117. package/out/src/split.d.ts +108 -0
  118. package/out/src/split.d.ts.map +1 -0
  119. package/out/src/split.js +191 -0
  120. package/out/src/split.js.map +1 -0
  121. package/out/src/synthesize.d.ts +146 -0
  122. package/out/src/synthesize.d.ts.map +1 -0
  123. package/out/src/synthesize.js +472 -0
  124. package/out/src/synthesize.js.map +1 -0
  125. package/out/src/tokenize.d.ts +47 -0
  126. package/out/src/tokenize.d.ts.map +1 -0
  127. package/out/src/tokenize.js +49 -0
  128. package/out/src/tokenize.js.map +1 -0
  129. package/out/src/types.d.ts +168 -0
  130. package/out/src/types.d.ts.map +1 -0
  131. package/out/src/types.js +19 -0
  132. package/out/src/types.js.map +1 -0
  133. package/out/src/wof-json.d.ts +105 -0
  134. package/out/src/wof-json.d.ts.map +1 -0
  135. package/out/src/wof-json.js +174 -0
  136. package/out/src/wof-json.js.map +1 -0
  137. package/package.json +36 -0
@@ -0,0 +1,168 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Canonical row schemas for the corpus pipeline (per #6 / Phase 1 plan).
7
+ *
8
+ * The corpus pipeline produces two row shapes:
9
+ *
10
+ * 1. `CanonicalRow`: an adapter's raw output. Carries a free-form `raw` string, a per-component
11
+ * ground-truth dict, provenance, and an optional augmentation marker. Adapters emit these.
12
+ * 2. `LabeledRow`: alignment's output. Adds a SentencePiece token list and a parallel BIO label list,
13
+ * suitable for direct ingestion by the neural training loop.
14
+ *
15
+ * `CorpusAdapter` is the contract every data source implements; `AdapterOptions` is the
16
+ * per-invocation knob set (input path, optional country filter, row cap, abort signal).
17
+ */
18
+ import type { BioLabel, ComponentTag } from "@mailwoman/core/types";
19
+ /**
20
+ * Provenance + augmentation metadata that travels with every corpus row.
21
+ *
22
+ * `synth` is `undefined` for natural (un-augmented) rows; present only when a row was produced by
23
+ * the synthesis pipeline (see `synthesize.ts`).
24
+ */
25
+ export interface SourceProvenance {
26
+ /** Adapter id that emitted this row, e.g. `"wof-admin"`, `"ban"`, `"openaddresses"`. */
27
+ source: string;
28
+ /**
29
+ * Stable id within the adapter's source. For SQLite-backed adapters this is the row's primary
30
+ * key; for CSV/GeoJSON, a hash of the canonical components. Must be stable across reruns so that
31
+ * dedup and holdout manifests are reproducible.
32
+ */
33
+ source_id: string;
34
+ /**
35
+ * Corpus version string. Stamped by the runner, not the adapter. Locked together with the
36
+ * tokenizer version: `corpus-v0.1.0` ships with `tokenizer-v0.1.0`.
37
+ */
38
+ corpus_version: string;
39
+ /**
40
+ * Short license label or SPDX id for _this_ row. Defaults to the adapter's `defaultLicense`, but
41
+ * per-row sources (OpenAddresses) override.
42
+ */
43
+ license: string;
44
+ }
45
+ /**
46
+ * Marker placed on rows produced by `synthesize.ts`. Allows downstream code to weight, stratify, or
47
+ * exclude augmentations.
48
+ */
49
+ export interface SynthMarker {
50
+ /**
51
+ * Pipeline id describing what augmentation produced this row. Free-form but stable — e.g.
52
+ * `"case-perturb"`, `"accent-strip"`, `"abbrev-swap"`, `"compose:case-perturb+typo"`.
53
+ */
54
+ method: string;
55
+ /**
56
+ * `source_id` of the un-augmented row this was derived from. Allows tracing every synthetic row
57
+ * back to its natural ancestor.
58
+ */
59
+ base_source_id: string;
60
+ }
61
+ /**
62
+ * One address row, before tokenization + BIO labeling.
63
+ *
64
+ * `raw` is what a parser would see in the wild — possibly multi-line, with arbitrary whitespace.
65
+ * `components` is the ground-truth tagging: every `ComponentTag` present in the source data, mapped
66
+ * to its surface form _as it appears in `raw`_. Alignment uses this to assign BIO labels.
67
+ *
68
+ * Country is ISO 3166-1 alpha-2 (`"US"`, `"FR"`). Locale is BCP-47 (`"en-US"`, `"fr-FR"`) and is
69
+ * optional; adapters that can't be sure leave it empty and let the runner default by country.
70
+ */
71
+ export interface CanonicalRow extends SourceProvenance {
72
+ /** Address string as it might appear in source data. */
73
+ raw: string;
74
+ /**
75
+ * Component-by-tag ground truth. Surface forms must occur in `raw` (within the alignment edit
76
+ * distance threshold) or the row will land in the quarantine pile.
77
+ */
78
+ components: Partial<Record<ComponentTag, string>>;
79
+ /** ISO 3166-1 alpha-2 country code. */
80
+ country: string;
81
+ /** Optional BCP-47 locale. Defaulted by country if absent. */
82
+ locale?: string;
83
+ /** Present only on synthetic rows. */
84
+ synth?: SynthMarker;
85
+ }
86
+ /**
87
+ * Output of `align.ts`. Carries everything `CanonicalRow` does, plus parallel `tokens` and `labels`
88
+ * arrays of identical length. `labels[i]` is the BIO tag for `tokens[i]`.
89
+ */
90
+ export interface LabeledRow extends CanonicalRow {
91
+ /** SentencePiece subword tokens for `raw`. */
92
+ tokens: readonly string[];
93
+ /** BIO labels, one per token. Same length as `tokens`. */
94
+ labels: readonly BioLabel[];
95
+ }
96
+ /**
97
+ * A row that alignment refused to label. Lands in `/data/corpus/quarantine/` for human review.
98
+ *
99
+ * The `reason` is human-readable; common values are `"component-not-found:<tag>"`,
100
+ * `"edit-distance-exceeded:<tag>"`, `"raw-empty"`. Re-running alignment after a fix should re-emit
101
+ * the quarantined rows; the runner keys them by `source_id`.
102
+ */
103
+ export interface QuarantinedRow {
104
+ row: CanonicalRow;
105
+ reason: string;
106
+ }
107
+ /**
108
+ * Per-invocation knobs handed to an adapter by the runner.
109
+ *
110
+ * `inputPath` is interpreted by the adapter — it might be a single file path, a directory of
111
+ * shards, or even an HTTPS URL. Each adapter documents its own expected shape in its README.
112
+ *
113
+ * `country` filters to a single ISO 3166-1 alpha-2 country _at the adapter level_. Adapters that
114
+ * hold multi-country data (OSM PBF, OpenAddresses) MUST honor this; single-country adapters (BAN)
115
+ * may ignore it but should reject mismatches.
116
+ *
117
+ * `limit` is a soft cap on rows emitted; useful for fixture-driven tests and smoke runs.
118
+ *
119
+ * `signal` allows the runner to cancel a long-running scan cleanly.
120
+ */
121
+ export interface AdapterOptions {
122
+ /** Path to the adapter's input data (file, directory, or URL — adapter-specific). */
123
+ inputPath: string;
124
+ /** Optional output directory, available to adapters that maintain side state (rare). */
125
+ outputDir?: string;
126
+ /** ISO 3166-1 alpha-2 country filter. */
127
+ country?: string;
128
+ /** Soft row cap. Adapters should stop iterating once this is reached. */
129
+ limit?: number;
130
+ /** Cancellation hook. Adapters should respect this on every iteration boundary. */
131
+ signal?: AbortSignal;
132
+ }
133
+ /**
134
+ * The contract every data source implements.
135
+ *
136
+ * Adapters are async generators: they yield `CanonicalRow`s one at a time, the runner consumes them
137
+ * (writing JSONL + maintaining checksums + driving alignment). Streaming is mandatory — many
138
+ * sources are tens of millions of rows and cannot be buffered.
139
+ *
140
+ * `defaultLicense` is stamped onto every emitted row's `license` field unless the adapter sets
141
+ * `license` explicitly (e.g. OpenAddresses, which carries per-source licenses).
142
+ */
143
+ export interface CorpusAdapter {
144
+ /** Stable, machine-friendly id used in paths and CLI args. E.g. `"wof-admin"`. */
145
+ readonly id: string;
146
+ /** Default SPDX-ish license label for rows from this adapter. Per-row overrides allowed. */
147
+ readonly defaultLicense: string;
148
+ /** One-sentence description shown by `npx mailwoman corpus list`. */
149
+ readonly description: string;
150
+ /**
151
+ * Async iterable of canonical rows.
152
+ *
153
+ * Implementations MUST:
154
+ *
155
+ * - Honor `opts.country` (filter or reject mismatches).
156
+ * - Honor `opts.limit` (stop after N rows).
157
+ * - Respect `opts.signal` on every iteration.
158
+ * - Set `source` to `this.id` on every emitted row.
159
+ * - Set `license` to `this.defaultLicense` unless overriding per-row.
160
+ *
161
+ * Implementations MUST NOT:
162
+ *
163
+ * - Set `corpus_version` (the runner stamps it).
164
+ * - Mutate previously-yielded rows.
165
+ */
166
+ rows(opts: AdapterOptions): AsyncIterable<CanonicalRow>;
167
+ }
168
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAEnE;;;;;GAKG;AACH,MAAM,WAAW,gBAAgB;IAChC,wFAAwF;IACxF,MAAM,EAAE,MAAM,CAAA;IAEd;;;;OAIG;IACH,SAAS,EAAE,MAAM,CAAA;IAEjB;;;OAGG;IACH,cAAc,EAAE,MAAM,CAAA;IAEtB;;;OAGG;IACH,OAAO,EAAE,MAAM,CAAA;CACf;AAED;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC3B;;;OAGG;IACH,MAAM,EAAE,MAAM,CAAA;IAEd;;;OAGG;IACH,cAAc,EAAE,MAAM,CAAA;CACtB;AAED;;;;;;;;;GASG;AACH,MAAM,WAAW,YAAa,SAAQ,gBAAgB;IACrD,wDAAwD;IACxD,GAAG,EAAE,MAAM,CAAA;IAEX;;;OAGG;IACH,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAA;IAEjD,uCAAuC;IACvC,OAAO,EAAE,MAAM,CAAA;IAEf,8DAA8D;IAC9D,MAAM,CAAC,EAAE,MAAM,CAAA;IAEf,sCAAsC;IACtC,KAAK,CAAC,EAAE,WAAW,CAAA;CACnB;AAED;;;GAGG;AACH,MAAM,WAAW,UAAW,SAAQ,YAAY;IAC/C,8CAA8C;IAC9C,MAAM,EAAE,SAAS,MAAM,EAAE,CAAA;IAEzB,0DAA0D;IAC1D,MAAM,EAAE,SAAS,QAAQ,EAAE,CAAA;CAC3B;AAED;;;;;;GAMG;AACH,MAAM,WAAW,cAAc;IAC9B,GAAG,EAAE,YAAY,CAAA;IACjB,MAAM,EAAE,MAAM,CAAA;CACd;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,WAAW,cAAc;IAC9B,qFAAqF;IACrF,SAAS,EAAE,MAAM,CAAA;IAEjB,wFAAwF;IACxF,SAAS,CAAC,EAAE,MAAM,CAAA;IAElB,yCAAyC;IACzC,OAAO,CAAC,EAAE,MAAM,CAAA;IAEhB,yEAAyE;IACzE,KAAK,CAAC,EAAE,MAAM,CAAA;IAEd,mFAAmF;IACnF,MAAM,CAAC,EAAE,WAAW,CAAA;CACpB;AAED;;;;;;;;;GASG;AACH,MAAM,WAAW,aAAa;IAC7B,kFAAkF;IAClF,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAA;IAEnB,4FAA4F;IAC5F,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAA;IAE/B,qEAAqE;IACrE,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAA;IAE5B;;;;;;;;;;;;;;;OAeG;IACH,IAAI,CAAC,IAAI,EAAE,cAAc,GAAG,aAAa,CAAC,YAAY,CAAC,CAAA;CACvD"}
@@ -0,0 +1,19 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Canonical row schemas for the corpus pipeline (per #6 / Phase 1 plan).
7
+ *
8
+ * The corpus pipeline produces two row shapes:
9
+ *
10
+ * 1. `CanonicalRow`: an adapter's raw output. Carries a free-form `raw` string, a per-component
11
+ * ground-truth dict, provenance, and an optional augmentation marker. Adapters emit these.
12
+ * 2. `LabeledRow`: alignment's output. Adds a SentencePiece token list and a parallel BIO label list,
13
+ * suitable for direct ingestion by the neural training loop.
14
+ *
15
+ * `CorpusAdapter` is the contract every data source implements; `AdapterOptions` is the
16
+ * per-invocation knob set (input path, optional country filter, row cap, abort signal).
17
+ */
18
+ export {};
19
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG"}
@@ -0,0 +1,105 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Shared utilities for the `wof-admin` / `wof-postalcode` GeoJSON-bundle adapters.
7
+ *
8
+ * The Phase 1.5.1 pivot moved both adapters off the SpatiaLite distribution (dead mirror, empty
9
+ * `names` table) and onto the per-record GeoJSON bundles published as
10
+ * `github.com/whosonfirst-data/whosonfirst-data-{admin,postalcode}-<cc>` repos. Each repo carries
11
+ * a tree of `data/<3>/<3>/<3>/<wof-id>.geojson` files plus alternate-geometry siblings like
12
+ * `<id>-alt-quattroshapes.geojson`. The adapter only consumes the canonical record (no `-alt-`
13
+ * files); the alternate geometries are irrelevant to the name/hierarchy concerns Phase 1 cares
14
+ * about.
15
+ *
16
+ * This module provides:
17
+ *
18
+ * - `WofRecord`: the lightweight per-feature shape both adapters carry in their ancestry index.
19
+ * - `walkFeatures`: streaming directory walk → parsed `WofRecord`s (skips alt files, bad JSON,
20
+ * deprecated records).
21
+ * - `buildAncestryIndex`: in-memory ancestry chain construction (`Map<id, ancestors[]>`).
22
+ * - `extractNameVariants`: pulls `name:*` localized name lists off a feature's properties.
23
+ * - `normalizeNameKey`: turns `"name:eng_x_colloquial"` into `"name-eng-x-colloquial"` for safe use
24
+ * in `source_id` suffixes.
25
+ *
26
+ * `is_current` semantics follow WOF + Pelias: `mz:is_current` ∈ {`1`, `-1`} are live; `0` is
27
+ * superseded. WOF's official postalcode distribution stamps every row with `-1` ("unknown but
28
+ * treated as active"), which is why the previous SpatiaLite adapter's `is_current = 1` filter
29
+ * silently emitted zero rows from the real corpus.
30
+ */
31
+ /** A WOF GeoJSON feature, as published by the per-record bundles. */
32
+ export interface WofFeature {
33
+ type?: string;
34
+ id?: number | string;
35
+ properties?: Record<string, unknown> | null;
36
+ }
37
+ /**
38
+ * Lightweight in-memory shape both adapters keep per record. Geometry is intentionally dropped —
39
+ * it's 95% of the file weight and the adapters never consult it.
40
+ */
41
+ export interface WofRecord {
42
+ id: number;
43
+ parent_id: number | null;
44
+ /** Canonical `wof:name` of the record. */
45
+ name: string;
46
+ placetype: string;
47
+ /** ISO 3166-1 alpha-2 from `wof:country`. */
48
+ country: string;
49
+ /**
50
+ * Localized name variants from `name:*` properties.
51
+ *
52
+ * Keys are the raw `name:eng_x_preferred` form; values are the first non-empty string from the
53
+ * underlying array (WOF stores variants as arrays even when only one form is present). The
54
+ * canonical `wof:name` is NOT included here — adapters add a synthetic `"default"` slot for it.
55
+ */
56
+ nameVariants: Map<string, string>;
57
+ }
58
+ /**
59
+ * `mz:is_current` ∈ {`1`, `-1`} → keep. `0` → drop.
60
+ *
61
+ * Real WOF postalcode distros tag every row `-1` ("unknown but treated as active"); the Pelias
62
+ * importer accepts `-1` alongside `1`. The previous SpatiaLite-backed adapters filtered on `= 1`
63
+ * only and silently emitted zero rows from the corpus — this loosened predicate is the load-bearing
64
+ * fix.
65
+ */
66
+ export declare function isCurrentFeature(props: Record<string, unknown>): boolean;
67
+ /**
68
+ * Pull `name:*` localized variants off a WOF feature's properties. WOF stores variants as arrays
69
+ * (`["Saint Petersburg"]`); we lift the first non-empty string. Multiple-value variants (rare;
70
+ * usually historical aliases) are not split into separate rows by this helper — adapters can opt in
71
+ * by iterating the underlying array if they need it.
72
+ */
73
+ export declare function extractNameVariants(props: Record<string, unknown>): Map<string, string>;
74
+ /**
75
+ * Turn a `name:*` property key into a hyphen-safe suffix fragment for `source_id`.
76
+ *
77
+ * `"name:eng_x_colloquial"` → `"name-eng-x-colloquial"`. `:` and `_` both become `-` because both
78
+ * collide with the existing source_id separator vocabulary and downstream consumers split on `-`.
79
+ */
80
+ export declare function normalizeNameKey(rawKey: string): string;
81
+ /**
82
+ * Stream every canonical GeoJSON file under `repoDir` and yield parsed `WofRecord`s.
83
+ *
84
+ * `repoDir` may point at a single cloned `whosonfirst-data-*` repo OR at a parent directory holding
85
+ * several such repos (the corpus pipeline clones all four into a shared `wof/repos/` root and runs
86
+ * the adapter against that root). `**\/*.geojson` walks the whole tree; `-alt-` siblings are
87
+ * skipped since they're alternate-geometry exports, not new records.
88
+ *
89
+ * Errors per-file (unreadable, malformed JSON, missing properties) are swallowed so one bad file
90
+ * doesn't poison a 3 GB walk. Adapters can add stricter validation downstream if they need it.
91
+ */
92
+ export declare function walkFeatures(repoDir: string, opts?: {
93
+ signal?: AbortSignal;
94
+ }): AsyncIterable<WofRecord>;
95
+ /**
96
+ * Build an in-memory ancestry index: `Map<wof_id, [parent, grandparent, ...]>` walking `parent_id`
97
+ * upward and stopping at the first missing link. A cycle guard halts at any re-visit (defensive —
98
+ * WOF data is acyclic by construction but corrupt fixtures shouldn't infinite-loop the adapter).
99
+ *
100
+ * Records whose ancestors aren't in `byId` (e.g. an FR locality whose region wasn't included in the
101
+ * cloned repo set) get a shorter chain; the variant emission gracefully degrades.
102
+ */
103
+ export type AncestryIndex = Map<number, WofRecord[]>;
104
+ export declare function buildAncestryIndex(byId: Map<number, WofRecord>): AncestryIndex;
105
+ //# sourceMappingURL=wof-json.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"wof-json.d.ts","sourceRoot":"","sources":["../../src/wof-json.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAKH,qEAAqE;AACrE,MAAM,WAAW,UAAU;IAC1B,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAAA;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAA;CAC3C;AAED;;;GAGG;AACH,MAAM,WAAW,SAAS;IACzB,EAAE,EAAE,MAAM,CAAA;IACV,SAAS,EAAE,MAAM,GAAG,IAAI,CAAA;IACxB,0CAA0C;IAC1C,IAAI,EAAE,MAAM,CAAA;IACZ,SAAS,EAAE,MAAM,CAAA;IACjB,6CAA6C;IAC7C,OAAO,EAAE,MAAM,CAAA;IACf;;;;;;OAMG;IACH,YAAY,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;CACjC;AAED;;;;;;;GAOG;AACH,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,OAAO,CAIxE;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAYvF;AAED;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAEvD;AAwCD;;;;;;;;;;GAUG;AACH,wBAAuB,YAAY,CAAC,OAAO,EAAE,MAAM,EAAE,IAAI,GAAE;IAAE,MAAM,CAAC,EAAE,WAAW,CAAA;CAAO,GAAG,aAAa,CAAC,SAAS,CAAC,CA8BlH;AAED;;;;;;;GAOG;AACH,MAAM,MAAM,aAAa,GAAG,GAAG,CAAC,MAAM,EAAE,SAAS,EAAE,CAAC,CAAA;AAEpD,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,GAAG,CAAC,MAAM,EAAE,SAAS,CAAC,GAAG,aAAa,CAiB9E"}
@@ -0,0 +1,174 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Shared utilities for the `wof-admin` / `wof-postalcode` GeoJSON-bundle adapters.
7
+ *
8
+ * The Phase 1.5.1 pivot moved both adapters off the SpatiaLite distribution (dead mirror, empty
9
+ * `names` table) and onto the per-record GeoJSON bundles published as
10
+ * `github.com/whosonfirst-data/whosonfirst-data-{admin,postalcode}-<cc>` repos. Each repo carries
11
+ * a tree of `data/<3>/<3>/<3>/<wof-id>.geojson` files plus alternate-geometry siblings like
12
+ * `<id>-alt-quattroshapes.geojson`. The adapter only consumes the canonical record (no `-alt-`
13
+ * files); the alternate geometries are irrelevant to the name/hierarchy concerns Phase 1 cares
14
+ * about.
15
+ *
16
+ * This module provides:
17
+ *
18
+ * - `WofRecord`: the lightweight per-feature shape both adapters carry in their ancestry index.
19
+ * - `walkFeatures`: streaming directory walk → parsed `WofRecord`s (skips alt files, bad JSON,
20
+ * deprecated records).
21
+ * - `buildAncestryIndex`: in-memory ancestry chain construction (`Map<id, ancestors[]>`).
22
+ * - `extractNameVariants`: pulls `name:*` localized name lists off a feature's properties.
23
+ * - `normalizeNameKey`: turns `"name:eng_x_colloquial"` into `"name-eng-x-colloquial"` for safe use
24
+ * in `source_id` suffixes.
25
+ *
26
+ * `is_current` semantics follow WOF + Pelias: `mz:is_current` ∈ {`1`, `-1`} are live; `0` is
27
+ * superseded. WOF's official postalcode distribution stamps every row with `-1` ("unknown but
28
+ * treated as active"), which is why the previous SpatiaLite adapter's `is_current = 1` filter
29
+ * silently emitted zero rows from the real corpus.
30
+ */
31
+ import FastGlob from "fast-glob";
32
+ import { readFile } from "node:fs/promises";
33
+ /**
34
+ * `mz:is_current` ∈ {`1`, `-1`} → keep. `0` → drop.
35
+ *
36
+ * Real WOF postalcode distros tag every row `-1` ("unknown but treated as active"); the Pelias
37
+ * importer accepts `-1` alongside `1`. The previous SpatiaLite-backed adapters filtered on `= 1`
38
+ * only and silently emitted zero rows from the corpus — this loosened predicate is the load-bearing
39
+ * fix.
40
+ */
41
+ export function isCurrentFeature(props) {
42
+ const raw = props["mz:is_current"];
43
+ const n = typeof raw === "number" ? raw : typeof raw === "string" ? Number(raw) : 1;
44
+ return n !== 0;
45
+ }
46
+ /**
47
+ * Pull `name:*` localized variants off a WOF feature's properties. WOF stores variants as arrays
48
+ * (`["Saint Petersburg"]`); we lift the first non-empty string. Multiple-value variants (rare;
49
+ * usually historical aliases) are not split into separate rows by this helper — adapters can opt in
50
+ * by iterating the underlying array if they need it.
51
+ */
52
+ export function extractNameVariants(props) {
53
+ const out = new Map();
54
+ for (const [key, value] of Object.entries(props)) {
55
+ if (!key.startsWith("name:"))
56
+ continue;
57
+ const candidate = Array.isArray(value)
58
+ ? value.find((v) => typeof v === "string" && v.trim().length > 0)
59
+ : typeof value === "string" && value.trim().length > 0
60
+ ? value
61
+ : undefined;
62
+ if (candidate)
63
+ out.set(key, candidate.trim());
64
+ }
65
+ return out;
66
+ }
67
+ /**
68
+ * Turn a `name:*` property key into a hyphen-safe suffix fragment for `source_id`.
69
+ *
70
+ * `"name:eng_x_colloquial"` → `"name-eng-x-colloquial"`. `:` and `_` both become `-` because both
71
+ * collide with the existing source_id separator vocabulary and downstream consumers split on `-`.
72
+ */
73
+ export function normalizeNameKey(rawKey) {
74
+ return rawKey.replace(/[:_]/g, "-");
75
+ }
76
+ /** Result of parsing a single GeoJSON file. `null` means "skip this row" (any reason). */
77
+ function recordFromFeature(feature) {
78
+ if (!feature || feature.type !== "Feature" || !feature.properties)
79
+ return null;
80
+ const props = feature.properties;
81
+ const rawId = typeof feature.id === "number" ? feature.id : props["wof:id"];
82
+ const id = typeof rawId === "number" ? rawId : typeof rawId === "string" ? Number(rawId) : NaN;
83
+ if (!Number.isFinite(id))
84
+ return null;
85
+ const name = props["wof:name"];
86
+ if (typeof name !== "string" || !name.trim())
87
+ return null;
88
+ const placetype = props["wof:placetype"];
89
+ if (typeof placetype !== "string" || !placetype)
90
+ return null;
91
+ const country = props["wof:country"];
92
+ if (typeof country !== "string" || !country)
93
+ return null;
94
+ if (!isCurrentFeature(props))
95
+ return null;
96
+ const parentRaw = props["wof:parent_id"];
97
+ const parent_id = typeof parentRaw === "number"
98
+ ? parentRaw
99
+ : typeof parentRaw === "string" && parentRaw.trim()
100
+ ? Number(parentRaw)
101
+ : null;
102
+ return {
103
+ id,
104
+ parent_id: Number.isFinite(parent_id) ? parent_id : null,
105
+ name: name.trim(),
106
+ placetype,
107
+ country,
108
+ nameVariants: extractNameVariants(props),
109
+ };
110
+ }
111
+ /**
112
+ * Stream every canonical GeoJSON file under `repoDir` and yield parsed `WofRecord`s.
113
+ *
114
+ * `repoDir` may point at a single cloned `whosonfirst-data-*` repo OR at a parent directory holding
115
+ * several such repos (the corpus pipeline clones all four into a shared `wof/repos/` root and runs
116
+ * the adapter against that root). `**\/*.geojson` walks the whole tree; `-alt-` siblings are
117
+ * skipped since they're alternate-geometry exports, not new records.
118
+ *
119
+ * Errors per-file (unreadable, malformed JSON, missing properties) are swallowed so one bad file
120
+ * doesn't poison a 3 GB walk. Adapters can add stricter validation downstream if they need it.
121
+ */
122
+ export async function* walkFeatures(repoDir, opts = {}) {
123
+ const stream = FastGlob.stream(["**/*.geojson"], {
124
+ cwd: repoDir,
125
+ absolute: true,
126
+ onlyFiles: true,
127
+ suppressErrors: true,
128
+ });
129
+ for await (const entry of stream) {
130
+ if (opts.signal?.aborted)
131
+ return;
132
+ const filePath = String(entry);
133
+ if (filePath.includes("-alt-"))
134
+ continue;
135
+ let text;
136
+ try {
137
+ text = await readFile(filePath, "utf8");
138
+ }
139
+ catch {
140
+ continue;
141
+ }
142
+ let parsed;
143
+ try {
144
+ parsed = JSON.parse(text);
145
+ }
146
+ catch {
147
+ continue;
148
+ }
149
+ const rec = recordFromFeature(parsed);
150
+ if (rec)
151
+ yield rec;
152
+ }
153
+ }
154
+ export function buildAncestryIndex(byId) {
155
+ const index = new Map();
156
+ for (const [id, rec] of byId) {
157
+ const chain = [];
158
+ const guard = new Set([id]);
159
+ let cur = rec.parent_id;
160
+ while (cur !== null && cur > 0) {
161
+ const parent = byId.get(cur);
162
+ if (!parent)
163
+ break;
164
+ if (guard.has(parent.id))
165
+ break;
166
+ chain.push(parent);
167
+ guard.add(parent.id);
168
+ cur = parent.parent_id;
169
+ }
170
+ index.set(id, chain);
171
+ }
172
+ return index;
173
+ }
174
+ //# sourceMappingURL=wof-json.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"wof-json.js","sourceRoot":"","sources":["../../src/wof-json.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAEH,OAAO,QAAQ,MAAM,WAAW,CAAA;AAChC,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAA;AA+B3C;;;;;;;GAOG;AACH,MAAM,UAAU,gBAAgB,CAAC,KAA8B;IAC9D,MAAM,GAAG,GAAG,KAAK,CAAC,eAAe,CAAC,CAAA;IAClC,MAAM,CAAC,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IACnF,OAAO,CAAC,KAAK,CAAC,CAAA;AACf,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAA8B;IACjE,MAAM,GAAG,GAAG,IAAI,GAAG,EAAkB,CAAA;IACrC,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QAClD,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC;YAAE,SAAQ;QACtC,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC;YACrC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC;YAC9E,CAAC,CAAC,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC;gBACrD,CAAC,CAAC,KAAK;gBACP,CAAC,CAAC,SAAS,CAAA;QACb,IAAI,SAAS;YAAE,GAAG,CAAC,GAAG,CAAC,GAAG,EAAE,SAAS,CAAC,IAAI,EAAE,CAAC,CAAA;IAC9C,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,gBAAgB,CAAC,MAAc;IAC9C,OAAO,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAA;AACpC,CAAC;AAED,0FAA0F;AAC1F,SAAS,iBAAiB,CAAC,OAAmB;IAC7C,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,IAAI,KAAK,SAAS,IAAI,CAAC,OAAO,CAAC,UAAU;QAAE,OAAO,IAAI,CAAA;IAC9E,MAAM,KAAK,GAAG,OAAO,CAAC,UAAU,CAAA;IAEhC,MAAM,KAAK,GAAG,OAAO,OAAO,CAAC,EAAE,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAA;IAC3E,MAAM,EAAE,GAAG,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAA;IAC9F,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;QAAE,OAAO,IAAI,CAAA;IAErC,MAAM,IAAI,GAAG,KAAK,CAAC,UAAU,CAAC,CAAA;IAC9B,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;QAAE,OAAO,IAAI,CAAA;IAEzD,MAAM,SAAS,GAAG,KAAK,CAAC,eAAe,CAAC,CAAA;IACxC,IAAI,OAAO,SAAS,KAAK,QAAQ,IAAI,CAAC,SAAS;QAAE,OAAO,IAAI,CAAA;IAE5D,MAAM,OAAO,GAAG,KAAK,CAAC,aAAa,CAAC,CAAA;IACpC,IAAI,OAAO,OAAO,KAAK,QAAQ,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IAExD,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC;QAAE,OAAO,IAAI,CAAA;IAEzC,MAAM,SAAS,GAAG,KAAK,CAAC,eAAe,CAAC,CAAA;IACxC,MAAM,SAAS,GACd,OAAO,SAAS,KAAK,QAAQ;QAC5B,CAAC,CAAC,SAAS;QACX,CAAC,CAAC,OAAO,SAAS,KAAK,QAAQ,IAAI,SAAS,CAAC,IAAI,EAAE;YAClD,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC;YACnB,CAAC,CAAC,IAAI,CAAA;IAET,OAAO;QACN,EAAE;QACF,SAAS,EAAE,MAAM,CAAC,QAAQ,CAAC,SAAmB,CAAC,CAAC,CAAC,CAAE,SAAoB,CAAC,CAAC,CAAC,IAAI;QAC9E,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE;QACjB,SAAS;QACT,OAAO;QACP,YAAY,EAAE,mBAAmB,CAAC,KAAK,CAAC;KACxC,CAAA;AACF,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,CAAC,KAAK,SAAS,CAAC,CAAC,YAAY,CAAC,OAAe,EAAE,OAAiC,EAAE;IACvF,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,cAAc,CAAC,EAAE;QAChD,GAAG,EAAE,OAAO;QACZ,QAAQ,EAAE,IAAI;QACd,SAAS,EAAE,IAAI;QACf,cAAc,EAAE,IAAI;KACpB,CAAC,CAAA;IAEF,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAClC,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO;YAAE,OAAM;QAChC,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAA;QAC9B,IAAI,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC;YAAE,SAAQ;QAExC,IAAI,IAAY,CAAA;QAChB,IAAI,CAAC;YACJ,IAAI,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;QACxC,CAAC;QAAC,MAAM,CAAC;YACR,SAAQ;QACT,CAAC;QAED,IAAI,MAAkB,CAAA;QACtB,IAAI,CAAC;YACJ,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAe,CAAA;QACxC,CAAC;QAAC,MAAM,CAAC;YACR,SAAQ;QACT,CAAC;QAED,MAAM,GAAG,GAAG,iBAAiB,CAAC,MAAM,CAAC,CAAA;QACrC,IAAI,GAAG;YAAE,MAAM,GAAG,CAAA;IACnB,CAAC;AACF,CAAC;AAYD,MAAM,UAAU,kBAAkB,CAAC,IAA4B;IAC9D,MAAM,KAAK,GAAkB,IAAI,GAAG,EAAE,CAAA;IACtC,KAAK,MAAM,CAAC,EAAE,EAAE,GAAG,CAAC,IAAI,IAAI,EAAE,CAAC;QAC9B,MAAM,KAAK,GAAgB,EAAE,CAAA;QAC7B,MAAM,KAAK,GAAG,IAAI,GAAG,CAAS,CAAC,EAAE,CAAC,CAAC,CAAA;QACnC,IAAI,GAAG,GAAkB,GAAG,CAAC,SAAS,CAAA;QACtC,OAAO,GAAG,KAAK,IAAI,IAAI,GAAG,GAAG,CAAC,EAAE,CAAC;YAChC,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;YAC5B,IAAI,CAAC,MAAM;gBAAE,MAAK;YAClB,IAAI,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;gBAAE,MAAK;YAC/B,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;YAClB,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;YACpB,GAAG,GAAG,MAAM,CAAC,SAAS,CAAA;QACvB,CAAC;QACD,KAAK,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAA;IACrB,CAAC;IACD,OAAO,KAAK,CAAA;AACb,CAAC"}
package/package.json ADDED
@@ -0,0 +1,36 @@
1
+ {
2
+ "name": "@mailwoman/corpus",
3
+ "version": "2.0.0",
4
+ "description": "Mailwoman corpus pipeline: BIO-labeled dataset builder for the neural classifier.",
5
+ "license": "AGPL-3.0-only",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/sister-software/mailwoman.git",
9
+ "directory": "corpus"
10
+ },
11
+ "type": "module",
12
+ "exports": {
13
+ "./package.json": "./package.json",
14
+ ".": "./out/src/index.js",
15
+ "./types": "./out/src/types.js",
16
+ "./format": "./out/src/format.js"
17
+ },
18
+ "dependencies": {
19
+ "@dsnp/parquetjs": "1.7.0",
20
+ "@fragaria/address-formatter": "^6.3.0",
21
+ "@mailwoman/core": "workspace:*",
22
+ "better-sqlite3": "^11.5.0",
23
+ "csv-parse": "^5.5.6",
24
+ "fastest-levenshtein": "^1.0.16",
25
+ "lru-cache": "^10.0.1"
26
+ },
27
+ "files": [
28
+ "out/**/*.js",
29
+ "out/**/*.js.map",
30
+ "out/**/*.d.ts",
31
+ "out/**/*.d.ts.map"
32
+ ],
33
+ "publishConfig": {
34
+ "access": "public"
35
+ }
36
+ }