@mailwoman/corpus 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/out/src/adapter.d.ts +96 -0
  2. package/out/src/adapter.d.ts.map +1 -0
  3. package/out/src/adapter.js +107 -0
  4. package/out/src/adapter.js.map +1 -0
  5. package/out/src/adapters/ban/adapter.d.ts +32 -0
  6. package/out/src/adapters/ban/adapter.d.ts.map +1 -0
  7. package/out/src/adapters/ban/adapter.js +133 -0
  8. package/out/src/adapters/ban/adapter.js.map +1 -0
  9. package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
  10. package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
  11. package/out/src/adapters/fcc-bdc/adapter.js +153 -0
  12. package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
  13. package/out/src/adapters/index.d.ts +42 -0
  14. package/out/src/adapters/index.d.ts.map +1 -0
  15. package/out/src/adapters/index.js +76 -0
  16. package/out/src/adapters/index.js.map +1 -0
  17. package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
  18. package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
  19. package/out/src/adapters/openaddresses/adapter.js +174 -0
  20. package/out/src/adapters/openaddresses/adapter.js.map +1 -0
  21. package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
  22. package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
  23. package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
  24. package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
  25. package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
  26. package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
  27. package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
  28. package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
  29. package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
  30. package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
  31. package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
  32. package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
  33. package/out/src/adapters/tiger/adapter.d.ts +45 -0
  34. package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
  35. package/out/src/adapters/tiger/adapter.js +179 -0
  36. package/out/src/adapters/tiger/adapter.js.map +1 -0
  37. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
  38. package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
  39. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
  40. package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
  41. package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
  42. package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
  43. package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
  44. package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
  45. package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
  46. package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
  47. package/out/src/adapters/usgov-nad/adapter.js +227 -0
  48. package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
  49. package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
  50. package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
  51. package/out/src/adapters/usgov-nppes/adapter.js +123 -0
  52. package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
  53. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
  54. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
  55. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
  56. package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
  57. package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
  58. package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
  59. package/out/src/adapters/wof-admin-json/adapter.js +241 -0
  60. package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
  61. package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
  62. package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
  63. package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
  64. package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
  65. package/out/src/align.d.ts +58 -0
  66. package/out/src/align.d.ts.map +1 -0
  67. package/out/src/align.js +139 -0
  68. package/out/src/align.js.map +1 -0
  69. package/out/src/build.d.ts +104 -0
  70. package/out/src/build.d.ts.map +1 -0
  71. package/out/src/build.js +201 -0
  72. package/out/src/build.js.map +1 -0
  73. package/out/src/codex/us-fips-state.d.ts +44 -0
  74. package/out/src/codex/us-fips-state.d.ts.map +1 -0
  75. package/out/src/codex/us-fips-state.js +105 -0
  76. package/out/src/codex/us-fips-state.js.map +1 -0
  77. package/out/src/codex/us-street-suffix.d.ts +259 -0
  78. package/out/src/codex/us-street-suffix.d.ts.map +1 -0
  79. package/out/src/codex/us-street-suffix.js +285 -0
  80. package/out/src/codex/us-street-suffix.js.map +1 -0
  81. package/out/src/format.d.ts +79 -0
  82. package/out/src/format.d.ts.map +1 -0
  83. package/out/src/format.js +151 -0
  84. package/out/src/format.js.map +1 -0
  85. package/out/src/golden.d.ts +50 -0
  86. package/out/src/golden.d.ts.map +1 -0
  87. package/out/src/golden.js +104 -0
  88. package/out/src/golden.js.map +1 -0
  89. package/out/src/index.d.ts +18 -0
  90. package/out/src/index.d.ts.map +1 -0
  91. package/out/src/index.js +18 -0
  92. package/out/src/index.js.map +1 -0
  93. package/out/src/parquet-wrapper/index.d.ts +12 -0
  94. package/out/src/parquet-wrapper/index.d.ts.map +1 -0
  95. package/out/src/parquet-wrapper/index.js +12 -0
  96. package/out/src/parquet-wrapper/index.js.map +1 -0
  97. package/out/src/parquet-wrapper/reader.d.ts +31 -0
  98. package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
  99. package/out/src/parquet-wrapper/reader.js +54 -0
  100. package/out/src/parquet-wrapper/reader.js.map +1 -0
  101. package/out/src/parquet-wrapper/schema.d.ts +45 -0
  102. package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
  103. package/out/src/parquet-wrapper/schema.js +55 -0
  104. package/out/src/parquet-wrapper/schema.js.map +1 -0
  105. package/out/src/parquet-wrapper/writer.d.ts +41 -0
  106. package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
  107. package/out/src/parquet-wrapper/writer.js +71 -0
  108. package/out/src/parquet-wrapper/writer.js.map +1 -0
  109. package/out/src/parquet.d.ts +122 -0
  110. package/out/src/parquet.d.ts.map +1 -0
  111. package/out/src/parquet.js +220 -0
  112. package/out/src/parquet.js.map +1 -0
  113. package/out/src/runner.d.ts +100 -0
  114. package/out/src/runner.d.ts.map +1 -0
  115. package/out/src/runner.js +183 -0
  116. package/out/src/runner.js.map +1 -0
  117. package/out/src/split.d.ts +108 -0
  118. package/out/src/split.d.ts.map +1 -0
  119. package/out/src/split.js +191 -0
  120. package/out/src/split.js.map +1 -0
  121. package/out/src/synthesize.d.ts +146 -0
  122. package/out/src/synthesize.d.ts.map +1 -0
  123. package/out/src/synthesize.js +472 -0
  124. package/out/src/synthesize.js.map +1 -0
  125. package/out/src/tokenize.d.ts +47 -0
  126. package/out/src/tokenize.d.ts.map +1 -0
  127. package/out/src/tokenize.js +49 -0
  128. package/out/src/tokenize.js.map +1 -0
  129. package/out/src/types.d.ts +168 -0
  130. package/out/src/types.d.ts.map +1 -0
  131. package/out/src/types.js +19 -0
  132. package/out/src/types.js.map +1 -0
  133. package/out/src/wof-json.d.ts +105 -0
  134. package/out/src/wof-json.d.ts.map +1 -0
  135. package/out/src/wof-json.js +174 -0
  136. package/out/src/wof-json.js.map +1 -0
  137. package/package.json +36 -0
@@ -0,0 +1,41 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Typed Parquet writer. Two static constructors mirror the base class:
7
+ *
8
+ * - `openStream`: wrap an existing writable stream.
9
+ * - `openFile`: open a path on disk, ensuring the parent directory exists first.
10
+ *
11
+ * Implements `AsyncDisposable` so `await using writer = await ParquetWriter.openFile(...)` flushes
12
+ * and closes cleanly. `close()` internally serializes against any in-flight flush so back-to-back
13
+ * dispose calls don't race.
14
+ *
15
+ * Salvaged 2026-05-17 from `isp-nexus/universe@6eeb7bd99643a6d62a8b8abbd50968a1e492b90b`
16
+ * `sdk/parquet/writer.ts` (originally copyright OpenISP, Inc.; both projects are AGPL-3.0). One
17
+ * trim relative to the original: dropped the `@isp.nexus/core/polyfills/promises/withResolvers`
18
+ * import — Node 22 (mailwoman's runtime) has `Promise.withResolvers` natively.
19
+ */
20
+ import { ParquetWriter as BaseParquetWriter } from "@dsnp/parquetjs";
21
+ import type { WriterOptions } from "@dsnp/parquetjs/dist/lib/declare.js";
22
+ import { type WriteStreamMinimal } from "@dsnp/parquetjs/dist/lib/util.js";
23
+ import { type ParquetRecordLike, ParquetSchema, type ParquetSchemaDefinition, ParquetSchemaDefinitionCache } from "./schema.js";
24
+ /** A typed Parquet writer, wrapping the base Parquet writer. */
25
+ export declare class ParquetWriter<T extends ParquetRecordLike> extends BaseParquetWriter implements AsyncDisposable {
26
+ #private;
27
+ schema: ParquetSchema<T>;
28
+ protected static readonly SchemaDefinitionCache: ParquetSchemaDefinitionCache;
29
+ static openStream<T extends ParquetRecordLike>(schemaLike: ParquetSchema<T> | ParquetSchemaDefinition<T>, outputStream: WriteStreamMinimal, opts?: WriterOptions): Promise<ParquetWriter<T>>;
30
+ /** Convenience method to create a new buffered parquet writer that writes to the specified file. */
31
+ static openFile<T extends ParquetRecordLike>(schemaLike: ParquetSchema<T> | ParquetSchemaDefinition<T>, sourcePath: string | Buffer | URL, opts?: WriterOptions): Promise<ParquetWriter<T>>;
32
+ /** Set a metadata key-value pair on the writer. */
33
+ setMetadata(key: string, value: string): void;
34
+ /** Append a row to the buffer. If the buffer is full, the data will be written to disk. */
35
+ appendRow(row: T): Promise<void>;
36
+ /** Flush all buffered data to disk, close the file, and release resources. */
37
+ close(): Promise<void>;
38
+ [Symbol.asyncDispose](): Promise<void>;
39
+ dispose(): Promise<void>;
40
+ }
41
+ //# sourceMappingURL=writer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"writer.d.ts","sourceRoot":"","sources":["../../../src/parquet-wrapper/writer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,EAAE,aAAa,IAAI,iBAAiB,EAAE,MAAM,iBAAiB,CAAA;AACpE,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qCAAqC,CAAA;AACxE,OAAO,EAAU,KAAK,kBAAkB,EAAE,MAAM,kCAAkC,CAAA;AAIlF,OAAO,EACN,KAAK,iBAAiB,EACtB,aAAa,EACb,KAAK,uBAAuB,EAC5B,4BAA4B,EAC5B,MAAM,aAAa,CAAA;AAEpB,gEAAgE;AAChE,qBAAa,aAAa,CAAC,CAAC,SAAS,iBAAiB,CAAE,SAAQ,iBAAkB,YAAW,eAAe;;IACnG,MAAM,EAAE,aAAa,CAAC,CAAC,CAAC,CAAA;IAChC,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,qBAAqB,+BAAqC;WAG9D,UAAU,CAAC,CAAC,SAAS,iBAAiB,EAC3D,UAAU,EAAE,aAAa,CAAC,CAAC,CAAC,GAAG,uBAAuB,CAAC,CAAC,CAAC,EACzD,YAAY,EAAE,kBAAkB,EAChC,IAAI,GAAE,aAAkB,GACtB,OAAO,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;IAW5B,oGAAoG;WAC9E,QAAQ,CAAC,CAAC,SAAS,iBAAiB,EACzD,UAAU,EAAE,aAAa,CAAC,CAAC,CAAC,GAAG,uBAAuB,CAAC,CAAC,CAAC,EACzD,UAAU,EAAE,MAAM,GAAG,MAAM,GAAG,GAAG,EACjC,IAAI,CAAC,EAAE,aAAa,GAClB,OAAO,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC;IAU5B,mDAAmD;IACnC,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI;IAI7D,2FAA2F;IACrE,SAAS,CAAC,GAAG,EAAE,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IAItD,8EAA8E;IACxD,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAa/B,CAAC,MAAM,CAAC,YAAY,CAAC;IAIrB,OAAO;CAGpB"}
@@ -0,0 +1,71 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Typed Parquet writer. Two static constructors mirror the base class:
7
+ *
8
+ * - `openStream`: wrap an existing writable stream.
9
+ * - `openFile`: open a path on disk, ensuring the parent directory exists first.
10
+ *
11
+ * Implements `AsyncDisposable` so `await using writer = await ParquetWriter.openFile(...)` flushes
12
+ * and closes cleanly. `close()` internally serializes against any in-flight flush so back-to-back
13
+ * dispose calls don't race.
14
+ *
15
+ * Salvaged 2026-05-17 from `isp-nexus/universe@6eeb7bd99643a6d62a8b8abbd50968a1e492b90b`
16
+ * `sdk/parquet/writer.ts` (originally copyright OpenISP, Inc.; both projects are AGPL-3.0). One
17
+ * trim relative to the original: dropped the `@isp.nexus/core/polyfills/promises/withResolvers`
18
+ * import — Node 22 (mailwoman's runtime) has `Promise.withResolvers` natively.
19
+ */
20
+ import { ParquetWriter as BaseParquetWriter } from "@dsnp/parquetjs";
21
+ import { osopen } from "@dsnp/parquetjs/dist/lib/util.js";
22
+ import { ParquetEnvelopeWriter } from "@dsnp/parquetjs/dist/lib/writer.js";
23
+ import * as fs from "node:fs/promises";
24
+ import * as path from "node:path";
25
+ import { ParquetSchema, ParquetSchemaDefinitionCache, } from "./schema.js";
26
+ /** A typed Parquet writer, wrapping the base Parquet writer. */
27
+ export class ParquetWriter extends BaseParquetWriter {
28
+ static SchemaDefinitionCache = new ParquetSchemaDefinitionCache();
29
+ #flushing = Promise.resolve();
30
+ static async openStream(schemaLike, outputStream, opts = {}) {
31
+ const schema = schemaLike instanceof ParquetSchema
32
+ ? schemaLike
33
+ : ParquetWriter.SchemaDefinitionCache.findOrCreateSchema(schemaLike);
34
+ const envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts);
35
+ return new ParquetWriter(schema, envelopeWriter, opts);
36
+ }
37
+ /** Convenience method to create a new buffered parquet writer that writes to the specified file. */
38
+ static async openFile(schemaLike, sourcePath, opts) {
39
+ if (typeof sourcePath === "string") {
40
+ await fs.mkdir(path.dirname(sourcePath), { recursive: true });
41
+ }
42
+ const outputStream = await osopen(sourcePath, opts);
43
+ return ParquetWriter.openStream(schemaLike, outputStream, opts);
44
+ }
45
+ // @note This fixes invalid Markdown in the base class JSDoc.
46
+ /** Set a metadata key-value pair on the writer. */
47
+ setMetadata(key, value) {
48
+ return super.setMetadata(key, value);
49
+ }
50
+ /** Append a row to the buffer. If the buffer is full, the data will be written to disk. */
51
+ async appendRow(row) {
52
+ return super.appendRow(row);
53
+ }
54
+ /** Flush all buffered data to disk, close the file, and release resources. */
55
+ async close() {
56
+ await this.#flushing;
57
+ if (this.closed)
58
+ return;
59
+ const { promise, resolve, reject } = Promise.withResolvers();
60
+ super.close().then(resolve, reject);
61
+ this.#flushing = promise;
62
+ return this.#flushing;
63
+ }
64
+ async [Symbol.asyncDispose]() {
65
+ return this.close();
66
+ }
67
+ async dispose() {
68
+ return this[Symbol.asyncDispose]();
69
+ }
70
+ }
71
+ //# sourceMappingURL=writer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"writer.js","sourceRoot":"","sources":["../../../src/parquet-wrapper/writer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,EAAE,aAAa,IAAI,iBAAiB,EAAE,MAAM,iBAAiB,CAAA;AAEpE,OAAO,EAAE,MAAM,EAA2B,MAAM,kCAAkC,CAAA;AAClF,OAAO,EAAE,qBAAqB,EAAE,MAAM,oCAAoC,CAAA;AAC1E,OAAO,KAAK,EAAE,MAAM,kBAAkB,CAAA;AACtC,OAAO,KAAK,IAAI,MAAM,WAAW,CAAA;AACjC,OAAO,EAEN,aAAa,EAEb,4BAA4B,GAC5B,MAAM,aAAa,CAAA;AAEpB,gEAAgE;AAChE,MAAM,OAAO,aAA2C,SAAQ,iBAAiB;IAEtE,MAAM,CAAU,qBAAqB,GAAG,IAAI,4BAA4B,EAAE,CAAA;IACpF,SAAS,GAAkB,OAAO,CAAC,OAAO,EAAE,CAAA;IAE5C,MAAM,CAAU,KAAK,CAAC,UAAU,CAC/B,UAAyD,EACzD,YAAgC,EAChC,OAAsB,EAAE;QAExB,MAAM,MAAM,GACX,UAAU,YAAY,aAAa;YAClC,CAAC,CAAC,UAAU;YACZ,CAAC,CAAC,aAAa,CAAC,qBAAqB,CAAC,kBAAkB,CAAC,UAAU,CAAC,CAAA;QAEtE,MAAM,cAAc,GAAG,MAAM,qBAAqB,CAAC,UAAU,CAAC,MAAM,EAAE,YAAY,EAAE,IAAI,CAAC,CAAA;QAEzF,OAAO,IAAI,aAAa,CAAI,MAAM,EAAE,cAAc,EAAE,IAAI,CAAC,CAAA;IAC1D,CAAC;IAED,oGAAoG;IACpG,MAAM,CAAU,KAAK,CAAC,QAAQ,CAC7B,UAAyD,EACzD,UAAiC,EACjC,IAAoB;QAEpB,IAAI,OAAO,UAAU,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;QAC9D,CAAC;QAED,MAAM,YAAY,GAAG,MAAM,MAAM,CAAC,UAAU,EAAE,IAAI,CAAC,CAAA;QACnD,OAAO,aAAa,CAAC,UAAU,CAAI,UAAU,EAAE,YAAY,EAAE,IAAI,CAAC,CAAA;IACnE,CAAC;IAED,6DAA6D;IAC7D,mDAAmD;IACnC,WAAW,CAAC,GAAW,EAAE,KAAa;QACrD,OAAO,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,KAAK,CAAC,CAAA;IACrC,CAAC;IAED,2FAA2F;IAC3E,KAAK,CAAC,SAAS,CAAC,GAAM;QACrC,OAAO,KAAK,CAAC,SAAS,CAAC,GAAG,CAAC,CAAA;IAC5B,CAAC;IAED,8EAA8E;IAC9D,KAAK,CAAC,KAAK;QAC1B,MAAM,IAAI,CAAC,SAAS,CAAA;QACpB,IAAI,IAAI,CAAC,MAAM;YAAE,OAAM;QAEvB,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,aAAa,EAAQ,CAAA;QAElE,KAAK,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QAEnC,IAAI,CAAC,SAAS,GAAG,OAAO,CAAA;QAExB,OAAO,IAAI,CAAC,SAAS,CAAA;IACtB,CAAC;IAEM,KAAK,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC;QACjC,OAAO,IAAI,CAAC,KAAK,EAAE,CAAA;IACpB,CAAC;IAEM,KAAK,CAAC,OAAO;QACnB,OAAO,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC,EAAE,CAAA;IACnC,CAAC"}
@@ -0,0 +1,122 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Final output sharder for the corpus pipeline.
7
+ *
8
+ * Phase 1 (#9) shipped JSONL shards + a Python (PyArrow) converter as the path to binary Parquet —
9
+ * bridging until the JS toolchain caught up. Phase 1.5 (#18 §4) replaced that with a native JS
10
+ * writer based on the salvaged `@dsnp/parquetjs` wrapper from isp-nexus (now in
11
+ * `./parquet-wrapper/`). The build pipeline no longer touches Python at all in its hot path; the
12
+ * only remaining Python is the one-shot `train_tokenizer.py` SentencePiece step.
13
+ *
14
+ * Compression: `SNAPPY`. The plan in #18 §4 specified `zstd`, but `@dsnp/parquetjs` 1.7.0 only
15
+ * supports UNCOMPRESSED / GZIP / SNAPPY / BROTLI (see `node_modules/@dsnp/parquetjs/dist/lib/
16
+ * compression.js`). SNAPPY is the standard ML-corpus default (PyArrow's default too) and is the
17
+ * closest substitute on speed; revisit if @dsnp/parquetjs gains zstd support. Documented in
18
+ * `DECISIONS.md`.
19
+ *
20
+ * Layout under `<outputDir>`:
21
+ *
22
+ * ```
23
+ * corpus-v<version>/
24
+ * MANIFEST.json
25
+ * train/
26
+ * part-0000.parquet
27
+ * part-0001.parquet
28
+ * ...
29
+ * val/
30
+ * part-0000.parquet
31
+ * test/
32
+ * part-0000.parquet
33
+ * ```
34
+ *
35
+ * Each shard caps at `rowsPerShard` (default 1_000_000); within a shard, parquetjs flushes row
36
+ * groups every `ROW_GROUP_SIZE` (50_000) rows per the issue spec. The MANIFEST captures every
37
+ * shard's path, row count, byte size, and SHA-256 (computed by re-reading the shard once after
38
+ * close — cheap relative to writing it).
39
+ */
40
+ import { type ParquetSchemaDefinition } from "./parquet-wrapper/index.js";
41
+ import type { SplitName } from "./split.js";
42
+ import type { LabeledRow } from "./types.js";
43
+ /** Row groups flush at this many rows (parquetjs internal cadence within a shard). */
44
+ export declare const ROW_GROUP_SIZE = 50000;
45
+ /** Snappy is the only zstd-equivalent codec available in @dsnp/parquetjs 1.7.0. */
46
+ export declare const SHARD_COMPRESSION: "SNAPPY";
47
+ /**
48
+ * A single Parquet-style row shape. The `[key: string]: unknown` index signature is required for
49
+ * compatibility with `ParquetRecordLike` in the wrapper — parquetjs accepts any string key on
50
+ * rows.
51
+ */
52
+ export interface ParquetRow {
53
+ raw: string;
54
+ tokens: readonly string[];
55
+ labels: readonly string[];
56
+ country: string;
57
+ locale: string | null;
58
+ source: string;
59
+ source_id: string;
60
+ corpus_version: string;
61
+ license: string;
62
+ synth_method: string | null;
63
+ synth_base_id: string | null;
64
+ [key: string]: unknown;
65
+ }
66
+ /** Column names emitted into every shard. Matches `ParquetRow`. */
67
+ export declare const PARQUET_COLUMNS: readonly ["raw", "tokens", "labels", "country", "locale", "source", "source_id", "corpus_version", "license", "synth_method", "synth_base_id"];
68
+ /**
69
+ * Parquet schema for `LabeledRow` per #18 §4. Optional fields use `optional: true`; repeated UTF8
70
+ * columns capture tokens/labels arrays. Compression is per-column SNAPPY.
71
+ */
72
+ export declare const LABELED_ROW_SCHEMA: ParquetSchemaDefinition<ParquetRow>;
73
+ /** Per-shard metadata captured in `MANIFEST.json`. */
74
+ export interface ShardDescriptor {
75
+ split: SplitName;
76
+ path: string;
77
+ format: "parquet";
78
+ compression: typeof SHARD_COMPRESSION;
79
+ rows: number;
80
+ bytes: number;
81
+ sha256: string;
82
+ first_source_id: string;
83
+ last_source_id: string;
84
+ }
85
+ export interface ShardManifest {
86
+ corpus_version: string;
87
+ schema: readonly string[];
88
+ rows_per_shard: number;
89
+ row_group_size: number;
90
+ shards: ShardDescriptor[];
91
+ counts: Record<SplitName, number>;
92
+ total_rows: number;
93
+ }
94
+ export interface WriteShardsOptions {
95
+ /** Root output directory; corpus version dir is created beneath. */
96
+ outputDir: string;
97
+ /** Corpus version stamped onto rows + into the output directory name. */
98
+ corpusVersion: string;
99
+ /** Max rows per `.parquet` shard. Default 1_000_000 per the Phase 1 plan. */
100
+ rowsPerShard?: number;
101
+ }
102
+ /**
103
+ * Pre-partitioned labeled-row streams, one per split. Callers (`buildCorpus`) decide each row's
104
+ * split inline at align time via `splitForRow` and route rows to the matching stream, eliminating
105
+ * the prior `Map<source_id, SplitName>` O(n) lookup table.
106
+ *
107
+ * Splits with no rows can be omitted (or passed as an empty iterable); `writeShards` skips them.
108
+ */
109
+ export type PerSplitRows = Partial<Record<SplitName, AsyncIterable<LabeledRow>>>;
110
+ /** Project a labeled row to the Parquet schema. */
111
+ export declare function rowToParquet(row: LabeledRow): ParquetRow;
112
+ /**
113
+ * Stream labeled rows into `.parquet` shards, one set of shards per split. Splits are processed
114
+ * sequentially so that only one shard writer is open at a time — memory cost is bounded by the
115
+ * parquetjs row-group buffer (~`ROW_GROUP_SIZE × row_size`), not by the labeled-row count.
116
+ *
117
+ * Callers pass per-split `AsyncIterable<LabeledRow>` (`PerSplitRows`); the prior
118
+ * `splitFor(sourceId)` callback is gone because pre-partitioning at the caller eliminates the O(n)
119
+ * `Map<source_id, SplitName>` it required. See `buildCorpus` for the new wire-up.
120
+ */
121
+ export declare function writeShards(perSplit: PerSplitRows, opts: WriteShardsOptions): Promise<ShardManifest>;
122
+ //# sourceMappingURL=parquet.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parquet.d.ts","sourceRoot":"","sources":["../../src/parquet.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AAMH,OAAO,EAAiB,KAAK,uBAAuB,EAAE,MAAM,4BAA4B,CAAA;AACxF,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAC3C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,YAAY,CAAA;AAE5C,sFAAsF;AACtF,eAAO,MAAM,cAAc,QAAS,CAAA;AAEpC,mFAAmF;AACnF,eAAO,MAAM,iBAAiB,EAAG,QAAiB,CAAA;AAElD;;;;GAIG;AACH,MAAM,WAAW,UAAU;IAC1B,GAAG,EAAE,MAAM,CAAA;IACX,MAAM,EAAE,SAAS,MAAM,EAAE,CAAA;IACzB,MAAM,EAAE,SAAS,MAAM,EAAE,CAAA;IACzB,OAAO,EAAE,MAAM,CAAA;IACf,MAAM,EAAE,MAAM,GAAG,IAAI,CAAA;IACrB,MAAM,EAAE,MAAM,CAAA;IACd,SAAS,EAAE,MAAM,CAAA;IACjB,cAAc,EAAE,MAAM,CAAA;IACtB,OAAO,EAAE,MAAM,CAAA;IACf,YAAY,EAAE,MAAM,GAAG,IAAI,CAAA;IAC3B,aAAa,EAAE,MAAM,GAAG,IAAI,CAAA;IAC5B,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CACtB;AAED,mEAAmE;AACnE,eAAO,MAAM,eAAe,gJAYlB,CAAA;AAEV;;;GAGG;AACH,eAAO,MAAM,kBAAkB,EAAE,uBAAuB,CAAC,UAAU,CAYlE,CAAA;AAED,sDAAsD;AACtD,MAAM,WAAW,eAAe;IAC/B,KAAK,EAAE,SAAS,CAAA;IAChB,IAAI,EAAE,MAAM,CAAA;IACZ,MAAM,EAAE,SAAS,CAAA;IACjB,WAAW,EAAE,OAAO,iBAAiB,CAAA;IACrC,IAAI,EAAE,MAAM,CAAA;IACZ,KAAK,EAAE,MAAM,CAAA;IACb,MAAM,EAAE,MAAM,CAAA;IACd,eAAe,EAAE,MAAM,CAAA;IACvB,cAAc,EAAE,MAAM,CAAA;CACtB;AAED,MAAM,WAAW,aAAa;IAC7B,cAAc,EAAE,MAAM,CAAA;IACtB,MAAM,EAAE,SAAS,MAAM,EAAE,CAAA;IACzB,cAAc,EAAE,MAAM,CAAA;IACtB,cAAc,EAAE,MAAM,CAAA;IACtB,MAAM,EAAE,eAAe,EAAE,CAAA;IACzB,MAAM,EAAE,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;IACjC,UAAU,EAAE,MAAM,CAAA;CAClB;AAED,MAAM,WAAW,kBAAkB;IAClC,oEAAoE;IACpE,SAAS,EAAE,MAAM,CAAA;IAEjB,yEAAyE;IACzE,aAAa,EAAE,MAAM,CAAA;IAErB,6EAA6E;IAC7E,YAAY,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;;;;;GAMG;AACH,MAAM,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,SAAS,EAAE,aAAa,CAAC,UAAU,CAAC,CAAC,CAAC,CAAA;AAEhF,mDAAmD;AACnD,wBAAgB,YAAY,CAAC,GAAG,EAAE,UAAU,GAAG,UAAU,CAcxD;AAwBD;;;;;;;;GAQG;AACH,wBAAsB,WAAW,CAAC,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,kBAAkB,GAAG,OAAO,CAAC,aAAa,CAAC,CAwF1G"}
@@ -0,0 +1,220 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Final output sharder for the corpus pipeline.
7
+ *
8
+ * Phase 1 (#9) shipped JSONL shards + a Python (PyArrow) converter as the path to binary Parquet —
9
+ * bridging until the JS toolchain caught up. Phase 1.5 (#18 §4) replaced that with a native JS
10
+ * writer based on the salvaged `@dsnp/parquetjs` wrapper from isp-nexus (now in
11
+ * `./parquet-wrapper/`). The build pipeline no longer touches Python at all in its hot path; the
12
+ * only remaining Python is the one-shot `train_tokenizer.py` SentencePiece step.
13
+ *
14
+ * Compression: `SNAPPY`. The plan in #18 §4 specified `zstd`, but `@dsnp/parquetjs` 1.7.0 only
15
+ * supports UNCOMPRESSED / GZIP / SNAPPY / BROTLI (see `node_modules/@dsnp/parquetjs/dist/lib/
16
+ * compression.js`). SNAPPY is the standard ML-corpus default (PyArrow's default too) and is the
17
+ * closest substitute on speed; revisit if @dsnp/parquetjs gains zstd support. Documented in
18
+ * `DECISIONS.md`.
19
+ *
20
+ * Layout under `<outputDir>`:
21
+ *
22
+ * ```
23
+ * corpus-v<version>/
24
+ * MANIFEST.json
25
+ * train/
26
+ * part-0000.parquet
27
+ * part-0001.parquet
28
+ * ...
29
+ * val/
30
+ * part-0000.parquet
31
+ * test/
32
+ * part-0000.parquet
33
+ * ```
34
+ *
35
+ * Each shard caps at `rowsPerShard` (default 1_000_000); within a shard, parquetjs flushes row
36
+ * groups every `ROW_GROUP_SIZE` (50_000) rows per the issue spec. The MANIFEST captures every
37
+ * shard's path, row count, byte size, and SHA-256 (computed by re-reading the shard once after
38
+ * close — cheap relative to writing it).
39
+ */
40
+ import { createHash } from "node:crypto";
41
+ import { createReadStream } from "node:fs";
42
+ import { mkdir, stat, writeFile } from "node:fs/promises";
43
+ import { join } from "node:path";
44
+ import { ParquetWriter } from "./parquet-wrapper/index.js";
45
+ /** Row groups flush at this many rows (parquetjs internal cadence within a shard). */
46
+ export const ROW_GROUP_SIZE = 50_000;
47
+ /** Snappy is the only zstd-equivalent codec available in @dsnp/parquetjs 1.7.0. */
48
+ export const SHARD_COMPRESSION = "SNAPPY";
49
+ /** Column names emitted into every shard. Matches `ParquetRow`. */
50
+ export const PARQUET_COLUMNS = [
51
+ "raw",
52
+ "tokens",
53
+ "labels",
54
+ "country",
55
+ "locale",
56
+ "source",
57
+ "source_id",
58
+ "corpus_version",
59
+ "license",
60
+ "synth_method",
61
+ "synth_base_id",
62
+ ];
63
+ /**
64
+ * Parquet schema for `LabeledRow` per #18 §4. Optional fields use `optional: true`; repeated UTF8
65
+ * columns capture tokens/labels arrays. Compression is per-column SNAPPY.
66
+ */
67
+ export const LABELED_ROW_SCHEMA = {
68
+ raw: { type: "UTF8", compression: SHARD_COMPRESSION },
69
+ tokens: { type: "UTF8", repeated: true, compression: SHARD_COMPRESSION },
70
+ labels: { type: "UTF8", repeated: true, compression: SHARD_COMPRESSION },
71
+ country: { type: "UTF8", compression: SHARD_COMPRESSION },
72
+ locale: { type: "UTF8", compression: SHARD_COMPRESSION, optional: true },
73
+ source: { type: "UTF8", compression: SHARD_COMPRESSION },
74
+ source_id: { type: "UTF8", compression: SHARD_COMPRESSION },
75
+ corpus_version: { type: "UTF8", compression: SHARD_COMPRESSION },
76
+ license: { type: "UTF8", compression: SHARD_COMPRESSION },
77
+ synth_method: { type: "UTF8", compression: SHARD_COMPRESSION, optional: true },
78
+ synth_base_id: { type: "UTF8", compression: SHARD_COMPRESSION, optional: true },
79
+ };
80
+ /** Project a labeled row to the Parquet schema. */
81
+ export function rowToParquet(row) {
82
+ return {
83
+ raw: row.raw,
84
+ tokens: row.tokens,
85
+ labels: row.labels,
86
+ country: row.country,
87
+ locale: row.locale ?? null,
88
+ source: row.source,
89
+ source_id: row.source_id,
90
+ corpus_version: row.corpus_version,
91
+ license: row.license,
92
+ synth_method: row.synth?.method ?? null,
93
+ synth_base_id: row.synth?.base_source_id ?? null,
94
+ };
95
+ }
96
+ /**
97
+ * Project a `ParquetRow` for `appendRow`. parquetjs treats `null` as "skip" for `optional` columns;
98
+ * passing it explicitly is fine, but cleaner to omit so the on-disk Definition Levels match what
99
+ * PyArrow / DuckDB / etc. produce for the same logical row.
100
+ */
101
+ function appendShape(row) {
102
+ const out = {
103
+ raw: row.raw,
104
+ tokens: row.tokens,
105
+ labels: row.labels,
106
+ country: row.country,
107
+ source: row.source,
108
+ source_id: row.source_id,
109
+ corpus_version: row.corpus_version,
110
+ license: row.license,
111
+ };
112
+ if (row.locale !== null)
113
+ out.locale = row.locale;
114
+ if (row.synth_method !== null)
115
+ out.synth_method = row.synth_method;
116
+ if (row.synth_base_id !== null)
117
+ out.synth_base_id = row.synth_base_id;
118
+ return out;
119
+ }
120
+ /**
121
+ * Stream labeled rows into `.parquet` shards, one set of shards per split. Splits are processed
122
+ * sequentially so that only one shard writer is open at a time — memory cost is bounded by the
123
+ * parquetjs row-group buffer (~`ROW_GROUP_SIZE × row_size`), not by the labeled-row count.
124
+ *
125
+ * Callers pass per-split `AsyncIterable<LabeledRow>` (`PerSplitRows`); the prior
126
+ * `splitFor(sourceId)` callback is gone because pre-partitioning at the caller eliminates the O(n)
127
+ * `Map<source_id, SplitName>` it required. See `buildCorpus` for the new wire-up.
128
+ */
129
+ export async function writeShards(perSplit, opts) {
130
+ const rowsPerShard = opts.rowsPerShard ?? 1_000_000;
131
+ const corpusDir = join(opts.outputDir, `corpus-v${opts.corpusVersion}`);
132
+ await mkdir(corpusDir, { recursive: true });
133
+ const shards = [];
134
+ const counts = { train: 0, val: 0, test: 0 };
135
+ let totalRows = 0;
136
+ for (const split of ["train", "val", "test"]) {
137
+ const rows = perSplit[split];
138
+ if (!rows)
139
+ continue;
140
+ let shardIndex = 0;
141
+ let writer = null;
142
+ let path = "";
143
+ let shardRows = 0;
144
+ let firstSourceId = "";
145
+ let lastSourceId = "";
146
+ const openShard = async () => {
147
+ const splitDir = join(corpusDir, split);
148
+ await mkdir(splitDir, { recursive: true });
149
+ path = join(splitDir, `part-${String(shardIndex).padStart(4, "0")}.parquet`);
150
+ writer = await ParquetWriter.openFile(LABELED_ROW_SCHEMA, path, {
151
+ rowGroupSize: ROW_GROUP_SIZE,
152
+ });
153
+ writer.setMetadata("mailwoman.corpus_version", opts.corpusVersion);
154
+ writer.setMetadata("mailwoman.split", split);
155
+ writer.setMetadata("mailwoman.shard_index", String(shardIndex));
156
+ shardRows = 0;
157
+ firstSourceId = "";
158
+ lastSourceId = "";
159
+ };
160
+ const closeShard = async () => {
161
+ if (!writer)
162
+ return;
163
+ await writer.close();
164
+ if (shardRows > 0) {
165
+ const fileStat = await stat(path);
166
+ const sha256 = await hashFile(path);
167
+ shards.push({
168
+ split,
169
+ path,
170
+ format: "parquet",
171
+ compression: SHARD_COMPRESSION,
172
+ rows: shardRows,
173
+ bytes: fileStat.size,
174
+ sha256,
175
+ first_source_id: firstSourceId,
176
+ last_source_id: lastSourceId,
177
+ });
178
+ }
179
+ writer = null;
180
+ };
181
+ for await (const row of rows) {
182
+ if (!writer)
183
+ await openShard();
184
+ const pq = rowToParquet(row);
185
+ await writer.appendRow(appendShape(pq));
186
+ if (shardRows === 0)
187
+ firstSourceId = row.source_id;
188
+ lastSourceId = row.source_id;
189
+ shardRows++;
190
+ counts[split]++;
191
+ totalRows++;
192
+ if (shardRows >= rowsPerShard) {
193
+ await closeShard();
194
+ shardIndex++;
195
+ }
196
+ }
197
+ await closeShard();
198
+ }
199
+ shards.sort((a, b) => (a.split === b.split ? a.path.localeCompare(b.path) : a.split.localeCompare(b.split)));
200
+ const manifest = {
201
+ corpus_version: opts.corpusVersion,
202
+ schema: PARQUET_COLUMNS,
203
+ rows_per_shard: rowsPerShard,
204
+ row_group_size: ROW_GROUP_SIZE,
205
+ shards,
206
+ counts,
207
+ total_rows: totalRows,
208
+ };
209
+ await writeFile(join(corpusDir, "MANIFEST.json"), `${JSON.stringify(manifest, null, 2)}\n`, "utf8");
210
+ return manifest;
211
+ }
212
+ /** Single-pass SHA-256 over the file at `path`. Cheap relative to Parquet write throughput. */
213
+ async function hashFile(path) {
214
+ const hash = createHash("sha256");
215
+ const stream = createReadStream(path);
216
+ for await (const chunk of stream)
217
+ hash.update(chunk);
218
+ return hash.digest("hex");
219
+ }
220
+ //# sourceMappingURL=parquet.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"parquet.js","sourceRoot":"","sources":["../../src/parquet.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AACxC,OAAO,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAA;AAC1C,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAA;AACzD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAA;AAChC,OAAO,EAAE,aAAa,EAAgC,MAAM,4BAA4B,CAAA;AAIxF,sFAAsF;AACtF,MAAM,CAAC,MAAM,cAAc,GAAG,MAAM,CAAA;AAEpC,mFAAmF;AACnF,MAAM,CAAC,MAAM,iBAAiB,GAAG,QAAiB,CAAA;AAsBlD,mEAAmE;AACnE,MAAM,CAAC,MAAM,eAAe,GAAG;IAC9B,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,SAAS;IACT,QAAQ;IACR,QAAQ;IACR,WAAW;IACX,gBAAgB;IAChB,SAAS;IACT,cAAc;IACd,eAAe;CACN,CAAA;AAEV;;;GAGG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAwC;IACtE,GAAG,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,iBAAiB,EAAE;IACrD,MAAM,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,iBAAiB,EAAE;IACxE,MAAM,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,iBAAiB,EAAE;IACxE,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,iBAAiB,EAAE;IACzD,MAAM,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,IAAI,EAAE;IACxE,MAAM,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,iBAAiB,EAAE;IACxD,SAAS,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,iBAAiB,EAAE;IAC3D,cAAc,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,iBAAiB,EAAE;IAChE,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,iBAAiB,EAAE;IACzD,YAAY,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,IAAI,EAAE;IAC9E,aAAa,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,WAAW,EAAE,iBAAiB,EAAE,QAAQ,EAAE,IAAI,EAAE;CAC/E,CAAA;AA6CD,mDAAmD;AACnD,MAAM,UAAU,YAAY,CAAC,GAAe;IAC3C,OAAO;QACN,GAAG,EAAE,GAAG,CAAC,GAAG;QACZ,MAAM,EAAE,GAAG,CAAC,MAAM;QAClB,MAAM,EAAE,GAAG,CAAC,MAAM;QAClB,OAAO,EAAE,GAAG,CAAC,OAAO;QACpB,MAAM,EAAE,GAAG,CAAC,MAAM,IAAI,IAAI;QAC1B,MAAM,EAAE,GAAG,CAAC,MAAM;QAClB,SAAS,EAAE,GAAG,CAAC,SAAS;QACxB,cAAc,EAAE,GAAG,CAAC,cAAc;QAClC,OAAO,EAAE,GAAG,CAAC,OAAO;QACpB,YAAY,EAAE,GAAG,CAAC,KAAK,EAAE,MAAM,IAAI,IAAI;QACvC,aAAa,EAAE,GAAG,CAAC,KAAK,EAAE,cAAc,IAAI,IAAI;KAChD,CAAA;AACF,CAAC;AAED;;;;GAIG;AACH,SAAS,WAAW,CAAC,GAAe;IACnC,MAAM,GAAG,GAA4B;QACpC,GAAG,EAAE,GAAG,CAAC,GAAG;QACZ,MAAM,EAAE,GAAG,CAAC,MAAM;QAClB,MAAM,EAAE,GAAG,CAAC,MAAM;QAClB,OAAO,EAAE,GAAG,CAAC,OAAO;QACpB,MAAM,EAAE,GAAG,CAAC,MAAM;QAClB,SAAS,EAAE,GAAG,CAAC,SAAS;QACxB,cAAc,EAAE,GAAG,CAAC,cAAc;QAClC,OAAO,EAAE,GAAG,CAAC,OAAO;KACpB,CAAA;IACD,IAAI,GAAG,CAAC,MAAM,KAAK,IAAI;QAAE,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,CAAA;IAChD,IAAI,GAAG,CAAC,YAAY,KAAK,IAAI;QAAE,GAAG,CAAC,YAAY,GAAG,GAAG,CAAC,YAAY,CAAA;IAClE,IAAI,GAAG,CAAC,aAAa,KAAK,IAAI;QAAE,GAAG,CAAC,aAAa,GAAG,GAAG,CAAC,aAAa,CAAA;IACrE,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAAC,QAAsB,EAAE,IAAwB;IACjF,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,SAAS,CAAA;IACnD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,WAAW,IAAI,CAAC,aAAa,EAAE,CAAC,CAAA;IACvE,MAAM,KAAK,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;IAE3C,MAAM,MAAM,GAAsB,EAAE,CAAA;IACpC,MAAM,MAAM,GAA8B,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAA;IACvE,IAAI,SAAS,GAAG,CAAC,CAAA;IAEjB,KAAK,MAAM,KAAK,IAAI,CAAC,OAAO,EAAE,KAAK,EAAE,MAAM,CAAU,EAAE,CAAC;QACvD,MAAM,IAAI,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAA;QAC5B,IAAI,CAAC,IAAI;YAAE,SAAQ;QAEnB,IAAI,UAAU,GAAG,CAAC,CAAA;QAClB,IAAI,MAAM,GAAqC,IAAI,CAAA;QACnD,IAAI,IAAI,GAAG,EAAE,CAAA;QACb,IAAI,SAAS,GAAG,CAAC,CAAA;QACjB,IAAI,aAAa,GAAG,EAAE,CAAA;QACtB,IAAI,YAAY,GAAG,EAAE,CAAA;QAErB,MAAM,SAAS,GAAG,KAAK,IAAmB,EAAE;YAC3C,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,EAAE,KAAK,CAAC,CAAA;YACvC,MAAM,KAAK,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAA;YAC1C,IAAI,GAAG,IAAI,CAAC,QAAQ,EAAE,QAAQ,MAAM,CAAC,UAAU,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,UAAU,CAAC,CAAA;YAC5E,MAAM,GAAG,MAAM,aAAa,CAAC,QAAQ,CAAa,kBAAkB,EAAE,IAAI,EAAE;gBAC3E,YAAY,EAAE,cAAc;aAC5B,CAAC,CAAA;YACF,MAAM,CAAC,WAAW,CAAC,0BAA0B,EAAE,IAAI,CAAC,aAAa,CAAC,CAAA;YAClE,MAAM,CAAC,WAAW,CAAC,iBAAiB,EAAE,KAAK,CAAC,CAAA;YAC5C,MAAM,CAAC,WAAW,CAAC,uBAAuB,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC,CAAA;YAC/D,SAAS,GAAG,CAAC,CAAA;YACb,aAAa,GAAG,EAAE,CAAA;YAClB,YAAY,GAAG,EAAE,CAAA;QAClB,CAAC,CAAA;QAED,MAAM,UAAU,GAAG,KAAK,IAAmB,EAAE;YAC5C,IAAI,CAAC,MAAM;gBAAE,OAAM;YACnB,MAAM,MAAM,CAAC,KAAK,EAAE,CAAA;YACpB,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;gBACnB,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,CAAA;gBACjC,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,CAAA;gBACnC,MAAM,CAAC,IAAI,CAAC;oBACX,KAAK;oBACL,IAAI;oBACJ,MAAM,EAAE,SAAS;oBACjB,WAAW,EAAE,iBAAiB;oBAC9B,IAAI,EAAE,SAAS;oBACf,KAAK,EAAE,QAAQ,CAAC,IAAI;oBACpB,MAAM;oBACN,eAAe,EAAE,aAAa;oBAC9B,cAAc,EAAE,YAAY;iBAC5B,CAAC,CAAA;YACH,CAAC;YACD,MAAM,GAAG,IAAI,CAAA;QACd,CAAC,CAAA;QAED,IAAI,KAAK,EAAE,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YAC9B,IAAI,CAAC,MAAM;gBAAE,MAAM,SAAS,EAAE,CAAA;YAC9B,MAAM,EAAE,GAAG,YAAY,CAAC,GAAG,CAAC,CAAA;YAC5B,MAAM,MAAO,CAAC,SAAS,CAAC,WAAW,CAAC,EAAE,CAA0B,CAAC,CAAA;YACjE,IAAI,SAAS,KAAK,CAAC;gBAAE,aAAa,GAAG,GAAG,CAAC,SAAS,CAAA;YAClD,YAAY,GAAG,GAAG,CAAC,SAAS,CAAA;YAC5B,SAAS,EAAE,CAAA;YACX,MAAM,CAAC,KAAK,CAAC,EAAE,CAAA;YACf,SAAS,EAAE,CAAA;YAEX,IAAI,SAAS,IAAI,YAAY,EAAE,CAAC;gBAC/B,MAAM,UAAU,EAAE,CAAA;gBAClB,UAAU,EAAE,CAAA;YACb,CAAC;QACF,CAAC;QAED,MAAM,UAAU,EAAE,CAAA;IACnB,CAAC;IAED,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;IAE5G,MAAM,QAAQ,GAAkB;QAC/B,cAAc,EAAE,IAAI,CAAC,aAAa;QAClC,MAAM,EAAE,eAAe;QACvB,cAAc,EAAE,YAAY;QAC5B,cAAc,EAAE,cAAc;QAC9B,MAAM;QACN,MAAM;QACN,UAAU,EAAE,SAAS;KACrB,CAAA;IACD,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,eAAe,CAAC,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;IACnG,OAAO,QAAQ,CAAA;AAChB,CAAC;AAED,+FAA+F;AAC/F,KAAK,UAAU,QAAQ,CAAC,IAAY;IACnC,MAAM,IAAI,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAA;IACjC,MAAM,MAAM,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAA;IACrC,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,MAAM;QAAE,IAAI,CAAC,MAAM,CAAC,KAAe,CAAC,CAAA;IAC9D,OAAO,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;AAC1B,CAAC"}
@@ -0,0 +1,100 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Adapter runner — drives a `CorpusAdapter` to completion and writes intermediate JSONL + a
7
+ * per-shard manifest.
8
+ *
9
+ * Output layout under `outputDir`:
10
+ *
11
+ * ```
12
+ * <outputDir>/<adapter.id>/
13
+ * canonical.jsonl # one row per line, in emission order
14
+ * MANIFEST.json # adapter id, version, row count, sha256, license, started_at, ended_at
15
+ * ```
16
+ *
17
+ * The runner is responsible for everything an adapter is **not** responsible for:
18
+ *
19
+ * - Stamping `corpus_version` on every row (adapters must NOT set it).
20
+ * - Applying `canonicalDedupKey` and skipping duplicates.
21
+ * - Streaming sha256 over JSONL bytes so the manifest checksum doesn't require a re-read.
22
+ * - Honoring backpressure on the output write stream.
23
+ * - Counting + emitting periodic progress to an optional callback.
24
+ * - Honoring `signal` (delegates to adapter's iteration boundary).
25
+ *
26
+ * The runner does NOT perform alignment, tokenization, synthesis, or sharding into Parquet. Those
27
+ * steps run later, consuming the JSONL shards this writes.
28
+ */
29
+ import { type AdapterRegistry } from "./adapter.js";
30
+ import type { AdapterOptions, CorpusAdapter } from "./types.js";
31
+ /** Snapshot of the runner's state, emitted on every progress tick. */
32
+ export interface RunnerProgress {
33
+ /** Adapter being driven. */
34
+ adapterId: string;
35
+ /** Total rows the adapter has yielded (before dedup). */
36
+ yielded: number;
37
+ /** Rows actually written to JSONL (after dedup). */
38
+ written: number;
39
+ /** Bytes written to JSONL so far. */
40
+ bytes: number;
41
+ /** Wall-clock milliseconds since the run started. */
42
+ elapsed_ms: number;
43
+ }
44
+ /** Per-invocation options for `runAdapter`. */
45
+ export interface RunAdapterOptions {
46
+ /** Adapter to drive. */
47
+ adapter: CorpusAdapter;
48
+ /** Options handed to the adapter (input path, country filter, limit, signal). */
49
+ adapterOptions: AdapterOptions;
50
+ /** Root output directory; the runner creates `<outputDir>/<adapter.id>/` under it. */
51
+ outputDir: string;
52
+ /** Corpus version stamped onto every row. Locked together with the tokenizer version. */
53
+ corpusVersion: string;
54
+ /**
55
+ * Optional progress callback. Invoked every `progressEvery` rows yielded (default 1000) and once
56
+ * at the end of the run. Errors thrown from this callback abort the run.
57
+ */
58
+ onProgress?: (snapshot: RunnerProgress) => void;
59
+ /**
60
+ * Yielded-row interval at which `onProgress` fires. Defaults to 1000. The terminal tick is always
61
+ * emitted regardless of this value.
62
+ */
63
+ progressEvery?: number;
64
+ }
65
+ /** Return value of `runAdapter`: the same shape as `MANIFEST.json` on disk. */
66
+ export interface AdapterRunManifest {
67
+ adapter_id: string;
68
+ corpus_version: string;
69
+ default_license: string;
70
+ description: string;
71
+ yielded: number;
72
+ written: number;
73
+ deduped: number;
74
+ bytes: number;
75
+ sha256: string;
76
+ jsonl_path: string;
77
+ started_at: string;
78
+ ended_at: string;
79
+ elapsed_ms: number;
80
+ }
81
+ /**
82
+ * Drive a single adapter to completion.
83
+ *
84
+ * Returns the manifest describing the run. Writes `canonical.jsonl` + `MANIFEST.json` under
85
+ * `outputDir/<adapter.id>/`. Throws if the output directory cannot be created, if a row arrives
86
+ * with a missing required field, or if the abort signal fires.
87
+ */
88
+ export declare function runAdapter(opts: RunAdapterOptions): Promise<AdapterRunManifest>;
89
+ /**
90
+ * Drive every adapter in a registry sequentially. Stops on the first failure (caller can filter the
91
+ * registry before calling if partial-failure is desired).
92
+ *
93
+ * Returns the manifests in registry insertion order.
94
+ */
95
+ export declare function runAllAdapters(registry: AdapterRegistry, common: Omit<RunAdapterOptions, "adapter"> & {
96
+ adapterOptionsFor?: (a: CorpusAdapter) => AdapterOptions;
97
+ }): Promise<AdapterRunManifest[]>;
98
+ /** Convenience: ensure the parent directory of `filePath` exists. */
99
+ export declare function ensureParentDir(filePath: string): Promise<void>;
100
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAKH,OAAO,EAAsC,KAAK,eAAe,EAAwB,MAAM,cAAc,CAAA;AAC7G,OAAO,KAAK,EAAE,cAAc,EAAgB,aAAa,EAAE,MAAM,YAAY,CAAA;AAE7E,sEAAsE;AACtE,MAAM,WAAW,cAAc;IAC9B,4BAA4B;IAC5B,SAAS,EAAE,MAAM,CAAA;IAEjB,yDAAyD;IACzD,OAAO,EAAE,MAAM,CAAA;IAEf,oDAAoD;IACpD,OAAO,EAAE,MAAM,CAAA;IAEf,qCAAqC;IACrC,KAAK,EAAE,MAAM,CAAA;IAEb,qDAAqD;IACrD,UAAU,EAAE,MAAM,CAAA;CAClB;AAED,+CAA+C;AAC/C,MAAM,WAAW,iBAAiB;IACjC,wBAAwB;IACxB,OAAO,EAAE,aAAa,CAAA;IAEtB,iFAAiF;IACjF,cAAc,EAAE,cAAc,CAAA;IAE9B,sFAAsF;IACtF,SAAS,EAAE,MAAM,CAAA;IAEjB,yFAAyF;IACzF,aAAa,EAAE,MAAM,CAAA;IAErB;;;OAGG;IACH,UAAU,CAAC,EAAE,CAAC,QAAQ,EAAE,cAAc,KAAK,IAAI,CAAA;IAE/C;;;OAGG;IACH,aAAa,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,+EAA+E;AAC/E,MAAM,WAAW,kBAAkB;IAClC,UAAU,EAAE,MAAM,CAAA;IAClB,cAAc,EAAE,MAAM,CAAA;IACtB,eAAe,EAAE,MAAM,CAAA;IACvB,WAAW,EAAE,MAAM,CAAA;IACnB,OAAO,EAAE,MAAM,CAAA;IACf,OAAO,EAAE,MAAM,CAAA;IACf,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,MAAM,CAAA;IACb,MAAM,EAAE,MAAM,CAAA;IACd,UAAU,EAAE,MAAM,CAAA;IAClB,UAAU,EAAE,MAAM,CAAA;IAClB,QAAQ,EAAE,MAAM,CAAA;IAChB,UAAU,EAAE,MAAM,CAAA;CAClB;AAED;;;;;;GAMG;AACH,wBAAsB,UAAU,CAAC,IAAI,EAAE,iBAAiB,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAkGrF;AAED;;;;;GAKG;AACH,wBAAsB,cAAc,CACnC,QAAQ,EAAE,eAAe,EACzB,MAAM,EAAE,IAAI,CAAC,iBAAiB,EAAE,SAAS,CAAC,GAAG;IAAE,iBAAiB,CAAC,EAAE,CAAC,CAAC,EAAE,aAAa,KAAK,cAAc,CAAA;CAAE,GACvG,OAAO,CAAC,kBAAkB,EAAE,CAAC,CAa/B;AAwCD,qEAAqE;AACrE,wBAAsB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAErE"}