@forwardimpact/libsyntheticgen 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/tools/faker.js ADDED
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Faker tool — generates datasets using @faker-js/faker in-process.
3
+ */
4
+
5
+ import { faker } from "@faker-js/faker";
6
+
7
+ export class FakerTool {
8
+ /**
9
+ * @param {object} deps
10
+ * @param {object} deps.logger
11
+ */
12
+ constructor({ logger }) {
13
+ if (!logger) throw new Error("FakerTool requires logger");
14
+ this.logger = logger;
15
+ }
16
+
17
+ /**
18
+ * Faker is always available (JS dependency).
19
+ * @returns {Promise<boolean>}
20
+ */
21
+ async checkAvailability() {
22
+ return true;
23
+ }
24
+
25
+ /**
26
+ * Generate a dataset from field definitions.
27
+ * @param {object} config
28
+ * @param {string} config.name - Dataset name from DSL
29
+ * @param {number} config.rows - Number of records to generate
30
+ * @param {Object<string, string>} config.fields - Field name → Faker provider path
31
+ * @param {number} config.seed - RNG seed
32
+ * @returns {Promise<Dataset[]>}
33
+ */
34
+ async generate(config) {
35
+ faker.seed(config.seed);
36
+ this.logger.info(
37
+ "faker",
38
+ `Generating ${config.rows} rows for ${config.name}`,
39
+ );
40
+ const records = [];
41
+ for (let i = 0; i < config.rows; i++) {
42
+ const record = {};
43
+ for (const [field, provider] of Object.entries(config.fields)) {
44
+ record[field] = this.callProvider(provider);
45
+ }
46
+ records.push(record);
47
+ }
48
+ return [
49
+ {
50
+ name: config.name,
51
+ schema: null,
52
+ records,
53
+ metadata: { tool: "faker", fields: config.fields },
54
+ },
55
+ ];
56
+ }
57
+
58
+ /**
59
+ * Resolve a dotted provider path like "person.fullName" to a Faker call.
60
+ * @param {string} provider
61
+ * @returns {*}
62
+ */
63
+ callProvider(provider) {
64
+ const parts = provider.split(".");
65
+ let fn = faker;
66
+ for (const part of parts) {
67
+ fn = fn[part];
68
+ if (!fn) throw new Error(`Unknown Faker provider: ${provider}`);
69
+ }
70
+ if (typeof fn !== "function") {
71
+ throw new Error(`Faker provider "${provider}" is not a function`);
72
+ }
73
+ return fn();
74
+ }
75
+ }
76
+
77
+ /**
78
+ * @param {object} logger
79
+ * @returns {FakerTool}
80
+ */
81
+ export function createFakerTool(logger) {
82
+ return new FakerTool({ logger });
83
+ }
package/tools/sdv.js ADDED
@@ -0,0 +1,93 @@
1
+ /**
2
+ * SDV tool — generates statistically representative tabular data via Python subprocess.
3
+ */
4
+
5
+ import { join } from "path";
6
+ import { tmpdir } from "os";
7
+ import { randomUUID } from "crypto";
8
+
9
+ export class SdvTool {
10
+ /**
11
+ * @param {object} deps
12
+ * @param {object} deps.logger
13
+ * @param {Function} deps.execFileFn - async (cmd, args) => { stdout }
14
+ * @param {object} deps.fsFns - { writeFile, rm }
15
+ */
16
+ constructor({ logger, execFileFn, fsFns }) {
17
+ if (!logger) throw new Error("SdvTool requires logger");
18
+ if (!execFileFn) throw new Error("SdvTool requires execFileFn");
19
+ if (!fsFns) throw new Error("SdvTool requires fsFns");
20
+ this.logger = logger;
21
+ this.execFileFn = execFileFn;
22
+ this.fsFns = fsFns;
23
+ this.scriptPath = join(import.meta.dirname, "sdv_generate.py");
24
+ }
25
+
26
+ /**
27
+ * Check that Python 3 with SDV is available.
28
+ * @returns {Promise<boolean>}
29
+ */
30
+ async checkAvailability() {
31
+ try {
32
+ await this.execFileFn("python3", ["-c", "import sdv"]);
33
+ return true;
34
+ } catch {
35
+ throw new Error(
36
+ "SDV requires Python 3 with the sdv package. " +
37
+ "Install with: pip install sdv",
38
+ );
39
+ }
40
+ }
41
+
42
+ /**
43
+ * Generate tabular data preserving statistical properties from sample data.
44
+ * @param {object} config
45
+ * @param {string} config.name - Dataset name from DSL
46
+ * @param {string} config.metadata - Path to SDV metadata JSON
47
+ * @param {Object<string, string>} config.data - Map of table name → CSV path
48
+ * @param {number} [config.rows=1000] - Number of rows to generate
49
+ * @param {number} config.seed - RNG seed
50
+ * @returns {Promise<Dataset[]>}
51
+ */
52
+ async generate(config) {
53
+ const tmpConfig = join(tmpdir(), `sdv-config-${randomUUID()}.json`);
54
+ await this.fsFns.writeFile(
55
+ tmpConfig,
56
+ JSON.stringify({
57
+ metadata: config.metadata,
58
+ data: config.data,
59
+ rows: config.rows || 1000,
60
+ seed: config.seed,
61
+ }),
62
+ );
63
+
64
+ this.logger.info("sdv", `Running SDV: rows=${config.rows || 1000}`);
65
+ const { stdout } = await this.execFileFn("python3", [
66
+ this.scriptPath,
67
+ tmpConfig,
68
+ ]);
69
+ await this.fsFns.rm(tmpConfig);
70
+
71
+ // Parse newline-delimited JSON
72
+ return stdout
73
+ .trim()
74
+ .split("\n")
75
+ .map((line) => {
76
+ const obj = JSON.parse(line);
77
+ return {
78
+ name: `${config.name}_${obj.name}`,
79
+ schema: null,
80
+ records: obj.records,
81
+ metadata: { tool: "sdv", table: obj.name },
82
+ };
83
+ });
84
+ }
85
+ }
86
+
87
+ /**
88
+ * @param {object} deps
89
+ * @returns {SdvTool}
90
+ */
91
+ export function createSdvTool(deps) {
92
+ return new SdvTool(deps);
93
+ }
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env python3
2
+ """Bridge between fit-universe and SDV."""
3
+ import json
4
+ import sys
5
+ import pandas as pd
6
+ from sdv.metadata import Metadata
7
+ from sdv.single_table import GaussianCopulaSynthesizer
8
+
9
+
10
+ def main():
11
+ config = json.load(open(sys.argv[1]))
12
+ metadata = Metadata.load_from_json(config["metadata"])
13
+ seed = config.get("seed", 0)
14
+
15
+ for table_name in metadata.get_tables():
16
+ data = pd.read_csv(config["data"][table_name])
17
+ synth = GaussianCopulaSynthesizer(metadata, table_name=table_name)
18
+ synth.fit(data)
19
+ samples = synth.sample(num_rows=config["rows"], seed=seed)
20
+
21
+ output = {
22
+ "name": table_name,
23
+ "records": json.loads(samples.to_json(orient="records")),
24
+ }
25
+ print(json.dumps(output))
26
+
27
+
28
+ if __name__ == "__main__":
29
+ main()
@@ -0,0 +1,126 @@
1
+ /**
2
+ * Synthea tool — generates FHIR R4 patient data via Java subprocess.
3
+ */
4
+
5
+ import { join } from "path";
6
+
7
+ export class SyntheaTool {
8
+ /**
9
+ * @param {object} deps
10
+ * @param {object} deps.logger
11
+ * @param {string} deps.syntheaJar - Absolute path to synthea-with-dependencies.jar
12
+ * @param {Function} deps.execFileFn - async (cmd, args) => { stdout }
13
+ * @param {object} deps.fsFns - { readFile, readdir, mkdtemp, rm }
14
+ */
15
+ constructor({ logger, syntheaJar, execFileFn, fsFns }) {
16
+ if (!logger) throw new Error("SyntheaTool requires logger");
17
+ if (!syntheaJar) throw new Error("SyntheaTool requires syntheaJar");
18
+ if (!execFileFn) throw new Error("SyntheaTool requires execFileFn");
19
+ if (!fsFns) throw new Error("SyntheaTool requires fsFns");
20
+ this.logger = logger;
21
+ this.syntheaJar = syntheaJar;
22
+ this.execFileFn = execFileFn;
23
+ this.fsFns = fsFns;
24
+ }
25
+
26
+ /**
27
+ * Check that Java and the Synthea jar are available.
28
+ * @returns {Promise<boolean>}
29
+ */
30
+ async checkAvailability() {
31
+ try {
32
+ await this.execFileFn("java", ["-version"]);
33
+ await this.fsFns.readFile(this.syntheaJar);
34
+ return true;
35
+ } catch {
36
+ throw new Error(
37
+ `Synthea requires Java and ${this.syntheaJar}. ` +
38
+ "Install Java (java.com) and download Synthea " +
39
+ "(github.com/synthetichealth/synthea/releases). " +
40
+ "Set SYNTHEA_JAR to the jar path.",
41
+ );
42
+ }
43
+ }
44
+
45
+ /**
46
+ * Generate FHIR patient data, returning one dataset per resource type.
47
+ * @param {object} config
48
+ * @param {string} config.name - Dataset name from DSL
49
+ * @param {number} [config.population=100] - Number of patients
50
+ * @param {string[]} [config.modules] - Synthea modules to enable
51
+ * @param {number} config.seed - RNG seed
52
+ * @returns {Promise<Dataset[]>}
53
+ */
54
+ async generate(config) {
55
+ const tmpDir = await this.fsFns.mkdtemp("synthea-");
56
+ const args = [
57
+ "-jar",
58
+ this.syntheaJar,
59
+ "-p",
60
+ String(config.population || 100),
61
+ "-s",
62
+ String(config.seed),
63
+ "--exporter.fhir.export",
64
+ "true",
65
+ "--exporter.baseDirectory",
66
+ tmpDir,
67
+ ];
68
+ if (config.modules) {
69
+ for (const mod of config.modules) {
70
+ args.push("-m", mod);
71
+ }
72
+ }
73
+
74
+ this.logger.info(
75
+ "synthea",
76
+ `Running Synthea: population=${config.population || 100}`,
77
+ );
78
+ await this.execFileFn("java", args);
79
+
80
+ // Read FHIR bundles from output
81
+ const fhirDir = join(tmpDir, "fhir");
82
+ const bundleFiles = (await this.fsFns.readdir(fhirDir)).filter((f) =>
83
+ f.endsWith(".json"),
84
+ );
85
+ const bundles = await Promise.all(
86
+ bundleFiles.map(async (f) =>
87
+ JSON.parse(await this.fsFns.readFile(join(fhirDir, f), "utf-8")),
88
+ ),
89
+ );
90
+
91
+ // Flatten bundles into datasets by resource type
92
+ const byType = new Map();
93
+ for (const bundle of bundles) {
94
+ for (const entry of bundle.entry || []) {
95
+ const resource = entry.resource;
96
+ const type = resource.resourceType;
97
+ if (!byType.has(type)) byType.set(type, []);
98
+ byType.get(type).push(resource);
99
+ }
100
+ }
101
+
102
+ // Return one dataset per resource type
103
+ const datasets = [];
104
+ for (const [type, records] of byType) {
105
+ datasets.push({
106
+ name: `${config.name}_${type.toLowerCase()}`,
107
+ schema: null,
108
+ records,
109
+ metadata: { tool: "synthea", resourceType: type },
110
+ });
111
+ }
112
+
113
+ // Clean up
114
+ await this.fsFns.rm(tmpDir, { recursive: true });
115
+
116
+ return datasets;
117
+ }
118
+ }
119
+
120
+ /**
121
+ * @param {object} deps
122
+ * @returns {SyntheaTool}
123
+ */
124
+ export function createSyntheaTool(deps) {
125
+ return new SyntheaTool(deps);
126
+ }