@forwardimpact/libsyntheticgen 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/dsl/index.js +36 -0
- package/dsl/parser.js +728 -0
- package/dsl/tokenizer.js +282 -0
- package/engine/activity.js +956 -0
- package/engine/entities.js +144 -0
- package/engine/names.js +290 -0
- package/engine/prose-keys.js +182 -0
- package/engine/rng.js +43 -0
- package/engine/tier0.js +63 -0
- package/index.js +7 -0
- package/package.json +35 -0
- package/test/activity.test.js +322 -0
- package/test/faker.test.js +98 -0
- package/test/parser-dataset.test.js +142 -0
- package/test/parser.test.js +596 -0
- package/test/rng.test.js +236 -0
- package/test/sdv.test.js +67 -0
- package/test/synthea.test.js +95 -0
- package/test/tokenizer.test.js +266 -0
- package/tools/faker.js +83 -0
- package/tools/sdv.js +93 -0
- package/tools/sdv_generate.py +29 -0
- package/tools/synthea.js +126 -0
package/tools/faker.js
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Faker tool — generates datasets using @faker-js/faker in-process.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { faker } from "@faker-js/faker";
|
|
6
|
+
|
|
7
|
+
export class FakerTool {
|
|
8
|
+
/**
|
|
9
|
+
* @param {object} deps
|
|
10
|
+
* @param {object} deps.logger
|
|
11
|
+
*/
|
|
12
|
+
constructor({ logger }) {
|
|
13
|
+
if (!logger) throw new Error("FakerTool requires logger");
|
|
14
|
+
this.logger = logger;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Faker is always available (JS dependency).
|
|
19
|
+
* @returns {Promise<boolean>}
|
|
20
|
+
*/
|
|
21
|
+
async checkAvailability() {
|
|
22
|
+
return true;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Generate a dataset from field definitions.
|
|
27
|
+
* @param {object} config
|
|
28
|
+
* @param {string} config.name - Dataset name from DSL
|
|
29
|
+
* @param {number} config.rows - Number of records to generate
|
|
30
|
+
* @param {Object<string, string>} config.fields - Field name → Faker provider path
|
|
31
|
+
* @param {number} config.seed - RNG seed
|
|
32
|
+
* @returns {Promise<Dataset[]>}
|
|
33
|
+
*/
|
|
34
|
+
async generate(config) {
|
|
35
|
+
faker.seed(config.seed);
|
|
36
|
+
this.logger.info(
|
|
37
|
+
"faker",
|
|
38
|
+
`Generating ${config.rows} rows for ${config.name}`,
|
|
39
|
+
);
|
|
40
|
+
const records = [];
|
|
41
|
+
for (let i = 0; i < config.rows; i++) {
|
|
42
|
+
const record = {};
|
|
43
|
+
for (const [field, provider] of Object.entries(config.fields)) {
|
|
44
|
+
record[field] = this.callProvider(provider);
|
|
45
|
+
}
|
|
46
|
+
records.push(record);
|
|
47
|
+
}
|
|
48
|
+
return [
|
|
49
|
+
{
|
|
50
|
+
name: config.name,
|
|
51
|
+
schema: null,
|
|
52
|
+
records,
|
|
53
|
+
metadata: { tool: "faker", fields: config.fields },
|
|
54
|
+
},
|
|
55
|
+
];
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Resolve a dotted provider path like "person.fullName" to a Faker call.
|
|
60
|
+
* @param {string} provider
|
|
61
|
+
* @returns {*}
|
|
62
|
+
*/
|
|
63
|
+
callProvider(provider) {
|
|
64
|
+
const parts = provider.split(".");
|
|
65
|
+
let fn = faker;
|
|
66
|
+
for (const part of parts) {
|
|
67
|
+
fn = fn[part];
|
|
68
|
+
if (!fn) throw new Error(`Unknown Faker provider: ${provider}`);
|
|
69
|
+
}
|
|
70
|
+
if (typeof fn !== "function") {
|
|
71
|
+
throw new Error(`Faker provider "${provider}" is not a function`);
|
|
72
|
+
}
|
|
73
|
+
return fn();
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* @param {object} logger
|
|
79
|
+
* @returns {FakerTool}
|
|
80
|
+
*/
|
|
81
|
+
export function createFakerTool(logger) {
|
|
82
|
+
return new FakerTool({ logger });
|
|
83
|
+
}
|
package/tools/sdv.js
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SDV tool — generates statistically representative tabular data via Python subprocess.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { join } from "path";
|
|
6
|
+
import { tmpdir } from "os";
|
|
7
|
+
import { randomUUID } from "crypto";
|
|
8
|
+
|
|
9
|
+
export class SdvTool {
|
|
10
|
+
/**
|
|
11
|
+
* @param {object} deps
|
|
12
|
+
* @param {object} deps.logger
|
|
13
|
+
* @param {Function} deps.execFileFn - async (cmd, args) => { stdout }
|
|
14
|
+
* @param {object} deps.fsFns - { writeFile, rm }
|
|
15
|
+
*/
|
|
16
|
+
constructor({ logger, execFileFn, fsFns }) {
|
|
17
|
+
if (!logger) throw new Error("SdvTool requires logger");
|
|
18
|
+
if (!execFileFn) throw new Error("SdvTool requires execFileFn");
|
|
19
|
+
if (!fsFns) throw new Error("SdvTool requires fsFns");
|
|
20
|
+
this.logger = logger;
|
|
21
|
+
this.execFileFn = execFileFn;
|
|
22
|
+
this.fsFns = fsFns;
|
|
23
|
+
this.scriptPath = join(import.meta.dirname, "sdv_generate.py");
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Check that Python 3 with SDV is available.
|
|
28
|
+
* @returns {Promise<boolean>}
|
|
29
|
+
*/
|
|
30
|
+
async checkAvailability() {
|
|
31
|
+
try {
|
|
32
|
+
await this.execFileFn("python3", ["-c", "import sdv"]);
|
|
33
|
+
return true;
|
|
34
|
+
} catch {
|
|
35
|
+
throw new Error(
|
|
36
|
+
"SDV requires Python 3 with the sdv package. " +
|
|
37
|
+
"Install with: pip install sdv",
|
|
38
|
+
);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Generate tabular data preserving statistical properties from sample data.
|
|
44
|
+
* @param {object} config
|
|
45
|
+
* @param {string} config.name - Dataset name from DSL
|
|
46
|
+
* @param {string} config.metadata - Path to SDV metadata JSON
|
|
47
|
+
* @param {Object<string, string>} config.data - Map of table name → CSV path
|
|
48
|
+
* @param {number} [config.rows=1000] - Number of rows to generate
|
|
49
|
+
* @param {number} config.seed - RNG seed
|
|
50
|
+
* @returns {Promise<Dataset[]>}
|
|
51
|
+
*/
|
|
52
|
+
async generate(config) {
|
|
53
|
+
const tmpConfig = join(tmpdir(), `sdv-config-${randomUUID()}.json`);
|
|
54
|
+
await this.fsFns.writeFile(
|
|
55
|
+
tmpConfig,
|
|
56
|
+
JSON.stringify({
|
|
57
|
+
metadata: config.metadata,
|
|
58
|
+
data: config.data,
|
|
59
|
+
rows: config.rows || 1000,
|
|
60
|
+
seed: config.seed,
|
|
61
|
+
}),
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
this.logger.info("sdv", `Running SDV: rows=${config.rows || 1000}`);
|
|
65
|
+
const { stdout } = await this.execFileFn("python3", [
|
|
66
|
+
this.scriptPath,
|
|
67
|
+
tmpConfig,
|
|
68
|
+
]);
|
|
69
|
+
await this.fsFns.rm(tmpConfig);
|
|
70
|
+
|
|
71
|
+
// Parse newline-delimited JSON
|
|
72
|
+
return stdout
|
|
73
|
+
.trim()
|
|
74
|
+
.split("\n")
|
|
75
|
+
.map((line) => {
|
|
76
|
+
const obj = JSON.parse(line);
|
|
77
|
+
return {
|
|
78
|
+
name: `${config.name}_${obj.name}`,
|
|
79
|
+
schema: null,
|
|
80
|
+
records: obj.records,
|
|
81
|
+
metadata: { tool: "sdv", table: obj.name },
|
|
82
|
+
};
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* @param {object} deps
|
|
89
|
+
* @returns {SdvTool}
|
|
90
|
+
*/
|
|
91
|
+
export function createSdvTool(deps) {
|
|
92
|
+
return new SdvTool(deps);
|
|
93
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Bridge between fit-universe and SDV."""
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sdv.metadata import Metadata
|
|
7
|
+
from sdv.single_table import GaussianCopulaSynthesizer
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
config = json.load(open(sys.argv[1]))
|
|
12
|
+
metadata = Metadata.load_from_json(config["metadata"])
|
|
13
|
+
seed = config.get("seed", 0)
|
|
14
|
+
|
|
15
|
+
for table_name in metadata.get_tables():
|
|
16
|
+
data = pd.read_csv(config["data"][table_name])
|
|
17
|
+
synth = GaussianCopulaSynthesizer(metadata, table_name=table_name)
|
|
18
|
+
synth.fit(data)
|
|
19
|
+
samples = synth.sample(num_rows=config["rows"], seed=seed)
|
|
20
|
+
|
|
21
|
+
output = {
|
|
22
|
+
"name": table_name,
|
|
23
|
+
"records": json.loads(samples.to_json(orient="records")),
|
|
24
|
+
}
|
|
25
|
+
print(json.dumps(output))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
main()
|
package/tools/synthea.js
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Synthea tool — generates FHIR R4 patient data via Java subprocess.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { join } from "path";
|
|
6
|
+
|
|
7
|
+
export class SyntheaTool {
|
|
8
|
+
/**
|
|
9
|
+
* @param {object} deps
|
|
10
|
+
* @param {object} deps.logger
|
|
11
|
+
* @param {string} deps.syntheaJar - Absolute path to synthea-with-dependencies.jar
|
|
12
|
+
* @param {Function} deps.execFileFn - async (cmd, args) => { stdout }
|
|
13
|
+
* @param {object} deps.fsFns - { readFile, readdir, mkdtemp, rm }
|
|
14
|
+
*/
|
|
15
|
+
constructor({ logger, syntheaJar, execFileFn, fsFns }) {
|
|
16
|
+
if (!logger) throw new Error("SyntheaTool requires logger");
|
|
17
|
+
if (!syntheaJar) throw new Error("SyntheaTool requires syntheaJar");
|
|
18
|
+
if (!execFileFn) throw new Error("SyntheaTool requires execFileFn");
|
|
19
|
+
if (!fsFns) throw new Error("SyntheaTool requires fsFns");
|
|
20
|
+
this.logger = logger;
|
|
21
|
+
this.syntheaJar = syntheaJar;
|
|
22
|
+
this.execFileFn = execFileFn;
|
|
23
|
+
this.fsFns = fsFns;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Check that Java and the Synthea jar are available.
|
|
28
|
+
* @returns {Promise<boolean>}
|
|
29
|
+
*/
|
|
30
|
+
async checkAvailability() {
|
|
31
|
+
try {
|
|
32
|
+
await this.execFileFn("java", ["-version"]);
|
|
33
|
+
await this.fsFns.readFile(this.syntheaJar);
|
|
34
|
+
return true;
|
|
35
|
+
} catch {
|
|
36
|
+
throw new Error(
|
|
37
|
+
`Synthea requires Java and ${this.syntheaJar}. ` +
|
|
38
|
+
"Install Java (java.com) and download Synthea " +
|
|
39
|
+
"(github.com/synthetichealth/synthea/releases). " +
|
|
40
|
+
"Set SYNTHEA_JAR to the jar path.",
|
|
41
|
+
);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Generate FHIR patient data, returning one dataset per resource type.
|
|
47
|
+
* @param {object} config
|
|
48
|
+
* @param {string} config.name - Dataset name from DSL
|
|
49
|
+
* @param {number} [config.population=100] - Number of patients
|
|
50
|
+
* @param {string[]} [config.modules] - Synthea modules to enable
|
|
51
|
+
* @param {number} config.seed - RNG seed
|
|
52
|
+
* @returns {Promise<Dataset[]>}
|
|
53
|
+
*/
|
|
54
|
+
async generate(config) {
|
|
55
|
+
const tmpDir = await this.fsFns.mkdtemp("synthea-");
|
|
56
|
+
const args = [
|
|
57
|
+
"-jar",
|
|
58
|
+
this.syntheaJar,
|
|
59
|
+
"-p",
|
|
60
|
+
String(config.population || 100),
|
|
61
|
+
"-s",
|
|
62
|
+
String(config.seed),
|
|
63
|
+
"--exporter.fhir.export",
|
|
64
|
+
"true",
|
|
65
|
+
"--exporter.baseDirectory",
|
|
66
|
+
tmpDir,
|
|
67
|
+
];
|
|
68
|
+
if (config.modules) {
|
|
69
|
+
for (const mod of config.modules) {
|
|
70
|
+
args.push("-m", mod);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
this.logger.info(
|
|
75
|
+
"synthea",
|
|
76
|
+
`Running Synthea: population=${config.population || 100}`,
|
|
77
|
+
);
|
|
78
|
+
await this.execFileFn("java", args);
|
|
79
|
+
|
|
80
|
+
// Read FHIR bundles from output
|
|
81
|
+
const fhirDir = join(tmpDir, "fhir");
|
|
82
|
+
const bundleFiles = (await this.fsFns.readdir(fhirDir)).filter((f) =>
|
|
83
|
+
f.endsWith(".json"),
|
|
84
|
+
);
|
|
85
|
+
const bundles = await Promise.all(
|
|
86
|
+
bundleFiles.map(async (f) =>
|
|
87
|
+
JSON.parse(await this.fsFns.readFile(join(fhirDir, f), "utf-8")),
|
|
88
|
+
),
|
|
89
|
+
);
|
|
90
|
+
|
|
91
|
+
// Flatten bundles into datasets by resource type
|
|
92
|
+
const byType = new Map();
|
|
93
|
+
for (const bundle of bundles) {
|
|
94
|
+
for (const entry of bundle.entry || []) {
|
|
95
|
+
const resource = entry.resource;
|
|
96
|
+
const type = resource.resourceType;
|
|
97
|
+
if (!byType.has(type)) byType.set(type, []);
|
|
98
|
+
byType.get(type).push(resource);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Return one dataset per resource type
|
|
103
|
+
const datasets = [];
|
|
104
|
+
for (const [type, records] of byType) {
|
|
105
|
+
datasets.push({
|
|
106
|
+
name: `${config.name}_${type.toLowerCase()}`,
|
|
107
|
+
schema: null,
|
|
108
|
+
records,
|
|
109
|
+
metadata: { tool: "synthea", resourceType: type },
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Clean up
|
|
114
|
+
await this.fsFns.rm(tmpDir, { recursive: true });
|
|
115
|
+
|
|
116
|
+
return datasets;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* @param {object} deps
|
|
122
|
+
* @returns {SyntheaTool}
|
|
123
|
+
*/
|
|
124
|
+
export function createSyntheaTool(deps) {
|
|
125
|
+
return new SyntheaTool(deps);
|
|
126
|
+
}
|