@forwardimpact/libsyntheticgen 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,282 @@
1
+ /**
2
+ * DSL Tokenizer — converts universe DSL source to token stream.
3
+ *
4
+ * Token types:
5
+ * KEYWORD - reserved words (universe, department, team, etc.)
6
+ * IDENT - identifiers (variable / entity names)
7
+ * STRING - double-quoted string literals
8
+ * NUMBER - integer or decimal numbers
9
+ * PERCENT - number followed by %
10
+ * DATE - YYYY-MM format
11
+ * AT_IDENT - @name references
12
+ * LBRACE - {
13
+ * RBRACE - }
14
+ * LBRACKET - [
15
+ * RBRACKET - ]
16
+ * COMMA - ,
17
+ * EOF - end of input
18
+ */
19
+
20
+ const KEYWORDS = new Set([
21
+ "universe",
22
+ "domain",
23
+ "industry",
24
+ "seed",
25
+ "org",
26
+ "department",
27
+ "team",
28
+ "name",
29
+ "location",
30
+ "parent",
31
+ "headcount",
32
+ "size",
33
+ "manager",
34
+ "repos",
35
+ "people",
36
+ "count",
37
+ "names",
38
+ "distribution",
39
+ "disciplines",
40
+ "project",
41
+ "type",
42
+ "phase",
43
+ "teams",
44
+ "timeline_start",
45
+ "timeline_end",
46
+ "prose_topic",
47
+ "prose_tone",
48
+ "snapshots",
49
+ "quarterly_from",
50
+ "quarterly_to",
51
+ "account_id",
52
+ "scenario",
53
+ "timerange_start",
54
+ "timerange_end",
55
+ "affect",
56
+ "github_commits",
57
+ "github_prs",
58
+ "dx_drivers",
59
+ "trajectory",
60
+ "magnitude",
61
+ "evidence_skills",
62
+ "evidence_floor",
63
+ "framework",
64
+ "proficiencies",
65
+ "maturities",
66
+ "capabilities",
67
+ "levels",
68
+ "behaviours",
69
+ "drivers",
70
+ "tracks",
71
+ "stages",
72
+ "skills",
73
+ "title",
74
+ "rank",
75
+ "experience",
76
+ "roleTitle",
77
+ "specialization",
78
+ "isProfessional",
79
+ "core",
80
+ "supporting",
81
+ "broad",
82
+ "validTracks",
83
+ "content",
84
+ "articles",
85
+ "article_topics",
86
+ "blogs",
87
+ "faqs",
88
+ "howtos",
89
+ "howto_topics",
90
+ "reviews",
91
+ "comments",
92
+ "courses",
93
+ "events",
94
+ "personas",
95
+ "persona_levels",
96
+ "briefings_per_persona",
97
+ "notes_per_persona",
98
+ "comments_per_snapshot",
99
+ // Dataset and output blocks
100
+ "dataset",
101
+ "tool",
102
+ "population",
103
+ "modules",
104
+ "metadata",
105
+ "data",
106
+ "rows",
107
+ "fields",
108
+ "output",
109
+ "table",
110
+ "path",
111
+ "json",
112
+ "yaml",
113
+ "csv",
114
+ "markdown",
115
+ "parquet",
116
+ "sql",
117
+ ]);
118
+
119
+ const DATE_RE = /^\d{4}-\d{2}$/;
120
+
121
+ /**
122
+ * @typedef {{ type: string, value: string, line: number }} Token
123
+ */
124
+
125
+ /**
126
+ * Tokenize DSL source into a token stream.
127
+ * @param {string} source
128
+ * @returns {Token[]}
129
+ */
130
+ export function tokenize(source) {
131
+ const tokens = [];
132
+ let i = 0;
133
+ let line = 1;
134
+
135
+ while (i < source.length) {
136
+ // Skip whitespace
137
+ if (source[i] === " " || source[i] === "\t" || source[i] === "\r") {
138
+ i++;
139
+ continue;
140
+ }
141
+
142
+ // Newline
143
+ if (source[i] === "\n") {
144
+ line++;
145
+ i++;
146
+ continue;
147
+ }
148
+
149
+ // Single-line comment
150
+ if (source[i] === "/" && source[i + 1] === "/") {
151
+ while (i < source.length && source[i] !== "\n") i++;
152
+ continue;
153
+ }
154
+
155
+ // Multi-line comment
156
+ if (source[i] === "/" && source[i + 1] === "*") {
157
+ i += 2;
158
+ while (
159
+ i < source.length - 1 &&
160
+ !(source[i] === "*" && source[i + 1] === "/")
161
+ ) {
162
+ if (source[i] === "\n") line++;
163
+ i++;
164
+ }
165
+ i += 2;
166
+ continue;
167
+ }
168
+
169
+ // String literal
170
+ if (source[i] === '"') {
171
+ i++;
172
+ let str = "";
173
+ while (i < source.length && source[i] !== '"') {
174
+ if (source[i] === "\\" && i + 1 < source.length) {
175
+ i++;
176
+ if (source[i] === "n") str += "\n";
177
+ else if (source[i] === "t") str += "\t";
178
+ else str += source[i];
179
+ } else {
180
+ str += source[i];
181
+ }
182
+ i++;
183
+ }
184
+ i++; // closing quote
185
+ tokens.push({ type: "STRING", value: str, line });
186
+ continue;
187
+ }
188
+
189
+ // Braces and brackets
190
+ if (source[i] === "{") {
191
+ tokens.push({ type: "LBRACE", value: "{", line });
192
+ i++;
193
+ continue;
194
+ }
195
+ if (source[i] === "}") {
196
+ tokens.push({ type: "RBRACE", value: "}", line });
197
+ i++;
198
+ continue;
199
+ }
200
+ if (source[i] === "[") {
201
+ tokens.push({ type: "LBRACKET", value: "[", line });
202
+ i++;
203
+ continue;
204
+ }
205
+ if (source[i] === "]") {
206
+ tokens.push({ type: "RBRACKET", value: "]", line });
207
+ i++;
208
+ continue;
209
+ }
210
+ if (source[i] === ",") {
211
+ tokens.push({ type: "COMMA", value: ",", line });
212
+ i++;
213
+ continue;
214
+ }
215
+
216
+ // @identifier
217
+ if (source[i] === "@") {
218
+ i++;
219
+ let name = "";
220
+ while (i < source.length && /[a-zA-Z0-9_]/.test(source[i])) {
221
+ name += source[i];
222
+ i++;
223
+ }
224
+ tokens.push({ type: "AT_IDENT", value: name, line });
225
+ continue;
226
+ }
227
+
228
+ // Number, percent, date, or negative number
229
+ if (/[\d-]/.test(source[i])) {
230
+ let num = "";
231
+ if (source[i] === "-") {
232
+ num += "-";
233
+ i++;
234
+ }
235
+ while (i < source.length && /[\d.]/.test(source[i])) {
236
+ num += source[i];
237
+ i++;
238
+ }
239
+ // Check for date (YYYY-MM)
240
+ if (source[i] === "-" && /^\d{4}$/.test(num)) {
241
+ num += "-";
242
+ i++;
243
+ while (i < source.length && /\d/.test(source[i])) {
244
+ num += source[i];
245
+ i++;
246
+ }
247
+ if (DATE_RE.test(num)) {
248
+ tokens.push({ type: "DATE", value: num, line });
249
+ continue;
250
+ }
251
+ }
252
+ // Check for percent
253
+ if (source[i] === "%") {
254
+ tokens.push({ type: "PERCENT", value: num, line });
255
+ i++;
256
+ continue;
257
+ }
258
+ tokens.push({ type: "NUMBER", value: num, line });
259
+ continue;
260
+ }
261
+
262
+ // Identifier or keyword
263
+ if (/[a-zA-Z_]/.test(source[i])) {
264
+ let word = "";
265
+ while (i < source.length && /[a-zA-Z0-9_]/.test(source[i])) {
266
+ word += source[i];
267
+ i++;
268
+ }
269
+ if (KEYWORDS.has(word)) {
270
+ tokens.push({ type: "KEYWORD", value: word, line });
271
+ } else {
272
+ tokens.push({ type: "IDENT", value: word, line });
273
+ }
274
+ continue;
275
+ }
276
+
277
+ throw new Error(`Unexpected character '${source[i]}' at line ${line}`);
278
+ }
279
+
280
+ tokens.push({ type: "EOF", value: "", line });
281
+ return tokens;
282
+ }