@ucdjs/codegen 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026-PRESENT Lucas Nørgård
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,21 @@
1
+ # @ucdjs/codegen
2
+
3
+ [![npm version][npm-version-src]][npm-version-href]
4
+ [![npm downloads][npm-downloads-src]][npm-downloads-href]
5
+
6
+ This package provides utilities for generating code from UCD data and models.
7
+
8
+ ## Installation
9
+
10
+ ```bash
11
+ npm install @ucdjs/codegen
12
+ ```
13
+
14
+ ## 📄 License
15
+
16
+ Published under [MIT License](./LICENSE).
17
+
18
+ [npm-version-src]: https://img.shields.io/npm/v/@ucdjs/codegen?style=flat&colorA=18181B&colorB=4169E1
19
+ [npm-version-href]: https://npmjs.com/package/@ucdjs/codegen
20
+ [npm-downloads-src]: https://img.shields.io/npm/dm/@ucdjs/codegen?style=flat&colorA=18181B&colorB=4169E1
21
+ [npm-downloads-href]: https://npmjs.com/package/@ucdjs/codegen
@@ -0,0 +1,48 @@
1
+ import { LanguageModel } from "ai";
2
+
3
+ //#region src/index.d.ts
4
+ interface CodegenFile {
5
+ /**
6
+ * The filePath to the data file.
7
+ */
8
+ filePath: string;
9
+ /**
10
+ * The version of the data file.
11
+ */
12
+ version: string;
13
+ }
14
+ interface ProcessedFile {
15
+ fields: {
16
+ type: string;
17
+ name: string;
18
+ description: string;
19
+ }[];
20
+ code: string;
21
+ fileName: string;
22
+ version: string;
23
+ }
24
+ interface CodegenOptions {
25
+ /**
26
+ * Files to generate structures for.
27
+ */
28
+ files: CodegenFile[];
29
+ /**
30
+ * The OpenAI API key to use for generating the fields.
31
+ */
32
+ openaiKey?: string;
33
+ /**
34
+ * The OpenAI model to use for generating fields.
35
+ * NOTE:
36
+ * This is good for testing purposes, where you
37
+ * can provide a mock model to test the generation.
38
+ *
39
+ * If not provided, it will create a new OpenAI instance
40
+ * with the default model.
41
+ *
42
+ * SEE: https://ai-sdk.dev/docs/ai-sdk-core/testing
43
+ */
44
+ model?: LanguageModel;
45
+ }
46
+ declare function runCodegen(options: CodegenOptions): Promise<ProcessedFile[]>;
47
+ //#endregion
48
+ export { CodegenFile, CodegenOptions, ProcessedFile, runCodegen };
package/dist/index.mjs ADDED
@@ -0,0 +1,243 @@
1
+ import { readFile } from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { dedent, sanitizeIdentifier, toPascalCase, toSnakeCase } from "@luxass/utils";
4
+ import { createConcurrencyLimiter } from "@ucdjs-internal/shared";
5
+ import { RawDataFile } from "@unicode-utils/core";
6
+ import { createOpenAI } from "@ai-sdk/openai";
7
+ import { generateObject } from "ai";
8
+ import { z } from "zod";
9
+ //#region src/fields.ts
10
+ const SYSTEM_PROMPT = dedent`
11
+ <system_prompt>
12
+ <role>Expert TypeScript field extractor for Unicode data files</role>
13
+
14
+ <task>
15
+ <input>
16
+ Text description: {{INPUT}}
17
+ </input>
18
+ <output>JSON array of field objects with name, type, and description</output>
19
+ </task>
20
+
21
+ <critical_rule>
22
+ YOU MUST ALWAYS OUTPUT A VALID JSON ARRAY OF OBJECTS WITH THIS STRUCTURE:
23
+ [
24
+ {
25
+ "name": "actual_field_name",
26
+ "type": "valid_typescript_type",
27
+ "description": "Description of the field"
28
+ }
29
+ ]
30
+
31
+ NEVER use generic names like "field_0", "field_1", etc. - extract the ACTUAL field names.
32
+ NEVER use invalid TypeScript types like "union" - use proper union syntax with pipe symbol.
33
+ </critical_rule>
34
+
35
+ <field_detection>
36
+ For Unicode data files, fields are typically described in patterns like:
37
+ - Lines starting with "# Field 0: Name" - where "Name" is the actual field name
38
+ - Table headers or column definitions
39
+ - Property descriptions in documentation
40
+
41
+ DO NOT use "field_0", "field_1" as field names - extract the REAL field names.
42
+ DO NOT use "type_name" as a field name unless it's explicitly mentioned as a field.
43
+
44
+ Example: "# Field 0: Code_Point" should produce a field named "code_point" (NOT "field_0").
45
+ </field_detection>
46
+
47
+ <type_mapping>
48
+ ONLY USE THESE VALID TYPESCRIPT TYPES:
49
+ - string - For text, identifiers, character codes, etc.
50
+ - number - For numeric values, indices, counts
51
+ - boolean - For true/false flags
52
+ - string[] - For arrays of strings
53
+ - number[] - For arrays of numbers
54
+ - Array<string> - Alternative syntax for string arrays
55
+ - Array<number> - Alternative syntax for number arrays
56
+ - Record<string, string> - For string to string mappings
57
+ - Record<string, number> - For string to number mappings
58
+ - Record<string, unknown> - For objects with unknown structure
59
+ - unknown - When type cannot be determined
60
+ - any - Only as a last resort when type is truly variable
61
+
62
+ For union types with string literals, ALWAYS use double quotes and pipe symbol:
63
+ - "\"value1\" | \"value2\" | \"value3\""
64
+
65
+ CRITICAL: Special values handling:
66
+ 1. If a value contains angle brackets, like <none>, FIRST remove the angle brackets: "none" (NOT "<none>")
67
+ 2. THEN ALWAYS wrap the value in quotes if it's a string literal: "\"none\""
68
+ 3. For union types of string literals, EACH value MUST be in quotes:
69
+ - Correct: "\"R\" | \"L\" | \"D\" | \"C\" | \"U\" | \"T\""
70
+ - Incorrect: "R | L | D | C | U | T" (missing quotes around each value)
71
+ - Incorrect: "none" (missing quotes if it's a string literal)
72
+
73
+ NEVER use these invalid types:
74
+ - "union" - This is not a valid TypeScript type
75
+ - "object" - Too generic, use Record<> instead
76
+ - "array" - Too generic, use proper array syntax instead
77
+ - "map" - Use Record<> instead
78
+ - "none" - Use "\"none\"" or never instead
79
+ - "list" - Use proper array syntax instead
80
+
81
+ ALWAYS wrap the final converted type in double quotes in the JSON output.
82
+ ALWAYS wrap each value in a union type of string literals in quotes.
83
+ </type_mapping>
84
+
85
+ <examples>
86
+ Input example:
87
+ \`\`\`
88
+ # ArabicShaping.txt
89
+ # Field 0: Code point
90
+ # Field 1: Name
91
+ # Field 2: Joining_Type (R = Right_Joining, L = Left_Joining, D = Dual_Joining, C = Join_Causing, U = Non_Joining, T = Transparent)
92
+ # Field 3: Joining_Group
93
+ \`\`\`
94
+
95
+ Correct output:
96
+ [
97
+ {
98
+ "name": "code_point",
99
+ "type": "string",
100
+ "description": "The code point of a character, in hexadecimal form"
101
+ },
102
+ {
103
+ "name": "name",
104
+ "type": "string",
105
+ "description": "A short schematic name for the character"
106
+ },
107
+ {
108
+ "name": "joining_type",
109
+ "type": "\"R\" | \"L\" | \"D\" | \"C\" | \"U\" | \"T\"",
110
+ "description": "Defines the joining type (R = Right_Joining, L = Left_Joining, D = Dual_Joining, C = Join_Causing, U = Non_Joining, T = Transparent)"
111
+ },
112
+ {
113
+ "name": "joining_group",
114
+ "type": "string",
115
+ "description": "Defines the joining group, based schematically on character names"
116
+ }
117
+ ]
118
+
119
+ Example with special value:
120
+ Input: "# Field 4: Value (<none> or specific value)"
121
+
122
+ Correct output for this field:
123
+ {
124
+ "name": "value",
125
+ "type": "\"none\" | string",
126
+ "description": "The value, which can be none or a specific value"
127
+ }
128
+ </examples>
129
+
130
+ <validation>
131
+ Before outputting, verify:
132
+ - Field names are the ACTUAL field names from the documentation, NOT generic "field_0" style names
133
+ - All field names are in snake_case
134
+ - All types are valid TypeScript types (string, number, boolean, arrays, or proper union types)
135
+ - NEVER use the word "union" as a type - use proper TypeScript syntax with the pipe symbol
136
+ - String literal values in union types are ALWAYS wrapped in quotes (e.g., "\"value1\" | \"value2\"")
137
+ - Special values like "none" (from <none>) are properly quoted as string literals: "\"none\""
138
+ - Each field has a clear, specific description
139
+ - Output is a valid JSON array of objects
140
+ </validation>
141
+
142
+ <error_handling>
143
+ If NO fields can be detected in the input:
144
+ - Return an empty JSON array: []
145
+ - DO NOT return error messages or explanations in the JSON output
146
+ - DO NOT attempt to create fields when none are clearly defined
147
+
148
+ Example when no fields are detected:
149
+ Input: "This is some text without any field definitions"
150
+
151
+ Correct output:
152
+ []
153
+
154
+ If SOME fields are unclear but others are detectable:
155
+ - Only include the fields that can be clearly identified
156
+ - Omit any fields that cannot be confidently extracted
157
+ - Follow all validation rules for the fields that are included
158
+ </error_handling>
159
+
160
+ <format>JSON array of field objects</format>
161
+ </system_prompt>
162
+ `;
163
+ async function generateFields(options) {
164
+ const { datafile, apiKey, model } = options;
165
+ if (!apiKey && !model) return null;
166
+ const openai = model != null ? null : createOpenAI({ apiKey });
167
+ try {
168
+ return (await generateObject({
169
+ model: model ?? openai("gpt-4o-mini"),
170
+ schema: z.object({ fields: z.array(z.object({
171
+ name: z.string(),
172
+ type: z.string(),
173
+ description: z.string()
174
+ })) }),
175
+ prompt: SYSTEM_PROMPT.replace("{{INPUT}}", datafile.heading)
176
+ })).object.fields;
177
+ } catch (err) {
178
+ console.error("error generating fields:", err);
179
+ return null;
180
+ }
181
+ }
182
+ //#endregion
183
+ //#region src/index.ts
184
+ const TXT_EXTENSION_RE = /\.txt$/;
185
+ function buildInterface(name, fields, opts) {
186
+ return `${opts?.export ? "export " : ""}interface ${name} {\n${Object.entries(fields).map(([key, type]) => ` ${key}: ${type};`).join("\n")}\n}`;
187
+ }
188
+ function buildStringArray(values) {
189
+ return `[${values.map((v) => `"${v}"`).join(", ")}]`;
190
+ }
191
+ async function runCodegen(options) {
192
+ const inputFiles = options.files;
193
+ const limit = createConcurrencyLimiter(10);
194
+ if (!options.openaiKey && !options.model) throw new Error("Either openaiKey or model must be provided");
195
+ const processPromises = inputFiles.map(({ filePath, version }) => limit(() => processFile({
196
+ filePath,
197
+ openaiKey: options.openaiKey,
198
+ version,
199
+ model: options.model
200
+ })));
201
+ return Promise.all(processPromises).then((results) => results.filter((result) => result !== null));
202
+ }
203
+ async function processFile(request) {
204
+ const { filePath, openaiKey, version, model } = request;
205
+ try {
206
+ console.log(`Processing file: ${filePath}`);
207
+ const content = await readFile(filePath, "utf-8");
208
+ const fileName = path.basename(filePath).replace(TXT_EXTENSION_RE, "");
209
+ const fields = await generateFields({
210
+ datafile: new RawDataFile(content),
211
+ apiKey: openaiKey,
212
+ model
213
+ });
214
+ if (fields == null) {
215
+ console.error(`Error generating fields for file: ${filePath}`);
216
+ return null;
217
+ }
218
+ let code = ``;
219
+ const properties = {};
220
+ for (const field of fields) {
221
+ if (properties[field.name] != null) {
222
+ console.error(`Duplicate field name ${field.name} in file ${filePath}. Skipping field.`);
223
+ continue;
224
+ }
225
+ properties[field.name] = field.type;
226
+ }
227
+ code += `// This file is generated by ucd codegen. Do not edit this file directly.\n`;
228
+ code += `// Unicode Version: ${version}\n\n`;
229
+ code += `${buildInterface(sanitizeIdentifier(toPascalCase(fileName)), properties, { export: true })}\n\n`;
230
+ code += `export const ${sanitizeIdentifier(toSnakeCase(fileName)).toUpperCase()}_FIELDS = ${buildStringArray(fields.map((f) => f.name))};\n`;
231
+ return {
232
+ fields,
233
+ fileName,
234
+ version,
235
+ code
236
+ };
237
+ } catch (err) {
238
+ console.error(`Error processing file ${filePath}:`, err);
239
+ return null;
240
+ }
241
+ }
242
+ //#endregion
243
+ export { runCodegen };
package/package.json ADDED
@@ -0,0 +1,59 @@
1
+ {
2
+ "name": "@ucdjs/codegen",
3
+ "version": "0.0.0",
4
+ "type": "module",
5
+ "author": {
6
+ "name": "Lucas Nørgård",
7
+ "email": "lucasnrgaard@gmail.com",
8
+ "url": "https://luxass.dev"
9
+ },
10
+ "license": "MIT",
11
+ "homepage": "https://github.com/ucdjs/ucd",
12
+ "repository": {
13
+ "type": "git",
14
+ "url": "git+https://github.com/ucdjs/ucd.git",
15
+ "directory": "packages/codegen"
16
+ },
17
+ "bugs": {
18
+ "url": "https://github.com/ucdjs/ucd/issues"
19
+ },
20
+ "exports": {
21
+ ".": "./dist/index.mjs",
22
+ "./package.json": "./package.json"
23
+ },
24
+ "types": "./dist/index.d.mts",
25
+ "files": [
26
+ "dist"
27
+ ],
28
+ "engines": {
29
+ "node": ">=24.13"
30
+ },
31
+ "dependencies": {
32
+ "@ai-sdk/openai": "3.0.48",
33
+ "@luxass/utils": "2.7.3",
34
+ "@unicode-utils/core": "0.12.0-beta.27",
35
+ "ai": "6.0.138",
36
+ "zod": "4.3.6",
37
+ "@ucdjs-internal/shared": "0.1.1-beta.10"
38
+ },
39
+ "devDependencies": {
40
+ "@luxass/eslint-config": "7.4.2",
41
+ "eslint": "10.1.0",
42
+ "publint": "0.3.18",
43
+ "tsdown": "0.21.7",
44
+ "typescript": "6.0.2",
45
+ "vitest-testdirs": "4.4.3",
46
+ "@ucdjs-tooling/tsconfig": "1.0.0",
47
+ "@ucdjs-tooling/tsdown-config": "1.0.0"
48
+ },
49
+ "publishConfig": {
50
+ "access": "public"
51
+ },
52
+ "scripts": {
53
+ "build": "tsdown --tsconfig=./tsconfig.build.json",
54
+ "dev": "tsdown --watch",
55
+ "clean": "git clean -xdf dist node_modules",
56
+ "lint": "eslint .",
57
+ "typecheck": "tsc --noEmit -p tsconfig.build.json"
58
+ }
59
+ }