@ucdjs/codegen 0.0.0 → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/fields.d.mts +2 -0
- package/dist/fields.mjs +2 -0
- package/dist/index.d.mts +2 -48
- package/dist/index.mjs +2 -243
- package/dist/run-Cgm8iCzh.d.mts +98 -0
- package/dist/run-bH3vZ-ci.mjs +142 -0
- package/package.json +3 -2
package/dist/fields.mjs
ADDED
package/dist/index.d.mts
CHANGED
|
@@ -1,48 +1,2 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
//#region src/index.d.ts
|
|
4
|
-
interface CodegenFile {
|
|
5
|
-
/**
|
|
6
|
-
* The filePath to the data file.
|
|
7
|
-
*/
|
|
8
|
-
filePath: string;
|
|
9
|
-
/**
|
|
10
|
-
* The version of the data file.
|
|
11
|
-
*/
|
|
12
|
-
version: string;
|
|
13
|
-
}
|
|
14
|
-
interface ProcessedFile {
|
|
15
|
-
fields: {
|
|
16
|
-
type: string;
|
|
17
|
-
name: string;
|
|
18
|
-
description: string;
|
|
19
|
-
}[];
|
|
20
|
-
code: string;
|
|
21
|
-
fileName: string;
|
|
22
|
-
version: string;
|
|
23
|
-
}
|
|
24
|
-
interface CodegenOptions {
|
|
25
|
-
/**
|
|
26
|
-
* Files to generate structures for.
|
|
27
|
-
*/
|
|
28
|
-
files: CodegenFile[];
|
|
29
|
-
/**
|
|
30
|
-
* The OpenAI API key to use for generating the fields.
|
|
31
|
-
*/
|
|
32
|
-
openaiKey?: string;
|
|
33
|
-
/**
|
|
34
|
-
* The OpenAI model to use for generating fields.
|
|
35
|
-
* NOTE:
|
|
36
|
-
* This is good for testing purposes, where you
|
|
37
|
-
* can provide a mock model to test the generation.
|
|
38
|
-
*
|
|
39
|
-
* If not provided, it will create a new OpenAI instance
|
|
40
|
-
* with the default model.
|
|
41
|
-
*
|
|
42
|
-
* SEE: https://ai-sdk.dev/docs/ai-sdk-core/testing
|
|
43
|
-
*/
|
|
44
|
-
model?: LanguageModel;
|
|
45
|
-
}
|
|
46
|
-
declare function runCodegen(options: CodegenOptions): Promise<ProcessedFile[]>;
|
|
47
|
-
//#endregion
|
|
48
|
-
export { CodegenFile, CodegenOptions, ProcessedFile, runCodegen };
|
|
1
|
+
import { a as generateFields, c as processFile, i as GenerateFieldsOptions, n as ProcessedFieldsFile, o as CodegenFile, r as runFieldsCodegen, s as ProcessDataFile, t as FieldsCodegenOptions } from "./run-Cgm8iCzh.mjs";
|
|
2
|
+
export { type CodegenFile, type FieldsCodegenOptions, type GenerateFieldsOptions, type ProcessDataFile, type ProcessedFieldsFile, generateFields, processFile, runFieldsCodegen };
|
package/dist/index.mjs
CHANGED
|
@@ -1,243 +1,2 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
import { dedent, sanitizeIdentifier, toPascalCase, toSnakeCase } from "@luxass/utils";
|
|
4
|
-
import { createConcurrencyLimiter } from "@ucdjs-internal/shared";
|
|
5
|
-
import { RawDataFile } from "@unicode-utils/core";
|
|
6
|
-
import { createOpenAI } from "@ai-sdk/openai";
|
|
7
|
-
import { generateObject } from "ai";
|
|
8
|
-
import { z } from "zod";
|
|
9
|
-
//#region src/fields.ts
|
|
10
|
-
const SYSTEM_PROMPT = dedent`
|
|
11
|
-
<system_prompt>
|
|
12
|
-
<role>Expert TypeScript field extractor for Unicode data files</role>
|
|
13
|
-
|
|
14
|
-
<task>
|
|
15
|
-
<input>
|
|
16
|
-
Text description: {{INPUT}}
|
|
17
|
-
</input>
|
|
18
|
-
<output>JSON array of field objects with name, type, and description</output>
|
|
19
|
-
</task>
|
|
20
|
-
|
|
21
|
-
<critical_rule>
|
|
22
|
-
YOU MUST ALWAYS OUTPUT A VALID JSON ARRAY OF OBJECTS WITH THIS STRUCTURE:
|
|
23
|
-
[
|
|
24
|
-
{
|
|
25
|
-
"name": "actual_field_name",
|
|
26
|
-
"type": "valid_typescript_type",
|
|
27
|
-
"description": "Description of the field"
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
|
|
31
|
-
NEVER use generic names like "field_0", "field_1", etc. - extract the ACTUAL field names.
|
|
32
|
-
NEVER use invalid TypeScript types like "union" - use proper union syntax with pipe symbol.
|
|
33
|
-
</critical_rule>
|
|
34
|
-
|
|
35
|
-
<field_detection>
|
|
36
|
-
For Unicode data files, fields are typically described in patterns like:
|
|
37
|
-
- Lines starting with "# Field 0: Name" - where "Name" is the actual field name
|
|
38
|
-
- Table headers or column definitions
|
|
39
|
-
- Property descriptions in documentation
|
|
40
|
-
|
|
41
|
-
DO NOT use "field_0", "field_1" as field names - extract the REAL field names.
|
|
42
|
-
DO NOT use "type_name" as a field name unless it's explicitly mentioned as a field.
|
|
43
|
-
|
|
44
|
-
Example: "# Field 0: Code_Point" should produce a field named "code_point" (NOT "field_0").
|
|
45
|
-
</field_detection>
|
|
46
|
-
|
|
47
|
-
<type_mapping>
|
|
48
|
-
ONLY USE THESE VALID TYPESCRIPT TYPES:
|
|
49
|
-
- string - For text, identifiers, character codes, etc.
|
|
50
|
-
- number - For numeric values, indices, counts
|
|
51
|
-
- boolean - For true/false flags
|
|
52
|
-
- string[] - For arrays of strings
|
|
53
|
-
- number[] - For arrays of numbers
|
|
54
|
-
- Array<string> - Alternative syntax for string arrays
|
|
55
|
-
- Array<number> - Alternative syntax for number arrays
|
|
56
|
-
- Record<string, string> - For string to string mappings
|
|
57
|
-
- Record<string, number> - For string to number mappings
|
|
58
|
-
- Record<string, unknown> - For objects with unknown structure
|
|
59
|
-
- unknown - When type cannot be determined
|
|
60
|
-
- any - Only as a last resort when type is truly variable
|
|
61
|
-
|
|
62
|
-
For union types with string literals, ALWAYS use double quotes and pipe symbol:
|
|
63
|
-
- "\"value1\" | \"value2\" | \"value3\""
|
|
64
|
-
|
|
65
|
-
CRITICAL: Special values handling:
|
|
66
|
-
1. If a value contains angle brackets, like <none>, FIRST remove the angle brackets: "none" (NOT "<none>")
|
|
67
|
-
2. THEN ALWAYS wrap the value in quotes if it's a string literal: "\"none\""
|
|
68
|
-
3. For union types of string literals, EACH value MUST be in quotes:
|
|
69
|
-
- Correct: "\"R\" | \"L\" | \"D\" | \"C\" | \"U\" | \"T\""
|
|
70
|
-
- Incorrect: "R | L | D | C | U | T" (missing quotes around each value)
|
|
71
|
-
- Incorrect: "none" (missing quotes if it's a string literal)
|
|
72
|
-
|
|
73
|
-
NEVER use these invalid types:
|
|
74
|
-
- "union" - This is not a valid TypeScript type
|
|
75
|
-
- "object" - Too generic, use Record<> instead
|
|
76
|
-
- "array" - Too generic, use proper array syntax instead
|
|
77
|
-
- "map" - Use Record<> instead
|
|
78
|
-
- "none" - Use "\"none\"" or never instead
|
|
79
|
-
- "list" - Use proper array syntax instead
|
|
80
|
-
|
|
81
|
-
ALWAYS wrap the final converted type in double quotes in the JSON output.
|
|
82
|
-
ALWAYS wrap each value in a union type of string literals in quotes.
|
|
83
|
-
</type_mapping>
|
|
84
|
-
|
|
85
|
-
<examples>
|
|
86
|
-
Input example:
|
|
87
|
-
\`\`\`
|
|
88
|
-
# ArabicShaping.txt
|
|
89
|
-
# Field 0: Code point
|
|
90
|
-
# Field 1: Name
|
|
91
|
-
# Field 2: Joining_Type (R = Right_Joining, L = Left_Joining, D = Dual_Joining, C = Join_Causing, U = Non_Joining, T = Transparent)
|
|
92
|
-
# Field 3: Joining_Group
|
|
93
|
-
\`\`\`
|
|
94
|
-
|
|
95
|
-
Correct output:
|
|
96
|
-
[
|
|
97
|
-
{
|
|
98
|
-
"name": "code_point",
|
|
99
|
-
"type": "string",
|
|
100
|
-
"description": "The code point of a character, in hexadecimal form"
|
|
101
|
-
},
|
|
102
|
-
{
|
|
103
|
-
"name": "name",
|
|
104
|
-
"type": "string",
|
|
105
|
-
"description": "A short schematic name for the character"
|
|
106
|
-
},
|
|
107
|
-
{
|
|
108
|
-
"name": "joining_type",
|
|
109
|
-
"type": "\"R\" | \"L\" | \"D\" | \"C\" | \"U\" | \"T\"",
|
|
110
|
-
"description": "Defines the joining type (R = Right_Joining, L = Left_Joining, D = Dual_Joining, C = Join_Causing, U = Non_Joining, T = Transparent)"
|
|
111
|
-
},
|
|
112
|
-
{
|
|
113
|
-
"name": "joining_group",
|
|
114
|
-
"type": "string",
|
|
115
|
-
"description": "Defines the joining group, based schematically on character names"
|
|
116
|
-
}
|
|
117
|
-
]
|
|
118
|
-
|
|
119
|
-
Example with special value:
|
|
120
|
-
Input: "# Field 4: Value (<none> or specific value)"
|
|
121
|
-
|
|
122
|
-
Correct output for this field:
|
|
123
|
-
{
|
|
124
|
-
"name": "value",
|
|
125
|
-
"type": "\"none\" | string",
|
|
126
|
-
"description": "The value, which can be none or a specific value"
|
|
127
|
-
}
|
|
128
|
-
</examples>
|
|
129
|
-
|
|
130
|
-
<validation>
|
|
131
|
-
Before outputting, verify:
|
|
132
|
-
- Field names are the ACTUAL field names from the documentation, NOT generic "field_0" style names
|
|
133
|
-
- All field names are in snake_case
|
|
134
|
-
- All types are valid TypeScript types (string, number, boolean, arrays, or proper union types)
|
|
135
|
-
- NEVER use the word "union" as a type - use proper TypeScript syntax with the pipe symbol
|
|
136
|
-
- String literal values in union types are ALWAYS wrapped in quotes (e.g., "\"value1\" | \"value2\"")
|
|
137
|
-
- Special values like "none" (from <none>) are properly quoted as string literals: "\"none\""
|
|
138
|
-
- Each field has a clear, specific description
|
|
139
|
-
- Output is a valid JSON array of objects
|
|
140
|
-
</validation>
|
|
141
|
-
|
|
142
|
-
<error_handling>
|
|
143
|
-
If NO fields can be detected in the input:
|
|
144
|
-
- Return an empty JSON array: []
|
|
145
|
-
- DO NOT return error messages or explanations in the JSON output
|
|
146
|
-
- DO NOT attempt to create fields when none are clearly defined
|
|
147
|
-
|
|
148
|
-
Example when no fields are detected:
|
|
149
|
-
Input: "This is some text without any field definitions"
|
|
150
|
-
|
|
151
|
-
Correct output:
|
|
152
|
-
[]
|
|
153
|
-
|
|
154
|
-
If SOME fields are unclear but others are detectable:
|
|
155
|
-
- Only include the fields that can be clearly identified
|
|
156
|
-
- Omit any fields that cannot be confidently extracted
|
|
157
|
-
- Follow all validation rules for the fields that are included
|
|
158
|
-
</error_handling>
|
|
159
|
-
|
|
160
|
-
<format>JSON array of field objects</format>
|
|
161
|
-
</system_prompt>
|
|
162
|
-
`;
|
|
163
|
-
async function generateFields(options) {
|
|
164
|
-
const { datafile, apiKey, model } = options;
|
|
165
|
-
if (!apiKey && !model) return null;
|
|
166
|
-
const openai = model != null ? null : createOpenAI({ apiKey });
|
|
167
|
-
try {
|
|
168
|
-
return (await generateObject({
|
|
169
|
-
model: model ?? openai("gpt-4o-mini"),
|
|
170
|
-
schema: z.object({ fields: z.array(z.object({
|
|
171
|
-
name: z.string(),
|
|
172
|
-
type: z.string(),
|
|
173
|
-
description: z.string()
|
|
174
|
-
})) }),
|
|
175
|
-
prompt: SYSTEM_PROMPT.replace("{{INPUT}}", datafile.heading)
|
|
176
|
-
})).object.fields;
|
|
177
|
-
} catch (err) {
|
|
178
|
-
console.error("error generating fields:", err);
|
|
179
|
-
return null;
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
//#endregion
|
|
183
|
-
//#region src/index.ts
|
|
184
|
-
const TXT_EXTENSION_RE = /\.txt$/;
|
|
185
|
-
function buildInterface(name, fields, opts) {
|
|
186
|
-
return `${opts?.export ? "export " : ""}interface ${name} {\n${Object.entries(fields).map(([key, type]) => ` ${key}: ${type};`).join("\n")}\n}`;
|
|
187
|
-
}
|
|
188
|
-
function buildStringArray(values) {
|
|
189
|
-
return `[${values.map((v) => `"${v}"`).join(", ")}]`;
|
|
190
|
-
}
|
|
191
|
-
async function runCodegen(options) {
|
|
192
|
-
const inputFiles = options.files;
|
|
193
|
-
const limit = createConcurrencyLimiter(10);
|
|
194
|
-
if (!options.openaiKey && !options.model) throw new Error("Either openaiKey or model must be provided");
|
|
195
|
-
const processPromises = inputFiles.map(({ filePath, version }) => limit(() => processFile({
|
|
196
|
-
filePath,
|
|
197
|
-
openaiKey: options.openaiKey,
|
|
198
|
-
version,
|
|
199
|
-
model: options.model
|
|
200
|
-
})));
|
|
201
|
-
return Promise.all(processPromises).then((results) => results.filter((result) => result !== null));
|
|
202
|
-
}
|
|
203
|
-
async function processFile(request) {
|
|
204
|
-
const { filePath, openaiKey, version, model } = request;
|
|
205
|
-
try {
|
|
206
|
-
console.log(`Processing file: ${filePath}`);
|
|
207
|
-
const content = await readFile(filePath, "utf-8");
|
|
208
|
-
const fileName = path.basename(filePath).replace(TXT_EXTENSION_RE, "");
|
|
209
|
-
const fields = await generateFields({
|
|
210
|
-
datafile: new RawDataFile(content),
|
|
211
|
-
apiKey: openaiKey,
|
|
212
|
-
model
|
|
213
|
-
});
|
|
214
|
-
if (fields == null) {
|
|
215
|
-
console.error(`Error generating fields for file: ${filePath}`);
|
|
216
|
-
return null;
|
|
217
|
-
}
|
|
218
|
-
let code = ``;
|
|
219
|
-
const properties = {};
|
|
220
|
-
for (const field of fields) {
|
|
221
|
-
if (properties[field.name] != null) {
|
|
222
|
-
console.error(`Duplicate field name ${field.name} in file ${filePath}. Skipping field.`);
|
|
223
|
-
continue;
|
|
224
|
-
}
|
|
225
|
-
properties[field.name] = field.type;
|
|
226
|
-
}
|
|
227
|
-
code += `// This file is generated by ucd codegen. Do not edit this file directly.\n`;
|
|
228
|
-
code += `// Unicode Version: ${version}\n\n`;
|
|
229
|
-
code += `${buildInterface(sanitizeIdentifier(toPascalCase(fileName)), properties, { export: true })}\n\n`;
|
|
230
|
-
code += `export const ${sanitizeIdentifier(toSnakeCase(fileName)).toUpperCase()}_FIELDS = ${buildStringArray(fields.map((f) => f.name))};\n`;
|
|
231
|
-
return {
|
|
232
|
-
fields,
|
|
233
|
-
fileName,
|
|
234
|
-
version,
|
|
235
|
-
code
|
|
236
|
-
};
|
|
237
|
-
} catch (err) {
|
|
238
|
-
console.error(`Error processing file ${filePath}:`, err);
|
|
239
|
-
return null;
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
//#endregion
|
|
243
|
-
export { runCodegen };
|
|
1
|
+
import { n as generateFields, r as processFile, t as runFieldsCodegen } from "./run-bH3vZ-ci.mjs";
|
|
2
|
+
export { generateFields, processFile, runFieldsCodegen };
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { RawDataFile } from "@unicode-utils/core";
|
|
2
|
+
import { LanguageModel } from "ai";
|
|
3
|
+
|
|
4
|
+
//#region src/process.d.ts
|
|
5
|
+
type CodegenFile = {
|
|
6
|
+
/**
|
|
7
|
+
* The file path to the UCD data file on disk.
|
|
8
|
+
*/
|
|
9
|
+
filePath: string;
|
|
10
|
+
/**
|
|
11
|
+
* The Unicode version this file belongs to.
|
|
12
|
+
*/
|
|
13
|
+
version: string;
|
|
14
|
+
} | {
|
|
15
|
+
/**
|
|
16
|
+
* Pre-loaded file content. When provided, no disk read is performed.
|
|
17
|
+
*/
|
|
18
|
+
content: string;
|
|
19
|
+
/**
|
|
20
|
+
* The file name (without extension) used to derive the interface and constant names.
|
|
21
|
+
*/
|
|
22
|
+
fileName: string;
|
|
23
|
+
/**
|
|
24
|
+
* The Unicode version this file belongs to.
|
|
25
|
+
*/
|
|
26
|
+
version: string;
|
|
27
|
+
};
|
|
28
|
+
type ProcessDataFile<T> = (datafile: RawDataFile, fileName: string, version: string) => Promise<T | null>;
|
|
29
|
+
/**
|
|
30
|
+
* Reads a UCD data file from disk, parses it into a {@link RawDataFile}, and
|
|
31
|
+
* passes the result to the provided `processor` callback.
|
|
32
|
+
*
|
|
33
|
+
* This is the shared file-loading primitive used by all codegen runners.
|
|
34
|
+
* Implement a custom `processor` to add new kinds of code generation without
|
|
35
|
+
* duplicating the I/O and error-handling logic.
|
|
36
|
+
*
|
|
37
|
+
* @example
|
|
38
|
+
* ```ts
|
|
39
|
+
* const result = await processFile("./data/ArabicShaping.txt", "16.0",
|
|
40
|
+
* async (datafile, fileName, version) => { ... }
|
|
41
|
+
* );
|
|
42
|
+
* ```
|
|
43
|
+
*/
|
|
44
|
+
declare function processFile<T>(filePath: string, version: string, processor: ProcessDataFile<T>): Promise<T | null>;
|
|
45
|
+
//#endregion
|
|
46
|
+
//#region src/fields/generate.d.ts
|
|
47
|
+
interface GenerateFieldsOptions {
|
|
48
|
+
/**
|
|
49
|
+
* The parsed UCD data file to extract fields from.
|
|
50
|
+
*/
|
|
51
|
+
datafile: RawDataFile;
|
|
52
|
+
/**
|
|
53
|
+
* The language model to use for field extraction.
|
|
54
|
+
*/
|
|
55
|
+
model: LanguageModel;
|
|
56
|
+
}
|
|
57
|
+
declare function generateFields(options: GenerateFieldsOptions): Promise<{
|
|
58
|
+
name: string;
|
|
59
|
+
type: string;
|
|
60
|
+
description: string;
|
|
61
|
+
}[] | null>;
|
|
62
|
+
//#endregion
|
|
63
|
+
//#region src/fields/run.d.ts
|
|
64
|
+
interface ProcessedFieldsFile {
|
|
65
|
+
fields: {
|
|
66
|
+
type: string;
|
|
67
|
+
name: string;
|
|
68
|
+
description: string;
|
|
69
|
+
}[];
|
|
70
|
+
code: string;
|
|
71
|
+
fileName: string;
|
|
72
|
+
version: string;
|
|
73
|
+
}
|
|
74
|
+
interface FieldsCodegenOptions {
|
|
75
|
+
/**
|
|
76
|
+
* Files to generate field definitions for.
|
|
77
|
+
*/
|
|
78
|
+
files: CodegenFile[];
|
|
79
|
+
/**
|
|
80
|
+
* OpenAI API key. Required when no custom `model` is provided.
|
|
81
|
+
*/
|
|
82
|
+
openaiKey?: string;
|
|
83
|
+
/**
|
|
84
|
+
* OpenAI model ID to use when `openaiKey` is provided.
|
|
85
|
+
* @default "gpt-4o-mini"
|
|
86
|
+
*/
|
|
87
|
+
modelId?: string;
|
|
88
|
+
/**
|
|
89
|
+
* A custom language model. Useful for testing or non-OpenAI providers.
|
|
90
|
+
* When provided, `openaiKey` and `modelId` are ignored.
|
|
91
|
+
*
|
|
92
|
+
* @see https://ai-sdk.dev/docs/ai-sdk-core/testing
|
|
93
|
+
*/
|
|
94
|
+
model?: LanguageModel;
|
|
95
|
+
}
|
|
96
|
+
declare function runFieldsCodegen(options: FieldsCodegenOptions): Promise<ProcessedFieldsFile[]>;
|
|
97
|
+
//#endregion
|
|
98
|
+
export { generateFields as a, processFile as c, GenerateFieldsOptions as i, ProcessedFieldsFile as n, CodegenFile as o, runFieldsCodegen as r, ProcessDataFile as s, FieldsCodegenOptions as t };
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import { createOpenAI } from "@ai-sdk/openai";
|
|
2
|
+
import { sanitizeIdentifier, toPascalCase, toSnakeCase } from "@luxass/utils";
|
|
3
|
+
import { createConcurrencyLimiter } from "@ucdjs-internal/shared";
|
|
4
|
+
import { RawDataFile } from "@unicode-utils/core";
|
|
5
|
+
import { readFile } from "node:fs/promises";
|
|
6
|
+
import path from "node:path";
|
|
7
|
+
import { Output, generateText } from "ai";
|
|
8
|
+
import { z } from "zod";
|
|
9
|
+
//#region src/knitwork.ts
|
|
10
|
+
function buildInterface(name, fields, opts) {
|
|
11
|
+
return `${opts?.export ? "export " : ""}interface ${name} {\n${Object.entries(fields).map(([key, type]) => ` ${key}: ${type};`).join("\n")}\n}`;
|
|
12
|
+
}
|
|
13
|
+
function buildStringArray(values) {
|
|
14
|
+
if (values.length === 0) return "[]";
|
|
15
|
+
return `[${values.map((v) => `"${v}"`).join(", ")}]`;
|
|
16
|
+
}
|
|
17
|
+
//#endregion
|
|
18
|
+
//#region src/process.ts
|
|
19
|
+
const TXT_EXTENSION_RE = /\.txt$/;
|
|
20
|
+
/**
|
|
21
|
+
* Reads a UCD data file from disk, parses it into a {@link RawDataFile}, and
|
|
22
|
+
* passes the result to the provided `processor` callback.
|
|
23
|
+
*
|
|
24
|
+
* This is the shared file-loading primitive used by all codegen runners.
|
|
25
|
+
* Implement a custom `processor` to add new kinds of code generation without
|
|
26
|
+
* duplicating the I/O and error-handling logic.
|
|
27
|
+
*
|
|
28
|
+
* @example
|
|
29
|
+
* ```ts
|
|
30
|
+
* const result = await processFile("./data/ArabicShaping.txt", "16.0",
|
|
31
|
+
* async (datafile, fileName, version) => { ... }
|
|
32
|
+
* );
|
|
33
|
+
* ```
|
|
34
|
+
*/
|
|
35
|
+
async function processFile(filePath, version, processor) {
|
|
36
|
+
try {
|
|
37
|
+
console.log(`Processing file: ${filePath}`);
|
|
38
|
+
const content = await readFile(filePath, "utf-8");
|
|
39
|
+
const fileName = path.basename(filePath).replace(TXT_EXTENSION_RE, "");
|
|
40
|
+
return await processor(new RawDataFile(content), fileName, version);
|
|
41
|
+
} catch (err) {
|
|
42
|
+
console.error(`Error processing file ${filePath}:`, err);
|
|
43
|
+
return null;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
//#endregion
|
|
47
|
+
//#region src/fields/generate.ts
|
|
48
|
+
const SYSTEM_PROMPT = `
|
|
49
|
+
Extract TypeScript field definitions from a Unicode data file header.
|
|
50
|
+
|
|
51
|
+
## Field naming
|
|
52
|
+
- Use snake_case for all field names.
|
|
53
|
+
- Extract the REAL name from patterns like "# Field 0: Code_Point" → "code_point". Never use "field_0", "field_1", etc.
|
|
54
|
+
|
|
55
|
+
## Types
|
|
56
|
+
Valid TypeScript only: string, number, boolean, string[], number[], Array<string>, Array<number>, Record<string, string>, Record<string, number>, Record<string, unknown>, unknown.
|
|
57
|
+
- String literal unions: each value quoted with pipe — "\"R\" | \"L\" | \"D\""
|
|
58
|
+
- Angle-bracket values like <none> → remove brackets and quote: "\"none\""
|
|
59
|
+
- Never use: union, object, array, map, list, none (unquoted)
|
|
60
|
+
|
|
61
|
+
## Output
|
|
62
|
+
Return { "fields": [] } when no fields are detectable. Only include fields you can confidently identify.
|
|
63
|
+
|
|
64
|
+
## Example
|
|
65
|
+
Input:
|
|
66
|
+
# Field 0: Code_Point
|
|
67
|
+
# Field 1: Name
|
|
68
|
+
# Field 2: Joining_Type (R = Right_Joining, L = Left_Joining, D = Dual_Joining, C = Join_Causing, U = Non_Joining, T = Transparent)
|
|
69
|
+
# Field 3: Joining_Group
|
|
70
|
+
|
|
71
|
+
Output:
|
|
72
|
+
{
|
|
73
|
+
"fields": [
|
|
74
|
+
{ "name": "code_point", "type": "string", "description": "Unicode code point in hexadecimal" },
|
|
75
|
+
{ "name": "name", "type": "string", "description": "Schematic name for the character" },
|
|
76
|
+
{ "name": "joining_type", "type": "\\"R\\" | \\"L\\" | \\"D\\" | \\"C\\" | \\"U\\" | \\"T\\"", "description": "Joining type (R=Right, L=Left, D=Dual, C=Join_Causing, U=Non_Joining, T=Transparent)" },
|
|
77
|
+
{ "name": "joining_group", "type": "string", "description": "Joining group based on character names" }
|
|
78
|
+
]
|
|
79
|
+
}
|
|
80
|
+
`.trim();
|
|
81
|
+
async function generateFields(options) {
|
|
82
|
+
const { datafile, model } = options;
|
|
83
|
+
try {
|
|
84
|
+
return (await generateText({
|
|
85
|
+
model,
|
|
86
|
+
system: SYSTEM_PROMPT,
|
|
87
|
+
prompt: datafile.heading,
|
|
88
|
+
output: Output.object({ schema: z.object({ fields: z.array(z.object({
|
|
89
|
+
name: z.string(),
|
|
90
|
+
type: z.string(),
|
|
91
|
+
description: z.string()
|
|
92
|
+
})) }) })
|
|
93
|
+
})).output.fields;
|
|
94
|
+
} catch (err) {
|
|
95
|
+
console.error("error generating fields:", err);
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
//#endregion
|
|
100
|
+
//#region src/fields/run.ts
|
|
101
|
+
async function generateFieldsCode(datafile, fileName, version, model) {
|
|
102
|
+
const fields = await generateFields({
|
|
103
|
+
datafile,
|
|
104
|
+
model
|
|
105
|
+
});
|
|
106
|
+
if (fields == null) {
|
|
107
|
+
console.error(`Error generating fields for file: ${fileName}`);
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
const properties = {};
|
|
111
|
+
for (const field of fields) {
|
|
112
|
+
if (properties[field.name] != null) {
|
|
113
|
+
console.error(`Duplicate field name ${field.name} in file ${fileName}. Skipping field.`);
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
properties[field.name] = field.type;
|
|
117
|
+
}
|
|
118
|
+
let code = `// This file is generated by ucd codegen. Do not edit this file directly.\n`;
|
|
119
|
+
code += `// Unicode Version: ${version}\n\n`;
|
|
120
|
+
code += `${buildInterface(sanitizeIdentifier(toPascalCase(fileName)), properties, { export: true })}\n\n`;
|
|
121
|
+
code += `export const ${sanitizeIdentifier(toSnakeCase(fileName)).toUpperCase()}_FIELDS = ${buildStringArray(Object.keys(properties))};\n`;
|
|
122
|
+
return {
|
|
123
|
+
fields,
|
|
124
|
+
fileName,
|
|
125
|
+
version,
|
|
126
|
+
code
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
async function runFieldsCodegen(options) {
|
|
130
|
+
if (!options.openaiKey && !options.model) throw new Error("Either openaiKey or model must be provided");
|
|
131
|
+
const model = options.model ?? createOpenAI({ apiKey: options.openaiKey })(options.modelId ?? "gpt-4o-mini");
|
|
132
|
+
const limit = createConcurrencyLimiter(10);
|
|
133
|
+
const processPromises = options.files.map((file) => {
|
|
134
|
+
if ("content" in file) return limit(async () => {
|
|
135
|
+
return generateFieldsCode(new RawDataFile(file.content), file.fileName, file.version, model);
|
|
136
|
+
});
|
|
137
|
+
return limit(() => processFile(file.filePath, file.version, (datafile, fileName, ver) => generateFieldsCode(datafile, fileName, ver, model)));
|
|
138
|
+
});
|
|
139
|
+
return Promise.all(processPromises).then((results) => results.filter((r) => r !== null));
|
|
140
|
+
}
|
|
141
|
+
//#endregion
|
|
142
|
+
export { generateFields as n, processFile as r, runFieldsCodegen as t };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ucdjs/codegen",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Lucas Nørgård",
|
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
},
|
|
20
20
|
"exports": {
|
|
21
21
|
".": "./dist/index.mjs",
|
|
22
|
+
"./fields": "./dist/fields.mjs",
|
|
22
23
|
"./package.json": "./package.json"
|
|
23
24
|
},
|
|
24
25
|
"types": "./dist/index.d.mts",
|
|
@@ -34,7 +35,7 @@
|
|
|
34
35
|
"@unicode-utils/core": "0.12.0-beta.27",
|
|
35
36
|
"ai": "6.0.138",
|
|
36
37
|
"zod": "4.3.6",
|
|
37
|
-
"@ucdjs-internal/shared": "0.
|
|
38
|
+
"@ucdjs-internal/shared": "0.2.0"
|
|
38
39
|
},
|
|
39
40
|
"devDependencies": {
|
|
40
41
|
"@luxass/eslint-config": "7.4.2",
|