@ucdjs/schema-gen 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  [![npm version][npm-version-src]][npm-version-href]
4
4
  [![npm downloads][npm-downloads-src]][npm-downloads-href]
5
+ [![codecov][codecov-src]][codecov-href]
5
6
 
6
7
  Utilities for working with the Unicode Character Database (UCD).
7
8
 
@@ -19,3 +20,5 @@ Published under [MIT License](./LICENSE).
19
20
  [npm-version-href]: https://npmjs.com/package/@ucdjs/schema-gen
20
21
  [npm-downloads-src]: https://img.shields.io/npm/dm/@ucdjs/schema-gen?style=flat&colorA=18181B&colorB=4169E1
21
22
  [npm-downloads-href]: https://npmjs.com/package/@ucdjs/schema-gen
23
+ [codecov-src]: https://img.shields.io/codecov/c/gh/ucdjs/ucd?style=flat&colorA=18181B&colorB=4169E1
24
+ [codecov-href]: https://codecov.io/gh/ucdjs/ucd
package/dist/index.d.ts CHANGED
@@ -1,17 +1,48 @@
1
- import { RawDataFile } from "@luxass/unicode-utils";
1
+ import { LanguageModel } from "ai";
2
2
 
3
- //#region src/fields.d.ts
4
- interface GenerateFieldsOptions {
5
- /**
6
- * The data file to generate fields for.
7
- */
8
- datafile: RawDataFile;
9
- /**
10
- * The OpenAI API key to use for generating fields.
11
- */
12
- apiKey: string;
3
+ //#region src/index.d.ts
4
+ interface SchemaGenFile {
5
+ /**
6
+ * The filePath to the data file.
7
+ */
8
+ filePath: string;
9
+ /**
10
+ * The version of the data file.
11
+ */
12
+ version: string;
13
13
  }
14
- declare function generateFields(options: GenerateFieldsOptions): Promise<string | null>;
15
-
14
+ interface ProcessedFile {
15
+ fields: {
16
+ type: string;
17
+ name: string;
18
+ description: string;
19
+ }[];
20
+ code: string;
21
+ fileName: string;
22
+ version: string;
23
+ }
24
+ interface SchemaGenOptions {
25
+ /**
26
+ * Files to generate structures for.
27
+ */
28
+ files: SchemaGenFile[];
29
+ /**
30
+ * The OpenAI API key to use for generating the schema.
31
+ */
32
+ openaiKey?: string;
33
+ /**
34
+ * The OpenAI model to use for generating fields.
35
+ * NOTE:
36
+ * This is good for testing purposes, where you
37
+ * can provide a mock model to test the generation.
38
+ *
39
+ * If not provided, it will create a new OpenAI instance
40
+ * with the default model.
41
+ *
42
+ * SEE: https://ai-sdk.dev/docs/ai-sdk-core/testing
43
+ */
44
+ model?: LanguageModel;
45
+ }
46
+ declare function runSchemagen(options: SchemaGenOptions): Promise<ProcessedFile[]>;
16
47
  //#endregion
17
- export { generateFields };
48
+ export { ProcessedFile, SchemaGenFile, SchemaGenOptions, runSchemagen };
package/dist/index.js CHANGED
@@ -1,71 +1,182 @@
1
+ import { readFile } from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { RawDataFile } from "@luxass/unicode-utils-old";
4
+ import { dedent, sanitizeIdentifier, toPascalCase, toSnakeCase } from "@luxass/utils";
5
+ import { createConcurrencyLimiter } from "@ucdjs-internal/shared";
6
+ import { genArrayFromRaw, genInterface } from "knitwork";
1
7
  import { createOpenAI } from "@ai-sdk/openai";
2
- import { dedent } from "@luxass/utils";
3
8
  import { generateObject } from "ai";
4
9
  import { z } from "zod";
5
10
 
6
11
  //#region src/fields.ts
7
- const SYSTEM_PROMOT = dedent`
8
- <system_prompt>
9
- <role>Expert TypeScript code generator specializing in interfaces and documentation</role>
10
-
11
- <task>
12
- <input>Text description: {{INPUT}}</input>
13
- <output>TypeScript interface with comprehensive JSDoc comments</output>
14
- </task>
15
-
16
- <requirements>
17
- <field_processing>
18
- - Extract all relevant fields from text
19
- - Convert field names to snake_case
20
- - Preserve original order
21
- </field_processing>
22
-
23
- <documentation>
24
- - JSDoc for each property only
25
- - Document union types with double quotes (no enums)
26
- - Explain constraints and formats with examples
27
- - No JSDoc for the main interface
28
- </documentation>
29
-
30
- <structure>
31
- - Use the file name from the first line of the input (ignoring version numbers) as the interface name
32
- - Single interface named after the input file name without version numbers (e.g., ArabicShaping not ArabicShaping-16.0.0)
33
- - No additional interfaces or arrays
34
- - Create ordered keys array named [INTERFACE_NAME]_FIELDS using SCREAMING_SNAKE_CASE
35
- - Convert the interface name to SCREAMING_SNAKE_CASE by inserting underscores between camelCase words
36
- - Use double quotes for field names in the keys array
37
- - No example data or additional declarations
38
- - Export all variables and interfaces
39
- </structure>
40
-
41
- <formatting>
42
- - Use 2 space indentation
43
- - Use trailing commas in all multi-line structures
44
- - Use trailing commas in arrays and interfaces
45
- </formatting>
46
- </requirements>
47
-
48
- <format>Single TypeScript code block containing only the interface and fields array, both exported</format>
49
-
50
- <examples>
51
- - Interface: NamedSequences -> Fields array: NAMED_SEQUENCES_FIELDS
52
- - Interface: NamedSequencesProv -> Fields array: NAMED_SEQUENCES_PROV_FIELDS
53
- - Interface: ArabicShaping -> Fields array: ARABIC_SHAPING_FIELDS
54
- </examples>
55
- </system_prompt>
12
+ const SYSTEM_PROMPT = dedent`
13
+ <system_prompt>
14
+ <role>Expert TypeScript field extractor for Unicode data files</role>
15
+
16
+ <task>
17
+ <input>
18
+ Text description: {{INPUT}}
19
+ </input>
20
+ <output>JSON array of field objects with name, type, and description</output>
21
+ </task>
22
+
23
+ <critical_rule>
24
+ YOU MUST ALWAYS OUTPUT A VALID JSON ARRAY OF OBJECTS WITH THIS STRUCTURE:
25
+ [
26
+ {
27
+ "name": "actual_field_name",
28
+ "type": "valid_typescript_type",
29
+ "description": "Description of the field"
30
+ }
31
+ ]
32
+
33
+ NEVER use generic names like "field_0", "field_1", etc. - extract the ACTUAL field names.
34
+ NEVER use invalid TypeScript types like "union" - use proper union syntax with pipe symbol.
35
+ </critical_rule>
36
+
37
+ <field_detection>
38
+ For Unicode data files, fields are typically described in patterns like:
39
+ - Lines starting with "# Field 0: Name" - where "Name" is the actual field name
40
+ - Table headers or column definitions
41
+ - Property descriptions in documentation
42
+
43
+ DO NOT use "field_0", "field_1" as field names - extract the REAL field names.
44
+ DO NOT use "type_name" as a field name unless it's explicitly mentioned as a field.
45
+
46
+ Example: "# Field 0: Code_Point" should produce a field named "code_point" (NOT "field_0").
47
+ </field_detection>
48
+
49
+ <type_mapping>
50
+ ONLY USE THESE VALID TYPESCRIPT TYPES:
51
+ - string - For text, identifiers, character codes, etc.
52
+ - number - For numeric values, indices, counts
53
+ - boolean - For true/false flags
54
+ - string[] - For arrays of strings
55
+ - number[] - For arrays of numbers
56
+ - Array<string> - Alternative syntax for string arrays
57
+ - Array<number> - Alternative syntax for number arrays
58
+ - Record<string, string> - For string to string mappings
59
+ - Record<string, number> - For string to number mappings
60
+ - Record<string, unknown> - For objects with unknown structure
61
+ - unknown - When type cannot be determined
62
+ - any - Only as a last resort when type is truly variable
63
+
64
+ For union types with string literals, ALWAYS use double quotes and pipe symbol:
65
+ - "\"value1\" | \"value2\" | \"value3\""
66
+
67
+ CRITICAL: Special values handling:
68
+ 1. If a value contains angle brackets, like <none>, FIRST remove the angle brackets: "none" (NOT "<none>")
69
+ 2. THEN ALWAYS wrap the value in quotes if it's a string literal: "\"none\""
70
+ 3. For union types of string literals, EACH value MUST be in quotes:
71
+ - Correct: "\"R\" | \"L\" | \"D\" | \"C\" | \"U\" | \"T\""
72
+ - Incorrect: "R | L | D | C | U | T" (missing quotes around each value)
73
+ - Incorrect: "none" (missing quotes if it's a string literal)
74
+
75
+ NEVER use these invalid types:
76
+ - "union" - This is not a valid TypeScript type
77
+ - "object" - Too generic, use Record<> instead
78
+ - "array" - Too generic, use proper array syntax instead
79
+ - "map" - Use Record<> instead
80
+ - "none" - Use "\"none\"" or never instead
81
+ - "list" - Use proper array syntax instead
82
+
83
+ ALWAYS wrap the final converted type in double quotes in the JSON output.
84
+ ALWAYS wrap each value in a union type of string literals in quotes.
85
+ </type_mapping>
86
+
87
+ <examples>
88
+ Input example:
89
+ \`\`\`
90
+ # ArabicShaping.txt
91
+ # Field 0: Code point
92
+ # Field 1: Name
93
+ # Field 2: Joining_Type (R = Right_Joining, L = Left_Joining, D = Dual_Joining, C = Join_Causing, U = Non_Joining, T = Transparent)
94
+ # Field 3: Joining_Group
95
+ \`\`\`
96
+
97
+ Correct output:
98
+ [
99
+ {
100
+ "name": "code_point",
101
+ "type": "string",
102
+ "description": "The code point of a character, in hexadecimal form"
103
+ },
104
+ {
105
+ "name": "name",
106
+ "type": "string",
107
+ "description": "A short schematic name for the character"
108
+ },
109
+ {
110
+ "name": "joining_type",
111
+ "type": "\"R\" | \"L\" | \"D\" | \"C\" | \"U\" | \"T\"",
112
+ "description": "Defines the joining type (R = Right_Joining, L = Left_Joining, D = Dual_Joining, C = Join_Causing, U = Non_Joining, T = Transparent)"
113
+ },
114
+ {
115
+ "name": "joining_group",
116
+ "type": "string",
117
+ "description": "Defines the joining group, based schematically on character names"
118
+ }
119
+ ]
120
+
121
+ Example with special value:
122
+ Input: "# Field 4: Value (<none> or specific value)"
123
+
124
+ Correct output for this field:
125
+ {
126
+ "name": "value",
127
+ "type": "\"none\" | string",
128
+ "description": "The value, which can be none or a specific value"
129
+ }
130
+ </examples>
131
+
132
+ <validation>
133
+ Before outputting, verify:
134
+ - Field names are the ACTUAL field names from the documentation, NOT generic "field_0" style names
135
+ - All field names are in snake_case
136
+ - All types are valid TypeScript types (string, number, boolean, arrays, or proper union types)
137
+ - NEVER use the word "union" as a type - use proper TypeScript syntax with the pipe symbol
138
+ - String literal values in union types are ALWAYS wrapped in quotes (e.g., "\"value1\" | \"value2\"")
139
+ - Special values like "none" (from <none>) are properly quoted as string literals: "\"none\""
140
+ - Each field has a clear, specific description
141
+ - Output is a valid JSON array of objects
142
+ </validation>
143
+
144
+ <error_handling>
145
+ If NO fields can be detected in the input:
146
+ - Return an empty JSON array: []
147
+ - DO NOT return error messages or explanations in the JSON output
148
+ - DO NOT attempt to create fields when none are clearly defined
149
+
150
+ Example when no fields are detected:
151
+ Input: "This is some text without any field definitions"
152
+
153
+ Correct output:
154
+ []
155
+
156
+ If SOME fields are unclear but others are detectable:
157
+ - Only include the fields that can be clearly identified
158
+ - Omit any fields that cannot be confidently extracted
159
+ - Follow all validation rules for the fields that are included
160
+ </error_handling>
161
+
162
+ <format>JSON array of field objects</format>
163
+ </system_prompt>
56
164
  `;
57
165
  async function generateFields(options) {
58
- const { datafile, apiKey } = options;
166
+ const { datafile, apiKey, model } = options;
59
167
  if (datafile.heading == null) return null;
60
- if (!apiKey) return null;
61
- const openai = createOpenAI({ apiKey });
168
+ if (!apiKey && !model) return null;
169
+ const openai = model != null ? null : createOpenAI({ apiKey });
62
170
  try {
63
- const result = await generateObject({
64
- model: openai("gpt-4o-mini"),
65
- schema: z.object({ code: z.string().describe("A TypeScript code block containing the JSDoc commented interface definition and the corresponding keys array with original casing.") }),
66
- prompt: SYSTEM_PROMOT.replace("{{INPUT}}", datafile.heading)
67
- });
68
- return result.object.code;
171
+ return (await generateObject({
172
+ model: model ?? openai("gpt-4o-mini"),
173
+ schema: z.object({ fields: z.array(z.object({
174
+ name: z.string(),
175
+ type: z.string(),
176
+ description: z.string()
177
+ })) }),
178
+ prompt: SYSTEM_PROMPT.replace("{{INPUT}}", datafile.heading)
179
+ })).object.fields;
69
180
  } catch (err) {
70
181
  console.error("error generating fields:", err);
71
182
  return null;
@@ -73,4 +184,63 @@ async function generateFields(options) {
73
184
  }
74
185
 
75
186
  //#endregion
76
- export { generateFields };
187
+ //#region src/index.ts
188
+ async function runSchemagen(options) {
189
+ const inputFiles = options.files;
190
+ const limit = createConcurrencyLimiter(10);
191
+ if (!options.openaiKey && !options.model) throw new Error("Either openaiKey or model must be provided");
192
+ const processPromises = inputFiles.map(({ filePath, version }) => limit(() => processFile({
193
+ filePath,
194
+ openaiKey: options.openaiKey,
195
+ version,
196
+ model: options.model
197
+ })));
198
+ return Promise.all(processPromises).then((results) => results.filter((result) => result !== null));
199
+ }
200
+ async function processFile(request) {
201
+ const { filePath, openaiKey, version, model } = request;
202
+ try {
203
+ console.log(`Processing file: ${filePath}`);
204
+ const content = await readFile(filePath, "utf-8");
205
+ const fileName = path.basename(filePath).replace(/\.txt$/, "");
206
+ const datafile = new RawDataFile(content, fileName);
207
+ if (datafile.heading == null) {
208
+ console.error(`heading for file ${filePath} is null. Skipping file.`);
209
+ return null;
210
+ }
211
+ const fields = await generateFields({
212
+ datafile,
213
+ apiKey: openaiKey,
214
+ model
215
+ });
216
+ if (fields == null) {
217
+ console.error(`Error generating fields for file: ${filePath}`);
218
+ return null;
219
+ }
220
+ let code = ``;
221
+ const properties = {};
222
+ for (const field of fields) {
223
+ if (properties[field.name] != null) {
224
+ console.error(`Duplicate field name ${field.name} in file ${filePath}. Skipping field.`);
225
+ continue;
226
+ }
227
+ properties[field.name] = field.type;
228
+ }
229
+ code += `// This file is generated by ucd codegen. Do not edit this file directly.\n`;
230
+ code += `// Unicode Version: ${version}\n\n`;
231
+ code += `${genInterface(sanitizeIdentifier(toPascalCase(fileName)), properties, { export: true })}\n\n`;
232
+ code += `export const ${sanitizeIdentifier(toSnakeCase(fileName)).toUpperCase()}_FIELDS = ${genArrayFromRaw(fields.map((f) => `"${f.name}"`))};\n`;
233
+ return {
234
+ fields,
235
+ fileName,
236
+ version,
237
+ code
238
+ };
239
+ } catch (error) {
240
+ console.error(`Error processing file ${filePath}:`, error);
241
+ return null;
242
+ }
243
+ }
244
+
245
+ //#endregion
246
+ export { runSchemagen };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ucdjs/schema-gen",
3
- "version": "0.2.0",
3
+ "version": "0.2.2",
4
4
  "type": "module",
5
5
  "author": {
6
6
  "name": "Lucas Nørgård",
@@ -27,26 +27,33 @@
27
27
  "files": [
28
28
  "dist"
29
29
  ],
30
+ "engines": {
31
+ "node": ">=22.18"
32
+ },
30
33
  "dependencies": {
31
- "@ai-sdk/openai": "^1.3.20",
32
- "@luxass/unicode-utils": "^0.4.1",
33
- "@luxass/utils": "^1.4.0",
34
- "ai": "^4.3.10",
35
- "zod": "^3.24.3"
34
+ "@ai-sdk/openai": "2.0.32",
35
+ "@luxass/unicode-utils-old": "npm:@luxass/unicode-utils@0.11.0",
36
+ "@luxass/utils": "2.7.2",
37
+ "ai": "5.0.48",
38
+ "knitwork": "1.2.0",
39
+ "zod": "4.1.11",
40
+ "@ucdjs-internal/shared": "0.1.0"
36
41
  },
37
42
  "devDependencies": {
38
- "@luxass/eslint-config": "^4.18.1",
39
- "eslint": "^9.25.1",
40
- "publint": "^0.3.12",
41
- "tsdown": "v0.9.3",
42
- "typescript": "^5.8.3",
43
- "vitest-testdirs": "^3.0.1"
43
+ "@luxass/eslint-config": "6.0.1",
44
+ "eslint": "9.38.0",
45
+ "publint": "0.3.15",
46
+ "tsdown": "0.15.9",
47
+ "typescript": "5.9.3",
48
+ "vitest-testdirs": "4.2.1",
49
+ "@ucdjs-tooling/tsconfig": "1.0.0",
50
+ "@ucdjs-tooling/tsdown-config": "1.0.0"
44
51
  },
45
52
  "publishConfig": {
46
53
  "access": "public"
47
54
  },
48
55
  "scripts": {
49
- "build": "tsdown",
56
+ "build": "tsdown --tsconfig=./tsconfig.build.json",
50
57
  "dev": "tsdown --watch",
51
58
  "clean": "git clean -xdf dist node_modules",
52
59
  "lint": "eslint .",