@rebasepro/schema-inference 0.0.1-canary.09e5ec5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,404 @@
1
+ import {
2
+ InferencePropertyBuilderProps,
3
+ TypesCount,
4
+ TypesCountRecord,
5
+ ValuesCountEntry,
6
+ ValuesCountRecord
7
+ } from "./types";
8
+ import { buildStringProperty } from "./builders/string_property_builder";
9
+ import { buildValidation } from "./builders/validation_builder";
10
+ import { buildReferenceProperty } from "./builders/reference_property_builder";
11
+ import { extractEnumFromValues, mergeDeep, prettifyIdentifier, resolveEnumValues } from "./util";
12
+ import { DataType, EnumValues, Properties, Property, StringProperty } from "@rebasepro/types";
13
+
14
+ export type InferenceTypeBuilder = (value: any) => DataType;
15
+
16
+ export async function buildEntityPropertiesFromData(
17
+ data: object[],
18
+ getType: InferenceTypeBuilder
19
+ ): Promise<Properties> {
20
+ const typesCount: TypesCountRecord = {};
21
+ const valuesCount: ValuesCountRecord = {};
22
+ if (data) {
23
+ data.forEach((entry) => {
24
+ if (entry) {
25
+ Object.entries(entry).forEach(([key, value]) => {
26
+ if (key.startsWith("_")) return; // Ignore properties starting with _
27
+ increaseMapTypeCount(typesCount, key, value, getType);
28
+ increaseValuesCount(valuesCount, key, value, getType);
29
+ });
30
+ }
31
+ });
32
+ }
33
+ return buildPropertiesFromCount(data.length, typesCount, valuesCount);
34
+ }
35
+
36
+ export function buildPropertyFromData(
37
+ data: any[],
38
+ property: Property,
39
+ getType: InferenceTypeBuilder
40
+ ): Property {
41
+ const typesCount = {};
42
+ const valuesCount: ValuesCountRecord = {};
43
+ if (data) {
44
+ data.forEach((entry) => {
45
+ increaseTypeCount(property.type, typesCount, entry, getType);
46
+ increaseValuesCount(valuesCount, "inferred_prop", entry, getType);
47
+ });
48
+ }
49
+ const enumValues = "enum" in property ? resolveEnumValues(property["enum"] as EnumValues) : undefined;
50
+ if (enumValues) {
51
+ const newEnumValues = extractEnumFromValues(Array.from(valuesCount["inferred_prop"].valuesCount.keys()));
52
+ return {
53
+ ...property,
54
+ enum: [...newEnumValues, ...enumValues]
55
+ } as StringProperty;
56
+ }
57
+ const generatedProperty = buildPropertyFromCount(
58
+ "inferred_prop",
59
+ data.length,
60
+ property.type,
61
+ typesCount,
62
+ valuesCount["inferred_prop"]
63
+ );
64
+ return mergeDeep(generatedProperty, property);
65
+ }
66
+
67
+ export function buildPropertiesOrder(
68
+ properties: Properties,
69
+ propertiesOrder?: string[],
70
+ priorityKeys?: string[]
71
+ ): string[] {
72
+ const lowerCasePriorityKeys = (priorityKeys ?? []).map((key) => key.toLowerCase());
73
+
74
+ function propOrder(s: string) {
75
+ const k = s.toLowerCase();
76
+ if (lowerCasePriorityKeys.includes(k)) return 4;
77
+ if (k === "title" || k === "name") return 3;
78
+ if (k.includes("title") || k.includes("name")) return 2;
79
+ if (k.includes("image") || k.includes("picture")) return 1;
80
+ return 0;
81
+ }
82
+
83
+ const keys = propertiesOrder ?? Object.keys(properties);
84
+ keys.sort(); // alphabetically
85
+ keys.sort((a, b) => {
86
+ return propOrder(b) - propOrder(a);
87
+ });
88
+ return keys;
89
+ }
90
+
91
+ /**
92
+ * @param type
93
+ * @param typesCount
94
+ * @param fieldValue
95
+ * @param getType
96
+ */
97
+ function increaseTypeCount(
98
+ type: DataType,
99
+ typesCount: TypesCount,
100
+ fieldValue: any,
101
+ getType: InferenceTypeBuilder
102
+ ) {
103
+ if (type === "map") {
104
+ if (fieldValue) {
105
+ let mapTypesCount = typesCount[type];
106
+ if (!mapTypesCount) {
107
+ mapTypesCount = {};
108
+ typesCount[type] = mapTypesCount;
109
+ }
110
+ Object.entries(fieldValue).forEach(([key, value]) => {
111
+ increaseMapTypeCount(mapTypesCount as TypesCountRecord, key, value, getType);
112
+ });
113
+ }
114
+ } else if (type === "array") {
115
+ let arrayTypesCount = typesCount[type];
116
+ if (!arrayTypesCount) {
117
+ arrayTypesCount = {};
118
+ typesCount[type] = arrayTypesCount;
119
+ }
120
+ if (fieldValue && Array.isArray(fieldValue) && fieldValue.length > 0) {
121
+ const arrayType = getMostProbableTypeInArray(fieldValue, getType);
122
+ if (arrayType === "map") {
123
+ let mapTypesCount = arrayTypesCount[arrayType];
124
+ if (!mapTypesCount) {
125
+ mapTypesCount = {};
126
+ }
127
+ fieldValue.forEach((value) => {
128
+ if (value && typeof value === "object" && !Array.isArray(value)) { // Ensure value is an object for Object.entries
129
+ Object.entries(value).forEach(([key, v]) =>
130
+ increaseMapTypeCount(mapTypesCount, key, v, getType)
131
+ );
132
+ }
133
+ });
134
+ arrayTypesCount[arrayType] = mapTypesCount;
135
+ } else {
136
+ if (!arrayTypesCount[arrayType]) arrayTypesCount[arrayType] = 1;
137
+ else arrayTypesCount[arrayType] = Number(arrayTypesCount[arrayType]) + 1;
138
+ }
139
+ }
140
+ } else {
141
+ if (!typesCount[type]) typesCount[type] = 1;
142
+ else typesCount[type] = Number(typesCount[type]) + 1;
143
+ }
144
+ }
145
+
146
+ function increaseMapTypeCount(
147
+ typesCountRecord: TypesCountRecord,
148
+ key: string,
149
+ fieldValue: any,
150
+ getType: InferenceTypeBuilder
151
+ ) {
152
+ if (key.startsWith("_")) return; // Ignore properties starting with _
153
+
154
+ let typesCount: TypesCount = typesCountRecord[key];
155
+ if (!typesCount) {
156
+ typesCount = {};
157
+ typesCountRecord[key] = typesCount;
158
+ }
159
+
160
+ if (fieldValue != null) {
161
+ // Check that fieldValue is not null or undefined before proceeding
162
+ const type = getType(fieldValue);
163
+ increaseTypeCount(type, typesCount, fieldValue, getType);
164
+ }
165
+ }
166
+
167
+ function increaseValuesCount(
168
+ typeValuesRecord: ValuesCountRecord,
169
+ key: string,
170
+ fieldValue: any,
171
+ getType: InferenceTypeBuilder
172
+ ) {
173
+ if (key.startsWith("_")) return; // Ignore properties starting with _
174
+
175
+ const type = getType(fieldValue);
176
+
177
+ let valuesRecord: {
178
+ values: any[];
179
+ valuesCount: Map<any, number>;
180
+ map?: ValuesCountRecord;
181
+ } = typeValuesRecord[key];
182
+
183
+ if (!valuesRecord) {
184
+ valuesRecord = {
185
+ values: [],
186
+ valuesCount: new Map()
187
+ };
188
+ typeValuesRecord[key] = valuesRecord;
189
+ }
190
+
191
+ if (type === "map") {
192
+ let mapValuesRecord: ValuesCountRecord | undefined = valuesRecord.map;
193
+ if (!mapValuesRecord) {
194
+ mapValuesRecord = {};
195
+ valuesRecord.map = mapValuesRecord;
196
+ }
197
+ if (fieldValue)
198
+ Object.entries(fieldValue).forEach(([subKey, value]) =>
199
+ increaseValuesCount(mapValuesRecord as ValuesCountRecord, subKey, value, getType)
200
+ );
201
+ } else if (type === "array") {
202
+ if (Array.isArray(fieldValue)) {
203
+ fieldValue.forEach((value) => {
204
+ valuesRecord.values.push(value);
205
+ valuesRecord.valuesCount.set(value, (valuesRecord.valuesCount.get(value) ?? 0) + 1);
206
+ });
207
+ }
208
+ } else {
209
+ if (fieldValue !== null && fieldValue !== undefined) {
210
+ valuesRecord.values.push(fieldValue);
211
+ valuesRecord.valuesCount.set(fieldValue, (valuesRecord.valuesCount.get(fieldValue) ?? 0) + 1);
212
+ }
213
+ }
214
+ }
215
+
216
+ function getHighestTypesCount(typesCount: TypesCount): number {
217
+ let highestCount = 0;
218
+ Object.entries(typesCount).forEach(([type, count]) => {
219
+ let countValue = 0;
220
+ if (type === "map") {
221
+ countValue = getHighestRecordCount(count as TypesCountRecord);
222
+ } else if (type === "array") {
223
+ countValue = getHighestTypesCount(count as TypesCount);
224
+ } else {
225
+ countValue = Number(count);
226
+ }
227
+ if (countValue > highestCount) {
228
+ highestCount = countValue;
229
+ }
230
+ });
231
+
232
+ return highestCount;
233
+ }
234
+
235
+ function getHighestRecordCount(record: TypesCountRecord): number {
236
+ return Object.entries(record)
237
+ .map(([key, typesCount]) => getHighestTypesCount(typesCount))
238
+ .reduce((a, b) => Math.max(a, b), 0);
239
+ }
240
+
241
+ function getMostProbableType(typesCount: TypesCount): DataType {
242
+ let highestCount = -1;
243
+ let probableType: DataType = "string"; // default
244
+ Object.entries(typesCount).forEach(([type, count]) => {
245
+ let countValue;
246
+ if (type === "map") {
247
+ countValue = getHighestRecordCount(count as TypesCountRecord);
248
+ } else if (type === "array") {
249
+ countValue = getHighestTypesCount(count as TypesCount);
250
+ } else {
251
+ countValue = Number(count);
252
+ }
253
+ if (countValue > highestCount) {
254
+ highestCount = countValue;
255
+ probableType = type as DataType;
256
+ }
257
+ });
258
+ return probableType;
259
+ }
260
+
261
+ function buildPropertyFromCount(
262
+ key: string,
263
+ totalDocsCount: number,
264
+ mostProbableType: DataType,
265
+ typesCount: TypesCount,
266
+ valuesResult?: ValuesCountEntry
267
+ ): Property {
268
+ let title: string | undefined;
269
+
270
+ if (key) {
271
+ title = prettifyIdentifier(key);
272
+ }
273
+
274
+ let result: Property | undefined = undefined;
275
+ if (mostProbableType === "map") {
276
+ const highVariability = checkTypesCountHighVariability(typesCount);
277
+ if (highVariability) {
278
+ result = {
279
+ type: "map",
280
+ name: title ?? key ?? "",
281
+ keyValue: true,
282
+ properties: {}
283
+ };
284
+ }
285
+ const properties = buildPropertiesFromCount(
286
+ totalDocsCount,
287
+ typesCount.map as TypesCountRecord,
288
+ valuesResult ? valuesResult.mapValues : undefined
289
+ );
290
+ result = {
291
+ type: "map",
292
+ name: title ?? key ?? "",
293
+ properties
294
+ };
295
+ } else if (mostProbableType === "array") {
296
+ const arrayTypesCount = typesCount.array as TypesCount;
297
+ const arrayMostProbableType = getMostProbableType(arrayTypesCount);
298
+ const of = buildPropertyFromCount(
299
+ key,
300
+ totalDocsCount,
301
+ arrayMostProbableType,
302
+ arrayTypesCount,
303
+ valuesResult
304
+ );
305
+ result = {
306
+ type: "array",
307
+ name: title ?? key ?? "",
308
+ of
309
+ };
310
+ }
311
+
312
+ if (!result) {
313
+ const propertyProps: InferencePropertyBuilderProps = {
314
+ name: key,
315
+ totalDocsCount,
316
+ valuesResult
317
+ };
318
+ if (mostProbableType === "string") {
319
+ result = buildStringProperty(propertyProps);
320
+ } else if (mostProbableType === "reference") {
321
+ result = buildReferenceProperty(propertyProps);
322
+ } else {
323
+ result = {
324
+ type: mostProbableType
325
+ } as Property;
326
+ }
327
+
328
+ if (title) {
329
+ result.name = title;
330
+ }
331
+
332
+ const validation = buildValidation(propertyProps);
333
+ if (validation) {
334
+ result.validation = validation;
335
+ }
336
+ }
337
+
338
+ return result;
339
+ }
340
+
341
+ function buildPropertiesFromCount(
342
+ totalDocsCount: number,
343
+ typesCountRecord: TypesCountRecord,
344
+ valuesCountRecord?: ValuesCountRecord
345
+ ): Properties {
346
+ const res: Properties = {};
347
+ Object.entries(typesCountRecord).forEach(([key, typesCount]) => {
348
+ const mostProbableType = getMostProbableType(typesCount);
349
+ res[key] = buildPropertyFromCount(
350
+ key,
351
+ totalDocsCount,
352
+ mostProbableType,
353
+ typesCount,
354
+ valuesCountRecord ? valuesCountRecord[key] : undefined
355
+ );
356
+ });
357
+ return res;
358
+ }
359
+
360
+ function countMaxDocumentsUnder(typesCount: TypesCount) {
361
+ let count = 0;
362
+ Object.entries(typesCount).forEach(([type, value]) => {
363
+ if (typeof value === "object") {
364
+ count = Math.max(count, countMaxDocumentsUnder(value as TypesCountRecord));
365
+ } else {
366
+ count = Math.max(count, Number(value));
367
+ }
368
+ });
369
+ return count;
370
+ }
371
+
372
+ function getMostProbableTypeInArray(
373
+ array: any[],
374
+ getType: InferenceTypeBuilder
375
+ ): DataType {
376
+ const typesCount: TypesCount = {};
377
+ array.forEach((value) => {
378
+ increaseTypeCount(getType(value), typesCount, value, getType);
379
+ });
380
+ return getMostProbableType(typesCount);
381
+ }
382
+
383
+ function checkTypesCountHighVariability(typesCount: TypesCount) {
384
+ const maxCount = countMaxDocumentsUnder(typesCount);
385
+ let keysWithFewValues = 0;
386
+ Object.entries(typesCount.map ?? {}).forEach(([key, value]) => {
387
+ const count = countMaxDocumentsUnder(value);
388
+ if (count < maxCount / 3) {
389
+ keysWithFewValues++;
390
+ }
391
+ });
392
+ return keysWithFewValues / Object.entries(typesCount.map ?? {}).length > 0.5;
393
+ }
394
+
395
+
396
+ export function inferTypeFromValue(value: any): DataType {
397
+ if (value === null || value === undefined) return "string";
398
+ if (typeof value === "string") return "string";
399
+ if (typeof value === "number") return "number";
400
+ if (typeof value === "boolean") return "boolean";
401
+ if (Array.isArray(value)) return "array";
402
+ if (typeof value === "object") return "map";
403
+ return "string";
404
+ }
package/src/index.ts ADDED
@@ -0,0 +1,3 @@
1
+ export * from "./collection_builder";
2
+ export * from "./util";
3
+ export * from "./strings";
package/src/strings.ts ADDED
@@ -0,0 +1,104 @@
1
+ import { ValuesCountEntry } from "./types";
2
+
3
+ /**
4
+ * Parse a reference string value which can be in the format:
5
+ * - Simple: "path/entityId"
6
+ * - With database: "database_name:::path/entityId"
7
+ * Returns the path and database (undefined if not specified or if "(default)")
8
+ */
9
+ export function parseReferenceString(value: string): { path: string; database?: string } | null {
10
+ if (!value) return null;
11
+
12
+ let database: string | undefined = undefined;
13
+ let fullPath = value;
14
+
15
+ // Parse the new format: database_name:::path/entityId
16
+ if (value.includes(":::")) {
17
+ const [dbName, pathPart] = value.split(":::");
18
+ if (dbName && dbName !== "(default)") {
19
+ database = dbName;
20
+ }
21
+ fullPath = pathPart;
22
+ }
23
+
24
+ // Check if it looks like a path (contains at least one slash)
25
+ if (!fullPath || !fullPath.includes("/")) {
26
+ return null;
27
+ }
28
+
29
+ // Extract the collection path (everything before the last slash)
30
+ const path = fullPath.substring(0, fullPath.lastIndexOf("/"));
31
+
32
+ return { path,
33
+ database };
34
+ }
35
+
36
+ /**
37
+ * Check if a string value looks like a reference
38
+ */
39
+ export function looksLikeReference(value: any): boolean {
40
+ if (typeof value !== "string") return false;
41
+ return parseReferenceString(value) !== null;
42
+ }
43
+
44
+ export function findCommonInitialStringInPath(valuesCount?: ValuesCountEntry) {
45
+
46
+ if (!valuesCount) return undefined;
47
+
48
+ function getPath(value: any): string | undefined {
49
+ let pathString: string | undefined;
50
+
51
+ if (typeof value === "string") {
52
+ pathString = value;
53
+ } else if (value.slug) {
54
+ pathString = value.slug;
55
+ } else {
56
+ console.warn("findCommonInitialStringInPath: value is not a string or document with path", value);
57
+ return undefined;
58
+ }
59
+
60
+ if (!pathString) return undefined;
61
+
62
+ // Parse the new format: database_name:::path/entityId
63
+ // Extract just the path portion for comparison
64
+ if (pathString.includes(":::")) {
65
+ const [, pathPart] = pathString.split(":::");
66
+ pathString = pathPart;
67
+ }
68
+
69
+ return pathString;
70
+ }
71
+
72
+ const strings: string[] = valuesCount.values.map((v) => getPath(v)).filter(v => !!v) as string[];
73
+ const pathWithSlash = strings.find((s) => s.includes("/"));
74
+ if (!pathWithSlash)
75
+ return undefined;
76
+
77
+ const searchedPath = pathWithSlash.substring(0, pathWithSlash.lastIndexOf("/"));
78
+
79
+ const yep = valuesCount.values
80
+ .filter((value) => {
81
+ const path = getPath(value);
82
+ if (!path) return false;
83
+ return path.startsWith(searchedPath)
84
+ }).length > valuesCount.values.length / 3 * 2;
85
+
86
+ return yep ? searchedPath : undefined;
87
+
88
+ }
89
+
90
+ export function removeInitialAndTrailingSlashes(s: string): string {
91
+ return removeInitialSlash(removeTrailingSlash(s));
92
+ }
93
+
94
+ export function removeInitialSlash(s: string) {
95
+ if (s.startsWith("/"))
96
+ return s.slice(1);
97
+ else return s;
98
+ }
99
+
100
+ export function removeTrailingSlash(s: string) {
101
+ if (s.endsWith("/"))
102
+ return s.slice(0, -1);
103
+ else return s;
104
+ }