goldenflow 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,360 @@
1
+ /**
2
+ * Core types for GoldenFlow JS — ported from Python dataclasses and Pydantic models.
3
+ * All types are readonly. Use factory functions for construction.
4
+ */
5
+ type TransformMode = "expr" | "series" | "dataframe";
6
+ /** Metadata for a registered transform. */
7
+ interface TransformInfo {
8
+ readonly name: string;
9
+ readonly func: TransformFunction;
10
+ readonly inputTypes: readonly string[];
11
+ readonly autoApply: boolean;
12
+ readonly priority: number;
13
+ readonly mode: TransformMode;
14
+ }
15
+ /**
16
+ * A transform function. Shape depends on mode:
17
+ * - expr: (column: string, ...params: unknown[]) => ExprTransform
18
+ * - series: (values: readonly ColumnValue[], ...params: unknown[]) => ColumnValue[] | [ColumnValue[], number[]]
19
+ * - dataframe: (rows: readonly Row[], column: string) => Row[]
20
+ */
21
+ type TransformFunction = (...args: any[]) => any;
22
+ type ColumnValue = string | number | boolean | null;
23
+ type Row = Readonly<Record<string, unknown>>;
24
+ type Dtype = "string" | "integer" | "float" | "boolean" | "date" | "datetime" | "null";
25
+ interface TransformRecord {
26
+ readonly column: string;
27
+ readonly transform: string;
28
+ readonly affectedRows: number;
29
+ readonly totalRows: number;
30
+ readonly sampleBefore: readonly string[];
31
+ readonly sampleAfter: readonly string[];
32
+ }
33
+ declare function makeTransformRecord(input: Pick<TransformRecord, "column" | "transform" | "affectedRows" | "totalRows"> & Partial<Pick<TransformRecord, "sampleBefore" | "sampleAfter">>): TransformRecord;
34
+ interface TransformError {
35
+ readonly column: string;
36
+ readonly transform: string;
37
+ readonly row: number;
38
+ readonly error: string;
39
+ }
40
+ interface Manifest {
41
+ readonly source: string;
42
+ readonly records: readonly TransformRecord[];
43
+ readonly errors: readonly TransformError[];
44
+ readonly createdAt: string;
45
+ }
46
+ declare function makeManifest(source: string): MutableManifest;
47
+ /** Mutable manifest used during transform pipeline, then frozen. */
48
+ declare class MutableManifest implements Manifest {
49
+ readonly source: string;
50
+ readonly records: TransformRecord[];
51
+ readonly errors: TransformError[];
52
+ readonly createdAt: string;
53
+ constructor(source: string);
54
+ addRecord(record: TransformRecord): void;
55
+ addError(column: string, transform: string, row: number, error: string): void;
56
+ toDict(): Record<string, unknown>;
57
+ }
58
+ interface TransformResult {
59
+ readonly rows: readonly Row[];
60
+ readonly columns: readonly string[];
61
+ readonly manifest: Manifest;
62
+ }
63
+ interface ColumnProfile {
64
+ readonly name: string;
65
+ readonly inferredType: string;
66
+ readonly rowCount: number;
67
+ readonly nullCount: number;
68
+ readonly nullPct: number;
69
+ readonly uniqueCount: number;
70
+ readonly uniquePct: number;
71
+ readonly sampleValues: readonly string[];
72
+ readonly detectedFormat: string | null;
73
+ }
74
+ declare function makeColumnProfile(input: Pick<ColumnProfile, "name" | "inferredType" | "rowCount" | "nullCount" | "nullPct" | "uniqueCount" | "uniquePct"> & Partial<Pick<ColumnProfile, "sampleValues" | "detectedFormat">>): ColumnProfile;
75
+ interface DatasetProfile {
76
+ readonly filePath: string;
77
+ readonly rowCount: number;
78
+ readonly columnCount: number;
79
+ readonly columns: readonly ColumnProfile[];
80
+ }
81
+ interface ColumnDiffDetail {
82
+ readonly changedRows: number;
83
+ }
84
+ interface DiffResult {
85
+ readonly totalChanges: number;
86
+ readonly changedColumns: readonly string[];
87
+ readonly addedColumns: readonly string[];
88
+ readonly removedColumns: readonly string[];
89
+ readonly rowCountBefore: number;
90
+ readonly rowCountAfter: number;
91
+ readonly columnDetails: Readonly<Record<string, ColumnDiffDetail>>;
92
+ }
93
+ interface TransformSpec {
94
+ readonly column: string;
95
+ readonly ops: readonly string[];
96
+ }
97
+ interface SplitSpec {
98
+ readonly source: string;
99
+ readonly target: readonly string[];
100
+ readonly method: string;
101
+ }
102
+ interface FilterSpec {
103
+ readonly column: string;
104
+ readonly condition: string;
105
+ }
106
+ interface DedupSpec {
107
+ readonly columns: readonly string[];
108
+ readonly keep: "first" | "last";
109
+ }
110
+ interface MappingSpec {
111
+ readonly source: string;
112
+ readonly target: string | readonly string[];
113
+ readonly transform: string | readonly string[] | null;
114
+ }
115
+ interface GoldenFlowConfig {
116
+ readonly source: string | null;
117
+ readonly output: string | null;
118
+ readonly transforms: readonly TransformSpec[];
119
+ readonly splits: readonly SplitSpec[];
120
+ readonly renames: Readonly<Record<string, string>>;
121
+ readonly drop: readonly string[];
122
+ readonly filters: readonly FilterSpec[];
123
+ readonly dedup: DedupSpec | null;
124
+ readonly mappings: readonly MappingSpec[];
125
+ }
126
+ declare function makeConfig(input?: Partial<GoldenFlowConfig>): GoldenFlowConfig;
127
+ interface ColumnMapping {
128
+ readonly source: string;
129
+ readonly target: string;
130
+ readonly confidence: number;
131
+ readonly transform: string | null;
132
+ }
133
+ interface DomainPack {
134
+ readonly name: string;
135
+ readonly description: string;
136
+ readonly transforms: readonly string[];
137
+ readonly defaultConfig: GoldenFlowConfig;
138
+ }
139
+ interface RunRecord {
140
+ readonly runId: string;
141
+ readonly source: string;
142
+ readonly timestamp: string;
143
+ readonly rows: number;
144
+ readonly columns: number;
145
+ readonly transformsApplied: number;
146
+ readonly errors: number;
147
+ readonly durationSeconds: number;
148
+ readonly configHash: string | null;
149
+ readonly manifestPath: string | null;
150
+ }
151
+
152
+ /**
153
+ * TabularData — edge-safe Polars replacement.
154
+ * Wraps Record<string, unknown>[] and provides column operations.
155
+ */
156
+
157
+ declare function isNullish(v: unknown): v is null | undefined;
158
+ declare function toColumnValue(v: unknown): ColumnValue;
159
+ declare class TabularData {
160
+ private readonly _rows;
161
+ private _columnCache;
162
+ constructor(rows: readonly Row[]);
163
+ get rows(): readonly Row[];
164
+ get columns(): readonly string[];
165
+ get rowCount(): number;
166
+ column(name: string): readonly ColumnValue[];
167
+ /** Raw column access — preserves original values without null coercion.
168
+ * Use for profiling where "N/A" should remain a string, not become null. */
169
+ rawColumn(name: string): readonly ColumnValue[];
170
+ nullCount(col: string): number;
171
+ dropNulls(col: string): ColumnValue[];
172
+ dtype(col: string): Dtype;
173
+ nUnique(col: string): number;
174
+ valueCounts(col: string): Map<ColumnValue, number>;
175
+ /** MUST use loop — Math.min(...array) crashes on >65K elements. */
176
+ min(col: string): number | null;
177
+ /** MUST use loop — Math.max(...array) crashes on >65K elements. */
178
+ max(col: string): number | null;
179
+ mean(col: string): number | null;
180
+ std(col: string): number | null;
181
+ filter(predicate: (row: Row) => boolean): TabularData;
182
+ head(n: number): TabularData;
183
+ sample(n: number, seed?: number): TabularData;
184
+ strContains(col: string, pattern: RegExp): boolean[];
185
+ strLengths(col: string): number[];
186
+ castFloat(col: string): (number | null)[];
187
+ castInt(col: string): (number | null)[];
188
+ numericValues(col: string): number[];
189
+ stringValues(col: string): string[];
190
+ sortedNumeric(col: string): number[];
191
+ isSorted(col: string, descending?: boolean): boolean;
192
+ }
193
+
194
+ /**
195
+ * Transform registry — TS equivalent of goldenflow/transforms/__init__.py.
196
+ * Transforms self-register via registerTransform().
197
+ */
198
+
199
+ interface RegisterOptions {
200
+ name: string;
201
+ inputTypes: readonly string[];
202
+ autoApply?: boolean;
203
+ priority?: number;
204
+ mode?: TransformMode;
205
+ }
206
+ declare function registerTransform(opts: RegisterOptions, func: TransformFunction): void;
207
+ declare function getTransform(name: string): TransformInfo | undefined;
208
+ declare function listTransforms(): TransformInfo[];
209
+ declare function parseTransformName(raw: string): [string, string[]];
210
+ declare function registry(): ReadonlyMap<string, TransformInfo>;
211
+
212
+ /**
213
+ * TransformEngine — the main orchestrator.
214
+ * Dispatches transforms by mode (expr/series/dataframe) on Row[] data.
215
+ */
216
+
217
+ declare class TransformEngine {
218
+ readonly config: GoldenFlowConfig;
219
+ constructor(config?: Partial<GoldenFlowConfig>);
220
+ transformDf(rows: readonly Row[], source?: string): TransformResult;
221
+ private _applyConfigTransforms;
222
+ private _applyAutoTransforms;
223
+ private _applySingleTransform;
224
+ private _applyFilter;
225
+ }
226
+
227
+ /**
228
+ * Profiler bridge — infer column types from data using regex heuristics.
229
+ * Edge-safe (no Node dependencies).
230
+ */
231
+
232
+ declare function profileDataframe(rows: readonly Row[], filePath?: string): DatasetProfile;
233
+
234
+ /**
235
+ * Selector — pick auto-applicable transforms for a column based on its profile.
236
+ */
237
+
238
+ /** GoldenCheck finding check → transform name mapping. */
239
+ declare const FINDING_TRANSFORM_MAP: Readonly<Record<string, readonly string[]>>;
240
+ declare function selectTransforms(profile: ColumnProfile, _confidenceThreshold?: number): TransformInfo[];
241
+ declare function selectFromFindings(findings: readonly Record<string, unknown>[]): Record<string, string[]>;
242
+
243
+ /**
244
+ * Differ — compare two row arrays and report differences.
245
+ */
246
+
247
+ declare function diffDataframes(before: readonly Row[], after: readonly Row[]): DiffResult;
248
+
249
+ /**
250
+ * StreamProcessor — incremental transform processing.
251
+ */
252
+
253
+ declare class StreamProcessor {
254
+ private readonly engine;
255
+ private _batchCount;
256
+ constructor(config?: Partial<GoldenFlowConfig>);
257
+ /** Transform a single record. */
258
+ transformOne(record: Record<string, unknown>): TransformResult;
259
+ /** Transform a batch of rows. */
260
+ transformBatch(rows: readonly Row[]): TransformResult;
261
+ /** Process rows in chunks, yielding TransformResult per chunk. */
262
+ streamRows(rows: readonly Row[], chunkSize?: number): Generator<TransformResult>;
263
+ get batchesProcessed(): number;
264
+ }
265
+
266
+ /**
267
+ * Config schema — re-exports types and provides runtime validation.
268
+ */
269
+
270
+ /** Basic runtime validation (not as strict as Pydantic, but catches common errors). */
271
+ declare function validateConfig(raw: Record<string, unknown>): GoldenFlowConfig;
272
+
273
+ /**
274
+ * Config loader — YAML load/save/merge.
275
+ * YAML is an optional peer dependency.
276
+ */
277
+
278
+ declare function loadConfigFromString(content: string): GoldenFlowConfig;
279
+ declare function saveConfigToString(config: GoldenFlowConfig): string;
280
+ declare function mergeConfigs(fileConfig: GoldenFlowConfig, cliOverrides: Partial<GoldenFlowConfig>): GoldenFlowConfig;
281
+
282
+ /**
283
+ * Config learner — generate a config from data profiles.
284
+ */
285
+
286
+ declare function learnConfig(rows: readonly Row[], source?: string): GoldenFlowConfig;
287
+
288
+ /**
289
+ * Name similarity — fuzzy column name matching with alias support.
290
+ */
291
+ declare function nameSimilarity(source: string, target: string): number;
292
+
293
+ /**
294
+ * Profile similarity — score how similar two column profiles are.
295
+ */
296
+
297
+ declare function profileSimilarity(source: ColumnProfile, target: ColumnProfile): number;
298
+
299
+ /**
300
+ * SchemaMapper — auto-map source columns to target columns.
301
+ */
302
+
303
+ declare class SchemaMapper {
304
+ readonly autoThreshold: number;
305
+ readonly suggestThreshold: number;
306
+ constructor(autoThreshold?: number, suggestThreshold?: number);
307
+ map(sourceRows: readonly Row[], targetRows: readonly Row[]): ColumnMapping[];
308
+ toConfig(mappings: readonly ColumnMapping[]): GoldenFlowConfig;
309
+ }
310
+
311
+ declare function loadDomain(name: string): Promise<DomainPack | null>;
312
+ declare function listDomains(): string[];
313
+
314
+ /**
315
+ * JSON reporter — serialize manifest to JSON.
316
+ */
317
+
318
+ declare function manifestToJson(manifest: Manifest): string;
319
+
320
+ declare function printProfile(profile: DatasetProfile): void;
321
+ declare function printManifest(manifest: Manifest): void;
322
+ declare function printDiff(diff: DiffResult): void;
323
+
324
+ /**
325
+ * LLM-assisted categorical correction -- edge-safe (raw fetch, no SDK imports).
326
+ *
327
+ * Registers `category_llm_correct` as a passthrough transform. The actual LLM
328
+ * interaction happens via the async helper `applyLlmCorrections()`, which the
329
+ * engine or CLI should call explicitly before/after the sync transform pipeline.
330
+ */
331
+
332
+ /**
333
+ * Fetch LLM-based corrections for a column's values and cache them.
334
+ *
335
+ * Usage:
336
+ * ```ts
337
+ * await prepareLlmCorrections("status", rows.map(r => r.status));
338
+ * // ... then run the sync transform pipeline which includes category_llm_correct
339
+ * ```
340
+ *
341
+ * Returns the corrections map (also cached internally).
342
+ */
343
+ declare function prepareLlmCorrections(columnName: string, values: readonly ColumnValue[]): Promise<Record<string, string>>;
344
+ /**
345
+ * High-level async helper: fetch corrections then apply them to `values`.
346
+ *
347
+ * Returns corrected values array (same length as input). Skips the LLM call
348
+ * if corrections are already cached for `columnName`.
349
+ */
350
+ declare function applyLlmCorrections(columnName: string, values: readonly ColumnValue[]): Promise<ColumnValue[]>;
351
+
352
+ /**
353
+ * Notebook HTML rendering — generates HTML tables for TransformResult, Manifest, DatasetProfile.
354
+ */
355
+
356
+ declare function transformResultToHtml(result: TransformResult): string;
357
+ declare function manifestToHtml(manifest: Manifest): string;
358
+ declare function profileToHtml(profile: DatasetProfile): string;
359
+
360
+ export { type ColumnDiffDetail, type ColumnMapping, type ColumnProfile, type ColumnValue, type DatasetProfile, type DedupSpec, type DiffResult, type DomainPack, type Dtype, FINDING_TRANSFORM_MAP, type FilterSpec, type GoldenFlowConfig, type Manifest, type MappingSpec, MutableManifest, type Row, type RunRecord, SchemaMapper, type SplitSpec, StreamProcessor, TabularData, TransformEngine, type TransformError, type TransformFunction, type TransformInfo, type TransformMode, type TransformRecord, type TransformResult, type TransformSpec, applyLlmCorrections, diffDataframes, getTransform, isNullish, learnConfig, listDomains, listTransforms, loadConfigFromString, loadDomain, makeColumnProfile, makeConfig, makeManifest, makeTransformRecord, manifestToHtml, manifestToJson, mergeConfigs, nameSimilarity, parseTransformName, prepareLlmCorrections, printDiff, printManifest, printProfile, profileDataframe, profileSimilarity, profileToHtml, registerTransform, registry, saveConfigToString, selectFromFindings, selectTransforms, toColumnValue, transformResultToHtml, validateConfig };